## Factor Count Evaluation using CV

This workflow investigates a single dataset factor count evaluation using cross validation on synthetic data. The workflow has the following steps:
1. Generate synthetic dataset
2. Create a subset of the synthetic dataset, using random sampling without replacement, for a % of the data to create a train and test dataset.
3. Create a SA instance (base) using the train dataset for k factors.
4. Take the base H matrix and run a new SA instance holding H constant on the test dataset (V_test).
   1. Evaluate the loss of a direct calculation of W using V_test and H_base.
5. Keep track of the RMSE of the test model.
6. Repeat steps 3-5 increasing k.
7. Evaluate/plot the change in RMSE

#### Code Imports

In [1]:
from esat.data.datahandler import DataHandler
from esat.model.batch_sa import BatchSA
from esat.model.sa import SA
from esat.data.analysis import ModelAnalysis, BatchAnalysis
from esat_eval.simulator import Simulator
from esat.estimator import FactorEstimator

import logging
from tqdm.notebook import tqdm
import plotly.graph_objects as go
import numpy as np
import pandas as pd
import time
import copy
import os

logger = logging.getLogger(__name__)

#### Synthetic Dataset

Generate a synthetic dataset where the factor profiles and contributions are pre-determined for model output analysis.

In [2]:
# Synethic dataset parameters
seed = 10
syn_factors = 6                # Number of factors in the synthetic dataset
syn_features = 40              # Number of features in the synthetic dataset
syn_samples = 2000             # Number of samples in the synthetic dataset
outliers = True                # Add outliers to the dataset
outlier_p = 0.10               # Decimal percent of outliers in the dataset
outlier_mag = 1.25                # Magnitude of outliers
contribution_max = 2           # Maximum value of the contribution matrix (W) (Randomly sampled from a uniform distribution)
noise_mean_min = 0.25          # Min value for the mean of noise added to the synthetic dataset, used to randomly determine the mean decimal percentage of the noise for each feature.
noise_mean_max = 0.5          # Max value for the mean of noise added to the synthetic dataset, used to randomly determine the mean decimal percentage of the noise for each feature.
noise_scale = 0.1             # Scale of the noise added to the synthetic dataset
uncertainty_mean_min = 0.04    # Min value for the mean uncertainty of a data feature, used to randomly determine the mean decimal percentage for each feature in the uncertainty dataset. 
uncertainty_mean_max = 0.06    # Max value for the mean uncertainty of a data feature, used to randomly determine the mean decimal percentage for each feature in the uncertainty dataset. 
uncertainty_scale = 0.01       # Scale of the uncertainty matrix

In [3]:
# Initialize the simulator with the above parameters
simulator = Simulator(seed=seed,
                      factors_n=syn_factors,
                      features_n=syn_features,
                      samples_n=syn_samples,
                      outliers=outliers,
                      outlier_p=outlier_p,
                      outlier_mag=outlier_mag,
                      contribution_max=contribution_max,
                      noise_mean_min=noise_mean_min,
                      noise_mean_max=noise_mean_max,
                      noise_scale=noise_scale,
                      uncertainty_mean_min=uncertainty_mean_min,
                      uncertainty_mean_max=uncertainty_mean_max,
                      uncertainty_scale=uncertainty_scale
                     )

24-Apr-25 15:55:07 - Synthetic profiles generated


In [4]:
# Example command for passing in a custom factor profile matrix, instead of the randomly generated profile matrix.
# my_profile = np.ones(shape=(syn_factors, syn_features))
# simulator.generate_profiles(profiles=my_profile)

In [5]:
# Example of how to customize the factor contributions. Curve_type options: 'uniform', 'decreasing', 'increasing', 'logistic', 'periodic'
# simulator.update_contribution(factor_i=0, curve_type="logistic", scale=0.1, frequency=0.5)
# simulator.update_contribution(factor_i=1, curve_type="periodic", minimum=0.0, maximum=1.0, frequency=0.5, scale=0.1)
# simulator.update_contribution(factor_i=2, curve_type="increasing", minimum=0.0, maximum=1.0, scale=0.1)
# simulator.update_contribution(factor_i=3, curve_type="decreasing", minimum=0.0, maximum=1.0, scale=0.1)
# simulator.plot_synthetic_contributions()

#### Load Data
Assign the processed data and uncertainty datasets to the variables V and U. These steps will be simplified/streamlined in a future version of the code.

In [6]:
syn_input_df, syn_uncertainty_df = simulator.get_data()

24-Apr-25 15:55:07 - Synthetic data generated
24-Apr-25 15:55:07 - Synthetic uncertainty data generated
24-Apr-25 15:55:07 - Synthetic dataframes completed
24-Apr-25 15:55:07 - Synthetic source apportionment instance created.


In [7]:
data_handler = DataHandler.load_dataframe(input_df=syn_input_df, uncertainty_df=syn_uncertainty_df)
V, U = data_handler.get_data()

In [8]:
cwd = os.getcwd()
data_dir = os.path.join(cwd, "..", "data")

# # Baton Rouge Dataset
br_input_file = os.path.join(data_dir, "Dataset-BatonRouge-con.csv")
br_uncertainty_file = os.path.join(data_dir, "Dataset-BatonRouge-unc.csv")
# # Baltimore Dataset
b_input_file = os.path.join(data_dir, "Dataset-Baltimore_con.txt")
b_uncertainty_file = os.path.join(data_dir, "Dataset-Baltimore_unc.txt")
# # Saint Louis Dataset
sl_input_file = os.path.join(data_dir, "Dataset-StLouis-con.csv")
sl_uncertainty_file = os.path.join(data_dir, "Dataset-StLouis-unc.csv")

data_handler2 = DataHandler(
    input_path=br_input_file,
    uncertainty_path=br_uncertainty_file,
    index_col="Date"
)
V, U = data_handler2.get_data()

24-Apr-25 15:55:07 - Input and output configured successfully


#### Input Parameters

In [9]:
index_col = "Date"                  # the index of the input/uncertainty datasets
method = "ls-nmf"                   # "ls-nmf", "ws-nmf"
models = 20                         # the number of models to train
init_method = "col_means"           # default is column means "col_means", "kmeans", "cmeans"
init_norm = True                    # if init_method=kmeans or cmeans, normalize the data prior to clustering.
seed = 42                           # random seed for initialization
max_iterations = 20000              # the maximum number of iterations for fitting a model
converge_delta = 0.1                # convergence criteria for the change in loss, Q
converge_n = 25                     # convergence criteria for the number of steps where the loss changes by less than converge_delta
verbose = True                      # adds more verbosity to the algorithm workflow on execution.

rng = np.random.default_rng(seed)

### Utility Functions

In [10]:
def calculate_W(V, U, H):
    H[H <= 0.0] = 1e-8
    # W = np.matmul(V * np.divide(1, U ** 2), H.T)
    W = np.matmul(V, H.T)
    return W

def q_loss(V, U, H, W):
    residuals = ((V-np.matmul(W, H))/U)**2
    return np.sum(residuals)

def rmse(_V, _U, _H, _W, use_uncertainty: bool = False):
    WH = np.matmul(_W, _H)
    if use_uncertainty: 
        residuals = ((_V-WH)/_U)**2
    else:
        residuals = (_V-WH)**2
    return np.sqrt(np.sum(residuals)/_V.size)

def prepare_data(V, U, p, seed):
    rng0 = np.random.default_rng(seed)
    
    samples_n = V.shape[0]
    sample_index = rng0.permutation(samples_n)
    
    train_n = int(samples_n * p)
    train_index = sample_index[0:train_n]
    test_index = sample_index[train_n:]
    
    train_V = pd.DataFrame(V.copy()[train_index,:])
    train_U = pd.DataFrame(U.copy()[train_index,:])
    test_V = pd.DataFrame(V.copy()[test_index,:])
    test_U = pd.DataFrame(U.copy()[test_index,:])
    
    for f in train_V.columns:
        train_V[f] = pd.to_numeric(train_V[f])
        train_U[f] = pd.to_numeric(train_U[f])
        test_V[f] = pd.to_numeric(test_V[f])
        test_U[f] = pd.to_numeric(test_U[f])
    return train_V.to_numpy(), train_U.to_numpy(), test_V.to_numpy(), test_U.to_numpy()
            
def plot_results(train_loss, test_loss, min_k, max_k, base_loss=None, true_k=None):
    x = np.arange(min_k, max_k+1)
    fig = go.Figure()
    fig.add_trace(go.Scatter(x=x, y=train_loss, name="Train"))
    fig.add_trace(go.Scatter(x=x, y=test_loss, name="Test"))
    if true_k:
        fig.add_vline(x=true_k, line_width=0.5, line_color="green")
    if base_loss:
        fig.add_trace(go.Scatter(x=x, y=base_loss, name="Base"))
    fig.update_layout(title_text=f"RMSE of Test data by Factor(k)", width=800, height=600, hovermode="x unified")
    fig.show()

In [11]:
# Split the dataset
# samples_n = V.shape[0]
# p = 0.75

# sample_index = rng.permutation(samples_n)

# train_n = int(samples_n * p)
# train_index = sample_index[0:train_n]
# test_index = sample_index[train_n:]

# train_V = pd.DataFrame(V.copy()[train_index,:])
# train_U = pd.DataFrame(U.copy()[train_index,:])

# for f in train_V.columns:
#     train_V[f] = pd.to_numeric(train_V[f])
#     train_U[f] = pd.to_numeric(train_U[f])
# train_V = train_V.to_numpy()
# train_U = train_U.to_numpy()

# test_V = pd.DataFrame(V.copy()[test_index,:])
# test_U = pd.DataFrame(U.copy()[test_index,:])

# for f in test_V.columns:
#     test_V[f] = pd.to_numeric(test_V[f])
#     test_U[f] = pd.to_numeric(test_U[f])
# test_V = test_V.to_numpy()
# test_U = test_U.to_numpy()



# _V = (V - np.min(V))/((np.max(V) - np.min(V) + 1e-8))
# _V[_V <= 0.0] = 1e-12

# VU_ratio = V/(U+1e-8)

# _U = _V * VU_ratio
# _U[_U <= 0.0] = 1e-12

# print(f"V: {V.shape}")
# print(f"Number of samples - train: {train_V.shape[0]}, test: {test_V.shape[0]}")

In [None]:
%%capture
min_factors = 2
max_factors = 10
n_models = 20
splits = 20
max_iter = 20000

test_rmse = []
test_full_rmse = []
train_rmse = []
train_h_rmse = []
base_rmse = []

t0 = time.time()

# V = _V
# U = _U

for i, k in enumerate(range(min_factors, max_factors+1)):
    split_error0 = []
    split_error0b = []
    split_error = []
    split_error2 = []

    initialization_seed = rng.integers(low=0, high=1e8)

    base_models = BatchSA(V=V, U=U, factors=k, models=n_models, method=method, seed=initialization_seed, max_iter=max_iter,
                        converge_delta=converge_delta, converge_n=converge_n, verbose=False)
    _ = base_models.train()
    base_rmse.append(np.mean([rmse(_V=V, _U=U, _H=sa.H, _W=sa.W) for sa in base_models.results]))
 
    for j in range(splits):       
        train_V, train_U, test_V, test_U = prepare_data(V=V, U=U, p=0.5, seed=rng.integers(low=0, high=1e8))
        sa_models = BatchSA(V=train_V, U=train_U, factors=k, models=n_models, method=method, seed=initialization_seed, max_iter=max_iter,
                            converge_delta=converge_delta, converge_n=converge_n, verbose=False)
        
        _ = sa_models.train()
        for sa in sa_models.results:
            split_error0.append(rmse(_V=train_V, _U=train_U, _H=sa.H, _W=sa.W))
        batch_H = np.array([sa.H for sa in sa_models.results])

        sa_models_b = BatchSA(V=train_V, U=train_U, H=batch_H, factors=k, models=n_models, method=method, seed=initialization_seed, max_iter=max_iter,
                            converge_delta=converge_delta, converge_n=converge_n, hold_h=True, verbose=False)
        _ = sa_models_b.train()
        for sa in sa_models_b.results:
            split_error0b.append(rmse(_V=train_V, _U=train_U, _H=sa.H, _W=sa.W))
        
        sa_tests = BatchSA(V=test_V, U=test_U, H=batch_H, factors=k, models=n_models, method=method, seed=initialization_seed, max_iter=max_iter,
                            converge_delta=converge_delta, converge_n=converge_n, hold_h=True, verbose=False)
        _ = sa_tests.train()
        for sa in sa_tests.results:
            split_error.append(rmse(_V=test_V, _U=test_U, _H=sa.H, _W=sa.W))

        sa_tests2 = BatchSA(V=test_V, U=test_U, H=batch_H, factors=k, models=n_models, method=method, seed=initialization_seed, max_iter=max_iter,
                            converge_delta=converge_delta, converge_n=converge_n, verbose=False)
        _ = sa_tests2.train()
        for sa in sa_tests2.results:
            split_error2.append(rmse(_V=test_V, _U=test_U, _H=sa.H, _W=sa.W))
        
    train_rmse.append(np.mean(split_error0))
    train_h_rmse.append(np.mean(split_error0b))
    test_rmse.append(np.mean(split_error))
    test_full_rmse.append(np.mean(split_error2))
    
    logger.info(f"Factor: {k}, Base RMSE: {base_rmse[i]:.4f}, Train RMSE: {np.mean(split_error0):.4f}, TrainB RMSE: {np.mean(split_error0b):.4f}, Test RMSE: {np.mean(split_error):.4f}, F-Test RMSE: {np.mean(split_error2):.4f}")
t1 = time.time()
logger.info(f"Runtime: {((t1-t0)/60):.2f} min(s)")

In [None]:
plot_results(train_loss=train_rmse, test_loss=test_rmse, base_loss=base_rmse, min_k=min_factors, max_k=max_factors, true_k=None)

In [None]:
# from sklearn.linear_model import LinearRegression
# k_list = np.arange(min_factors, max_factors+1)
# k_list = k_list.reshape(len(k_list), 1)
# k_model = LinearRegression()
# k_model.fit(k_list,test_rmse)
# y_pred = k_model.predict(k_list)

# slope0 = k_model.coef_[0]
# inter0 = k_model.intercept_
# m, c, _, _ = np.linalg.lstsq(k_list, test_rmse)

In [None]:
# fig = go.Figure()
# fig.add_trace(go.Scatter(x=k_list.flatten(), y=train_rmse, name="Train", mode="markers", marker_color="purple"))
# fig.add_trace(go.Scatter(x=k_list.flatten(), y=test_rmse, name="Test", mode="markers", marker_color="blue"))
# fig.add_trace(go.Scatter(x=k_list.flatten(), y=y_pred, name="Reg", mode="lines", marker_color="red"))
# fig.add_trace(go.Scatter(x=k_list.flatten(), y=k_list.flatten()*m, name="LST", mode="lines", marker_color="black"))
# fig.add_vline(x=syn_factors, line_width=0.5, line_color="green")
# fig.add_hline(y=np.mean(test_rmse), line_width=1.0, line_color="darkgreen")
# fig.update_layout(title_text="RMSE of Test data by Factor(k)", width=800, height=600, hovermode="x unified")
# fig.show()

# print(int(min_factors + np.argmin(np.abs(k_list.flatten()*m - y_pred))))
# print(min_factors + np.argmin(np.abs(test_rmse - np.mean(test_rmse)))) 

In [None]:
n = len(test_rmse)
test_dif_b = np.array(test_rmse[1:]) - np.array(test_rmse[:n])
test_dif_f = np.array(test_rmse[:n-1]) - np.array(test_rmse[1:])
test_dif_ratio = test_dif_b/test_dif_f
test_dif_ratio