## Environmental Source Apportionment Toolkit (ESAT) Cross-Validation

*** TESTING NOTEBOOK ***

This notebook is intended to test implementations of cross-validation of SA models for the purpose of estimating the optimal number of factors for a given dataset.

The starting reference for this approach: http://alexhwilliams.info/itsneuronalblog/2018/02/26/crossval/

Once an automated process has been developed, the approach can be validated using the synthetic data developed from the ESAT simulator.


In [1]:
# Notebook imports
import os
import sys
import json
import copy
import numpy as np
import pandas as pd
import multiprocessing as mp
from tqdm import tqdm
from scipy.cluster.vq import whiten
import plotly.graph_objects as go

#### Code Imports

In [2]:
from esat.data.datahandler import DataHandler
from esat.model.sa import SA
from esat.model.batch_sa import BatchSA
from esat.data.analysis import ModelAnalysis, BatchAnalysis
from esat.metrics import q_loss
from esat_eval.simulator import Simulator

#### Sample Dataset
The three sample datasets from PMF5 are available for use, but a new dataset can be used in their place.

In [3]:
# Synethic dataset parameters
seed = 42
syn_factors = 6                # Number of factors in the synthetic dataset
syn_features = 40              # Number of features in the synthetic dataset
syn_samples = 300             # Number of samples in the synthetic dataset
outliers = True                # Add outliers to the dataset
outlier_p = 0.10               # Decimal percent of outliers in the dataset
outlier_mag = 1.25                # Magnitude of outliers
contribution_max = 2           # Maximum value of the contribution matrix (W) (Randomly sampled from a uniform distribution)
noise_mean_min = 0.03          # Min value for the mean of noise added to the synthetic dataset, used to randomly determine the mean decimal percentage of the noise for each feature.
noise_mean_max = 0.05          # Max value for the mean of noise added to the synthetic dataset, used to randomly determine the mean decimal percentage of the noise for each feature.
noise_scale = 0.02             # Scale of the noise added to the synthetic dataset
uncertainty_mean_min = 0.04    # Min value for the mean uncertainty of a data feature, used to randomly determine the mean decimal percentage for each feature in the uncertainty dataset. 
uncertainty_mean_max = 0.06    # Max value for the mean uncertainty of a data feature, used to randomly determine the mean decimal percentage for each feature in the uncertainty dataset. 
uncertainty_scale = 0.01       # Scale of the uncertainty matrix

In [4]:
# Initialize the simulator with the above parameters
simulator = Simulator(seed=seed,
                      factors_n=syn_factors,
                      features_n=syn_features,
                      samples_n=syn_samples,
                      outliers=outliers,
                      outlier_p=outlier_p,
                      outlier_mag=outlier_mag,
                      contribution_max=contribution_max,
                      noise_mean_min=noise_mean_min,
                      noise_mean_max=noise_mean_max,
                      noise_scale=noise_scale,
                      uncertainty_mean_min=uncertainty_mean_min,
                      uncertainty_mean_max=uncertainty_mean_max,
                      uncertainty_scale=uncertainty_scale
                     )

08-May-24 15:54:24 - Synthetic profiles generated


In [5]:
cwd = os.getcwd()
data_dir = os.path.join(cwd, "..", "data")

# Baton Rouge Dataset
br_input_file = os.path.join(data_dir, "Dataset-BatonRouge-con.csv")
br_uncertainty_file = os.path.join(data_dir, "Dataset-BatonRouge-unc.csv")
br_output_path = os.path.join(data_dir, "output", "BatonRouge")
# Baltimore Dataset
b_input_file = os.path.join(data_dir, "Dataset-Baltimore_con.txt")
b_uncertainty_file = os.path.join(data_dir, "Dataset-Baltimore_unc.txt")
b_output_path = os.path.join(data_dir, "output", "Baltimore")
# Saint Louis Dataset
sl_input_file = os.path.join(data_dir, "Dataset-StLouis-con.csv")
sl_uncertainty_file = os.path.join(data_dir, "Dataset-StLouis-unc.csv")
sl_output_path = os.path.join(data_dir, "output", "StLouis")

#### Input Parameters

In [6]:
index_col = "Date"                  # the index of the input/uncertainty datasets
factors = 3                         # the number of factors
method = "ls-nmf"                   # "ls-nmf", "ws-nmf"
models = 20                         # the number of models to train
init_method = "col_means"           # default is column means "col_means", "kmeans", "cmeans"
init_norm = True                    # if init_method=kmeans or cmeans, normalize the data prior to clustering.
seed = 42                           # random seed for initialization
max_iterations = 20000              # the maximum number of iterations for fitting a model
converge_delta = 0.1               # convergence criteria for the change in loss, Q
converge_n = 10                     # convergence criteria for the number of steps where the loss changes by less than converge_delta
verbose = True                      # adds more verbosity to the algorithm workflow on execution.
optimized = True                    # use the Rust code if possible
parallel = True                     # execute the model training in parallel, multiple models at the same time

#### Dataset Selection
One of the three sample datasets can be selected or a new cleaned dataset can be used. Datasets should be cleaned, containing no missing data (either dropping missing/NaNs, or interpolating the missing values).

In [7]:
# Loading the Baton Rouge dataset
# input_file = br_input_file
# uncertainty_file = br_uncertainty_file
input_data_df, uncertainty_data_df = simulator.get_data()

08-May-24 15:54:24 - Synthetic data generated
08-May-24 15:54:24 - Synthetic uncertainty data generated
08-May-24 15:54:24 - Synthetic dataframes completed
08-May-24 15:54:24 - Synthetic source apportionment instance created.


#### Load Data
Assign the processed data and uncertainty datasets to the variables V and U. These steps will be simplified/streamlined in a future version of the code.

In [8]:
# data_handler = DataHandler(
#     input_path=input_file,
#     uncertainty_path=uncertainty_file,
#     index_col=index_col
# )
data_handler = DataHandler.load_dataframe(input_df=input_data_df, uncertainty_df=uncertainty_data_df)

In [9]:
# Get the processed data and uncertainty
V, U = data_handler.get_data()

#### Cross-Validation

For a series of factors counts, i.e. 3->12, train a model(s) using the standard input data that has been masked. The mask is a binary matrix that indicates the cells in V and U which are used for training and validation testing. The selection of these cells will be random for each test.

The mask will contain some percentage of the data points, the MSE will be evaluated for both training data (mask=1) and the testing data (mask=0). The point on the slope where the test data stops decreases is the proposed optimal number of factors/components.

Tests will include using this method on a single model and averaging batch solutions.


In [10]:
from esat.helpers import FSearch

In [11]:
factor_search = FSearch(V=V, U=U)
results = factor_search.search(samples=200, min_factors=2, max_factors=12)
results

Rapid random sampling for factor estimation:   0%|▌                                                                                                                     | 1/200 [00:05<18:51,  5.69s/it]

(0.19923, -0.22429, 3)


Rapid random sampling for factor estimation:   1%|█▏                                                                                                                    | 2/200 [00:06<08:20,  2.53s/it]

(0.22384, -0.24093, 3)


Rapid random sampling for factor estimation:   2%|█▊                                                                                                                    | 3/200 [00:07<07:06,  2.16s/it]

(0.12898, -0.14075, 4)


Rapid random sampling for factor estimation:   2%|██▎                                                                                                                   | 4/200 [00:08<04:53,  1.50s/it]

(0.02249, -0.02524, 6)


Rapid random sampling for factor estimation:   2%|██▉                                                                                                                   | 5/200 [00:08<03:31,  1.08s/it]

(0.02145, -0.02335, 9)


Rapid random sampling for factor estimation:   3%|███▌                                                                                                                  | 6/200 [00:09<02:57,  1.09it/s]

(0.07686, -0.07594, 5)
(0.02125, -0.02421, 7)


Rapid random sampling for factor estimation:   4%|████▋                                                                                                                 | 8/200 [00:09<01:48,  1.77it/s]

(0.02152, -0.02361, 8)


Rapid random sampling for factor estimation:   4%|█████▎                                                                                                                | 9/200 [00:09<01:32,  2.06it/s]

(0.07316, -0.0743, 5)


Rapid random sampling for factor estimation:   5%|█████▊                                                                                                               | 10/200 [00:10<01:50,  1.72it/s]

(0.11779, -0.13686, 4)


Rapid random sampling for factor estimation:   6%|██████▍                                                                                                              | 11/200 [00:11<02:08,  1.47it/s]

(0.02219, -0.02308, 8)


Rapid random sampling for factor estimation:   6%|███████                                                                                                              | 12/200 [00:11<01:44,  1.80it/s]

(0.21533, -0.24025, 3)


Rapid random sampling for factor estimation:   6%|███████▌                                                                                                             | 13/200 [00:12<01:31,  2.05it/s]

(0.01993, -0.02229, 11)


Rapid random sampling for factor estimation:   7%|████████▏                                                                                                            | 14/200 [00:12<01:24,  2.19it/s]

(0.1196, -0.13685, 4)


Rapid random sampling for factor estimation:   8%|████████▊                                                                                                            | 15/200 [00:13<01:27,  2.13it/s]

(0.0214, -0.02397, 9)


Rapid random sampling for factor estimation:   8%|█████████▎                                                                                                           | 16/200 [00:13<01:23,  2.20it/s]

(0.05994, -0.08276, 5)
(0.1861, -0.21446, 3)


Rapid random sampling for factor estimation:   9%|██████████▌                                                                                                          | 18/200 [00:13<00:57,  3.16it/s]

(0.07259, -0.07603, 5)
(0.0204, -0.02224, 11)


Rapid random sampling for factor estimation:  12%|█████████████▍                                                                                                       | 23/200 [00:14<00:30,  5.82it/s]

(0.02118, -0.02321, 9)
(0.19982, -0.24877, 3)
(0.02203, -0.02318, 9)
(0.02237, -0.02312, 9)


Rapid random sampling for factor estimation:  12%|██████████████                                                                                                       | 24/200 [00:14<00:34,  5.17it/s]

(0.29369, -0.35454, 2)
(0.02183, -0.02503, 7)


Rapid random sampling for factor estimation:  13%|███████████████▏                                                                                                     | 26/200 [00:15<00:45,  3.83it/s]

(0.18193, -0.21395, 3)


Rapid random sampling for factor estimation:  14%|████████████████▍                                                                                                    | 28/200 [00:16<00:54,  3.17it/s]

(0.1188, -0.1365, 4)
(0.02344, -0.0246, 7)


Rapid random sampling for factor estimation:  14%|████████████████▉                                                                                                    | 29/200 [00:16<00:57,  2.97it/s]

(0.02016, -0.02286, 9)


Rapid random sampling for factor estimation:  15%|█████████████████▌                                                                                                   | 30/200 [00:17<01:12,  2.36it/s]

(0.12203, -0.14444, 4)


Rapid random sampling for factor estimation:  16%|██████████████████▏                                                                                                  | 31/200 [00:17<01:03,  2.65it/s]

(0.3533, -0.38232, 2)


Rapid random sampling for factor estimation:  16%|██████████████████▋                                                                                                  | 32/200 [00:18<01:05,  2.55it/s]

(0.1192, -0.13671, 4)
(0.05867, -0.0799, 5)


Rapid random sampling for factor estimation:  17%|███████████████████▉                                                                                                 | 34/200 [00:18<01:07,  2.47it/s]

(0.02339, -0.02396, 8)


Rapid random sampling for factor estimation:  18%|████████████████████▍                                                                                                | 35/200 [00:19<01:03,  2.60it/s]

(0.02286, -0.02414, 7)
(0.02017, -0.02215, 11)


Rapid random sampling for factor estimation:  18%|█████████████████████▋                                                                                               | 37/200 [00:19<00:45,  3.57it/s]

(0.02053, -0.02126, 11)
(0.29368, -0.36779, 2)


Rapid random sampling for factor estimation:  20%|██████████████████████▊                                                                                              | 39/200 [00:20<00:48,  3.30it/s]

(0.01997, -0.02123, 11)


Rapid random sampling for factor estimation:  20%|███████████████████████▍                                                                                             | 40/200 [00:20<00:58,  2.76it/s]

(0.02223, -0.02347, 8)


Rapid random sampling for factor estimation:  20%|███████████████████████▉                                                                                             | 41/200 [00:20<00:54,  2.93it/s]

(0.02383, -0.02554, 6)


Rapid random sampling for factor estimation:  21%|████████████████████████▌                                                                                            | 42/200 [00:21<00:58,  2.69it/s]

(0.02194, -0.02373, 9)


Rapid random sampling for factor estimation:  22%|█████████████████████████▏                                                                                           | 43/200 [00:21<00:59,  2.63it/s]

(0.02295, -0.02411, 8)
(0.11799, -0.14602, 4)


Rapid random sampling for factor estimation:  22%|██████████████████████████▎                                                                                          | 45/200 [00:22<00:49,  3.10it/s]

(0.18767, -0.21427, 3)
(0.02149, -0.02291, 9)


Rapid random sampling for factor estimation:  24%|███████████████████████████▍                                                                                         | 47/200 [00:22<00:36,  4.25it/s]

(0.20142, -0.25134, 3)


Rapid random sampling for factor estimation:  24%|████████████████████████████                                                                                         | 48/200 [00:23<00:46,  3.30it/s]

(0.1976, -0.26144, 3)


Rapid random sampling for factor estimation:  24%|████████████████████████████▋                                                                                        | 49/200 [00:23<01:00,  2.48it/s]

(0.02234, -0.0235, 8)


Rapid random sampling for factor estimation:  25%|█████████████████████████████▎                                                                                       | 50/200 [00:24<00:54,  2.77it/s]

(0.02163, -0.02232, 10)


Rapid random sampling for factor estimation:  26%|█████████████████████████████▊                                                                                       | 51/200 [00:24<01:02,  2.40it/s]

(0.0223, -0.0243, 8)


Rapid random sampling for factor estimation:  26%|██████████████████████████████▍                                                                                      | 52/200 [00:24<00:54,  2.70it/s]

(0.06032, -0.08245, 5)


Rapid random sampling for factor estimation:  26%|███████████████████████████████                                                                                      | 53/200 [00:25<00:51,  2.86it/s]

(0.13209, -0.1422, 4)


Rapid random sampling for factor estimation:  28%|████████████████████████████████▏                                                                                    | 55/200 [00:25<00:46,  3.14it/s]

(0.02033, -0.02247, 9)
(0.02069, -0.02201, 11)


Rapid random sampling for factor estimation:  28%|████████████████████████████████▊                                                                                    | 56/200 [00:26<00:55,  2.59it/s]

(0.31811, -0.37333, 2)


Rapid random sampling for factor estimation:  28%|█████████████████████████████████▎                                                                                   | 57/200 [00:26<00:52,  2.73it/s]

(0.02129, -0.02234, 11)


Rapid random sampling for factor estimation:  29%|█████████████████████████████████▉                                                                                   | 58/200 [00:26<00:47,  2.97it/s]

(0.29996, -0.35619, 2)


Rapid random sampling for factor estimation:  30%|██████████████████████████████████▌                                                                                  | 59/200 [00:27<00:50,  2.81it/s]

(0.02167, -0.02302, 9)


Rapid random sampling for factor estimation:  30%|███████████████████████████████████                                                                                  | 60/200 [00:27<00:53,  2.61it/s]

(0.10022, -0.08769, 5)
(0.02192, -0.02337, 8)


Rapid random sampling for factor estimation:  32%|████████████████████████████████████▊                                                                                | 63/200 [00:28<00:34,  3.92it/s]

(0.29226, -0.36444, 2)
(0.02142, -0.02361, 8)


Rapid random sampling for factor estimation:  32%|█████████████████████████████████████▍                                                                               | 64/200 [00:28<00:29,  4.65it/s]

(0.05858, -0.0808, 5)
(0.09503, -0.08863, 5)


Rapid random sampling for factor estimation:  33%|██████████████████████████████████████▌                                                                              | 66/200 [00:28<00:29,  4.55it/s]

(0.05958, -0.08153, 5)


Rapid random sampling for factor estimation:  34%|███████████████████████████████████████▏                                                                             | 67/200 [00:29<00:31,  4.18it/s]

(0.02171, -0.02251, 11)


Rapid random sampling for factor estimation:  34%|███████████████████████████████████████▊                                                                             | 68/200 [00:29<00:31,  4.20it/s]

(0.29962, -0.35599, 2)
(0.02185, -0.02355, 9)


Rapid random sampling for factor estimation:  35%|████████████████████████████████████████▉                                                                            | 70/200 [00:30<00:34,  3.78it/s]

(0.05964, -0.08001, 5)


Rapid random sampling for factor estimation:  36%|█████████████████████████████████████████▌                                                                           | 71/200 [00:30<00:36,  3.56it/s]

(0.07324, -0.07435, 5)
(0.05935, -0.08101, 5)


Rapid random sampling for factor estimation:  36%|██████████████████████████████████████████▋                                                                          | 73/200 [00:31<00:39,  3.23it/s]

(0.0219, -0.02417, 6)


Rapid random sampling for factor estimation:  37%|███████████████████████████████████████████▎                                                                         | 74/200 [00:31<00:42,  2.98it/s]

(0.1124, -0.13586, 4)


Rapid random sampling for factor estimation:  38%|███████████████████████████████████████████▉                                                                         | 75/200 [00:31<00:38,  3.28it/s]

(0.20599, -0.25446, 3)
(0.02411, -0.02382, 8)


Rapid random sampling for factor estimation:  38%|█████████████████████████████████████████████                                                                        | 77/200 [00:32<00:37,  3.29it/s]

(0.02154, -0.02292, 9)
(0.02117, -0.0233, 8)


Rapid random sampling for factor estimation:  40%|██████████████████████████████████████████████▏                                                                      | 79/200 [00:32<00:30,  3.96it/s]

(0.29706, -0.36787, 2)


Rapid random sampling for factor estimation:  41%|███████████████████████████████████████████████▉                                                                     | 82/200 [00:33<00:25,  4.55it/s]

(0.02259, -0.0251, 6)
(0.02192, -0.02391, 7)
(0.0232, -0.02474, 7)


Rapid random sampling for factor estimation:  42%|█████████████████████████████████████████████████▋                                                                   | 85/200 [00:33<00:19,  5.94it/s]

(0.06016, -0.07953, 5)
(0.0218, -0.02253, 11)
(0.3081, -0.37263, 2)


Rapid random sampling for factor estimation:  43%|██████████████████████████████████████████████████▎                                                                  | 86/200 [00:34<00:24,  4.60it/s]

(0.21836, -0.2394, 3)


Rapid random sampling for factor estimation:  44%|███████████████████████████████████████████████████▍                                                                 | 88/200 [00:34<00:29,  3.85it/s]

(0.02337, -0.02427, 6)
(0.29421, -0.3558, 2)
(0.02364, -0.02512, 6)
(0.02298, -0.02385, 8)


Rapid random sampling for factor estimation:  46%|█████████████████████████████████████████████████████▏                                                               | 91/200 [00:35<00:19,  5.53it/s]

(0.11521, -0.13955, 4)
(0.30254, -0.35863, 2)


Rapid random sampling for factor estimation:  46%|██████████████████████████████████████████████████████▍                                                              | 93/200 [00:35<00:20,  5.33it/s]

(0.05951, -0.07981, 5)


Rapid random sampling for factor estimation:  48%|███████████████████████████████████████████████████████▌                                                             | 95/200 [00:36<00:22,  4.75it/s]

(0.0219, -0.02336, 9)
(0.02153, -0.02394, 7)


Rapid random sampling for factor estimation:  48%|████████████████████████████████████████████████████████▋                                                            | 97/200 [00:36<00:21,  4.82it/s]

(0.02184, -0.02358, 8)
(0.02266, -0.02484, 6)


Rapid random sampling for factor estimation:  49%|█████████████████████████████████████████████████████████▎                                                           | 98/200 [00:36<00:18,  5.53it/s]

(0.02227, -0.02525, 6)


Rapid random sampling for factor estimation:  50%|█████████████████████████████████████████████████████████▉                                                           | 99/200 [00:36<00:19,  5.11it/s]

(0.18525, -0.21453, 3)


Rapid random sampling for factor estimation:  50%|██████████████████████████████████████████████████████████                                                          | 100/200 [00:37<00:26,  3.70it/s]

(0.12389, -0.14019, 4)


Rapid random sampling for factor estimation:  51%|███████████████████████████████████████████████████████████▏                                                        | 102/200 [00:37<00:23,  4.10it/s]

(0.12713, -0.14045, 4)
(0.29109, -0.36443, 2)


Rapid random sampling for factor estimation:  52%|███████████████████████████████████████████████████████████▋                                                        | 103/200 [00:38<00:39,  2.48it/s]

(0.06465, -0.07756, 5)
(0.21619, -0.22597, 3)


Rapid random sampling for factor estimation:  52%|████████████████████████████████████████████████████████████▉                                                       | 105/200 [00:38<00:26,  3.56it/s]

(0.02261, -0.0248, 6)


Rapid random sampling for factor estimation:  54%|██████████████████████████████████████████████████████████████                                                      | 107/200 [00:39<00:21,  4.36it/s]

(0.02301, -0.02348, 8)
(0.02261, -0.02495, 6)


Rapid random sampling for factor estimation:  54%|██████████████████████████████████████████████████████████████▋                                                     | 108/200 [00:39<00:19,  4.84it/s]

(0.02329, -0.02557, 6)
(0.19862, -0.22602, 3)


Rapid random sampling for factor estimation:  55%|███████████████████████████████████████████████████████████████▊                                                    | 110/200 [00:40<00:23,  3.80it/s]

(0.02265, -0.02374, 9)


Rapid random sampling for factor estimation:  57%|██████████████████████████████████████████████████████████████████                                                  | 114/200 [00:40<00:16,  5.27it/s]

(0.1149, -0.13632, 4)
(0.12231, -0.13714, 4)
(0.20252, -0.21727, 3)
(0.11905, -0.13661, 4)


Rapid random sampling for factor estimation:  57%|██████████████████████████████████████████████████████████████████▋                                                 | 115/200 [00:41<00:24,  3.52it/s]

(0.02398, -0.02557, 6)
(0.02079, -0.02265, 9)
(0.02167, -0.02329, 8)
(0.30474, -0.35664, 2)


Rapid random sampling for factor estimation:  60%|█████████████████████████████████████████████████████████████████████▌                                              | 120/200 [00:41<00:12,  6.21it/s]

(0.02109, -0.02243, 10)
(0.18213, -0.21457, 3)


Rapid random sampling for factor estimation:  60%|██████████████████████████████████████████████████████████████████████▏                                             | 121/200 [00:42<00:13,  5.95it/s]

(0.31174, -0.3732, 2)


Rapid random sampling for factor estimation:  62%|███████████████████████████████████████████████████████████████████████▎                                            | 123/200 [00:43<00:25,  3.03it/s]

(0.01962, -0.02167, 11)
(0.02232, -0.02312, 10)


Rapid random sampling for factor estimation:  62%|███████████████████████████████████████████████████████████████████████▉                                            | 124/200 [00:43<00:28,  2.70it/s]

(0.07323, -0.0767, 5)


Rapid random sampling for factor estimation:  63%|█████████████████████████████████████████████████████████████████████████                                           | 126/200 [00:44<00:29,  2.52it/s]

(0.02419, -0.02632, 6)
(0.02148, -0.02349, 8)


Rapid random sampling for factor estimation:  64%|█████████████████████████████████████████████████████████████████████████▋                                          | 127/200 [00:45<00:33,  2.18it/s]

(0.21578, -0.24039, 3)


Rapid random sampling for factor estimation:  64%|██████████████████████████████████████████████████████████████████████████▏                                         | 128/200 [00:46<00:36,  1.95it/s]

(0.02366, -0.02454, 6)


Rapid random sampling for factor estimation:  64%|██████████████████████████████████████████████████████████████████████████▊                                         | 129/200 [00:46<00:35,  2.01it/s]

(0.18146, -0.2224, 3)


Rapid random sampling for factor estimation:  65%|███████████████████████████████████████████████████████████████████████████▍                                        | 130/200 [00:47<00:40,  1.74it/s]

(0.02167, -0.02309, 9)


Rapid random sampling for factor estimation:  66%|███████████████████████████████████████████████████████████████████████████▉                                        | 131/200 [00:47<00:35,  1.92it/s]

(0.02209, -0.02314, 9)
(0.02022, -0.02202, 10)


Rapid random sampling for factor estimation:  67%|█████████████████████████████████████████████████████████████████████████████▋                                      | 134/200 [00:48<00:23,  2.79it/s]

(0.05966, -0.08006, 5)
(0.0228, -0.02501, 6)


Rapid random sampling for factor estimation:  68%|██████████████████████████████████████████████████████████████████████████████▎                                     | 135/200 [00:48<00:20,  3.12it/s]

(0.02137, -0.02253, 10)
(0.02176, -0.02331, 9)


Rapid random sampling for factor estimation:  68%|███████████████████████████████████████████████████████████████████████████████▍                                    | 137/200 [00:49<00:15,  4.15it/s]

(0.19967, -0.23951, 3)
(0.28529, -0.35683, 2)


Rapid random sampling for factor estimation:  70%|████████████████████████████████████████████████████████████████████████████████▌                                   | 139/200 [00:49<00:15,  4.03it/s]

(0.02098, -0.02404, 6)


Rapid random sampling for factor estimation:  70%|█████████████████████████████████████████████████████████████████████████████████▏                                  | 140/200 [00:50<00:20,  2.91it/s]

(0.02194, -0.02321, 9)
(0.17867, -0.2133, 3)


Rapid random sampling for factor estimation:  71%|██████████████████████████████████████████████████████████████████████████████████▎                                 | 142/200 [00:50<00:14,  3.92it/s]

(0.02264, -0.02429, 8)


Rapid random sampling for factor estimation:  72%|██████████████████████████████████████████████████████████████████████████████████▉                                 | 143/200 [00:50<00:15,  3.75it/s]

(0.22234, -0.23897, 3)
(0.0219, -0.02398, 8)


Rapid random sampling for factor estimation:  72%|████████████████████████████████████████████████████████████████████████████████████                                | 145/200 [00:51<00:16,  3.38it/s]

(0.023, -0.02696, 6)
(0.02301, -0.02408, 7)


Rapid random sampling for factor estimation:  74%|█████████████████████████████████████████████████████████████████████████████████████▊                              | 148/200 [00:52<00:12,  4.08it/s]

(0.02189, -0.02354, 8)
(0.02241, -0.02248, 10)


Rapid random sampling for factor estimation:  74%|██████████████████████████████████████████████████████████████████████████████████████▍                             | 149/200 [00:52<00:12,  4.19it/s]

(0.11231, -0.13627, 4)
(0.02123, -0.02336, 9)


Rapid random sampling for factor estimation:  76%|███████████████████████████████████████████████████████████████████████████████████████▌                            | 151/200 [00:52<00:10,  4.54it/s]

(0.02313, -0.02437, 6)
(0.28013, -0.34963, 2)
(0.05709, -0.07948, 5)


Rapid random sampling for factor estimation:  77%|█████████████████████████████████████████████████████████████████████████████████████████▎                          | 154/200 [00:53<00:08,  5.18it/s]

(0.35986, -0.39142, 2)


Rapid random sampling for factor estimation:  78%|█████████████████████████████████████████████████████████████████████████████████████████▉                          | 155/200 [00:53<00:10,  4.43it/s]

(0.02054, -0.02202, 10)


Rapid random sampling for factor estimation:  78%|██████████████████████████████████████████████████████████████████████████████████████████▍                         | 156/200 [00:53<00:11,  3.70it/s]

(0.11965, -0.13672, 4)


Rapid random sampling for factor estimation:  78%|███████████████████████████████████████████████████████████████████████████████████████████                         | 157/200 [00:54<00:14,  2.92it/s]

(0.02158, -0.02225, 11)
(0.02339, -0.02427, 7)


Rapid random sampling for factor estimation:  80%|████████████████████████████████████████████████████████████████████████████████████████████▏                       | 159/200 [00:55<00:14,  2.90it/s]

(0.11785, -0.13662, 4)
(0.19345, -0.22235, 3)


Rapid random sampling for factor estimation:  80%|█████████████████████████████████████████████████████████████████████████████████████████████▍                      | 161/200 [00:55<00:10,  3.86it/s]

(0.11955, -0.13684, 4)
(0.02226, -0.02513, 6)


Rapid random sampling for factor estimation:  82%|██████████████████████████████████████████████████████████████████████████████████████████████▌                     | 163/200 [00:55<00:07,  4.72it/s]

(0.02173, -0.02394, 7)
(0.02157, -0.02279, 10)


Rapid random sampling for factor estimation:  82%|███████████████████████████████████████████████████████████████████████████████████████████████▋                    | 165/200 [00:56<00:08,  4.06it/s]

(0.11984, -0.13956, 4)


Rapid random sampling for factor estimation:  83%|████████████████████████████████████████████████████████████████████████████████████████████████▎                   | 166/200 [00:57<00:13,  2.61it/s]

(0.02111, -0.02292, 10)


Rapid random sampling for factor estimation:  84%|████████████████████████████████████████████████████████████████████████████████████████████████▊                   | 167/200 [00:58<00:19,  1.69it/s]

(0.02224, -0.02402, 8)


Rapid random sampling for factor estimation:  84%|█████████████████████████████████████████████████████████████████████████████████████████████████▍                  | 168/200 [00:58<00:16,  1.90it/s]

(0.02282, -0.02325, 10)


Rapid random sampling for factor estimation:  85%|██████████████████████████████████████████████████████████████████████████████████████████████████▌                 | 170/200 [00:59<00:12,  2.38it/s]

(0.02299, -0.02346, 9)
(0.0226, -0.02399, 8)


Rapid random sampling for factor estimation:  86%|███████████████████████████████████████████████████████████████████████████████████████████████████▊                | 172/200 [00:59<00:08,  3.43it/s]

(0.18222, -0.21427, 3)
(0.02189, -0.02232, 10)


Rapid random sampling for factor estimation:  87%|████████████████████████████████████████████████████████████████████████████████████████████████████▉               | 174/200 [01:00<00:05,  4.84it/s]

(0.02036, -0.02216, 11)
(0.02202, -0.02295, 10)


Rapid random sampling for factor estimation:  88%|█████████████████████████████████████████████████████████████████████████████████████████████████████▌              | 175/200 [01:00<00:04,  5.19it/s]

(0.13218, -0.1411, 4)


Rapid random sampling for factor estimation:  88%|██████████████████████████████████████████████████████████████████████████████████████████████████████              | 176/200 [01:01<00:08,  2.95it/s]

(0.02077, -0.02226, 11)


Rapid random sampling for factor estimation:  88%|██████████████████████████████████████████████████████████████████████████████████████████████████████▋             | 177/200 [01:01<00:11,  2.04it/s]

(0.02147, -0.02304, 10)
(0.02221, -0.02424, 9)


Rapid random sampling for factor estimation:  90%|███████████████████████████████████████████████████████████████████████████████████████████████████████▊            | 179/200 [01:02<00:07,  2.66it/s]

(0.02236, -0.02359, 9)
(0.06279, -0.07578, 5)


Rapid random sampling for factor estimation:  90%|████████████████████████████████████████████████████████████████████████████████████████████████████████▉           | 181/200 [01:02<00:05,  3.43it/s]

(0.02283, -0.02543, 6)
(0.31587, -0.36597, 2)


Rapid random sampling for factor estimation:  92%|██████████████████████████████████████████████████████████████████████████████████████████████████████████▏         | 183/200 [01:03<00:04,  3.70it/s]

(0.31602, -0.36461, 2)


Rapid random sampling for factor estimation:  92%|██████████████████████████████████████████████████████████████████████████████████████████████████████████▋         | 184/200 [01:03<00:05,  2.69it/s]

(0.02153, -0.02499, 6)


Rapid random sampling for factor estimation:  94%|████████████████████████████████████████████████████████████████████████████████████████████████████████████▍       | 187/200 [01:04<00:03,  4.24it/s]

(0.02067, -0.02153, 11)
(0.02251, -0.02338, 9)
(0.18185, -0.21413, 3)
(0.11943, -0.13667, 4)


Rapid random sampling for factor estimation:  94%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████▌      | 189/200 [01:04<00:02,  5.38it/s]

(0.02154, -0.02305, 9)


Rapid random sampling for factor estimation:  96%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████▊     | 191/200 [01:05<00:02,  3.57it/s]

(0.02186, -0.02171, 11)
(0.02262, -0.02544, 6)


Rapid random sampling for factor estimation:  96%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████▎    | 192/200 [01:06<00:02,  2.87it/s]

(0.02181, -0.0242, 7)
(0.0761, -0.07458, 5)


Rapid random sampling for factor estimation:  97%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌   | 194/200 [01:06<00:01,  3.43it/s]

(0.02293, -0.02438, 8)


Rapid random sampling for factor estimation:  98%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████   | 195/200 [01:06<00:01,  3.36it/s]

(0.02307, -0.02391, 7)
(0.02086, -0.02283, 8)


Rapid random sampling for factor estimation:  98%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎ | 197/200 [01:07<00:00,  4.12it/s]

(0.0218, -0.02402, 7)
(0.02269, -0.02354, 8)


Rapid random sampling for factor estimation: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍| 199/200 [01:07<00:00,  4.99it/s]

(0.02226, -0.02404, 8)


Rapid random sampling for factor estimation: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 200/200 [01:10<00:00,  1.20it/s]

(0.02103, -0.02255, 10)


Rapid random sampling for factor estimation: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 200/200 [01:11<00:00,  2.81it/s]
08-May-24 15:55:35 - Estimated factor count: 5


Unnamed: 0,Test MSE,Train MSE,Delta MSE,Delta Ratio
0,-0.364856,0.306172,,
1,-0.229468,0.198227,-0.135387,
2,-0.138647,0.120554,-0.090822,1.490629
3,-0.079471,0.067637,-0.059175,1.534676
4,-0.02512,0.022829,-0.054351,1.088679
5,-0.02423,0.022372,-0.00089,60.749164
6,-0.023697,0.022261,-0.000533,1.658128
7,-0.023275,0.021722,-0.000422,1.248875
8,-0.022624,0.021535,-0.000651,0.645143
9,-0.022009,0.020757,-0.000615,1.050612


In [13]:
factor_search.plot(actual_count=syn_factors)

Unnamed: 0,Test MSE,Train MSE,Delta MSE,Delta Ratio
0,-0.364856,0.306172,,
1,-0.229468,0.198227,-0.135387,
2,-0.138647,0.120554,-0.090822,1.490629
3,-0.079471,0.067637,-0.059175,1.534676
4,-0.02512,0.022829,-0.054351,1.088679
5,-0.02423,0.022372,-0.00089,60.749164
6,-0.023697,0.022261,-0.000533,1.658128
7,-0.023275,0.021722,-0.000422,1.248875
8,-0.022624,0.021535,-0.000651,0.645143
9,-0.022009,0.020757,-0.000615,1.050612


In [None]:
# rng = np.random.default_rng(seed=seed)

# hold_percentage = 0.10               # 10%
# mask = rng.random(size=V.shape) > hold_percentage
# rng_V = rng.random(size=V.shape)
# rng_U = rng.random(size=U.shape)

# V_train = copy.copy(V)
# V_train[~mask] = rng_V[~mask]
# U_train = copy.copy(U)
# U_train[~mask] = rng_U[~mask]

# m_train = np.count_nonzero(mask)
# m_test = np.count_nonzero(~mask)

# pbar = None

# def get_mask(V, threshold=0.1):
#     _mask = np.zeros(shape=V.shape)
#     for feature in range(V.shape[1]):
#         feature_i = rng.random(size=V[feature].shape) > threshold
#         _mask[feature] = feature_i
#     return _mask.astype(int)

# def random_sample(seed, factor_n):
#     mask = get_mask(V=V, threshold=hold_percentage)
#     m_train = np.count_nonzero(mask)
#     m_test = np.count_nonzero(~mask)
#     _sa = SA(V=V, U=U, factors=factor_n, method=method, seed=seed, optimized=True, verbose=False)
#     _sa.initialize()
#     _sa.train(max_iter=10000, converge_delta=1.0, converge_n=10)
#     residuals = V - _sa.WH
#     train_residuals = np.multiply(mask, residuals**2)
#     test_residuals = np.multiply(~mask, residuals**2)
#     train_mse = np.round(train_residuals.sum()/m_train, 5)
#     test_mse = np.round(test_residuals.sum()/m_test, 5)
#     return train_mse, test_mse, factor_n

In [None]:
# samples = 100

# min_factor = 3
# max_factor = 15
# train_mse = [[] for i in range(max_factor-min_factor)]
# test_mse = [[] for i in range(max_factor-min_factor)]

# # mp_pool = mp.Pool()
# # pool_parameters = []
# # for i in range(samples):
# #     seed_i = rng.integers(low=100, high=1e6, size=1)[0]
# #     factor_i = rng.integers(low=min_factor, high=max_factor, size=1)[0]
# #     pool_parameters.append((seed_i, factor_i))

# # pbar = tqdm(total=samples, desc="Rapid random sampling for factor estimation")

# # with mp.Pool() as pool:
# #     results = pool.starmap(random_sample, pool_parameters)
# # for result in results:
# #     list_i = result[2]-min_factor
# #     train_mse[list_i].append(result[0])
# #     test_mse[list_i].append(result[1])



# for i in tqdm(range(samples), desc="Rapid random sampling for factor estimation"):
#     seed_i = rng.integers(low=100, high=1e6, size=1)[0]
#     factor_i = rng.integers(low=min_factor, high=max_factor, size=1)[0]
#     train_mse_i, test_mse_i, _ = random_sample(seed=seed_i, factor_n=factor_i)
#     list_i = factor_i-min_factor
#     train_mse[list_i].append(train_mse_i)
#     test_mse[list_i].append(test_mse_i)

# # mask_n = 3
# # train_mse = []
# # test_mse = []

# # min_factor = 1
# # max_factor = 15
# # check_i = 0
# # total_n = int(mask_n)
# # print(f"Total: {total_n}")




# # mask_pbar = tqdm(range(mask_n), total=total_n, desc=f"Mask: NA - Searching factors {min_factor} -> {max_factor}") 
# # for mask_i in mask_pbar:
# #     mask_pbar.set_description(f"Mask: {mask_i+1} - Searching factors {min_factor} -> {max_factor}")
# #     # mask = rng.random(size=V.shape) > hold_percentage
# #     mask = get_mask(V=V, threshold=hold_percentage)
# #     m_train = np.count_nonzero(mask)
# #     m_test = np.count_nonzero(~mask)
# #     # mask = mask.astype(int)

# #     train_mse_n = []
# #     test_mse_n = []
# #     # for factor_n in tqdm(list(range(min_factor, max_factor+1)), desc=f"Mask: {mask_i} - Search factors {min_factor} -> {max_factor}", position=0, leave=True):
# #     for factor_n in range(min_factor, max_factor+1):
# #         # mask = get_mask(V=nV, threshold=hold_percentage)
# #         # m_train = np.count_nonzero(mask)
# #         # m_test = np.count_nonzero(~mask)
# #         # mask = mask.astype(int)
# #         # _batch_sa = BatchSA(V=V, U=U, factors=factor_n, method=method, seed=seed, optimized=True, verbose=False)
# #         _sa = SA(V=V, U=U, factors=factor_n, method=method, seed=seed, optimized=True, verbose=False)
# #         _sa.initialize()
# #         _sa.train()
# #         # _sa = _batch_sa.results[_batch_sa.best_model]
# #         residuals = V - _sa.WH
# #         train_residuals = np.multiply(mask, residuals**2)
# #         test_residuals = np.multiply(~mask, residuals**2)
# #         train_mse_i = np.round(train_residuals.sum()/m_train, 5)
# #         test_mse_i = np.round(test_residuals.sum()/m_test, 5)
# #         train_mse_n.append(train_mse_i)
# #         test_mse_n.append(test_mse_i)
# #     train_mse.append(train_mse_n)
# #     test_mse.append(test_mse_n)

In [None]:
# train_mse_mean = [np.mean(i) for i in train_mse]
# test_mse_mean = [np.mean(i) for i in test_mse]

# delta_mse_r = []
# for factor_n in range(0, len(test_mse_mean)-1):
#     delta_i = test_mse_mean[factor_n] - test_mse_mean[factor_n+1]
#     delta_mse_r.append(delta_i)
# c = np.max(delta_mse_r)*0.01
# ratio_delta = [np.nan]*2
# for factor_n in range(0, len(test_mse_mean)-2):
#     rd = delta_mse_r[factor_n]/(delta_mse_r[factor_n+1]+c)
#     ratio_delta.append(rd)
# delta_mse = [np.nan]
# for factor_n in range(0, len(test_mse_mean)-1):
#     delta_i = test_mse_mean[factor_n] - test_mse_mean[factor_n+1]
#     delta_mse.append(delta_i)

# mse_fig = go.Figure()
# x = list(range(min_factor, max_factor))
# mse_fig.add_trace(go.Scatter(x=x, y=train_mse_mean, name="Train MSE", mode='lines+markers'))
# mse_fig.add_trace(go.Scatter(x=x, y=test_mse_mean, name="Test MSE", mode='lines+markers'))
# mse_fig.add_trace(go.Scatter(x=x, y=delta_mse, name="Delta MSE", mode='lines+markers'))
# mse_fig.add_trace(go.Scatter(x=x, y=ratio_delta, name="Ratio MSE", mode='lines+markers'))
# mse_fig.add_vline(x=syn_factors, line_width=1, line_dash="dash", line_color="black", name="Actual Factors")
# mse_fig.update_layout(width=800, height=800, title_text="Factor Search", hovermode='x')
# mse_fig.update_yaxes(title_text="Mean Squared Error")
# mse_fig.update_xaxes(title_text="Number of Factors")
# mse_fig.show()

In [None]:
# optimal_factor_n = np.nanargmax(ratio_delta) + min_factor
# print(f"Optimal Factor n: {optimal_factor_n}, Actual Factor n: {syn_factors}")