## Environmental Source Apportionment Toolkit (ESAT) Simulator


In [None]:
# When running from Google Colab or another Jupyter notebook cloud environment, the ESAT python package may need to be installed.
# If the python package file is available locally, run a pip install for the specific wheel for your current OS/Arch

#installation of wheel for the first time
#!pip install "path to wheel file"

#installation of an updated wheel over an already existing wheel
#!pip install --force-reinstall "path to wheel file"

#### Code Imports

In [None]:
from esat.data.datahandler import DataHandler
from esat.model.batch_sa import BatchSA
from esat.data.analysis import ModelAnalysis, BatchAnalysis
from esat_eval.simulator import Simulator
from esat.estimator import FactorEstimator

#### Synthetic Dataset

Generate synthetic input (V) and uncertainty (U) datasets for model analysis. V and U are generated in the following sequence:

1.	Feature profiles are defined and/or randomly generated (H); if the latter, for each feature, a random number of factors between 1 and K are chosen as sources for that feature. For each contributing factor, a random contribution (uniform value between 0 and 1) is assigned. If one or more predefined factor profiles (a row of H) are provided by the user, they are assigned to H in order of occurrence and overwrite the corresponding randomly generated row of H.
2.	Sample concentrations are defined and/or randomly generated (W); if the latter, each cell of W is set to a random uniform number between 0 and contribution_max
3.	V1 is calculated as the product W x H
4.	A noise matrix (N) is created by selecting values from a normal distribution with a randomly selected mean noise (uniform distribution between noise_mean_min and noise_mean_max) for each feature, and standard deviation = noise_scale. The randomly selected mean noise for a feature has a 50% chance to be multiplied by -1 to allow for the reduction of values in V1. Then the Hadamard product (element-wise matrix multiplication) of V1 and N is used to calculate V: V1 + V1◦N -> V
5.  Outliers are added to V if outliers=True. A number of elements in V (a proportion = outlier_p) are randomly selected and each one has a 50% chance to become V*outlier_mag, and a 50% chance to become V/outlier_mag

6.	An uncertainty matrix (U1) is created by selecting values from a normal distribution with a randomly selected mean uncertainty (uniform distribution between uncertainty_mean_min and uncertainty_mean_max) for each feature, and standard deviation = uncertainty_scale. Then the Hadamard product of V and U1 is used to calculate U: V◦U1 -> U

In [None]:
# Synethic dataset parameters
seed = 85
syn_factors = 4              # Number of factors in the synthetic dataset
syn_features = 40             # Number of features in the synthetic dataset
syn_samples = 850            # Number of samples in the synthetic dataset
outliers = True               # Add outliers to the dataset
outlier_p = 0.05               # Proportion of outliers
outlier_mag = 1.5             # Magnitude of outliers; a multipier of the data value
contribution_max = 20         # Each cell of W (dimension of samples x factors) is a random unform number between 0 and contribution_max
noise_mean_min = 0.1          # Min value for the mean noise (as a proportion) applied to a specific feature (column of V)
noise_mean_max = 0.2        # Max value for the mean noise (as a proportion) applied to a specific feature (column of V)
noise_scale = 0.02            # Standard deviation of the randomly assigned mean noise for a specific feature; should be 1/3 or less of the minimum noise value to avoid negative noise
uncertainty_mean_min = 0.1   # Min value for the mean uncertainty (as a proportion) applied to a specific feature (column of V)
uncertainty_mean_max = 0.4    # Max value for the mean uncertainty (as a proportion) applied to a specific feature (column of V)
uncertainty_scale = 0.01      # Standard deviation of the randomly assigned mean uncertainty for a specific feature; should be 1/3 or less of the minimum uncertainty value to avoid negative uncertainty

In [None]:
# Initialize the simulator with the above parameters
simulator = Simulator(seed=seed,
                      factors_n=syn_factors,
                      features_n=syn_features,
                      samples_n=syn_samples,
                      outliers=outliers,
                      outlier_p=outlier_p,
                      outlier_mag=outlier_mag,
                      contribution_max=contribution_max,
                      noise_mean_min=noise_mean_min,
                      noise_mean_max=noise_mean_max,
                      noise_scale=noise_scale,
                      uncertainty_mean_min=uncertainty_mean_min,
                      uncertainty_mean_max=uncertainty_mean_max,
                      uncertainty_scale=uncertainty_scale
                     )

In [None]:
# Example command for passing in a custom factor profile matrix instead of a randomly generated profile
# my_profile = np.ones(shape=(syn_factors, syn_features))
# simulator.generate_profiles(profiles=my_profile)

In [None]:
# Example of how to customize the factor contributions. Curve_type options: 'uniform', 'decreasing', 'increasing', 'logistic', 'periodic' ; uniform by default
# simulator.update_contribution(factor_i=0, curve_type="logistic", scale=0.1, frequency=0.5)
# simulator.update_contribution(factor_i=1, curve_type="periodic", minimum=0.0, maximum=1.0, frequency=0.5, scale=0.1)
# simulator.update_contribution(factor_i=2, curve_type="increasing", minimum=0.0, maximum=1.0, scale=0.1)
# simulator.update_contribution(factor_i=3, curve_type="decreasing", minimum=0.0, maximum=1.0, scale=0.1)
simulator.plot_synthetic_contributions()

#### Load Data
Assign the processed data and uncertainty datasets to the variables V and U. These steps will be simplified/streamlined in a future version of the code.

In [None]:
syn_input_df, syn_uncertainty_df = simulator.get_data()

In [None]:
data_handler = DataHandler.load_dataframe(input_df=syn_input_df, uncertainty_df=syn_uncertainty_df)
V, U = data_handler.get_data()

#### Input/Uncertainty Data Metrics and Visualizations

In [None]:
# Show the input data metrics, including signal to noise ratio of the data and variability of feature concentrations
data_handler.metrics

In [None]:
# Concentration/Uncertainty scatterplot for a specific feature; feature indexing starts at 0
data_handler.plot_data_uncertainty(feature_idx=0)

In [None]:
# Concentration plot comparing features; feature indexing starts at 0
data_handler.plot_feature_data(x_idx=0, y_idx=1)

In [None]:
# Feature timeseries; a list [] of features separated by commas; feature indexing starts at 0
data_handler.plot_feature_timeseries(feature_selection=[0, 1, 2, 3])

#### Factor Estimator
With real datasets, the actual number of factors/sources is typically unknown. The researcher can make a best guess, or run multiple batches using different numbers of factors. To assist with this process, the Factor Estimator algorithm generates an array models (samples parameter; suggested value ~ 1000) using various numbers of factors (between min_factors and max_factors; suggest 2-10) and random W and H initilaizations. Cross-validation is used, where 10% of the input data are randomly excluded from fitting the model solutions (a different 10% for each sample). "Test MSE" is calculated using each solution to predict the 10% excluded values, and is then used to calculate other metrics that are useful for estimating the correct factor count. We suggest examining the largest "Delta Ratio" and "K estimate" values as pointers to what factor counts appear to fit the data best.

Delta MSE:

     (Test MSE at k-1) - (Test MSE at k)

Delta Ratio:

     1.01*(Maximum Delta MSE)*(Delta MSE at k)/(Delta MSE at k+1)


K Estimate

     (Minimum Test MSE)/[(Test MSE at k)*k^1.3]

In [None]:
# Initialize and run factor estimator
samples = 100
min_factors = 2
max_factors = 10
factor_est = FactorEstimator(V=V, U=U)
results = factor_est.run(samples=samples, min_factors=min_factors, max_factors=max_factors)
results

In [None]:
# Plot the results of the factor search; the red vertical dashed line is the factor count with the maximum Delta Ratio,
# the black vertical dashed line is true number of factors (only plotted when sythetic data have been generated).
factor_est.plot(actual_count=syn_factors)
estimated_factors = factor_est.estimated_factor

#### Model Training Input Parameters

In [None]:
index_col = "Date"                  # the index of the input/uncertainty datasets
# factors = syn_factors             # the number of factors set by the initial synthetic parameters above
factors = 5                         # the number of factors in the model
method = "ls-nmf"                   # "ls-nmf", "ws-nmf"
models = 20                         # the number of models to train
init_method = "col_means"           # default is column means "col_means", "kmeans", "cmeans"
init_norm = True                    # if init_method=kmeans or cmeans, normalize the data prior to clustering.
seed = 332                           # random seed for initialization
max_iterations = 20000              # the maximum number of iterations for fitting a model
converge_delta = 0.1                # convergence criteria for the change in loss, Q
converge_n = 25                     # convergence criteria for the number of steps where the loss changes by less than converge_delta
verbose = True                      # adds more verbosity to the algorithm workflow on execution.
optimized = True                    # use the Rust code if possible
parallel = True                     # execute the model training in parallel, multiple models at the same time

#### Train Model

In [None]:
%%time
# Training multiple models, optional parameters are commented out. Model with lowest QTrue shown at the end (using zero indexing for model number)
sa_models = BatchSA(V=V, U=U, factors=factors, models=models, method=method, seed=seed, max_iter=max_iterations,
                    init_method=init_method, init_norm=init_norm,
                    converge_delta=converge_delta, converge_n=converge_n, 
                    parallel=parallel, optimized=optimized,
                    verbose=True
                   )
_ = sa_models.train()

lowest_Q_model = sa_models.best_model
lowest_Q_model

#### Batch Analysis

These methods allow plotting/reviewing results of models fit via the BatchSA training.

In [None]:
# Perform batch model analysis
batch_analysis = BatchAnalysis(batch_sa=sa_models, data_handler=data_handler)
# Plot the loss of the models over iterations
batch_analysis.plot_loss()

In [None]:
# Plot the loss distribution for the batch models
batch_analysis.plot_loss_distribution()

In [None]:
# Plot the concentration and residuals for each model for a specified feature with starting index of 0
batch_analysis.plot_temporal_residuals(feature_idx=0)

### Compare to Synthetic Data

Compare the set of batch models to the original synthetic factor data.


In [None]:
simulator.compare(batch_sa=sa_models)

In [None]:
# Plot the factor/feature mapping of the best model by lowest QTrue
simulator.plot_comparison(model_i=lowest_Q_model)

In [None]:
# Save the Simulator instance, saves the instance as a pickle file and saves the synthetic profiles, contributions, data and uncertainty as csv files.
# sim_name = "synthetic"
# sim_output_dir = "D:/git/esat/notebooks/"
# simulator.save(sim_name=sim_name, output_directory=sim_output_dir)

In [None]:
# Load a previously saved Simulator instance
# simulator_file = "D:/git/esat/notebooks/esat_simulator.pkl"
# simulator_2 = Simulator.load(file_path=simulator_file)
# simulator_2.factor_compare.print_results()

In [None]:
# Selet the model with highest correlations to the sythetic profiles (using zero indexing for the model number)
highest_corr_model = simulator.factor_compare.best_model
highest_corr_model

In [None]:
# Plot mapping results for best model by highest average correlation with sythetic data
simulator.plot_comparison(model_i=highest_corr_model)

In [None]:
# Initialize the Model Analysis module for the model with lowest QTrue or the model with highest correlation to sythetic data

#sa_model = sa_models.results[lowest_Q_model]
#model_analysis = ModelAnalysis(datahandler=data_handler, model=sa_model, selected_model=lowest_Q_model)

sa_model = sa_models.results[highest_corr_model]
model_analysis = ModelAnalysis(datahandler=data_handler, model=sa_model, selected_model=highest_corr_model)

In [None]:
# Residual Analysis shows the scaled residual histogram for a specified feature (with zero indexing), along with metrics and distribution curves. The abs_threshold parameter specifies the condition for the returned values of the function call as those residuals which exceed the absolute value of that threshold.
abs_threshold = 3.0
threshold_residuals = model_analysis.plot_residual_histogram(feature_idx=5, abs_threshold=abs_threshold)

In [None]:
print(f"List of Absolute Scaled Residual Greather than: {abs_threshold}. Count: {threshold_residuals.shape[0]}")
threshold_residuals

In [None]:
# The model output statistics for the estimated V, including SE: Standard Error metrics, and 3 normal distribution tests of the residuals (KS Normal is used in PMF5)
model_analysis.calculate_statistics()
model_analysis.statistics

In [None]:
# Model feature observed vs predicted plot with regression and one-to-one lines. Feature/Column specified by index (using zero indexing).
model_analysis.plot_estimated_observed(feature_idx=4)

In [None]:
# Model feature timeseries analysis plot showing the observed vs predicted values of the feature, along with the residuals shown below. Feature/column specified by index (using zero indexing).
model_analysis.plot_estimated_timeseries(feature_idx=4)

In [None]:
# Top Plot: factor profile showing the sum of concentrations over all samples for each feature (blue bars) and the percentage of the feature for this factor (red dot);
# summing this percent for a specific feature over all model factors would = 100; bottom plot shows the normalized contributions by date (values are resampled at a daily timestep for timeseries consistency)
# Factor specified by index, starting at 1
model_analysis.plot_factor_profile(factor_idx=1)

In [None]:
# Model factor fingerprint specifies the feature percentage of each factor.
model_analysis.plot_factor_fingerprints(grouped=False)

In [None]:
# Factor G-Space plot shows the normalized contributions of one factor vs another factor. Factor specified by index (starting at 1).
model_analysis.plot_g_space(factor_1=3, factor_2=1)

In [None]:
# Factor contribution pie chart shows the percentage of factor contributions for the specified feature, and the corresponding normalized contribution of each factor for that feature (bottom plot). Feature specified by index (starting at 0).
model_analysis.plot_factor_contributions(feature_idx=0)