## NMF-PY Workflow

The steps in this notebook are intended to replicate the preprocessing, base model building, and base model post-processing steps of PMF5.

The error estimation functionality has not yet been implemented in the new code base.

In [1]:
# Notebook imports
import os
import sys
import json

sys.path.insert(0, "/content/nmf_py-main")
sys.path.insert(0, "/content/nmf_py-main/src")

#### Sample Dataset
The three sample datasets from PMF5 are available for use, but a new dataset can be used in their place.

In [2]:
# Baton Rouge Dataset
br_input_file = os.path.join("/content/nmf_py-main/data/Dataset-BatonRouge-con.csv")
br_uncertainty_file = os.path.join("/content/nmf_py-main/data/Dataset-BatonRouge-unc.csv")
br_output_path = os.path.join("/content/nmf_py/data-main/output/BatonRouge")
# Baltimore Dataset
# b_input_file = os.path.join("data", "Dataset-Baltimore_con.txt")
# b_uncertainty_file = os.path.join("data", "Dataset-Baltimore_unc.txt")
# b_output_path = os.path.join("data", "output", "Baltimore")
# Saint Louis Dataset
# sl_input_file = os.path.join("data", "Dataset-StLouis-con.csv")
# sl_uncertainty_file = os.path.join("data", "Dataset-StLouis-unc.csv")
# sl_output_path = os.path.join("data", "output", "StLouis")

In [3]:
# !unzip /content/nmf_py-main-20231012.zip

In [4]:
!pip install fuzzy-c-means



#### Code Imports

In [5]:
from src.data.datahandler import DataHandler
from src.model.nmf import NMF
from src.model.batch_nmf import BatchNMF
from src.data.analysis import ModelAnalysis

ModuleNotFoundError: No module named 'src'

#### Input Parameters

In [None]:
index_col = "Date"                  # the index of the input/uncertainty datasets
factors = 6                         # the number of factors
method = "ls-nmf"                   # "ls-nmf", "ws-nmf"
models = 20                         # the number of models to train
init_method = "col_means"           # default is column means "col_means", "kmeans", "cmeans"
init_norm = True                    # if init_method=kmeans or cmeans, normalize the data prior to clustering.
seed = 42                           # random seed for initialization
max_iterations = 20000              # the maximum number of iterations for fitting a model
converge_delta = 0.1                # convergence criteria for the change in loss, Q
converge_n = 10                     # convergence criteria for the number of steps where the loss changes by less than converge_delta
verbose = True                      # adds more verbosity to the algorithm workflow on execution.
optimized = True                    # use the Rust code if possible
parallel = True                     # execute the model training in parallel, multiple models at the same time

#### Dataset Selection
One of the three sample datasets can be selected or a new cleaned dataset can be used. Datasets should be cleaned, containing no missing data (either dropping missing/NaNs, or interpolating the missing values).

In [None]:
# Loading the Baton Rouge dataset
input_file = br_input_file
uncertainty_file = br_uncertainty_file
output_path = br_output_path

#### Load Data
Assign the processed data and uncertainty datasets to the variables V and U. These steps will be simplified/streamlined in a future version of the code.

In [None]:
data_handler = DataHandler(
    input_path=input_file,
    uncertainty_path=uncertainty_file,
    index_col=index_col
)
V, U = data_handler.get_data()

#### Input/Uncertainty Data Metrics and Visualizations

In [None]:
# Show the input data metrics, including signal to noise ratio of the data and uncertainty
data_handler.metrics

In [None]:
# Concentration / Uncertainty Scatter plot for specific feature, feature/column specified by index
data_handler.data_uncertainty_plot(feature_idx=2)

In [None]:
# Species Concentration plot comparing features, features/columns specified by index
data_handler.feature_data_plot(x_idx=0, y_idx=1)

In [None]:
# Species Timeseries, a single or list of features/columns specified by index
data_handler.feature_timeseries_plot(feature_selection=[0, 1, 2, 3])

#### Train Model

In [None]:
%%time
# Training multiple models, optional parameters are commented out.
nmf_models = BatchNMF(V=V, U=U, factors=factors, models=models, method=method, seed=seed, max_iter=max_iterations,
                    # init_method=init_method, init_norm=init_norm,
                    converge_delta=converge_delta, converge_n=converge_n,
                    parallel=parallel, optimized=False,
                    # verbose=verbose
                   )
nmf_models.train()

In [None]:
# Selet the model to review, by index or the best performing from the collection of models
best_model = nmf_models.best_model
nmf_model = nmf_models.results[best_model]
best_model

In [None]:
# Initialize the Model Analysis module
model_analysis = ModelAnalysis(datahandler=data_handler, model=nmf_model, selected_model=best_model)

In [None]:
# Residual Analysis shows the scaled residual histogram, along with metrics and distribution curves. The abs_threshold parameter specifies the condition for the returned values of the function call as those residuals which exceed the absolute value of that threshold.
abs_threshold = 3.0
threshold_residuals = model_analysis.plot_residual_histogram(feature_idx=0, abs_threshold=abs_threshold)
print(f"List of Absolute Scaled Residual Greather than: {abs_threshold}. Count: {threshold_residuals.shape[0]}")
threshold_residuals

In [None]:
# The model output statistics for the estimated V, including SE: Standard Error metrics, and 3 normal distribution tests of the residuals (KS Normal is used in PMF5)
model_analysis.calculate_statistics()
model_analysis.statistics

In [None]:
# Model feature observed vs predicted plot with regression and one-to-one lines. Feature/Column specified by index.
model_analysis.plot_estimated_observed(feature_idx=2)

In [None]:
# Model feature timeseries analysis plot showing the observed vs predicted values of the feature, along with the residuals shown below. Feature/column specified by index.
model_analysis.plot_estimated_timeseries(feature_idx=1)

In [None]:
# Factor profile plot showing the factor sum of concentrations by feature (blue bars), the percentage of the feature as the red dot, and in the bottom plot the normalized contributions by date (values are resampled at a daily timestep for timeseries consistency).
# Factor specified by index.
model_analysis.plot_factor_profile(factor_idx=1)

In [None]:
# Model factor fingerprint specifies the feature percentage of each factor.
model_analysis.plot_factor_fingerprints()

In [None]:
# Factor G-Space plot shows the normalized contributions of one factor vs another factor. Factor specified by index.
model_analysis.plot_g_space(factor_1=2, factor_2=1)

In [None]:
# Factor contribution pie chart shows the percentage of factor contributions for the specified feature, and the corresponding normalized contribution of each factor for that feature (bottom plot). Feature specified by index.
model_analysis.plot_factor_contributions(feature_idx=1)

In [None]:
# New Graphic: Factor Profile Composition Radar Graph
model_analysis.plot_factor_composition()

In [None]:
# New Graphic: Factor Contribution Surface Plot
factor_idx = 1
model_analysis.plot_factor_surface(factor_idx=factor_idx)

### Error Estimation - Displacement

The displacement method for error estimation works by make slight adjustments to the factor profile values, individually, until a specific change in the loss value (dQ) is reached. There are 4 dQ values that are targetted, dQ = 4, 8, 16, 32. 

The target dQ value is run for both an increase and decrease in the factor profile value, a single value in the factor profile matrix at a time. The change in factor profile is found by running a modified binary search to identify the value change within a small threshold, 0.1 of the target dQ. The value search is stopped if the change in the factor profile value is less than 1e-8, in the instance that decreasing a factor profile value already near zero.

Once the change in the factor profile value is found that produces the target dQ, both increasing and decreasing, the modification to the H matrix is used as an initial guess for retraining a NMF model. The W matrix is reinitialized, using the original base model seed, and the model is trained to convergence. 

The resulting model factor profile is checked to see if any factors swapped base upon the highest factor correlation with the base model factors. The output shows the swap %, based upon the number of retrained models where that factor was modified and the number of times a swap was detected.

The factor profile plot shows the variability in the factor profile feature values that correspond to the dQ target values, default shown in the plots is for a dQ=4. The last plot show the factor feature contribution variability based upon the same changes to the H matrix.

In [None]:
# Import Error Estimation Displacement Method
from src.error.displacement import Displacement

In [None]:
# Initialize the Displacement method, passing in the results of the batch nmf run and the features labels from the data handler.
disp = Displacement(nmf=nmf_model, feature_labels=data_handler.features, model_selected=best_model)

In [None]:
%%time
# Execute the displacement model, which will test both increasing and decreasing changes to the individual values of H for all dQ targets. Results are then compiled and prepared.
disp.run()

In [None]:
# The swap table shows the percentage of times a factor was found to be more highly correlated as a result of the change in the factor feature value. This percentage is the number of times that a specific factor swapped after a model was retrained.
# The largest change in the dQ value is the largest difference (both from increasing and decreasing changes to the factor feature values) between a retrained model and the base model loss value.
disp.summary()

In [None]:
# The results for a specific factor can be plotted, showing both the variability in the profile(%) and contribution for a given dQ value.
factor_i = 1
disp.plot_results(factor=factor_i)

### Error Estimation - Bootstrap

The bootstrap method used is the block bootstrap method for time-series data. Here the initial dataset is broken into chunks of a specified size, containing sequential samples, and randomly added to a bootstrap dataset until the bootstrap dataset is the same size as the initial dataset.

The recommended block size calculation feature has not yet been implemented. 

The aim of the bootstrap method is to quantify the variability in the factor profiles and contributions when the order of the datasets have been shuffled, resampled. In this case the block bootstrap method is the default method, while the full bootstrap method can be used by setting the block parameter in the run function to false, i.e. bs.run(block=False). The blocks are randomly selected with replacement, allowing for the same block to be added to the bs dataset more than once. The final selected block is reduced in size until the bs dataset is exactly the same as the initial dataset. 

The resampling is completed a specified number of times, set by bootstrap_n. During each bootstrap run, the initial datasets are resampled (both the data and uncertainty datasets are resampled using the same indeces) and used to retrain a NMF model. The NMF model uses the resampled data and uncertainty data, the base model H matrix and the base model random seed. W is reinitialized and the model is trained to convergence. The resulting factor contributions, $V'_{bs}$ are mapped to the base model $V'_{base}$, where the correlation between all combinations of the factor contributions are checked. The mapping between the highest correlation is then noted, if the correlation is above the user specified threshold (default=0.6), and the results are shown in the summary statistics. The summary statistics also include metrics for the distribution of the bootstrap model results to compare against the base model results, these are shown as tables for each factor in the summary statistics.

The results of all the bootstrap runs are compiled to provide a distribution for each factor/feature percentage and contribution, these are shown in the plotted results.

In [None]:
# Import Bootstrap Error Estimation Method
from src.error.bootstrap import Bootstrap

In [None]:
# Bootstrap input parameters
# Some parameters are simply reassigned for clarity.
model_selected = nmf_models.best_model              # the model selected to be the base model for the Bootstrap method, here chosen as the best performing model from the batch nmf run.
nmf_model = nmf_models.results[model_selected]      # The selected model NMF object.
feature_labels = data_handler.features              # The list of feature names/labels

bootstrap_n = 20                                    # The number of bootstrap runs to complete
block_size = data_handler.optimal_block             # Calculates an optimal block size from the Politis and White 2004 algorithm (used in PMF5)
threshold = 0.6                                     # The r-correlation threshold
seed = seed                                         # The random seed used for random selection of the bootstrap blocks.
print(f"Optimal BS block size: {data_handler.optimal_block}")

In [None]:
# Initialize the bootstrap object
bs = Bootstrap(nmf=nmf_model, feature_labels=feature_labels, model_selected=model_selected, bootstrap_n=bootstrap_n, block_size=block_size, threshold=threshold, seed=seed)

In [None]:
%%time
# Execute the bootstrap runs with default parameters
bs.run()

In [None]:
# Print the output summary of all the bootstrap runs.
bs.summary()

In [None]:
# Plot the results of the bootstrap runs for a specific factor, showing the variability in percentage and concentration for each feature of the specified factor.
factor_i = 1
bs.plot_results(factor=factor_i)

### Error Estimation - Bootstrap-Displacement

The bootstrap-displacement (BS-DISP) method is a combination of the boostrap and displacement methods. An existing BS instance can be used, or a new BS instance will be created.

The BS-DISP method runs a BS instance and for each model in the BS run, DISP is run for each of the features specified or all by default. The results are similar to the DISP results by are aggregated across all bootstrap runs.

In [None]:
from src.error.bs_disp import BSDISP

In [None]:
# Bootstrap input parameters (Shared with BS-DISP)
model_selected = nmf_models.best_model              # the model selected to be the base model for the Bootstrap method, here chosen as the best performing model from the batch nmf run.
nmf_model = nmf_models.results[model_selected]      # The selected model NMF object.
feature_labels = data_handler.features              # The list of feature names/labels

bootstrap_n = 10                                    # The number of bootstrap runs to complete
block_size = data_handler.optimal_block             # Calculates an optimal block size from the Politis and White 2004 algorithm (used in PMF5)
threshold = 0.6                                     # The r-correlation threshold
seed = seed                                         # The random seed used for random selection of the bootstrap blocks.

# Displacement input parameters (Shared with BS-DISP)
threshold_dQ = 0.1
max_search = 50
features = [0,1]

In [None]:
# Initialize BS-DISP with the BS and DISP parameters
bsdisp = BSDISP(nmf=nmf_model, feature_labels=feature_labels, model_selected=model_selected, bootstrap_n=bootstrap_n, block_size=block_size, threshold=threshold, max_search=max_search, threshold_dQ=threshold_dQ, features=features, seed=seed)

In [None]:
%%time
# Execute the BS-DISP instance, which will first run BS (if required) then will run DISP for each BS model.
# parallel = False (CPU times: total: 1h 21min 25s, Wall time: 10min 13s)
bsdisp.run(parallel=False)

In [None]:
# Print the summary table and general metrics
bsdisp.summary()

In [None]:
# Plot the BS-DISP results, the profile and contribution boxplots for a specific factor.
factor_i = 1
bsdisp.plot_results(factor=factor_i)

In [None]:
# The overall error summary can be shown through the follow method that will take in an existing bootstrap and displacement object and plot the error estimation for a given factor.
from src.error.error import Error

In [None]:
# Pass in the previously completed bootstrap and displacement instances
error = Error(bs=bs, disp=disp)

In [None]:
# Plot the error estimation of the concentration for a specified factor.
factor_i = 1
error.plot_summary(factor=factor_i)

### Rotational Tools - Fpeak
Placeholder for fpeak summary and algorithm.

In [None]:
# import the module
from src.rotational.fpeak import Fpeak

In [None]:
# intialize the fpeak instance and set the fpeak values
fpeak_list = [1.0, -1.0, 1.5, -1.5, 2.5, -2.5, 5.0, -5.0]
s = 0.1       # The softness parameter, setting lowercase s sets all values of the S array to s.
S = None      # An array of size N, that is the softness values corresponding to each nth auxiliary equation (sample in the dataset). Used when all values of S are not the same.

fp = Fpeak(base_model=nmf_model, data_handler=data_handler, fpeaks=fpeak_list, s=s, S=S)

In [None]:
# Run the Fpeak instance
max_iterations = 10000
converge_delta = 1e-4
converge_n = 20

fp.run(max_iter=max_iterations, converge_delta=converge_delta, converge_n=converge_n)

In [None]:
# View the tabled results of the fpeak runs (as shown in PMF5 - Fpeak Run Summary
fp.results_df

In [None]:
# Plot the Profiles/Contributions of a specific Fpeak value and factor
factor_idx = 1
fpeak = 1.0

fp.plot_profile_contributions(factor_idx=factor_idx, fpeak=fpeak)

In [None]:
# Plot the Factor profiles/fingerprints 
fpeak = 1.0

fp.plot_factor_fingerprints(fpeak=fpeak)

In [None]:
# Plot the Factor G-Space graph
fpeak = 1.0
factor_idx1 = 1
factor_idx2 = 2
show_base = True
show_delta = True

fp.plot_g_space(fpeak=fpeak, factor_idx1=factor_idx1, factor_idx2=factor_idx2, show_base=show_base, show_delta=show_delta)

In [None]:
# Plot the factor contributions
fpeak = 1.0
feature_idx = 2
threshold = 0.06

fp.plot_factor_contributions(fpeak=fpeak, feature_idx=feature_idx, threshold=threshold)