## Environmental Source Apportionment Toolkit (ESAT) Solution Eval


In [None]:
# Notebook imports
import os
import sys
import json
import numpy as np
import pandas as pd

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

#### Code Imports

In [None]:
from esat.data.datahandler import DataHandler
from esat.model.sa import SA
from esat.model.batch_sa import BatchSA
from esat.data.analysis import ModelAnalysis, BatchAnalysis
from esat.metrics import q_loss, qr_loss

#### Synthetic Dataset

Generate a synthetic dataset where the factor profiles and contributions are pre-determined for model output analysis.

In [None]:
seed = 42

syn_factors = 6
syn_features = 40
syn_samples = 200

rng = np.random.default_rng(seed)
syn_factor_profiles = np.zeros(shape=(syn_factors, syn_features))
syn_factor_contributions = rng.random(size=(syn_samples, syn_factors)) * 10
factor_list = list(range(syn_factors))
for i in range(syn_features):
    factor_features_n = rng.integers(1, syn_factors, 1)     # Number of factors which will have a non-zero value in the profile for this feature
    factor_feature_selected = rng.choice(factor_list, size=factor_features_n, replace=False)     # The specific factors which have a non-zero value in the profile for this feature
    for j in factor_feature_selected:
        ij_value = rng.random(size=1)
        syn_factor_profiles[j, i] = ij_value

syn_factor_profiles[syn_factor_profiles == 0.0] = 1e-12

syn_data = np.matmul(syn_factor_contributions, syn_factor_profiles)
noise = syn_data * np.random.normal(loc=0.1, scale=0.05, size=syn_data.shape)
syn_data = np.add(syn_data, noise)

syn_unc_p = np.random.normal(loc=0.05, scale=0.01, size=syn_data.shape)
syn_uncertainty = syn_data * syn_unc_p
syn_uncertainty[syn_uncertainty <= 0.0] = 1e-12

In [None]:
syn_columns = [f"Feature {i}" for i in range(1, syn_features+1)]
syn_input_df = pd.DataFrame(syn_data, columns=syn_columns)
syn_uncertainty_df = pd.DataFrame(syn_uncertainty, columns=syn_columns)

#### Input Parameters

In [None]:
index_col = "Date"                  # the index of the input/uncertainty datasets
factors = 5                         # the number of factors
method = "ls-nmf"                   # "ls-nmf", "ws-nmf"
models = 20                         # the number of models to train
init_method = "col_means"           # default is column means "col_means", "kmeans", "cmeans"
init_norm = True                    # if init_method=kmeans or cmeans, normalize the data prior to clustering.
seed = 42                           # random seed for initialization
max_iterations = 20000              # the maximum number of iterations for fitting a model
converge_delta = 0.1                # convergence criteria for the change in loss, Q
converge_n = 10                     # convergence criteria for the number of steps where the loss changes by less than converge_delta
verbose = True                      # adds more verbosity to the algorithm workflow on execution.
optimized = True                    # use the Rust code if possible
parallel = True                     # execute the model training in parallel, multiple models at the same time

In [None]:
syn_sa = SA(V=syn_input_df.to_numpy(), U=syn_uncertainty_df.to_numpy(), factors=factors, seed=seed, optimized=optimized, parallelized=parallel, verbose=verbose)
syn_sa.H = syn_factor_profiles
syn_sa.W = syn_factor_contributions
syn_sa.WH = syn_data
syn_sa.Qrobust = qr_loss(V=syn_sa.V, U=syn_sa.U, W=syn_sa.W, H=syn_sa.H)
syn_sa.Qtrue = q_loss(V=syn_sa.V, U=syn_sa.U, W=syn_sa.W, H=syn_sa.H)

syn_factor_columns = [f"Factor {i}" for i in range(1, syn_factors+1)]
syn_profile_df = pd.DataFrame(syn_factor_profiles.T, columns=syn_factor_columns)
syn_contribution_df = pd.DataFrame(syn_factor_contributions, columns=syn_factor_columns)

In [None]:
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots

In [None]:
2%2

In [None]:
# curve_x = np.arange(syn_samples)
min_y = 1.0
max_y = 10.0
frequency = 0.3
samples_n = 200

n_periods = int(1.0/frequency)
x_periods = []

curve_a = np.linspace(-np.pi, np.pi, int(samples_n/n_periods))
curve_x = np.tile(curve_a[0:len(curve_a)-1], n_periods+1)
curve_x = curve_x[0:samples_n]
curve_y = np.sin(curve_x) * ((max_y-min_y)/2)
curve_y = (curve_y + (np.abs(np.min(curve_y)))+min_y)
print(curve_y.shape)
# curve_x = np.concatenate((np.linspace(-100, 100, int(syn_samples/3)), np.linspace(100, -100, int(syn_samples/3)), np.linspace(-100, 100, int(syn_samples - syn_current_n))), axis=None)
# curve_y = min_y + (max_y-min_y)/(1.0 + np.exp(-curve_x))
# # curve_x = np.linspace(-np.pi, np.pi, syn_samples)
# # curve_y = (np.sin(curve_x) + min_y) * (max_y/2)
# curve_y = rng.normal(curve_y, scale=0.5)
# syn_factor_contributions[:,0] = curve_y

# curve_fig = go.Figure()
# for i in range(syn_factors):
# contributions_i = syn_factor_contributions[:,i]
# curve_fig.add_trace(go.Scatter(x=np.arange(syn_samples), y=curve_y, name=syn_factor_columns[i]))
# curve_fig.update_layout(width=1200, height=600)
# curve_fig.show()

#### Load Data
Assign the processed data and uncertainty datasets to the variables V and U. These steps will be simplified/streamlined in a future version of the code.

In [None]:
data_handler = DataHandler.load_dataframe(input_df=syn_input_df, uncertainty_df=syn_uncertainty_df)
V, U = data_handler.get_data()

#### Input/Uncertainty Data Metrics and Visualizations

In [None]:
# Show the input data metrics, including signal to noise ratio of the data and uncertainty
data_handler.metrics

In [None]:
# Concentration / Uncertainty Scatter plot for specific feature, feature/column specified by index
data_handler.plot_data_uncertainty(feature_idx=2)

In [None]:
# Species Concentration plot comparing features, features/columns specified by index
data_handler.plot_feature_data(x_idx=0, y_idx=1)

In [None]:
# Species Timeseries, a single or list of features/columns specified by index
data_handler.plot_feature_timeseries(feature_selection=[0, 1, 2, 3])

In [None]:
%%time
# Training multiple models, optional parameters are commented out.
sa_models = BatchSA(V=V, U=U, factors=factors, models=models, method=method, seed=seed, max_iter=max_iterations,
                    init_method=init_method, init_norm=init_norm,
                    converge_delta=converge_delta, converge_n=converge_n, 
                    parallel=parallel, optimized=optimized,
                    verbose=verbose
                   )
_ = sa_models.train()

#### Train Model

In [None]:
# Selet the best performing model to review
best_model = sa_models.best_model
sa_model = sa_models.results[best_model]
best_model

In [None]:
# Perform batch model analysis
batch_analysis = BatchAnalysis(batch_sa=sa_models)
# Plot the loss of the models over iterations
batch_analysis.plot_loss()

In [None]:
# Plot the loss distribution for the batch models
batch_analysis.plot_loss_distribution()

In [None]:
# Initialize the Model Analysis module
model_analysis = ModelAnalysis(datahandler=data_handler, model=sa_model, selected_model=best_model)

In [None]:
# Residual Analysis shows the scaled residual histogram, along with metrics and distribution curves. The abs_threshold parameter specifies the condition for the returned values of the function call as those residuals which exceed the absolute value of that threshold.
abs_threshold = 3.0
threshold_residuals = model_analysis.plot_residual_histogram(feature_idx=5, abs_threshold=abs_threshold)

In [None]:
print(f"List of Absolute Scaled Residual Greather than: {abs_threshold}. Count: {threshold_residuals.shape[0]}")
threshold_residuals

In [None]:
# The model output statistics for the estimated V, including SE: Standard Error metrics, and 3 normal distribution tests of the residuals (KS Normal is used in PMF5)
model_analysis.calculate_statistics()
model_analysis.statistics

In [None]:
# Model feature observed vs predicted plot with regression and one-to-one lines. Feature/Column specified by index.
model_analysis.plot_estimated_observed(feature_idx=2)

In [None]:
# Model feature timeseries analysis plot showing the observed vs predicted values of the feature, along with the residuals shown below. Feature/column specified by index.
model_analysis.plot_estimated_timeseries(feature_idx=1)

In [None]:
# Factor profile plot showing the factor sum of concentrations by feature (blue bars), the percentage of the feature as the red dot, and in the bottom plot the normalized contributions by date (values are resampled at a daily timestep for timeseries consistency).
# Factor specified by index.
model_analysis.plot_factor_profile(factor_idx=1)

In [None]:
# Model factor fingerprint specifies the feature percentage of each factor.
model_analysis.plot_factor_fingerprints()

In [None]:
# Factor G-Space plot shows the normalized contributions of one factor vs another factor. Factor specified by index.
model_analysis.plot_g_space(factor_1=2, factor_2=1)

In [None]:
# Factor contribution pie chart shows the percentage of factor contributions for the specified feature, and the corresponding normalized contribution of each factor for that feature (bottom plot). Feature specified by index.
model_analysis.plot_factor_contributions(feature_idx=1)

### Compare to Synthetic Data

Compare the set of batch models to the original synthetic factor data.


In [None]:
from eval.factor_comparison import FactorCompare

In [None]:
factor_comp = FactorCompare(input_df=data_handler.input_data, uncertainty_df=data_handler.uncertainty_data, base_profile_df=syn_profile_df, base_contribution_df=syn_contribution_df, batch_sa=sa_models)

In [None]:
factor_comp.compare()

In [None]:
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots

In [None]:
factor_comp.best_factor_r

In [None]:
color_map = px.colors.sample_colorscale("plasma", [n/(factors -1) for n in range(factors)])
r_color_map = px.colors.sample_colorscale("jet", [n/(100 -1) for n in range(100)])

c_model = sa_models.results[factor_comp.best_model]
syn_H = syn_factor_profiles
norm_syn_H = 100 * (syn_H / syn_H.sum(axis=0))

_H = sa_models.results[factor_comp.best_model].H
norm_H = 100 * (_H / _H.sum(axis=0))

syn_W = syn_factor_contributions
norm_syn_W = 100 * (syn_W / syn_W.sum(axis=0))
_W = sa_models.results[factor_comp.best_model].W
norm_W = 100 * (_W / _W.sum(axis=0))

factor_n = min(len(factor_comp.sa_factors), len(factor_comp.base_factors))
# print(f"Base factors: {len(factor_comp.base_factors)}, SA factors: {len(factor_comp.sa_factors)}. Factor N: {factor_n}")
if not factor_comp.base_k:
    subplot_titles = [f"Synthetic Factor {i} : Modelled {factor_comp.factor_map[i-1]}" for i in range(1, factor_n+1)]
else:
    subplot_titles = [f"Modelled Factor {i} : Synthetic {factor_comp.factor_map[i-1]}" for i in range(1, factor_n+1)]
print(f"Factor Map: {factor_comp.factor_map}")
print(f"Profile r2: {factor_comp.best_factor_r}")
print(f"Contribution r2: {factor_comp.best_contribution_r}")
for i in range(1, factor_n+1):
    map_i = int(factor_comp.factor_map[i-1].split(" ")[1])
    if not factor_comp.base_k:
        syn_i = i - 1
        mod_i = map_i - 1
        # i_r2 = factor_comp.best_factor_r[i-1]
        # i_r2_con = factor_comp.best_contribution_r[i-1]
    else:
        syn_i = map_i - 1
        mod_i = i - 1
    i_r2 = factor_comp.best_factor_r[i-1]
    i_r2_con = factor_comp.best_contribution_r[i-1]
    print(f"i: {i}, syn_i: {syn_i}, mod_i: {mod_i}, i_r2: {i_r2}, i_r2_con: {i_r2_con}")
    label = (subplot_titles[i-1] + " - R2: " + str(round(i_r2,4)), subplot_titles[i-1] + " - R2: " + str(round(i_r2_con,4)), "", "",)
    h_fig = make_subplots(rows=2, cols=2, subplot_titles=label, vertical_spacing=0.01, row_heights=[0.6, 0.4])
    h_fig.add_trace(go.Bar(name=f"Synthetic Profile f{syn_i+1}", x=data_handler.features, y=norm_syn_H[syn_i], marker_color="black"), row=1, col=1)
    h_fig.add_trace(go.Bar(name=f"Modelled Profile f{mod_i+1}", x=data_handler.features, y=norm_H[mod_i], marker_color="green"), row=1, col=1)
    h_fig.add_trace(go.Bar(name="", x=data_handler.features, y=norm_syn_H[syn_i]- norm_H[mod_i], marker_color="blue", showlegend=False), row=2, col=1)
    h_fig.add_trace(go.Scatter(name=f"Synthetic Contribution f{syn_i+1}", x=data_handler.input_data_df.index, y=norm_syn_W[:,syn_i], line_color="black"), row=1, col=2)
    h_fig.add_trace(go.Scatter(name=f"Model Contribution f{mod_i+1}", x=data_handler.input_data_df.index, y=norm_W[:,mod_i], line_color="green"), row=1, col=2)
    h_fig.add_trace(go.Scatter(name="", x=data_handler.input_data_df.index, y=norm_syn_W[:,syn_i]- norm_W[:,mod_i], marker_color="blue", showlegend=False), row=2, col=2)
    h_fig.update_yaxes(title_text="Synthetic Profile", row=1, col=1, title_standoff=3)
    h_fig.update_yaxes(title_text="Difference", row=2, col=1)
    h_fig.update_yaxes(title_text="Scaled Concentrations", row=1, col=2)
    h_fig.update_xaxes(row=1, showticklabels=False)
    h_fig.update_yaxes(row=2, col=2, title_text="Residuals")
    h_fig.update_yaxes(row=2, col=1, range=[-50, 50])
    h_fig.update_layout(title_text=f"Mapped Factor Comparison - Model: {factor_comp.best_model+1}", width=1600, height=800, hovermode='x', showlegend=True)
    h_fig.show()


In [None]:
syn_H = syn_factor_profiles
syn_W = syn_factor_contributions

_H = sa_models.results[factor_comp.best_model].H
_W = sa_models.results[factor_comp.best_model].W

syn_matrices = []
pred_matrices = []
for f in range(factors):
    f_sW = syn_W[:, f]
    f_sW = np.reshape(f_sW, (len(f_sW), 1))
    f_sH = [syn_H[f]]
    f_sWH = np.matmul(f_sW, f_sH)
    syn_matrices.append(f_sWH)

    f_pW = _W[:, f]
    f_pW = np.reshape(f_pW, (len(f_pW), 1))
    f_pH = [_H[f]]
    f_pWH = np.matmul(f_pW, f_pH)
    pred_matrices.append(f_pWH)

In [None]:
x = list(syn_input_df.index)
y = list(syn_input_df.columns)

In [None]:
factor_i = 3
feature_i = 1
syn_i = syn_matrices[factor_i][:,feature_i] / syn_matrices[factor_i][:,feature_i].sum()
pred_i = pred_matrices[factor_i][:,feature_i] / pred_matrices[factor_i][:,feature_i].sum()
residual_i = syn_i - pred_i
y_max = max(np.max(syn_matrices[factor_i]), np.max(pred_matrices[factor_i]))

In [None]:
conc_fig = go.Figure()
conc_fig.add_trace(go.Scatter(x=x, y=syn_i, name="Synthetic Data"))
conc_fig.add_trace(go.Scatter(x=x, y=pred_i, name="Predicted Data"))
# conc_fig.add_trace(go.Scatter(x=x, y=residual_i, name="Residuals"))
conc_fig.update_layout(width=1200, height=800, title_text=f"Factor: {factor_i+1}", hovermode='x unified')
conc_fig.show()

In [None]:
from itertools import product, permutations, combinations

In [None]:
base_factors = [f"Factor {i+1}" for i in range(6)]
model_factors = [f"Factor {i+1}" for i in range(7)]

In [None]:
if len(base_factors) >= len(model_factors):
    all_permutations = list(permutations(base_factors, len(model_factors)))
    print("Base model has the same or more factors")
else:
    all_permutations = list(permutations(model_factors, len(base_factors)))
    print("ESAT model has more factors")
len(all_permutations)

In [None]:
all_permutations