## ESAT K Estimation Workflow 3 - Input Perturbation Model Evaluation

This notebook implements an input perturbation workflow for model evaluation and testing for k-estimate metrics.

#### Code Imports

In [1]:
import time
import copy
import numpy as np
import pandas as pd
import plotly.graph_objects as go

from plotly.subplots import make_subplots
from esat.data.datahandler import DataHandler
from esat.model.sa import SA
from esat.model.batch_sa import BatchSA
from esat.data.analysis import ModelAnalysis, BatchAnalysis
from esat.error.bootstrap import Bootstrap
from esat_eval.simulator import Simulator
from esat_eval.factor_comparison import FactorCompare

In [2]:
# Synethic dataset parameter value ranges
syn_factors_min = 3
syn_factors_max = 8

syn_features_min = 15
syn_features_max = 45

syn_samples_min = 200
syn_samples_max = 1000

outliers = True
outliers_p_min = 0.05
outliers_p_max = 0.1
outliers_mag_min = 1.1
outliers_mag_max = 2

noise_mean_min = 0.05
noise_mean_max = 0.15
noise_scale = 0.01

uncertainty_mean_min = 0.05
uncertainty_mean_max = 0.15
uncertainty_scale = 0.01

contr_curve_min_range = [0.0, 1.0]
contr_curve_max_range = [2.0, 5.0]
contr_curve_scale_range = [0.1, 0.5]

random_seed = 337
k_coef = 0.75

In [3]:
rng = np.random.default_rng(seed=random_seed)

In [4]:
# Initialize the simulator with the above parameters
def generate_synthetic_data(true_factor):
    n_features = rng.integers(low=syn_features_min, high=syn_features_max, size=1)[0]
    n_samples = rng.integers(low=syn_samples_min, high=syn_samples_max, size=1)[0]
    i_outlier_p = round(rng.uniform(low=outliers_p_min, high=outliers_p_max, size=1)[0], 2)
    i_outlier_mag = round(rng.uniform(low=outliers_mag_min, high=outliers_mag_max, size=1)[0], 2)
    contribution_max = round(rng.uniform(low=1.0, high=10.0, size=1)[0], 2)
    print(f"True Factors: {true_factor}, Features: {n_features}, Samples: {n_samples}, Outliers %: {i_outlier_p}, Outliers Magnitude: {i_outlier_mag}, Contribution Max: {contribution_max}")
    simulator = Simulator(seed=rng.integers(low=0, high=10, size=1)[0],
                          factors_n=true_factor,
                          features_n=n_features,
                          samples_n=n_samples,
                          outliers=outliers,
                          outlier_p=i_outlier_p,
                          outlier_mag=i_outlier_mag,
                          contribution_max=contribution_max,
                          noise_mean_min=noise_mean_min,
                          noise_mean_max=noise_mean_max,
                          noise_scale=noise_scale,
                          uncertainty_mean_min=uncertainty_mean_min,
                          uncertainty_mean_max=uncertainty_mean_max,
                          uncertainty_scale=uncertainty_scale,
                          verbose=False
                         )
    curved_factors_count = rng.integers(low=0, high=true_factor, size=1)[0]
    curved_factor_list = rng.choice(list(range(true_factor)), size=curved_factors_count, replace=False)
    for c_i in curved_factor_list:
        # parameters not used by the curve type are ignored
        i_curve_type = rng.choice(['uniform', 'decreasing', 'increasing', 'logistic', 'periodic'], size=1)[0]
        i_curve_min = rng.uniform(low=contr_curve_min_range[0], high=contr_curve_min_range[1], size=1)[0]
        i_curve_max = rng.uniform(low=contr_curve_max_range[0], high=contr_curve_max_range[1], size=1)[0]
        i_curve_scale = rng.uniform(low=contr_curve_scale_range[0], high=contr_curve_scale_range[1], size=1)[0]
        i_curve_frequency = rng.uniform(low=0.1, high=0.9, size=1)[0]
        
        # To keep all as uniform comment out the line below
        # simulator.update_contribution(factor_i=c_i, curve_type=i_curve_type, scale=i_curve_scale, frequency=i_curve_frequency, minimum=i_curve_min, maximum=i_curve_max)
    
    syn_input_df, syn_uncertainty_df = simulator.get_data()
    data_handler = DataHandler.load_dataframe(input_df=syn_input_df, uncertainty_df=syn_uncertainty_df)
    data_handler.metrics
    V, U = data_handler.get_data()
    return V, U


def run_perturbed(v, u, ifactors, random_seed, perturb_p = 0.25, perturb_v = 0.05, sa_model=None, models=10, max_iter=10000, converge_n=20, converge_delta=0.1, threshold: float=0.9):
    # Runs a perturbed input batch instance
    # Steps:
    # 1. Create a SA instance using the provided iV, iU and true_k for the data and factor count (if one is not provided).
    # 2. Using a general predefined % value (perturb_v) change and % instance change (perturb_p) for the dataset, or by feature:
    #    a. Select perturb_p number of indecies from the input data matrix and change those values +/- up to perturb_v
    # 3. With the perturbed dataset rerun the model using the base model H matrix.
    # 4. Evaluate how much the profiles changes, mapping the profiles and calculating r2. A perturbed result will have % of profiles that didn't map, and the range of values for those profiles that did.
    #    a. The metrics will be used to determine how much the solution profiles change with the perturbed datasets.
    #    b. For each perturbed solution, a single metric of average r2 for the mapped profiles and the number of profiles that mapped.
    # 5. Repeat for n number of models

    # step 1
    if sa_model is None:
        sa_model = SA(V=v, U=u, factors=ifactors, seed=random_seed, verbose=False)
        sa_model.initialize()
        sa_model.train(max_iter=max_iter, converge_delta=converge_delta, converge_n=converge_n)

    factor_mapping_p = []
    factor_coef_mean = []
    for i in range(models):
        i_mask = rng.random(size=v.shape) > perturb_p
        v_matrix = rng.uniform(low=0, high=perturb_v, size=v.shape)
        pn_matrix = rng.random(size=v.shape)
        pn_matrix[pn_matrix > 0.5] = 1.0
        pn_matrix[pn_matrix <= 0.5] = -1.0
        
        i_V = copy.copy(v)
        # for the cells where i_mask is True, the value of i_V is equal to i_V +/- v_matrix * i_V
        i_V[i_mask] =  i_V[i_mask] + (pn_matrix[i_mask] * v_matrix[i_mask] * i_V[i_mask])
        i_V[i_V < 0.0] = 1e-12

        i_sa_model = SA(i_V, U=u, factors=ifactors, seed=random_seed, verbose=False)
        i_sa_model.initialize(H=sa_model.H, W=sa_model.W)
        i_sa_model.train(max_iter=max_iter, converge_delta=converge_delta, converge_n=converge_n)

        i_mapping = []
        coef_mapping = []
        for k in range(ifactors):
            base_factor = sa_model.H[k]
            k_factor = i_sa_model.H[k]
            base_k_correlation = FactorCompare.calculate_correlation(factor1=base_factor, factor2=k_factor)
            coef_mapping.append(base_k_correlation)
            if base_k_correlation >= threshold:
                i_mapping.append(1)
            else:
                i_mapping.append(0)
        factor_mapping_p.append(np.sum(i_mapping)/ifactors)
        factor_coef_mean.append(np.round(np.mean(coef_mapping), 4))
    results = {
        "k": ifactors,
        "seed": random_seed,
        "perturb %": perturb_p,
        "perturb v": perturb_v,
        "factor_mapping": factor_mapping_p,
        "factor_coef_mean": factor_coef_mean
    }
    print(f"Factor n: {ifactors}, Mapping %: {np.round(np.mean(factor_mapping_p) * 100,2)} %, Mean R2: {np.round(np.mean(factor_coef_mean), 4)}")
    return results


def run_batch_perturb(k, iV, iU, min_k: int = 2, max_k: int = 10, perturb_p=0.25, perturb_v=0.05, models: int = 100, max_iter=10000, converge_n=20, converge_delta=0.1, threshold: float = 0.9):
    results = {
        "true_k": k,
        "models": models,
        "min_k": min_k,
        "max_k": max_k,
        "k_results": []
    }
    for ifactors in range(min_k, max_k+1):
        i_seed = rng.integers(low=0, high=1e10, endpoint=True, size=1)[0]
        i_result = run_perturbed(v=iV, u=iU, ifactors=ifactors, random_seed=i_seed, perturb_p=perturb_p, perturb_v=perturb_v, models=models, max_iter=max_iter, converge_n=converge_n, converge_delta=converge_delta, threshold=threshold)
        results["k_results"].append(i_result)
    return results
    

In [8]:
%%time
true_k = 3
iV, iU = generate_synthetic_data(true_factor=true_k)

n_models = 20
threshold = 0.9
min_factors = 2
max_factors = 14

perturb_p = 1.0
perturb_v = 0.25

batch_estimate = run_batch_perturb(k=true_k, iV=iV, iU=iU, perturb_p=perturb_p, perturb_v=perturb_v, min_k=min_factors, max_k=max_factors, models=n_models, max_iter=10000, converge_n=20, converge_delta=0.1, threshold=threshold)

True Factors: 3, Features: 42, Samples: 493, Outliers %: 0.07, Outliers Magnitude: 1.47, Contribution Max: 5.48
Factor n: 2, Mapping %: 100.0 %, Mean R2: 1.0
Factor n: 3, Mapping %: 100.0 %, Mean R2: 1.0
Factor n: 4, Mapping %: 100.0 %, Mean R2: 0.9777
Factor n: 5, Mapping %: 80.0 %, Mean R2: 0.8971
Factor n: 6, Mapping %: 100.0 %, Mean R2: 0.9867
Factor n: 7, Mapping %: 100.0 %, Mean R2: 0.9857
Factor n: 8, Mapping %: 100.0 %, Mean R2: 0.9802
Factor n: 9, Mapping %: 100.0 %, Mean R2: 0.9778
Factor n: 10, Mapping %: 100.0 %, Mean R2: 0.9881
Factor n: 11, Mapping %: 63.64 %, Mean R2: 0.8757
Factor n: 12, Mapping %: 75.0 %, Mean R2: 0.922
Factor n: 13, Mapping %: 76.92 %, Mean R2: 0.9515
Factor n: 14, Mapping %: 85.71 %, Mean R2: 0.9613
CPU times: total: 32min 47s
Wall time: 38min 23s


In [9]:
factor_labels = [f"Factor {i}" for i in range(min_factors, max_factors+1)]
mapping_mean = []
coef_mean = []
for i in range(max_factors-min_factors+1):
    i_fm = batch_estimate["k_results"][i]["factor_mapping"]
    i_cm = batch_estimate["k_results"][i]["factor_coef_mean"]
    mapping_mean.append(i_fm)
    coef_mean.append(i_cm)

In [10]:
batch_fig = make_subplots(specs=[[{"secondary_y": True}]])
for i in range(max_factors-min_factors+1):
    i_results = coef_mean[i]
    i_label = factor_labels[i]
    batch_fig.add_trace(go.Box(y=i_results, name=i_label), secondary_y=False)
batch_fig.add_trace(go.Scatter(x=factor_labels, y=np.mean(mapping_mean, axis=1)* 100, name="Average % Mapped"), secondary_y=True)
batch_fig.update_layout(title="Input Perturbation Results", width=1200, height=800)
batch_fig.update_yaxes(secondary_y=False, title_text="R2")
batch_fig.update_yaxes(secondary_y=True, title_text="%")
batch_fig.show()