## ESAT K Estimation Workflow 1 - Batch Model Evaluation

This notebook implements a batch model evaluation approach to using solution profile variabilty to estimate optimal number of factors in a dataset.

#### Code Imports

In [1]:
import time
import numpy as np
import pandas as pd
import plotly.graph_objects as go

from plotly.subplots import make_subplots
from esat.data.datahandler import DataHandler
from esat.model.sa import SA
from esat.model.batch_sa import BatchSA
from esat.data.analysis import ModelAnalysis, BatchAnalysis
from esat.error.bootstrap import Bootstrap
from esat_eval.simulator import Simulator
from esat_eval.factor_comparison import FactorCompare

In [2]:
# Synethic dataset parameter value ranges
syn_factors_min = 3
syn_factors_max = 8

syn_features_min = 15
syn_features_max = 45

syn_samples_min = 200
syn_samples_max = 1000

outliers = True
outliers_p_min = 0.05
outliers_p_max = 0.1
outliers_mag_min = 1.1
outliers_mag_max = 2

noise_mean_min = 0.05
noise_mean_max = 0.15
noise_scale = 0.01

uncertainty_mean_min = 0.05
uncertainty_mean_max = 0.15
uncertainty_scale = 0.01

contr_curve_min_range = [0.0, 1.0]
contr_curve_max_range = [2.0, 5.0]
contr_curve_scale_range = [0.1, 0.5]

random_seed = 337
k_coef = 0.75

In [3]:
rng = np.random.default_rng(seed=random_seed)

In [9]:
# Initialize the simulator with the above parameters
def generate_synthetic_data(true_factor):
    n_features = rng.integers(low=syn_features_min, high=syn_features_max, size=1)[0]
    n_samples = rng.integers(low=syn_samples_min, high=syn_samples_max, size=1)[0]
    i_outlier_p = round(rng.uniform(low=outliers_p_min, high=outliers_p_max, size=1)[0], 2)
    i_outlier_mag = round(rng.uniform(low=outliers_mag_min, high=outliers_mag_max, size=1)[0], 2)
    contribution_max = round(rng.uniform(low=1.0, high=10.0, size=1)[0], 2)
    print(f"True Factors: {true_factor}, Features: {n_features}, Samples: {n_samples}, Outliers %: {i_outlier_p}, Outliers Magnitude: {i_outlier_mag}, Contribution Max: {contribution_max}")
    simulator = Simulator(seed=rng.integers(low=0, high=10, size=1)[0],
                          factors_n=true_factor,
                          features_n=n_features,
                          samples_n=n_samples,
                          outliers=outliers,
                          outlier_p=i_outlier_p,
                          outlier_mag=i_outlier_mag,
                          contribution_max=contribution_max,
                          noise_mean_min=noise_mean_min,
                          noise_mean_max=noise_mean_max,
                          noise_scale=noise_scale,
                          uncertainty_mean_min=uncertainty_mean_min,
                          uncertainty_mean_max=uncertainty_mean_max,
                          uncertainty_scale=uncertainty_scale,
                          verbose=False
                         )
    curved_factors_count = rng.integers(low=0, high=true_factor, size=1)[0]
    curved_factor_list = rng.choice(list(range(true_factor)), size=curved_factors_count, replace=False)
    for c_i in curved_factor_list:
        # parameters not used by the curve type are ignored
        i_curve_type = rng.choice(['uniform', 'decreasing', 'increasing', 'logistic', 'periodic'], size=1)[0]
        i_curve_min = rng.uniform(low=contr_curve_min_range[0], high=contr_curve_min_range[1], size=1)[0]
        i_curve_max = rng.uniform(low=contr_curve_max_range[0], high=contr_curve_max_range[1], size=1)[0]
        i_curve_scale = rng.uniform(low=contr_curve_scale_range[0], high=contr_curve_scale_range[1], size=1)[0]
        i_curve_frequency = rng.uniform(low=0.1, high=0.9, size=1)[0]
        
        # To keep all as uniform comment out the line below
        # simulator.update_contribution(factor_i=c_i, curve_type=i_curve_type, scale=i_curve_scale, frequency=i_curve_frequency, minimum=i_curve_min, maximum=i_curve_max)
    
    syn_input_df, syn_uncertainty_df = simulator.get_data()
    data_handler = DataHandler.load_dataframe(input_df=syn_input_df, uncertainty_df=syn_uncertainty_df)
    data_handler.metrics
    V, U = data_handler.get_data()
    return V, U


def run_batch_estimate(iV, iU, ifactors, batch_seed, models=100, max_iter=10000, converge_n=20, converge_delta=0.1, threshold: float = 0.9):
    # Runs a k-estimate instance
    # Steps:
    # 1. Create a batch SA instance using the provided iV, iU and true_k for the data and factor count.
    # 2. Evaluate the profiles of each model in the batch
    # 3. For each model profile, calculate the correlation coefficient to the profiles of each other model
    #    a. Counting the number of models where the profile has a r2 coefficient to one of that model's profile above the threshold
    #    b. Calculate the average correlation between all profiles within a single model.
    # 4. Repeat this process for a range of k values to evaluate the resulting metrics for determining k-estimate

    # step 1
    batch_sa = BatchSA(V=iV, U=iU, factors=ifactors, models=models, seed=batch_seed, max_iter=max_iter,
                    converge_delta=converge_delta, converge_n=converge_n, optimized=True,
                    verbose=False
                   )
    batch_sa.train()

    # steps 2-3
    model_profile_count = []
    model_profile_avg = []
    model_max_cor = []
    for i, i_result in enumerate(batch_sa.results):
        # step 3a
        model_count = []
        model_i_max = []
        for ik in range(batch_sa.factors):
            ik_profile = i_result.H[ik]
            ik_count = 0
            factor_ik_max_coe = []
            for j in range(batch_sa.models):
                if j == i:
                    continue
                add_counter = False
                max_coe = 0
                for k in range(batch_sa.factors):
                    jk_profile = batch_sa.results[j].H[k]
                    i_j_correlation = FactorCompare.calculate_correlation(factor1=ik_profile, factor2=jk_profile)
                    if i_j_correlation >= threshold:
                        add_counter = True
                    if i_j_correlation > max_coe:
                        max_coe = i_j_correlation
                        # ik_count += 1
                        # continue
                if add_counter:
                    ik_count += 1
                factor_ik_max_coe.append(max_coe)
            model_count.append(ik_count)
            model_i_max.append(factor_ik_max_coe)
        model_profile_count.append(model_count)
        model_max_cor.append(model_i_max)
        
        #step 3b
        model_profile_r2 = []
        for j in range(batch_sa.factors):
            j_profile = i_result.H[j]
            for k in range(batch_sa.factors):
                if k == j:
                    break
                else:
                    k_profile = i_result.H[k]
                jk_correlation = FactorCompare.calculate_correlation(factor1=j_profile, factor2=k_profile)
                model_profile_r2.append(jk_correlation)
        model_profile_avg.append(np.round(np.mean(model_profile_r2), 4))
    
    batch_result = {
        "k": ifactors,
        "seed": batch_seed,
        "correlation count": model_profile_count,
        "model correlation avg": model_profile_avg,
        "profile max r2": model_max_cor
    }
    return batch_result

def run_batch_k_estimate(k, iV, iU, min_k: int = 2, max_k: int = 10, models: int = 100, max_iter=10000, converge_n=20, converge_delta=0.1, threshold: float = 0.9):
    results = {
        "true_k": k,
        "models": models,
        "min_k": min_k,
        "max_k": max_k,
        "k_results": []
    }
    for ifactors in range(min_k, max_k+1):
        i_seed = rng.integers(low=0, high=1e10, endpoint=True, size=1)[0]
        i_result = run_batch_estimate(iV=iV, iU=iU, ifactors=ifactors, batch_seed=i_seed, models=models, max_iter=max_iter, converge_n=converge_n, converge_delta=converge_delta, threshold=threshold)
        results["k_results"].append(i_result)
    return results
    

In [14]:
%%time
true_k = 6
iV, iU = generate_synthetic_data(true_factor=true_k)

n_models = 50
threshold = 0.98
min_factors = 2
max_factors = 10
batch_estimate = run_batch_k_estimate(k=true_k, iV=iV, iU=iU, min_k=min_factors, max_k=max_factors, models=n_models, max_iter=10000, converge_n=20, converge_delta=0.1, threshold=threshold)

16-Oct-24 16:44:59 - Running batch SA models in parallel using 15 processes.


True Factors: 6, Features: 37, Samples: 579, Outliers %: 0.09, Outliers Magnitude: 1.56, Contribution Max: 4.06


16-Oct-24 16:45:14 - Running batch SA models in parallel using 15 processes.
16-Oct-24 16:45:39 - Running batch SA models in parallel using 15 processes.
16-Oct-24 16:46:15 - Running batch SA models in parallel using 15 processes.
16-Oct-24 16:47:19 - Running batch SA models in parallel using 15 processes.
16-Oct-24 16:48:43 - Running batch SA models in parallel using 15 processes.
16-Oct-24 16:50:54 - Running batch SA models in parallel using 15 processes.
16-Oct-24 16:53:29 - Running batch SA models in parallel using 15 processes.
16-Oct-24 16:56:11 - Running batch SA models in parallel using 15 processes.


CPU times: total: 2min
Wall time: 14min 10s


In [15]:
factor_labels = [f"Factor {i}" for i in range(min_factors, max_factors+1)]
count_mean = []
count_std = []
count_total = []
model_r2 = []
max_r2 = []
for i in range(max_factors-min_factors+1):
    i_count = np.sum(batch_estimate["k_results"][i]["correlation count"], axis=1)
    model_r2.append(np.round(np.mean(batch_estimate["k_results"][i]["model correlation avg"]), 4))
    max_r2.append(np.round(np.mean(np.mean(batch_estimate["k_results"][i]["profile max r2"], axis=0), axis=0), 4))
    i_mean = np.mean(i_count)
    i_std = np.std(i_count)
    count_mean.append(np.round(i_mean, 4))
    count_std.append(np.round(i_std, 4))
    count_total.append(i_count)

In [16]:
batch_fig = make_subplots(specs=[[{"secondary_y": True}]])
for i in range(max_factors-min_factors+1):
    i_results = count_total[i]
    i_label = factor_labels[i]
    batch_fig.add_trace(go.Box(y=i_results, name=i_label), secondary_y=False)
batch_fig.add_trace(go.Scatter(x=factor_labels, y=model_r2, name="Average Model Profile R2"), secondary_y=True)
batch_fig.update_layout(title="Batch K-Estimate Results", width=1200, height=800)
batch_fig.update_yaxes(secondary_y=False, title_text="Count")
batch_fig.update_yaxes(secondary_y=True, title_text="R2")
batch_fig.show()

In [17]:
max_r_fig = go.Figure()
for i in range(max_factors-min_factors+1):
    i_results = max_r2[i]
    i_label = factor_labels[i]
    max_r_fig.add_trace(go.Box(y=i_results, name=i_label))
max_r_fig.update_layout(title="Batch K-Estimate Max Correlation", width=1200, height=800)
max_r_fig.update_yaxes(title_text="Max R2")
max_r_fig.show()