## ESAT K Estimation Workflow 1 - Batch Model Evaluation

This notebook implements a batch model evaluation approach to using solution profile variabilty to estimate optimal number of factors in a dataset.

#### Code Imports

In [1]:
import time
import numpy as np
import pandas as pd
import plotly.graph_objects as go

from plotly.subplots import make_subplots
from esat.data.datahandler import DataHandler
from esat.model.sa import SA
from esat.model.batch_sa import BatchSA
from esat.data.analysis import ModelAnalysis, BatchAnalysis
from esat.error.bootstrap import Bootstrap
from esat_eval.simulator import Simulator
from esat.estimator import FactorEstimator

In [2]:
# Synethic dataset parameter value ranges
syn_factors_min = 3
syn_factors_max = 8

syn_features_min = 15
syn_features_max = 45

syn_samples_min = 200
syn_samples_max = 1000

outliers = True
outliers_p_min = 0.05
outliers_p_max = 0.1
outliers_mag_min = 1.1
outliers_mag_max = 2

noise_mean_min = 0.05
noise_mean_max = 0.15
noise_scale = 0.01

uncertainty_mean_min = 0.05
uncertainty_mean_max = 0.15
uncertainty_scale = 0.01

contr_curve_min_range = [0.0, 1.0]
contr_curve_max_range = [2.0, 5.0]
contr_curve_scale_range = [0.1, 0.5]

random_seed = 337
k_coef = 0.75

In [3]:
rng = np.random.default_rng(seed=random_seed)

In [4]:
# Initialize the simulator with the above parameters
def generate_synthetic_data(true_factor):
    n_features = rng.integers(low=syn_features_min, high=syn_features_max, size=1)[0]
    n_samples = rng.integers(low=syn_samples_min, high=syn_samples_max, size=1)[0]
    i_outlier_p = round(rng.uniform(low=outliers_p_min, high=outliers_p_max, size=1)[0], 2)
    i_outlier_mag = round(rng.uniform(low=outliers_mag_min, high=outliers_mag_max, size=1)[0], 2)
    contribution_max = round(rng.uniform(low=1.0, high=10.0, size=1)[0], 2)
    print(f"True Factors: {true_factor}, Features: {n_features}, Samples: {n_samples}, Outliers %: {i_outlier_p}, Outliers Magnitude: {i_outlier_mag}, Contribution Max: {contribution_max}")
    simulator = Simulator(seed=rng.integers(low=0, high=10, size=1)[0],
                          factors_n=true_factor,
                          features_n=n_features,
                          samples_n=n_samples,
                          outliers=outliers,
                          outlier_p=i_outlier_p,
                          outlier_mag=i_outlier_mag,
                          contribution_max=contribution_max,
                          noise_mean_min=noise_mean_min,
                          noise_mean_max=noise_mean_max,
                          noise_scale=noise_scale,
                          uncertainty_mean_min=uncertainty_mean_min,
                          uncertainty_mean_max=uncertainty_mean_max,
                          uncertainty_scale=uncertainty_scale,
                          verbose=False
                         )
    curved_factors_count = rng.integers(low=0, high=true_factor, size=1)[0]
    curved_factor_list = rng.choice(list(range(true_factor)), size=curved_factors_count, replace=False)
    for c_i in curved_factor_list:
        # parameters not used by the curve type are ignored
        i_curve_type = rng.choice(['uniform', 'decreasing', 'increasing', 'logistic', 'periodic'], size=1)[0]
        i_curve_min = rng.uniform(low=contr_curve_min_range[0], high=contr_curve_min_range[1], size=1)[0]
        i_curve_max = rng.uniform(low=contr_curve_max_range[0], high=contr_curve_max_range[1], size=1)[0]
        i_curve_scale = rng.uniform(low=contr_curve_scale_range[0], high=contr_curve_scale_range[1], size=1)[0]
        i_curve_frequency = rng.uniform(low=0.1, high=0.9, size=1)[0]
        
        # To keep all as uniform comment out the line below
        # simulator.update_contribution(factor_i=c_i, curve_type=i_curve_type, scale=i_curve_scale, frequency=i_curve_frequency, minimum=i_curve_min, maximum=i_curve_max)
    
    syn_input_df, syn_uncertainty_df = simulator.get_data()
    data_handler = DataHandler.load_dataframe(input_df=syn_input_df, uncertainty_df=syn_uncertainty_df)
    data_handler.metrics
    V, U = data_handler.get_data()
    return V, U


def run_bs(k, bV, bU, bseed, bs_instances: int = 20, block_size: int = 4, threshold: float = 0.9):
    # Runs a bootstrap instance
    # Steps:
    # 1. Generate base model with a specified number of factors
    # 2. Generate bs_instances number of bootstrap datasets
    # 3. For each bootstrap dataset, use the base model H profiles for initialization. Run to convergence.
    # 4. On each converged bs model, calculate the average correlation of the mapped profiles.
    # 5. Reported metrics for each bootstrap run. K, % mapped profiles, mean mapped correlation, mean overall correlation, mean QTrue

    # Base Model, step 1
    base_sa = SA(V=bV, U=bU, factors=k, seed=bseed, verbose=False)
    base_sa.initialize()
    base_sa.train(max_iter=10000, converge_delta=0.1, converge_n=20)

    # BS instance, steps 2-3
    bs = Bootstrap(sa=base_sa, model_selected=-1, bootstrap_n=bs_instances, block_size=block_size, threshold=threshold, seed=bseed, parallel=True)
    bs.run()

    # Evaluate correlations, step 4
    # for each bs result get the mapping correlations bs.bs_results[1]['mapping']
    compare_count = 0
    mapped = 0
    mapped_correlations = []
    notmapped_correlations = []
    for i, i_result in bs.bs_results.items():
        i_mapping = i_result["mapping"]
        for j, j_factor in i_mapping.items():
            compare_count += 1
            if j_factor["mapped"]:
                mapped_correlations.append(j_factor["r2"])
                mapped += 1
            else:
                notmapped_correlations.append(j_factor["r2"])
    # return results, step 5
    bs_results = {
        "k": k,
        "seed": bseed,
        "% mapped": round((mapped/compare_count) * 100, 2),
        "mean mapped r2": round(np.mean(mapped_correlations), 4),
        "mean r2": round((np.sum(mapped_correlations)+np.sum(notmapped_correlations))/(len(mapped_correlations)+len(notmapped_correlations)), 4),
        "mean QRobust": round(np.mean(bs.q_results), 4)
    }
    return bs_results

def run_bs_batch(k, n_batches, bV, bU, bseed, bs_instances: int = 20, block_size: int = 4, threshold: float = 0.9, ):
    results = {
        "k": k,
        "seed": bseed,
        "% mapped": [],
        "mean mapped r2": [],
        "mean r2": [],
        "mean QRobust": []
    }
    for i in range(n_batches):
        i_seed = rng.integers(low=0, high=1e10, endpoint=True, size=1)[0]
        i_result = run_bs(k=i_factor, bV=bV, bU=bU, bseed=i_seed, bs_instances=bs_instances)
        results["% mapped"].append(i_result["% mapped"])
        results["mean mapped r2"].append(i_result["mean mapped r2"])
        results["mean r2"].append(i_result["mean r2"])
        results["mean QRobust"].append(i_result["mean QRobust"])
    results["% mapped"] = np.mean(results["% mapped"])
    results["mean mapped r2"] = np.mean(results["mean mapped r2"])
    results["mean r2"] = np.mean(results["mean r2"])
    results["mean QRobust"] = np.mean(results["mean QRobust"])
    return results
    

In [None]:
%%time
true_k = 6
i_V, i_U = generate_synthetic_data(true_factor=true_k)

n_batches = 10
bs_instances = 20
min_factors = 2
max_factors = 10

results_list0 = []
for i_factor in range(min_factors, max_factors+1):
    t0 = time.time()
    bseed = rng.integers(low=0, high=1e10, endpoint=True, size=1)[0]
    i_results = run_bs_batch(k=i_factor, n_batches=n_batches, bV=i_V, bU=i_U, bseed=bseed, bs_instances=bs_instances)
    results_list0.append(i_results)
    t1 = time.time()
    print(f"Results: {i_results}, Runtime {round(t1-t0, 2)} sec(s)")
# results_list0

True Factors: 6, Features: 24, Samples: 222, Outliers %: 0.06, Outliers Magnitude: 1.8, Contribution Max: 9.26
{'k': 2, 'seed': 6827920442, '% mapped': 67.75, 'mean mapped r2': 0.9704900000000001, 'mean r2': 0.9006700000000001, 'mean QRobust': 54494.49254000001}
{'k': 3, 'seed': 2857205905, '% mapped': 46.334, 'mean mapped r2': 0.95902, 'mean r2': 0.76839, 'mean QRobust': 37212.303490000006}
{'k': 4, 'seed': 7708876169, '% mapped': 90.375, 'mean mapped r2': 0.9725400000000001, 'mean r2': 0.9556600000000002, 'mean QRobust': 22614.765160000003}
{'k': 5, 'seed': 3396816599, '% mapped': 89.6, 'mean mapped r2': 0.97094, 'mean r2': 0.9497, 'mean QRobust': 15767.494090000002}


CPU times: total: 0 ns
Wall time: 0 ns


In [6]:
%%time
# Sampling parameters
n_batches = 5
bs_instances = 10
min_factors = 2
max_factors = 10
all_results = []

# for j in range(100):
#     t0 = time.time()
#     true_k = rng.integers(low=syn_factors_min, high=syn_factors_max, endpoint=True, size=1)[0]
#     i_V, i_U = generate_synthetic_data(true_factor=true_k)
    
#     results_list = []
#     predicted_k = -1
#     best_r2 = 0
#     for i_factor in range(min_factors, max_factors+1):
#         bseed = rng.integers(low=0, high=1e10, endpoint=True, size=1)[0]
#         i_results = run_bs_batch(k=i_factor, n_batches=n_batches, bV=i_V, bU=i_U, bseed=bseed, bs_instances=bs_instances)
#         if i_results['mean r2'] > best_r2:
#             best_r2 = i_results['mean r2']
#             predicted_k = i_results['k']
#         results_list.append(i_results)
#     t1 = time.time()
#     print(f"Predicted K: {predicted_k}, R2: {best_r2}. Runtime: {round(t1-t0, 4)} sec")
#     all_results.append(results_list)

True Factors: 4, Features: 15, Samples: 398, Outliers %: 0.09, Outliers Magnitude: 1.93, Contribution Max: 6.59
Predicted K: 4, R2: 0.9912600000000001
True Factors: 6, Features: 16, Samples: 922, Outliers %: 0.07, Outliers Magnitude: 1.64, Contribution Max: 1.84
Predicted K: 6, R2: 0.99318
True Factors: 3, Features: 22, Samples: 535, Outliers %: 0.07, Outliers Magnitude: 1.5, Contribution Max: 1.59
Predicted K: 3, R2: 0.9806000000000001
True Factors: 4, Features: 35, Samples: 795, Outliers %: 0.05, Outliers Magnitude: 1.93, Contribution Max: 2.2
Predicted K: 4, R2: 0.9872
True Factors: 7, Features: 22, Samples: 817, Outliers %: 0.09, Outliers Magnitude: 1.61, Contribution Max: 3.45
Predicted K: 6, R2: 0.992
True Factors: 6, Features: 21, Samples: 338, Outliers %: 0.07, Outliers Magnitude: 1.39, Contribution Max: 7.65
Predicted K: 6, R2: 0.9901399999999999
True Factors: 3, Features: 40, Samples: 284, Outliers %: 0.05, Outliers Magnitude: 1.71, Contribution Max: 1.96
Predicted K: 3, R2: 

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


Predicted K: 6, R2: 0.98924
True Factors: 7, Features: 15, Samples: 329, Outliers %: 0.06, Outliers Magnitude: 1.43, Contribution Max: 2.05
Predicted K: 6, R2: 0.95954
True Factors: 5, Features: 25, Samples: 698, Outliers %: 0.05, Outliers Magnitude: 1.61, Contribution Max: 1.15
Predicted K: 5, R2: 0.9984
True Factors: 7, Features: 28, Samples: 917, Outliers %: 0.06, Outliers Magnitude: 1.73, Contribution Max: 7.84
Predicted K: 6, R2: 0.9943200000000001
True Factors: 5, Features: 43, Samples: 208, Outliers %: 0.07, Outliers Magnitude: 1.59, Contribution Max: 3.26
Predicted K: 5, R2: 0.9859399999999999
True Factors: 8, Features: 32, Samples: 227, Outliers %: 0.06, Outliers Magnitude: 1.62, Contribution Max: 2.63
Predicted K: 8, R2: 0.9487
True Factors: 5, Features: 38, Samples: 913, Outliers %: 0.1, Outliers Magnitude: 1.43, Contribution Max: 8.51
Predicted K: 5, R2: 0.999
True Factors: 4, Features: 16, Samples: 218, Outliers %: 0.07, Outliers Magnitude: 1.91, Contribution Max: 9.72
Pre

KeyboardInterrupt: 

In [7]:
# def accuracy(occurences, results):
#     accuracy_list = [0]*len(occurences)
#     for i in range(len(occurences)):
#         if occurences[i] == 0:
#             accuracy_list[i] = 0
#         else:
#             accuracy_list[i] = round(results[i]/occurences[i],2)
#     return np.multiply(100, accuracy_list)

# elements = max_factors-min_factors - 1

# dr_results = [0]*elements
# kest_results = [0]*elements
# qtrue_results = [0]*elements
# qrobust_results = [0]*elements
# k_runs = [0]*elements

# for _result in est_list:
#     idx_k = _result['true K'] - min_factors - 1
#     true_k = _result['true K']
#     k_runs[idx_k] += 1
#     if _result['delta ratio'] == true_k:
#         dr_results[idx_k] += 1
#     if _result['K estimate'] == true_k:
#         kest_results[idx_k] += 1
#     if _result['QTrue Overall'] == true_k:
#         qtrue_results[idx_k] += 1
#     if _result['QRobust Overall'] == true_k:
#         qrobust_results[idx_k] += 1

# labels = [f"{i + min_factors} Factor(s)" for i in range(min_factors-1,max_factors)]

# result_fig = make_subplots(specs=[[{"secondary_y": True}]])
# result_fig.add_trace(go.Bar(name="Delta Ratio", x=labels, y=accuracy(k_runs, dr_results)), secondary_y=False)
# result_fig.add_trace(go.Bar(name="K Estimate", x=labels, y=accuracy(k_runs, kest_results)), secondary_y=False)
# result_fig.add_trace(go.Bar(name="QTrue Overall", x=labels, y=accuracy(k_runs, qtrue_results)), secondary_y=False)
# result_fig.add_trace(go.Bar(name="QRobust Overall", x=labels, y=accuracy(k_runs, qrobust_results)), secondary_y=False)
# result_fig.add_trace(go.Scatter(name="K Runs", x=labels, y=k_runs, mode='markers'), secondary_y=True)

# result_fig.update_layout(title="K Estimation Randomized Results", barmode='group', height=600, width=1200)
# result_fig.update_yaxes(title_text="Accuracy (%)", range=[0, 100.0], secondary_y=False)
# result_fig.update_yaxes(title_text="Run Count", secondary_y=True)
# result_fig.show()

In [8]:
print(f"Total Runs: {np.sum(k_runs)}")
print(f"Delta Ratio Accuracy: {100*np.sum(dr_results)/np.sum(k_runs)}%")
print(f"K Estimate Accuracy: {100*np.sum(kest_results)/np.sum(k_runs)}%")
print(f"QTrue Overall Accuracy: {100*np.sum(qtrue_results)/np.sum(k_runs)}%")
print(f"QRobust Overall Accuracy: {100*np.sum(qrobust_results)/np.sum(k_runs)}%")

NameError: name 'k_runs' is not defined

In [None]:
# est_list

In [None]:
# results_df = pd.DataFrame(est_list)
# results_df.to_csv("Run_Results.csv", index=False)

In [None]:
rank_results = {"dr": 0, "k": 0, "qt":0, "qr":0}
for i_run in est_list:
    rank_results['dr'] += i_run["DR Rank"]/len(est_list)
    rank_results['k'] += i_run["K Rank"]/len(est_list)
    rank_results['qt'] += i_run["QT Rank"]/len(est_list)
    rank_results['qr'] += i_run["QR Rank"]/len(est_list)
print(f"Average Ranks - DR: {round(rank_results['dr'], 2)}, K: {round(rank_results['k'],2)}, QTrue: {round(rank_results['qt'],2)}, QRobust: {round(rank_results['qr'],2)}")

In [None]:
#results_list
results_list_df = None

for j,k in enumerate(results_list):
    k = k.drop("Iteration Number",errors="ignore", axis=1)
    actual_k = est_list[j]["true K"]
    k_true = [actual_k]*k.shape[0]
    k_rows = [j+1]*k.shape[0]
    k.insert(0,"Iteration Number",k_rows)
    k.insert(1,"True k",k_true)

    w1 = 0.25
    w2 = 0.50
    w3 = 0.25

    k["QTW"] = w1*k["K Estimate"]/k["K Estimate"].max()+w2*k["Delta Ratio"]/k["Delta Ratio"].max()+w3*k["Q(True)"].min()/k["Q(True)"]
    k["QRW"] = w1*k["K Estimate"]/k["K Estimate"].max()+w2*k["Delta Ratio"]/k["Delta Ratio"].max()+w3*k["Q(Robust)"].min()/k["Q(Robust)"]

    k_chopped = k.iloc[1:-1]
    k["QTW Rank"] =  [None]+list(k_chopped["QTW"].rank(ascending=False))+ [None]
    k["QRW Rank"] =  [None]+list(k_chopped["QRW"].rank(ascending=False))+ [None]

    #qtr_ordered_list = k_chopped.sort_values("Weighted QRobust Overall", ascending=False).reset_index()
    #k["QTR Rank"] = [None]+list(qtr_ordered_list.index)+[None]
    
    if results_list_df is None:
        results_list_df = k
    else:
        results_list_df = results_list_df.merge(k,how="outer")

results_list_df = results_list_df.sort_values(by=["Iteration Number","Factors"]).reset_index(drop=True).round(decimals=4)

# results_list_df.to_csv("Complete_Run_Results.csv",index=False)

results_list_df.loc[(results_list_df["Iteration Number"]==1) | (results_list_df["Iteration Number"]==2)]