## ESAT K Estimation Metrics


#### Code Imports

In [1]:
import numpy as np
import pandas as pd
from esat.data.datahandler import DataHandler
from esat.model.batch_sa import BatchSA
from esat.data.analysis import ModelAnalysis, BatchAnalysis
from esat_eval.simulator import Simulator
from esat.estimator import FactorEstimator

#### Synthetic Dataset

Generate synthetic input (V) and uncertainty (U) datasets for model analysis. V and U are generated in the following sequence:

1.	Feature profiles are defined and/or randomly generated (H); if the latter, for each feature, a random number of factors between 1 and K are chosen as sources for that feature. For each contributing factor, a random contribution (uniform value between 0 and 1) is assigned. If one or more predefined factor profiles (a row of H) are provided by the user, they are assigned to H in order of occurrence and overwrite the corresponding randomly generated row of H.
2.	Sample concentrations are defined and/or randomly generated (W); if the latter, each cell of W is set to a random uniform number between 0 and contribution_max
3.	V1 is calculated as the product W x H
4.	A noise matrix (N) is created by selecting values from a normal distribution with a randomly selected mean noise (uniform distribution between noise_mean_min and noise_mean_max) for each feature, and standard deviation = noise_scale. The randomly selected mean noise for a feature has a 50% chance to be multiplied by -1 to allow for the reduction of values in V1. Then the Hadamard product (element-wise matrix multiplication) of V1 and N is used to calculate V: V1 + V1◦N -> V
5.  Outliers are added to V if outliers=True. A number of elements in V (a proportion = outlier_p) are randomly selected and each one has a 50% chance to become V*outlier_mag, and a 50% chance to become V/outlier_mag

6.	An uncertainty matrix (U1) is created by selecting values from a normal distribution with a randomly selected mean uncertainty (uniform distribution between uncertainty_mean_min and uncertainty_mean_max) for each feature, and standard deviation = uncertainty_scale. Then the Hadamard product of V and U1 is used to calculate U: V◦U1 -> U

In [2]:
# Synethic dataset parameter value ranges
syn_factors_min = 3
syn_factors_max = 8

syn_features_min = 5
syn_features_max = 100

syn_samples_min = 20
syn_samples_max = 2000

outliers = True
outliers_p_min = 0.05
outliers_p_max = 0.1
outliers_mag_min = 1.1
outliers_mag_max = 2

noise_mean_min = 0.1
noise_mean_max = 0.2
noise_scale = 0.05

uncertainty_mean_min = 0.1
uncertainty_mean_max = 0.2
uncertainty_scale = 0.05

contr_curve_min_range = [0.0, 1.0]
contr_curve_max_range = [2.0, 5.0]
contr_curve_scale_range = [0.1, 0.5]

# Sampling parameters
random_seed = 42
test_n = 1000

# Each iteration
samples = 100
min_factors = 2
max_factors = 12

In [3]:
rng = np.random.default_rng(seed=random_seed)

In [9]:
# Initialize the simulator with the above parameters
def generate_synthetic_data(true_factor):
    n_features = rng.integers(low=syn_features_min, high=syn_features_max, size=1)[0]
    n_samples = rng.integers(low=syn_samples_min, high=syn_samples_max, size=1)[0]
    i_outlier_p = round(rng.uniform(low=outliers_p_min, high=outliers_p_max, size=1)[0], 2)
    i_outlier_mag = round(rng.uniform(low=outliers_mag_min, high=outliers_mag_min, size=1)[0], 2)
    contribution_max = round(rng.uniform(low=1.0, high=10.0, size=1)[0], 2)
    print(f"True Factors: {true_factor}, Features: {n_features}, Samples: {n_samples}, Outliers %: {i_outlier_p}, Outliers Magnitude: {i_outlier_mag}, Contribution Max: {contribution_max}")
    simulator = Simulator(seed=rng.integers(low=0, high=1e10, size=1)[0],
                          factors_n=true_factor,
                          features_n=n_features,
                          samples_n=n_samples,
                          outliers=outliers,
                          outlier_p=i_outlier_p,
                          outlier_mag=i_outlier_mag,
                          contribution_max=contribution_max,
                          noise_mean_min=noise_mean_min,
                          noise_mean_max=noise_mean_max,
                          noise_scale=noise_scale,
                          uncertainty_mean_min=uncertainty_mean_min,
                          uncertainty_mean_max=uncertainty_mean_max,
                          uncertainty_scale=uncertainty_scale
                         )
    curved_factors_count = rng.integers(low=0, high=true_factor, size=1)[0]
    print(f"Factors Curve Update Count: {curved_factors_count}")
    curved_factor_list = rng.choice(list(range(true_factor)), size=curved_factors_count, replace=False)
    print(f"Updating factors: {curved_factor_list} curve type")
    for c_i in curved_factor_list:
        # parameters not used by the curve type are ignored
        i_curve_type = rng.choice(['uniform', 'decreasing', 'increasing', 'logistic', 'periodic'], size=1)[0]
        print(f"New curve type: {i_curve_type}")
        i_curve_min = rng.uniform(low=contr_curve_min_range[0], high=contr_curve_min_range[1], size=1)[0]
        i_curve_max = rng.uniform(low=contr_curve_max_range[0], high=contr_curve_max_range[1], size=1)[0]
        i_curve_scale = rng.uniform(low=contr_curve_scale_range[0], high=contr_curve_scale_range[1], size=1)[0]
        i_curve_frequency = rng.uniform(low=0.1, high=0.9, size=1)[0]
        simulator.update_contribution(factor_i=c_i, curve_type=i_curve_type, scale=i_curve_scale, frequency=i_curve_frequency, minimum=i_curve_min, maximum=i_curve_max)
    
    syn_input_df, syn_uncertainty_df = simulator.get_data()
    data_handler = DataHandler.load_dataframe(input_df=syn_input_df, uncertainty_df=syn_uncertainty_df)
    data_handler.metrics
    V, U = data_handler.get_data()
    return V, U

def run_estimation(k, eV, eU, e_samples: int = 100, min_factors: int = 2, max_factors: int = 12):
    factor_est = FactorEstimator(V=eV, U=eU)
    results = factor_est.run(samples=e_samples, min_factors=min_factors, max_factors=max_factors)
    results["Overall Score"] = (results["Factors"] * results["Delta Ratio"] * results["Q(True)"].min()) / results["Q(True)"]

    estimation = {
        "true K": k,
        "delta ratio": np.nanargmax(results["Delta Ratio"].values) + min_factors,
        "K estimate": np.nanargmax(results["K Estimate"].values) + min_factors,
        "Overall Score": np.nanargmax(results["Overall Score"].values) + min_factors,
        "features": eV.shape[1],
        "samples": eV.shape[0]
    }
    return estimation, results
    

In [5]:
# Random test
true_factor = rng.integers(low=syn_factors_min, high=syn_factors_max, size=1)[0]
print(f"True Factors: {true_factor}")
i_V, i_U = generate_synthetic_data(true_factor=true_factor)
i_V.shape

29-Aug-24 16:18:02 - Synthetic profiles generated
29-Aug-24 16:18:02 - Synthetic factor 3 contribution updated as a random sampling from a normal distribution along a logistic curve.
29-Aug-24 16:18:02 - Synthetic data generated
29-Aug-24 16:18:02 - Synthetic uncertainty data generated
29-Aug-24 16:18:02 - Synthetic dataframes completed
29-Aug-24 16:18:02 - Synthetic source apportionment instance created.


True Factors: 3
True Factors: 3, Features: 78, Samples: 1316, Outliers %: 0.09, Outliers Magnitude: 1.1, Contribution Max: 1.85
Factors Curve Update Count: 1
Updating factors: [2] curve type
New curve type: logistic


(1316, 78)

In [6]:
%%time
# factor_est = FactorEstimator(V=i_V, U=i_U)
# results = factor_est.run(samples=50, min_factors=2, max_factors=10)
# results

CPU times: total: 0 ns
Wall time: 0 ns


In [10]:
%%time
estimation, results = run_estimation(k=true_factor, eV=i_V, eU=i_U, e_samples=samples, min_factors=min_factors, max_factors=max_factors)
estimation

Rapid random sampling for factor estimation: 100%|███████████████████████████████████| 100/100 [20:20<00:00, 12.20s/it]
29-Aug-24 17:02:23 - Estimated factor count: 2


CPU times: total: 3.72 s
Wall time: 20min 20s


{'true K': 3,
 'delta ratio': 8,
 'K estimate': 2,
 'Overall Score': 8,
 'features': 78,
 'samples': 1316}

In [11]:
results

Unnamed: 0,Factors,Test MSE,Train MSE,Delta MSE,Delta Ratio,K Estimate,Q(True),Overall Score
0,2,1.106017,1.095301,,,0.095388,3037098000.0,
1,3,0.742907,0.750106,0.36311,4.512463,0.08383,1818244000.0,0.025296
2,4,0.63215,0.622229,0.110757,0.719913,0.067779,3483862000.0,0.002527
3,5,0.420393,0.420707,0.211757,8.506746,0.076257,826640600.0,0.146846
4,6,0.38613,0.387314,0.034263,-0.035778,0.065503,4233725.0,-0.137815
5,7,1.704258,1.72891,-1.318128,-1.331309,0.012146,145053600.0,-0.168388
6,8,0.341478,0.343755,1.36278,43.360088,0.050958,3073054.0,287.631991
7,9,0.298219,0.298927,0.04326,6.163705,0.050066,2510543.0,55.053379
8,10,0.288558,0.29014,0.00966,0.461914,0.045119,2223052.0,5.082886
9,11,0.259773,0.25756,0.028785,-3.829129,0.044278,2038530.0,-49.778682
