In [29]:
import argparse
import yaml
import numpy as np
import pandas as pd
import pickle as pkl
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import mean_squared_error
from itertools import product
from datetime import datetime
import json
import hashlib
import os
import matplotlib.pyplot as plt
import plotly.express as px

from algorithms import BMP
from data_generation import GaussianDataGenerator

In [3]:
alpha_lst = [0.3, 0.5, 0.7, 0.9] #atom_bag_percent
beta_lst = [0.3, 0.5, 0.7, 0.9] #signal_bag_percent
p = 300
n = 600
m = 20
noise_level = 0.2
data_seed = 1
K_lst = [10, 20, 40, 80, 160, 200]

In [4]:
def cal_bias_variance_corr(n, p, m, data_seed, noise_level, model_params, test_num):
    Data_Geneartor = GaussianDataGenerator(p, n, m, noise_level, data_seed)
    test_true_signal, test_dictionary, true_indices, true_coefficients, test_perturbed_signal = Data_Geneartor.shuffle()
    bmp_model = BMP(**model_params)
    y_pred_lst = []
    mse_lst = []
    corr_lst = []
    for test_i in range(test_num):
        _, train_dictionary, _, _, train_perturbed_signal = Data_Geneartor.keep_coef_reshuffle(seed = test_i + 100)
        bmp_model.fit(train_dictionary, train_perturbed_signal)
        y_pred = bmp_model.predict(test_dictionary)
        y_pred_lst.append(y_pred)
        mse_lst.append(bmp_model.score(test_dictionary, test_perturbed_signal))
    y_pred_mat = np.concatenate(y_pred_lst, axis = 1)
    y_pred_mean = np.mean(y_pred_mat, axis = 1)
    y_pred_var = np.var(y_pred_mat, axis = 1)
    y_pred_bias = y_pred_mean - test_true_signal.ravel()
    
    # TODO: correlation need to be debugged
    y_pred_corr = np.corrcoef(y_pred_mat, rowvar = False)
    return y_pred_bias, y_pred_var, y_pred_corr, mse_lst

In [28]:
res_log = {
    'params': [],
    'bias_log': [],
    'var_log': [],
    'corr_log': []
}
for alpha, beta in product(alpha_lst, beta_lst):
    param = 'alpha: ' + str(alpha) + ' beta: ' + str(beta)
    res_log['params'].append(param)
    bias_tmp = []
    var_tmp = []
    corr_tmp = []
    for K in K_lst:
        model_params = {
            'N_bag': 1,
            'K': K,
            'signal_bag_percent': beta,
            'atom_bag_percent': alpha,
            'agg_func': 'avg',
            'replace_flag': False
        }
        y_pred_bias, y_pred_var, corr, mse_lst = cal_bias_variance_corr(n, p, m, data_seed, noise_level, model_params, 10)
        corr_lst = corr[np.triu_indices(len(corr), k=1)]
        bias_tmp.append(np.mean(y_pred_bias**2))
        var_tmp.append(np.mean(y_pred_var))
        corr_tmp.append(np.mean(corr_lst))
    res_log['bias_log'].append(bias_tmp)
    res_log['var_log'].append(var_tmp)
    res_log['corr_log'].append(corr_tmp)


In [32]:
df = []
row_num = len(alpha_lst) * len(beta_lst) * len(K_lst)
for i in range(row_num):
    param_count = i//len(K_lst)
    K_count = i%len(K_lst)
    row = {
        'params': res_log['params'][param_count],
        'K': K_lst[K_count],
        'bias': res_log['bias_log'][param_count][K_count],
        'var': res_log['var_log'][param_count][K_count],
        'corr': res_log['corr_log'][param_count][K_count]
    }
    df.append(row)
df = pd.DataFrame(df)

In [33]:
df

Unnamed: 0,params,K,bias,var,corr
0,alpha: 0.3 beta: 0.3,10,0.010244,0.014656,0.449917
1,alpha: 0.3 beta: 0.3,20,0.005648,0.021407,0.395963
2,alpha: 0.3 beta: 0.3,40,0.005500,0.030710,0.316691
3,alpha: 0.3 beta: 0.3,80,0.006030,0.039197,0.266683
4,alpha: 0.3 beta: 0.3,160,0.006953,0.047169,0.232363
...,...,...,...,...,...
91,alpha: 0.9 beta: 0.9,20,0.001415,0.005685,0.803463
92,alpha: 0.9 beta: 0.9,40,0.001578,0.009493,0.716195
93,alpha: 0.9 beta: 0.9,80,0.001873,0.014269,0.624501
94,alpha: 0.9 beta: 0.9,160,0.002340,0.018334,0.565801


In [36]:
fig = px.line(df, x="K", y="bias", color="params", title="Mean bias square of testing set")
fig.update_traces(mode="markers+lines")

fig.show()

In [38]:
fig = px.line(df, x="K", y="var", color="params", title="Mean variance of testing set")
fig.update_traces(mode="markers+lines")

fig.show()

In [39]:
fig = px.line(df, x="K", y="corr", color="params", title="Mean correlation of testing set")
fig.update_traces(mode="markers+lines")

fig.show()