In [1]:
import argparse
import yaml
import numpy as np
import pandas as pd
import pickle as pkl
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import mean_squared_error
from itertools import product
from datetime import datetime
import json
import hashlib
import os
import matplotlib.pyplot as plt
import plotly.express as px

from algorithms import BMP
from data_generation import GaussianDataGenerator

In [10]:
alpha_lst = [0.3, 0.5, 0.7, 0.9] #atom_bag_percent
beta_lst = [0.3, 0.5, 0.7, 0.9] #signal_bag_percent
p = 300
n = 600
m = 20
noise_level = 0.2
data_seed = 1
K_lst = [2, 4, 6, 8, 10, 20, 40, 80, 160, 200]

In [3]:
def cal_bias_variance_corr(n, p, m, data_seed, noise_level, model_params, N_bag_for_corr, test_num):
    Data_Geneartor = GaussianDataGenerator(p, n, m, noise_level, data_seed)
    test_true_signal, test_dictionary, true_indices, true_coefficients, test_perturbed_signal = Data_Geneartor.shuffle()
    bmp_model = BMP(**model_params)
    model_param_N_bag = model_params.copy()
    model_param_N_bag['N_bag'] = N_bag_for_corr
    bmp_model_multibag = BMP(**model_param_N_bag)
    y_pred_lst = []
    mse_lst = []
    corr_lst = []
    for test_i in range(test_num):
        _, train_dictionary, _, _, train_perturbed_signal = Data_Geneartor.keep_coef_reshuffle(seed = test_i + 100)
        bmp_model.fit(train_dictionary, train_perturbed_signal)
        bmp_model_multibag.fit(train_dictionary, train_perturbed_signal)
        y_pred = bmp_model.predict(test_dictionary)
        pred_corr = bmp_model_multibag.pred_corr(test_dictionary)
        y_pred_lst.append(y_pred)
        mse_lst.append(bmp_model.score(test_dictionary, test_perturbed_signal))
        corr_lst.append(pred_corr)
    y_pred_mat = np.concatenate(y_pred_lst, axis = 1)
    y_pred_mean = np.mean(y_pred_mat, axis = 1)
    y_pred_var = np.var(y_pred_mat, axis = 1)
    y_pred_bias = y_pred_mean - test_true_signal.ravel()
    
    y_pred_corr = np.vstack(corr_lst)
    return y_pred_bias, y_pred_var, y_pred_corr, mse_lst

In [11]:
res_log = {
    'params': [],
    'bias_log': [],
    'var_log': [],
    'corr_log': []
}
for alpha, beta in product(alpha_lst, beta_lst):
    param = 'alpha: ' + str(alpha) + ' beta: ' + str(beta)
    res_log['params'].append(param)
    bias_tmp = []
    var_tmp = []
    corr_tmp = []
    for K in K_lst:
        model_params = {
            'N_bag': 1,
            'K': K,
            'signal_bag_percent': beta,
            'atom_bag_percent': alpha,
            'agg_func': 'avg',
            'replace_flag': False
        }
        y_pred_bias, y_pred_var, y_pred_corr, mse_lst = cal_bias_variance_corr(n, p, m, data_seed, noise_level, model_params, 100, 50)
        y_pred_corr_trials_avg = np.mean(y_pred_corr, axis=1)
        bias_tmp.append(np.mean(y_pred_bias**2))
        var_tmp.append(np.mean(y_pred_var))
        corr_tmp.append(np.mean(y_pred_corr_trials_avg))
    res_log['bias_log'].append(bias_tmp)
    res_log['var_log'].append(var_tmp)
    res_log['corr_log'].append(corr_tmp)


In [12]:
df = []
row_num = len(alpha_lst) * len(beta_lst) * len(K_lst)
for i in range(row_num):
    param_count = i//len(K_lst)
    K_count = i%len(K_lst)
    row = {
        'params': res_log['params'][param_count],
        'K': K_lst[K_count],
        'bias': res_log['bias_log'][param_count][K_count],
        'var': res_log['var_log'][param_count][K_count],
        'corr': res_log['corr_log'][param_count][K_count]
    }
    df.append(row)
df = pd.DataFrame(df)

In [13]:
df

Unnamed: 0,params,K,bias,var,corr
0,alpha: 0.3 beta: 0.3,2,0.020618,0.005651,0.285306
1,alpha: 0.3 beta: 0.3,4,0.010674,0.007959,0.406063
2,alpha: 0.3 beta: 0.3,6,0.009166,0.010736,0.488647
3,alpha: 0.3 beta: 0.3,8,0.008227,0.012927,0.517591
4,alpha: 0.3 beta: 0.3,10,0.007949,0.015077,0.539697
...,...,...,...,...,...
155,alpha: 0.9 beta: 0.9,20,0.000608,0.006280,0.925363
156,alpha: 0.9 beta: 0.9,40,0.000448,0.010473,0.897946
157,alpha: 0.9 beta: 0.9,80,0.000443,0.015101,0.890401
158,alpha: 0.9 beta: 0.9,160,0.000437,0.019948,0.903541


In [14]:
fig = px.line(df, x="K", y="bias", color="params", title="Mean bias square of testing set")
fig.update_traces(mode="markers+lines")

fig.show()

In [15]:
fig = px.line(df, x="K", y="var", color="params", title="Mean variance of testing set")
fig.update_traces(mode="markers+lines")

fig.show()

In [16]:
fig = px.line(df, x="K", y="corr", color="params", title="Mean correlation of testing set")
fig.update_traces(mode="markers+lines")

fig.show()