## Task 10: Plotting Bias-Variance Tradeoff and Correlation

In [1]:
import argparse
import yaml
import numpy as np
import pickle as pkl
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import mean_squared_error
from itertools import product
from datetime import datetime
import json
import hashlib
import os
import matplotlib.pyplot as plt

from algorithms import BMP
from data_generation import GaussianDataGenerator

In [44]:
p = 300
n = 600
m = 20
noise_level = 0.2

Data_Geneartor = GaussianDataGenerator(p, n, m, noise_level, random_seed=1)
_ = Data_Geneartor.shuffle()

In [47]:
(
    test_true_signal,
    test_dictionary,
    true_indices,
    true_coefficients,
    test_perturbed_signal,
) = Data_Geneartor.get_current_shuffle()
print(test_perturbed_signal[:5])
print(true_coefficients[:5])

[[ 0.05826719]
 [-0.329659  ]
 [-0.62405756]
 [ 0.19549826]
 [ 0.29230804]]
[[ 0.64427199]
 [-0.96876819]
 [ 0.21446787]
 [ 0.18492569]
 [ 1.33144795]]


In [20]:
train_seed = 33
(
    train_true_signal,
    train_dictionary,
    true_indices,
    true_coefficients,
    train_perturbed_signal,
) = Data_Geneartor.keep_coef_reshuffle(seed = train_seed)

In [48]:
model_params = {
    'N_bag': 100,
    'K': 20,
    'signal_bag_percent': 0.3,
    'atom_bag_percent': 0.3,
    'agg_func': 'avg',
    'replace_flag': False
}

bmp_model = BMP(**model_params)

In [49]:
bmp_model.fit(train_dictionary, train_perturbed_signal)
y_pred = bmp_model.predict(test_dictionary)
bmp_model.score(test_dictionary, test_perturbed_signal)

0.041208068908968165

Now we are ready to calculate bias variance and correlation of y_pred

In [51]:
def cal_bias_variance_corr(n, p, m, data_seed, noise_level, model_params, test_num):
    Data_Geneartor = GaussianDataGenerator(p, n, m, noise_level, data_seed)
    test_true_signal, test_dictionary, true_indices, true_coefficients, test_perturbed_signal = Data_Geneartor.shuffle()
    bmp_model = BMP(**model_params)
    y_pred_lst = []
    mse_lst = []
    for test_i in range(test_num):
        _, train_dictionary, _, _, train_perturbed_signal = Data_Geneartor.keep_coef_reshuffle(seed = test_i + 100)
        bmp_model.fit(train_dictionary, train_perturbed_signal)
        y_pred = bmp_model.predict(test_dictionary)
        y_pred_lst.append(y_pred)
        mse_lst.append(bmp_model.score(test_dictionary, test_perturbed_signal))
    y_pred_mat = np.concatenate(y_pred_lst, axis = 1)
    y_pred_mean = np.mean(y_pred_mat, axis = 1)
    y_pred_var = np.var(y_pred_mat, axis = 1)
    y_pred_bias = y_pred_mean - test_true_signal
    corr = np.corrcoef(y_pred_mat, rowvar=False)
    return y_pred_bias, y_pred_var, corr, mse_lst

In [53]:
model_params = {
    'N_bag': 50,
    'K': 20,
    'signal_bag_percent': 0.3,
    'atom_bag_percent': 0.3,
    'agg_func': 'avg',
    'replace_flag': False
}

p = 300
n = 600
m = 20
noise_level = 0.2
data_seed = 1

cal_bias_variance_corr(n, p, m, data_seed, noise_level, model_params, 20)

(array([[-0.00940016, -0.21376644, -0.25220852, ..., -0.38345634,
          0.00660752, -0.12997846],
        [ 0.26552805,  0.06116178,  0.02271969, ..., -0.10852812,
          0.28153574,  0.14494976],
        [ 0.3504726 ,  0.14610633,  0.10766424, ..., -0.02358357,
          0.36648029,  0.22989431],
        ...,
        [ 0.43243457,  0.2280683 ,  0.18962621, ...,  0.0583784 ,
          0.44844226,  0.31185628],
        [-0.02991918, -0.23428545, -0.27272754, ..., -0.40397535,
         -0.0139115 , -0.15049748],
        [ 0.16477845, -0.03958783, -0.07802991, ..., -0.20927773,
          0.18078613,  0.04420015]]),
 array([0.0012957 , 0.00321258, 0.00165476, 0.00582023, 0.00220626,
        0.00161097, 0.00213001, 0.00293098, 0.00235027, 0.0025455 ,
        0.00243013, 0.00092671, 0.00264055, 0.0019549 , 0.00300793,
        0.00365107, 0.00327333, 0.00107854, 0.00485927, 0.00374018,
        0.00104405, 0.00240851, 0.00307885, 0.00221724, 0.00217717,
        0.00196619, 0.00212995, 0