In [1]:
import numpy as np 
import pandas as pd 
from sklearn.linear_model import LinearRegression, Ridge, ElasticNet, Lasso
from sklearn.preprocessing import StandardScaler
from functools import reduce  
import matplotlib.pyplot as plt
import statsmodels.api as sm
import seaborn as sns
from mlxtend.evaluate import bias_variance_decomp
from sklearn import metrics
from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings('ignore')

In [2]:
from auxiliary_files.auxiliary_plots import * 
from auxiliary_files.auxiliary_analysis import * 
from auxiliary_files.auxiliary_tables import *

In [6]:
#np.random.seed(123)
#np.random.seed(190)
#np.random.seed(200)
np.random.seed(210)
#np.random.seed(66)


n = 20 
p = 8 
true_betas = [3, 1.5, 0, 0, 2, 0, 0, 0]
cor_factor = [0, 0.1, 0.3, 0.5, 0.7, 0.8]
iterations = 500
alphas = np.logspace(-4,1,200)

store_X_test = []
store_y_test = []


for i in cor_factor: 

    y_test, X_test, df_test= get_sim_data(n, p, i, true_betas) # get training data
    store_X_test.append(X_test)
    store_y_test.append(y_test)

df_predictions_1 = get_predictions(n, p, true_betas, cor_factor[0], iterations, alphas, store_X_test[0])
df_predictions_2 = get_predictions(n, p, true_betas, cor_factor[1], iterations, alphas, store_X_test[1])
df_predictions_3 = get_predictions(n, p, true_betas, cor_factor[2], iterations, alphas, store_X_test[2])
df_predictions_4 = get_predictions(n, p, true_betas, cor_factor[3], iterations, alphas, store_X_test[3])
df_predictions_5 = get_predictions(n, p, true_betas, cor_factor[4], iterations, alphas, store_X_test[4])
df_predictions_6 = get_predictions(n, p, true_betas, cor_factor[5], iterations, alphas, store_X_test[5])


store_mse_1, store_variance_1, store_bias_sq_1 = compute_mse(df_predictions_1, store_y_test[0])
store_mse_2, store_variance_2, store_bias_sq_2 = compute_mse(df_predictions_2, store_y_test[1])
store_mse_3, store_variance_3, store_bias_sq_3 = compute_mse(df_predictions_3, store_y_test[2])
store_mse_4, store_variance_4, store_bias_sq_4 = compute_mse(df_predictions_4, store_y_test[3])
store_mse_5, store_variance_5, store_bias_sq_5 = compute_mse(df_predictions_5, store_y_test[4])
store_mse_6, store_variance_6, store_bias_sq_6 = compute_mse(df_predictions_6, store_y_test[5])



In [7]:
for i in store_mse_1: 
    
    print(min(i), np.argmin(i))

1.294973556133842 0
1.2951180617502536 0
1.2962359060542044 0
1.2958158266056012 0
1.2951455563509326 0


In [8]:
for i in store_mse_2: 
    
    print(min(i), np.argmin(i))

1.2380474762671343 151
0.5989540600341798 125
1.160345900187367 106
0.9793690331635143 116
0.6130707112546397 125


In [9]:
for i in store_mse_3: 
    
    print(min(i), np.argmin(i))

2.0706833810097005 128
1.3398163289517586 132
2.0337815376364103 91
1.8967791340267053 107
1.3696791518626947 132


In [10]:
#only setting where elastic net outperforms lasso and ridge. (for l1 = 0.5 and 0.7)

for i in store_mse_4: 
    
    print(min(i), np.argmin(i))

0.6754013861143037 170
0.6269705722939336 135
0.631904903217388 120
0.5794044196240361 125
0.6188576862400641 134


In [11]:
for i in store_mse_5: 
    
    print(min(i), np.argmin(i))

0.9047012166206374 151
0.8286202879153189 111
0.8893109032297652 101
0.8644077122487904 105
0.8295943841382927 111


In [12]:
for i in store_mse_6: 
    
    print(min(i), np.argmin(i))

0.6731623057226193 164
0.521129891650058 125
0.6376483053664417 114
0.5839160663984087 117
0.521873694062772 125


In [13]:
"""ridge should do best here in all cases"""

np.random.seed(123)
#np.random.seed(190)
#np.random.seed(200)
#np.random.seed(210)
#np.random.seed(66)


n = 20 
p = 8 
true_betas = np.repeat(0.85, 8)
cor_factor = [0, 0.1, 0.3, 0.5, 0.7, 0.8]
iterations = 500
alphas = np.logspace(-4,1,200)

store_X_test = []
store_y_test = []


for i in cor_factor: 

    y_test, X_test, df_test= get_sim_data(n, p, i, true_betas) # get training data
    store_X_test.append(X_test)
    store_y_test.append(y_test)

df_predictions_ls_1 = get_predictions(n, p, true_betas, cor_factor[0], iterations, alphas, store_X_test[0])
df_predictions_ls_2 = get_predictions(n, p, true_betas, cor_factor[1], iterations, alphas, store_X_test[1])
df_predictions_ls_3 = get_predictions(n, p, true_betas, cor_factor[2], iterations, alphas, store_X_test[2])
df_predictions_ls_4 = get_predictions(n, p, true_betas, cor_factor[3], iterations, alphas, store_X_test[3])
df_predictions_ls_5 = get_predictions(n, p, true_betas, cor_factor[4], iterations, alphas, store_X_test[4])
df_predictions_ls_6 = get_predictions(n, p, true_betas, cor_factor[5], iterations, alphas, store_X_test[5])


store_mse_ls_1, store_variance_ls_1, store_bias_sq_ls_1 = compute_mse(df_predictions_ls_1, store_y_test[0])
store_mse_ls_2, store_variance_ls_2, store_bias_sq_ls_2 = compute_mse(df_predictions_ls_2, store_y_test[1])
store_mse_ls_3, store_variance_ls_3, store_bias_sq_ls_3 = compute_mse(df_predictions_ls_3, store_y_test[2])
store_mse_ls_4, store_variance_ls_4, store_bias_sq_ls_4 = compute_mse(df_predictions_ls_4, store_y_test[3])
store_mse_ls_5, store_variance_ls_5, store_bias_sq_ls_5 = compute_mse(df_predictions_ls_5, store_y_test[4])
store_mse_ls_6, store_variance_ls_6, store_bias_sq_ls_6 = compute_mse(df_predictions_ls_6, store_y_test[5])




In [14]:
for i in store_mse_ls_1: 
    
    print(min(i), np.argmin(i))

0.6328685732064164 199
0.4157596358523855 181
0.41282818752350364 192
0.4161886833783439 193
0.4160358786171014 187


In [15]:
for i in store_mse_ls_2: 
    
    print(min(i), np.argmin(i))

2.4708848617041204 155
2.547768294838287 0
2.506256624737217 96
2.5418651874153566 79
2.5475430695676624 0


In [16]:
for i in store_mse_ls_3: 
    
    print(min(i), np.argmin(i))

0.5214577708046586 199
1.532239001309111 134
0.585327762543371 153
0.8819700161134203 147
1.1238047450280437 143


In [17]:
for i in store_mse_ls_4: 
    
    print(min(i), np.argmin(i))

0.30673624106238695 194
0.69148109757708 106
0.35408706097896225 140
0.45618938410309 135
0.5566821256918226 129


In [18]:
for i in store_mse_ls_5: 
    
    print(min(i), np.argmin(i))

0.7364908724774366 166
0.9055901649901404 81
0.7721661835330006 112
0.8317379452284769 105
0.8669102900869129 98


In [19]:
for i in store_mse_ls_6: 
    
    print(min(i), np.argmin(i))

0.1425766264124557 193
0.5245712202852323 125
0.1628614311142402 140
0.21961946470192778 137
0.2992716975351493 135


# High Dimensionality, sparsity, varying correlation

In [20]:
def compute_mse(predictions_df_list, y_test, iterations):

    store_mse_lists = []
    store_variance_lists = []
    store_bias_sq_lists = []

    for df in enumerate(predictions_df_list):
    
        store_mse = []
        store_variance = []
        store_bias_sq = []
    
        for i in df[1].columns: 

            mse = np.sum((np.asarray(df[1].iloc[:,i]) - y_test.iloc[14])**2) / iterations
            variance = np.mean((np.mean(df[1].iloc[:,i]) - np.asarray(df[1].iloc[:,i]))**2)
            bias_squared = (np.mean(df[1].iloc[:,i]) - y_test.iloc[14])**2
    
            store_mse.append(mse)
            store_variance.append(variance)
            store_bias_sq.append(bias_squared)
    
        store_mse_lists.append(store_mse)
        store_variance_lists.append(store_variance)
        store_bias_sq_lists.append(store_bias_sq)
    
    return store_mse_lists, store_variance_lists, store_bias_sq_lists

In [5]:
non_zero_betas = [10]
zero_betas = [25]

true_betas_hd = generate_true_betas(non_zero_betas, zero_betas, size_non_zero)

len(true_betas_hd[0])

35

In [8]:
#np.random.seed(123) works fine! 
np.random.seed(190)
#np.random.seed(200)
#np.random.seed(210)
#np.random.seed(66)

non_zero_betas = [10]
zero_betas = [25]
size_non_zero = 2
true_betas_hd = generate_true_betas(non_zero_betas, zero_betas, size_non_zero)

n = 30
p = 35 
cor_factor = [0, 0.1, 0.3, 0.5, 0.6, 0.7, 0.75, 0.8, 0.9]
iterations = 500
alphas = np.logspace(-4,1,200)

store_X_test = []
store_y_test = []

for i in cor_factor: 

    y_test, X_test, df_test= get_sim_data(n, p, i, true_betas_hd[0]) # get training data
    store_X_test.append(X_test)
    store_y_test.append(y_test)

df_predictions_hd_1 = get_predictions(n, p, true_betas_hd[0], cor_factor[0], iterations, alphas, store_X_test[0])
df_predictions_hd_2 = get_predictions(n, p, true_betas_hd[0], cor_factor[1], iterations, alphas, store_X_test[1])
df_predictions_hd_3 = get_predictions(n, p, true_betas_hd[0], cor_factor[2], iterations, alphas, store_X_test[2])
df_predictions_hd_4 = get_predictions(n, p, true_betas_hd[0], cor_factor[3], iterations, alphas, store_X_test[3])
df_predictions_hd_5 = get_predictions(n, p, true_betas_hd[0], cor_factor[4], iterations, alphas, store_X_test[4])
df_predictions_hd_6 = get_predictions(n, p, true_betas_hd[0], cor_factor[5], iterations, alphas, store_X_test[5])
df_predictions_hd_7 = get_predictions(n, p, true_betas_hd[0], cor_factor[6], iterations, alphas, store_X_test[6])
df_predictions_hd_8 = get_predictions(n, p, true_betas_hd[0], cor_factor[7], iterations, alphas, store_X_test[7])
df_predictions_hd_9 = get_predictions(n, p, true_betas_hd[0], cor_factor[8], iterations, alphas, store_X_test[8])


store_mse_hd_1, store_variance_hd_1, store_bias_sq_hd_1 = compute_mse(df_predictions_hd_1, store_y_test[0], iterations)
store_mse_hd_2, store_variance_hd_2, store_bias_sq_hd_2 = compute_mse(df_predictions_hd_2, store_y_test[1], iterations)
store_mse_hd_3, store_variance_hd_3, store_bias_sq_hd_3 = compute_mse(df_predictions_hd_3, store_y_test[2], iterations)
store_mse_hd_4, store_variance_hd_4, store_bias_sq_hd_4 = compute_mse(df_predictions_hd_4, store_y_test[3], iterations)
store_mse_hd_5, store_variance_hd_5, store_bias_sq_hd_5 = compute_mse(df_predictions_hd_5, store_y_test[4], iterations)
store_mse_hd_6, store_variance_hd_6, store_bias_sq_hd_6 = compute_mse(df_predictions_hd_6, store_y_test[5], iterations)
store_mse_hd_7, store_variance_hd_7, store_bias_sq_hd_7 = compute_mse(df_predictions_hd_7, store_y_test[6], iterations)
store_mse_hd_8, store_variance_hd_8, store_bias_sq_hd_8 = compute_mse(df_predictions_hd_8, store_y_test[7], iterations)
store_mse_hd_9, store_variance_hd_9, store_bias_sq_hd_9 = compute_mse(df_predictions_hd_9, store_y_test[8], iterations)


NameError: name 'iterations' is not defined

In [22]:
#2.98

for i in store_mse_hd_1: 
    
    print(min(i), np.argmin(i))

5.722529553607648 189
1.9477681259102655 120
4.7110870312262465 124
3.368037086637932 118
2.668716815920924 117


In [23]:
# 5.07

for i in store_mse_hd_2: 
    
    print(min(i), np.argmin(i))

9.411593333421497 169
4.078711445913463 119
7.931813540195717 112
6.049486716970318 114
5.1215344065827 113


In [24]:
#9.42

for i in store_mse_hd_3: 
    
    print(min(i), np.argmin(i))

15.714047611249931 135
6.57466636625762 103
11.092029259535344 20
10.504714116071582 91
8.569909531192156 97


In [25]:
#2.18

for i in store_mse_hd_4: 
    
    print(min(i), np.argmin(i))

4.973013440888867 176
1.6970645917525118 128
3.8193329846314747 124
2.562492063331387 129
2.0390959927532215 131


In [26]:
# 1.33 

for i in store_mse_hd_5: 
    
    print(min(i), np.argmin(i))

2.6407831326971545 168
1.346996941609988 122
2.0678403886953127 112
1.514983664737439 117
1.2995911852931374 120


In [27]:
#1.48

for i in store_mse_hd_6: 
    
    print(min(i), np.argmin(i))

3.2632563671166532 164
1.7160947171621663 123
2.659132100675153 111
1.8822090239682039 119
1.588781290366569 122


In [28]:
# 0.40

for i in store_mse_hd_7: 
    
    print(min(i), np.argmin(i))

1.0757929804698305 182
0.7343055682549087 128
0.8052974368154648 128
0.5374601099992359 132
0.4921354430580525 132


In [29]:
for i in store_mse_hd_8: 
    
    print(min(i), np.argmin(i))

1.976905220337567 175
1.2760203084945747 128
1.6124881253648364 122
1.151750704656661 128
1.045683234003479 130


In [30]:
for i in store_mse_hd_9: 
    
    print(min(i), np.argmin(i))

1.3562858581845878 180
1.7139667625002597 120
1.0570625744241764 127
0.8140912690392613 136
0.8605410173064758 139
