In [1]:
import numpy as np 
import pandas as pd 
from sklearn.linear_model import LinearRegression, Ridge, ElasticNet, Lasso
from sklearn.preprocessing import StandardScaler
from functools import reduce  
import matplotlib.pyplot as plt
import statsmodels.api as sm
import seaborn as sns
from mlxtend.evaluate import bias_variance_decomp
from sklearn import metrics
from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings('ignore')

In [2]:
def get_predictions(n, true_betas, iterations, alphas, cor_factor, X_test): 

    store_predictions_list_ridge = []
    store_predictions_list_lasso = []
    store_predictions_list_elnet_20 = []
    store_predictions_list_elnet_50 = []
    store_predictions_list_elnet_70 = []

    for i in range(iterations):
    
        store_predictions_ridge = []
        store_predictions_lasso = []
        store_predictions_elnet_20 = []
        store_predictions_elnet_50 = []
        store_predictions_elnet_70 = []
    
        y_train, X_train, df_train = get_sim_data(n, p, cor_factor, true_betas) # get test data 

        for a in alphas: 

            ridge = Ridge(alpha=a).fit(X_train, y_train)
            ridge_predict = ridge.predict(X_test)
            ridge_predict_select = ridge_predict[14]
            store_predictions_ridge.append(ridge_predict_select) 
        
            lasso = Lasso(alpha=a).fit(X_train, y_train)
            lasso_predict = lasso.predict(X_test)
            lasso_predict_select = lasso_predict[14]
            store_predictions_lasso.append(lasso_predict_select) 
        
            elnet_20 = ElasticNet(alpha=a, l1_ratio=0.2).fit(X_train, y_train)
            elnet_20_predict = elnet_20.predict(X_test)
            elnet_20_predict_select = elnet_20_predict[14]
            store_predictions_elnet_20.append(elnet_20_predict_select)
        
            elnet_50 = ElasticNet(alpha=a, l1_ratio=0.5).fit(X_train, y_train)
            elnet_50_predict = elnet_50.predict(X_test)
            elnet_50_predict_select = elnet_50_predict[14]
            store_predictions_elnet_50.append(elnet_50_predict_select)
        
            elnet_70 = ElasticNet(alpha=a, l1_ratio=0.7).fit(X_train, y_train)
            elnet_70_predict = elnet_70.predict(X_test)
            elnet_70_predict_select = elnet_70_predict[14]
            store_predictions_elnet_70.append(elnet_70_predict_select)
    
        store_predictions_list_ridge.append(store_predictions_ridge)
        store_predictions_list_lasso.append(store_predictions_lasso)
        store_predictions_list_elnet_20.append(store_predictions_elnet_20)
        store_predictions_list_elnet_50.append(store_predictions_elnet_50)
        store_predictions_list_elnet_70.append(store_predictions_elnet_70)
        
        store_predictions_df_ridge = pd.DataFrame(store_predictions_list_ridge)
        store_predictions_df_lasso = pd.DataFrame(store_predictions_list_lasso)
        store_predictions_df_elnet_20 = pd.DataFrame(store_predictions_list_elnet_20)
        store_predictions_df_elnet_50 = pd.DataFrame(store_predictions_list_elnet_50)
        store_predictions_df_elnet_70 = pd.DataFrame(store_predictions_list_elnet_70)
        
    predictions_dfs = [store_predictions_df_ridge, store_predictions_df_lasso, store_predictions_df_elnet_20,
                           store_predictions_df_elnet_50, store_predictions_df_elnet_70]
        
    return predictions_dfs




In [3]:
def compute_mse(predictions_df_list, y_test):

    store_mse_lists = []
    store_variance_lists = []
    store_bias_sq_lists = []

    for df in enumerate(predictions_df_list):
    
        store_mse = []
        store_variance = []
        store_bias_sq = []
    
        for i in df[1].columns: 

            mse = np.sum((np.asarray(df[1].iloc[:,i]) - y_test.iloc[14])**2) / iterations
            variance = np.mean((np.mean(df[1].iloc[:,i]) - np.asarray(df[1].iloc[:,i]))**2)
            bias_squared = (np.mean(df[1].iloc[:,i]) - y_test.iloc[14])**2
    
            store_mse.append(mse)
            store_variance.append(variance)
            store_bias_sq.append(bias_squared)
    
        store_mse_lists.append(store_mse)
        store_variance_lists.append(store_variance)
        store_bias_sq_lists.append(store_bias_sq)
    
    return store_mse_lists, store_variance_lists, store_bias_sq_lists

In [4]:
def get_sim_data(n, p, cor_factor, true_betas):
    
    sd_vec = np.ones(p) 
    mean = np.zeros(p)
    
    
    cor_matrix = np.zeros([p,p])
    store_corr = []

    for i in list(range(1, p)):
    
        for j in list(range(i + 1, p + 1)): 
            
            corr = cor_factor ** abs(i - j)
            store_corr.append(corr)

    cor_matrix[np.triu_indices(p, 1)] = store_corr
    cor_matrix[np.tril_indices(p, -1)] = cor_matrix.T[np.tril_indices(p, -1)]
    np.fill_diagonal(cor_matrix, 1)
    
    D = np.diag(sd_vec)
    sigma = D.dot(cor_matrix).dot(D)
    
    X = np.random.multivariate_normal(mean, sigma, n)
    
    eps = np.random.normal(0, 1, n)

    y = X.dot(true_betas) + eps 
    
    y = pd.Series(y, name = "y")
    
    column_names = []
    
    for value in range(1, p + 1): 
        
        column = f"X_{value}"
        column_names.append(column)
        
    
    X = pd.DataFrame(X, columns = column_names)
    
    df = pd.concat([y, X], axis = 1)
    
    return y, X, df


In [5]:
def generate_true_betas(non_zero_betas, zero_betas):

    store_true_betas = []

    for i, j in zip(non_zero_betas, zero_betas): 
    
        non_zeros = np.repeat(2, i)
        zeros = np.repeat(0, j)
    
        true_betas = np.concatenate([non_zeros, zeros])
        store_true_betas.append(true_betas)
        
    return store_true_betas
    



In [6]:
#np.random.seed(123)
#np.random.seed(190)
#np.random.seed(200)
np.random.seed(210)
#np.random.seed(66)


n = 20 
p = 8 
true_betas = [3, 1.5, 0, 0, 2, 0, 0, 0]
cor_factor = [0, 0.1, 0.3, 0.5, 0.7, 0.8]
iterations = 500
alphas = np.logspace(-4,1,200)

store_X_test = []
store_y_test = []


for i in cor_factor: 

    y_test, X_test, df_test= get_sim_data(n, p, i, true_betas) # get training data
    store_X_test.append(X_test)
    store_y_test.append(y_test)

df_predictions_1 = get_predictions(n, true_betas, iterations, alphas, cor_factor[0], store_X_test[0])
df_predictions_2 = get_predictions(n, true_betas, iterations, alphas, cor_factor[1], store_X_test[1])
df_predictions_3 = get_predictions(n, true_betas, iterations, alphas, cor_factor[2], store_X_test[2])
df_predictions_4 = get_predictions(n, true_betas, iterations, alphas, cor_factor[3], store_X_test[3])
df_predictions_5 = get_predictions(n, true_betas, iterations, alphas, cor_factor[4], store_X_test[4])
df_predictions_6 = get_predictions(n, true_betas, iterations, alphas, cor_factor[5], store_X_test[5])


store_mse_1, store_variance_1, store_bias_sq_1 = compute_mse(df_predictions_1, store_y_test[0])
store_mse_2, store_variance_2, store_bias_sq_2 = compute_mse(df_predictions_2, store_y_test[1])
store_mse_3, store_variance_3, store_bias_sq_3 = compute_mse(df_predictions_3, store_y_test[2])
store_mse_4, store_variance_4, store_bias_sq_4 = compute_mse(df_predictions_4, store_y_test[3])
store_mse_5, store_variance_5, store_bias_sq_5 = compute_mse(df_predictions_5, store_y_test[4])
store_mse_6, store_variance_6, store_bias_sq_6 = compute_mse(df_predictions_6, store_y_test[5])



In [7]:
for i in store_mse_1: 
    
    print(min(i), np.argmin(i))

1.294973556133842 0
1.2951180617502536 0
1.2962359060542044 0
1.2958158266056012 0
1.2951455563509326 0


In [8]:
for i in store_mse_2: 
    
    print(min(i), np.argmin(i))

1.2380474762671343 151
0.5989540600341798 125
1.160345900187367 106
0.9793690331635143 116
0.6130707112546397 125


In [9]:
for i in store_mse_3: 
    
    print(min(i), np.argmin(i))

2.0706833810097005 128
1.3398163289517586 132
2.0337815376364103 91
1.8967791340267053 107
1.3696791518626947 132


In [10]:
#only setting where elastic net outperforms lasso and ridge. (for l1 = 0.5 and 0.7)

for i in store_mse_4: 
    
    print(min(i), np.argmin(i))

0.6754013861143037 170
0.6269705722939336 135
0.631904903217388 120
0.5794044196240361 125
0.6188576862400641 134


In [11]:
for i in store_mse_5: 
    
    print(min(i), np.argmin(i))

0.9047012166206374 151
0.8286202879153189 111
0.8893109032297652 101
0.8644077122487904 105
0.8295943841382927 111


In [12]:
for i in store_mse_6: 
    
    print(min(i), np.argmin(i))

0.6731623057226193 164
0.521129891650058 125
0.6376483053664417 114
0.5839160663984087 117
0.521873694062772 125


In [13]:
"""ridge should do best here in all cases"""

np.random.seed(123)
#np.random.seed(190)
#np.random.seed(200)
#np.random.seed(210)
#np.random.seed(66)


n = 20 
p = 8 
true_betas = np.repeat(0.85, 8)
cor_factor = [0, 0.1, 0.3, 0.5, 0.7, 0.8]
iterations = 500
alphas = np.logspace(-4,1,200)

store_X_test = []
store_y_test = []


for i in cor_factor: 

    y_test, X_test, df_test= get_sim_data(n, p, i, true_betas) # get training data
    store_X_test.append(X_test)
    store_y_test.append(y_test)

df_predictions_ls_1 = get_predictions(n, true_betas, iterations, alphas, cor_factor[0], store_X_test[0])
df_predictions_ls_2 = get_predictions(n, true_betas, iterations, alphas, cor_factor[1], store_X_test[1])
df_predictions_ls_3 = get_predictions(n, true_betas, iterations, alphas, cor_factor[2], store_X_test[2])
df_predictions_ls_4 = get_predictions(n, true_betas, iterations, alphas, cor_factor[3], store_X_test[3])
df_predictions_ls_5 = get_predictions(n, true_betas, iterations, alphas, cor_factor[4], store_X_test[4])
df_predictions_ls_6 = get_predictions(n, true_betas, iterations, alphas, cor_factor[5], store_X_test[5])


store_mse_ls_1, store_variance_ls_1, store_bias_sq_ls_1 = compute_mse(df_predictions_ls_1, store_y_test[0])
store_mse_ls_2, store_variance_ls_2, store_bias_sq_ls_2 = compute_mse(df_predictions_ls_2, store_y_test[1])
store_mse_ls_3, store_variance_ls_3, store_bias_sq_ls_3 = compute_mse(df_predictions_ls_3, store_y_test[2])
store_mse_ls_4, store_variance_ls_4, store_bias_sq_ls_4 = compute_mse(df_predictions_ls_4, store_y_test[3])
store_mse_ls_5, store_variance_ls_5, store_bias_sq_ls_5 = compute_mse(df_predictions_ls_5, store_y_test[4])
store_mse_ls_6, store_variance_ls_6, store_bias_sq_ls_6 = compute_mse(df_predictions_ls_6, store_y_test[5])




In [14]:
for i in store_mse_ls_1: 
    
    print(min(i), np.argmin(i))

0.6328685732064164 199
0.4157596358523855 181
0.41282818752350364 192
0.4161886833783439 193
0.4160358786171014 187


In [15]:
for i in store_mse_ls_2: 
    
    print(min(i), np.argmin(i))

2.4708848617041204 155
2.547768294838287 0
2.506256624737217 96
2.5418651874153566 79
2.5475430695676624 0


In [16]:
for i in store_mse_ls_3: 
    
    print(min(i), np.argmin(i))

0.5214577708046586 199
1.532239001309111 134
0.585327762543371 153
0.8819700161134203 147
1.1238047450280437 143


In [17]:
for i in store_mse_ls_4: 
    
    print(min(i), np.argmin(i))

0.30673624106238695 194
0.69148109757708 106
0.35408706097896225 140
0.45618938410309 135
0.5566821256918226 129


In [18]:
for i in store_mse_ls_5: 
    
    print(min(i), np.argmin(i))

0.7364908724774366 166
0.9055901649901404 81
0.7721661835330006 112
0.8317379452284769 105
0.8669102900869129 98


In [19]:
for i in store_mse_ls_6: 
    
    print(min(i), np.argmin(i))

0.1425766264124557 193
0.5245712202852323 125
0.1628614311142402 140
0.21961946470192778 137
0.2992716975351493 135


In [5]:
np.random.seed(123)
#np.random.seed(190)
#np.random.seed(200)
#np.random.seed(210)
#np.random.seed(66)


n = 20 
p = 8 
true_betas = [3, 1.5, 0, 0, 2, 0, 0, 0]
cor_factor = 0
iterations = 500
alphas = np.logspace(-4,1,200)



y_test, X_test, df_test= get_sim_data(n, p, cor_factor, true_betas) # get training data
    

df_predictions = get_predictions(n, true_betas, iterations, alphas, cor_factor, X_test)
store_mse, store_variance, store_bias_sq = compute_mse(df_predictions, y_test)




#y, X, df = get_sim_data(n, p, cor_factor, true_betas)

In [6]:
for i in store_mse: 
    
    print(min(i), np.argmin(i))

1.0373120636389146 164
0.4471576085506193 131
0.9227872022170516 117
0.7158751211699762 124
0.5892222001071664 127


In [7]:
X_test.corr()

Unnamed: 0,X_1,X_2,X_3,X_4,X_5,X_6,X_7,X_8
X_1,1.0,0.399211,-0.034945,0.17721,0.083332,0.243926,-0.258732,0.058009
X_2,0.399211,1.0,0.346084,-0.150646,-0.138666,0.43694,-0.234178,0.045517
X_3,-0.034945,0.346084,1.0,0.03088,-0.334731,0.198793,-0.055639,-0.073571
X_4,0.17721,-0.150646,0.03088,1.0,-0.064106,0.075042,0.280153,-0.198846
X_5,0.083332,-0.138666,-0.334731,-0.064106,1.0,-0.29776,-0.080714,0.517108
X_6,0.243926,0.43694,0.198793,0.075042,-0.29776,1.0,-0.309259,-0.08019
X_7,-0.258732,-0.234178,-0.055639,0.280153,-0.080714,-0.309259,1.0,0.045083
X_8,0.058009,0.045517,-0.073571,-0.198846,0.517108,-0.08019,0.045083,1.0


In [8]:
np.random.seed(123)
#np.random.seed(190)
#np.random.seed(200)
#np.random.seed(210)
#np.random.seed(66)

n = 20 
p = 8 
true_betas = [3, 1.5, 0, 0, 2, 0, 0, 0]
cor_factor = 0.1
iterations = 500
alphas = np.logspace(-4,1,200)



y_test, X_test, df_test= get_sim_data(n, p, cor_factor, true_betas) # get training data
    

df_predictions = get_predictions(n, true_betas, iterations, alphas, cor_factor, X_test)
store_mse, store_variance, store_bias_sq = compute_mse(df_predictions, y_test)




#y, X, df = get_sim_data(n, p, cor_factor, true_betas)

In [9]:
for i in store_mse: 
    
    print(min(i), np.argmin(i))

1.2148682339366823 156
0.5728647290499886 131
1.119064902174663 112
0.9323886853782712 121
0.7887306831835047 126


In [10]:
np.random.seed(123)
#np.random.seed(190)
#np.random.seed(200)
#np.random.seed(210)
#np.random.seed(66)

n = 20 
p = 8 
true_betas = [3, 1.5, 0, 0, 2, 0, 0, 0]
cor_factor = 0.15
iterations = 500
alphas = np.logspace(-4,1,200)



y_test, X_test, df_test= get_sim_data(n, p, cor_factor, true_betas) # get training data
    

df_predictions = get_predictions(n, true_betas, iterations, alphas, cor_factor, X_test)
store_mse, store_variance, store_bias_sq = compute_mse(df_predictions, y_test)

In [11]:
for i in store_mse: 
    
    print(min(i), np.argmin(i))

1.34582120303462 103
1.2714946618066127 96
1.342254069249426 65
1.329097589613071 79
1.3126677056434068 86


In [12]:
np.random.seed(123)
#np.random.seed(190)
#np.random.seed(200)
#np.random.seed(210)
#np.random.seed(66)

n = 20 
p = 8 
true_betas = [3, 1.5, 0, 0, 2, 0, 0, 0]
cor_factor = 0.2
iterations = 500
alphas = np.logspace(-4,1,200)



y_test, X_test, df_test= get_sim_data(n, p, cor_factor, true_betas) # get training data
    

df_predictions = get_predictions(n, true_betas, iterations, alphas, cor_factor, X_test)
store_mse, store_variance, store_bias_sq = compute_mse(df_predictions, y_test)

In [13]:
for i in store_mse: 
    
    print(min(i), np.argmin(i))

1.3453177751712506 105
1.2673694707965402 96
1.341230095148846 66
1.3268585166617417 79
1.3096213410321111 86


In [14]:
np.random.seed(123)
#np.random.seed(190)
#np.random.seed(200)
#np.random.seed(210)
#np.random.seed(66)

n = 20 
p = 8 
true_betas = [3, 1.5, 0, 0, 2, 0, 0, 0]
cor_factor = 0.3
iterations = 500
alphas = np.logspace(-4,1,200)



y_test, X_test, df_test= get_sim_data(n, p, cor_factor, true_betas) # get training data
    

df_predictions = get_predictions(n, true_betas, iterations, alphas, cor_factor, X_test)
store_mse, store_variance, store_bias_sq = compute_mse(df_predictions, y_test)

In [15]:
for i in store_mse: 
    
    print(min(i), np.argmin(i))


1.3435548967924682 110
1.2586324124196622 97
1.3380758792671414 69
1.3214066300246878 80
1.303044142803241 87


In [16]:
X_test.corr()

Unnamed: 0,X_1,X_2,X_3,X_4,X_5,X_6,X_7,X_8
X_1,1.0,-0.188385,0.158507,-0.016293,-0.030042,-0.404724,-0.144438,-0.337785
X_2,-0.188385,1.0,0.053836,0.175233,-0.019451,0.205784,0.216478,0.181493
X_3,0.158507,0.053836,1.0,0.46626,0.034806,-0.136054,-0.198228,0.091746
X_4,-0.016293,0.175233,0.46626,1.0,0.305983,0.039375,-0.080952,-0.043376
X_5,-0.030042,-0.019451,0.034806,0.305983,1.0,0.287845,0.478582,0.216042
X_6,-0.404724,0.205784,-0.136054,0.039375,0.287845,1.0,0.61446,0.512387
X_7,-0.144438,0.216478,-0.198228,-0.080952,0.478582,0.61446,1.0,0.683644
X_8,-0.337785,0.181493,0.091746,-0.043376,0.216042,0.512387,0.683644,1.0


In [17]:
np.random.seed(123)
#np.random.seed(190)
#np.random.seed(200)
#np.random.seed(210)
#np.random.seed(66)

n = 20 
p = 8 
true_betas = [3, 1.5, 0, 0, 2, 0, 0, 0]
cor_factor = 0.5
iterations = 500
alphas = np.logspace(-4,1,200)



y_test, X_test, df_test= get_sim_data(n, p, cor_factor, true_betas) # get training data
    

df_predictions = get_predictions(n, true_betas, iterations, alphas, cor_factor, X_test)
store_mse, store_variance, store_bias_sq = compute_mse(df_predictions, y_test)

In [18]:
for i in store_mse: 
    
    print(min(i), np.argmin(i))

0.7141672027832446 173
0.5208295404148077 131
0.6490810112643433 124
0.5575327135425395 130
0.5183260290041167 132


In [19]:
X_test.corr()

Unnamed: 0,X_1,X_2,X_3,X_4,X_5,X_6,X_7,X_8
X_1,1.0,0.539375,0.41247,-0.067962,0.100296,-0.010962,0.050399,-0.128751
X_2,0.539375,1.0,0.334538,-0.153082,-0.227143,-0.080527,-0.166714,-0.159227
X_3,0.41247,0.334538,1.0,0.330921,0.193921,0.33507,0.182977,0.334739
X_4,-0.067962,-0.153082,0.330921,1.0,0.596353,0.678753,0.520633,0.396095
X_5,0.100296,-0.227143,0.193921,0.596353,1.0,0.558257,0.554436,0.237172
X_6,-0.010962,-0.080527,0.33507,0.678753,0.558257,1.0,0.773588,0.57399
X_7,0.050399,-0.166714,0.182977,0.520633,0.554436,0.773588,1.0,0.721378
X_8,-0.128751,-0.159227,0.334739,0.396095,0.237172,0.57399,0.721378,1.0


In [20]:
np.random.seed(123)
#np.random.seed(190)
#np.random.seed(200)
#np.random.seed(210)
#np.random.seed(66)

n = 20 
p = 8 
true_betas = [3, 1.5, 0, 0, 2, 0, 0, 0]
cor_factor = 0.7
iterations = 500
alphas = np.logspace(-4,1,200)



y_test, X_test, df_test= get_sim_data(n, p, cor_factor, true_betas) # get training data
    

df_predictions = get_predictions(n, true_betas, iterations, alphas, cor_factor, X_test)
store_mse, store_variance, store_bias_sq = compute_mse(df_predictions, y_test)

In [21]:
for i in store_mse: 
    
    print(min(i), np.argmin(i))

0.5251125848952942 199
0.4907306107108244 129
0.49010181498603234 152
0.468954651593961 138
0.45759848794791524 133


In [22]:
np.random.seed(123)
#np.random.seed(190)
#np.random.seed(200)
#np.random.seed(210)
#np.random.seed(66)

n = 20 
p = 8 
true_betas = [3, 1.5, 0, 0, 2, 0, 0, 0]
cor_factor = 0.8
iterations = 500
alphas = np.logspace(-4,1,200)



y_test, X_test, df_test= get_sim_data(n, p, cor_factor, true_betas) # get training data
    

df_predictions = get_predictions(n, true_betas, iterations, alphas, cor_factor, X_test)
store_mse, store_variance, store_bias_sq = compute_mse(df_predictions, y_test)

In [23]:
for i in store_mse: 
    
    print(min(i), np.argmin(i))

0.3699761897013376 191
0.3900069385407727 124
0.3761568259297662 138
0.3965915134278802 129
0.3956910507538307 126


In [24]:
np.random.seed(123)
#np.random.seed(190)
#np.random.seed(200)
#np.random.seed(210)
#np.random.seed(66)

n = 20 
p = 8 
true_betas = [3, 1.5, 0, 0, 2, 0, 0, 0]
cor_factor = 0.9
iterations = 500
alphas = np.logspace(-4,1,200)



y_test, X_test, df_test= get_sim_data(n, p, cor_factor, true_betas) # get training data
    

df_predictions = get_predictions(n, true_betas, iterations, alphas, cor_factor, X_test)
store_mse, store_variance, store_bias_sq = compute_mse(df_predictions, y_test)

In [25]:
for i in store_mse: 
    
    print(min(i), np.argmin(i))

0.26337345900594267 187
0.36302860429737327 122
0.2698215976584662 135
0.30735051357883675 131
0.33626012842753356 126


In [26]:
np.random.seed(123)
#np.random.seed(190)
#np.random.seed(200)
#np.random.seed(210)
#np.random.seed(66)

n = 20 
p = 8 
true_betas = [3, 1.5, 0, 0, 2, 0, 0, 0]
cor_factor = 0.95
iterations = 500
alphas = np.logspace(-4,1,200)



y_test, X_test, df_test= get_sim_data(n, p, cor_factor, true_betas) # get training data
    

df_predictions = get_predictions(n, true_betas, iterations, alphas, cor_factor, X_test)
store_mse, store_variance, store_bias_sq = compute_mse(df_predictions, y_test)

In [27]:
for i in store_mse: 
    
    print(min(i), np.argmin(i))

0.21212007537686547 184
0.33654440899963983 117
0.21958100715012147 130
0.25154789104467923 127
0.28237869667491666 123


In [28]:
np.random.seed(123)
#np.random.seed(190)
#np.random.seed(200)
#np.random.seed(210)
#np.random.seed(66)

n = 20 
p = 8 
true_betas = [3, 1.5, 0, 0, 2, 0, 0, 0]
cor_factor = 0.97
iterations = 500
alphas = np.logspace(-4,1,200)



y_test, X_test, df_test= get_sim_data(n, p, cor_factor, true_betas) # get training data
    

df_predictions = get_predictions(n, true_betas, iterations, alphas, cor_factor, X_test)
store_mse, store_variance, store_bias_sq = compute_mse(df_predictions, y_test)

In [29]:
for i in store_mse: 
    
    print(min(i), np.argmin(i))

0.19247958966465994 181
0.31613970944297776 114
0.2012723161259833 126
0.22804059755591463 124
0.2547486214524229 120


In [30]:
X_test.corr()

Unnamed: 0,X_1,X_2,X_3,X_4,X_5,X_6,X_7,X_8
X_1,1.0,0.988159,0.975294,0.969167,0.941142,0.910324,0.868551,0.823665
X_2,0.988159,1.0,0.981532,0.968949,0.954697,0.920528,0.885171,0.844819
X_3,0.975294,0.981532,1.0,0.97148,0.949859,0.909114,0.867255,0.827288
X_4,0.969167,0.968949,0.97148,1.0,0.970779,0.934431,0.875247,0.835929
X_5,0.941142,0.954697,0.949859,0.970779,1.0,0.97428,0.942135,0.896957
X_6,0.910324,0.920528,0.909114,0.934431,0.97428,1.0,0.973853,0.948604
X_7,0.868551,0.885171,0.867255,0.875247,0.942135,0.973853,1.0,0.976752
X_8,0.823665,0.844819,0.827288,0.835929,0.896957,0.948604,0.976752,1.0


# High Dimensionality, sparsity, varying correlation

In [6]:
non_zero_betas = [10]
zero_betas = [25]
true_betas_hd = generate_true_betas(non_zero_betas, zero_betas)

len(true_betas_hd[0])

35

In [7]:
#np.random.seed(123) works fine! 
np.random.seed(190)
#np.random.seed(200)
#np.random.seed(210)
#np.random.seed(66)

non_zero_betas = [10]
zero_betas = [25]
true_betas_hd = generate_true_betas(non_zero_betas, zero_betas)

n = 30
p = 35 
cor_factor = [0, 0.1, 0.3, 0.5, 0.6, 0.7, 0.75, 0.8, 0.9]
iterations = 500
alphas = np.logspace(-4,1,200)

store_X_test = []
store_y_test = []


for i in cor_factor: 

    y_test, X_test, df_test= get_sim_data(n, p, i, true_betas_hd[0]) # get training data
    store_X_test.append(X_test)
    store_y_test.append(y_test)

df_predictions_hd_1 = get_predictions(n, true_betas_hd[0], iterations, alphas, cor_factor[0], store_X_test[0])
df_predictions_hd_2 = get_predictions(n, true_betas_hd[0], iterations, alphas, cor_factor[1], store_X_test[1])
df_predictions_hd_3 = get_predictions(n, true_betas_hd[0], iterations, alphas, cor_factor[2], store_X_test[2])
df_predictions_hd_4 = get_predictions(n, true_betas_hd[0], iterations, alphas, cor_factor[3], store_X_test[3])
df_predictions_hd_5 = get_predictions(n, true_betas_hd[0], iterations, alphas, cor_factor[4], store_X_test[4])
df_predictions_hd_6 = get_predictions(n, true_betas_hd[0], iterations, alphas, cor_factor[5], store_X_test[5])
df_predictions_hd_7 = get_predictions(n, true_betas_hd[0], iterations, alphas, cor_factor[6], store_X_test[6])
df_predictions_hd_8 = get_predictions(n, true_betas_hd[0], iterations, alphas, cor_factor[7], store_X_test[7])
df_predictions_hd_9 = get_predictions(n, true_betas_hd[0], iterations, alphas, cor_factor[8], store_X_test[8])


store_mse_hd_1, store_variance_hd_1, store_bias_sq_hd_1 = compute_mse(df_predictions_hd_1, store_y_test[0])
store_mse_hd_2, store_variance_hd_2, store_bias_sq_hd_2 = compute_mse(df_predictions_hd_2, store_y_test[1])
store_mse_hd_3, store_variance_hd_3, store_bias_sq_hd_3 = compute_mse(df_predictions_hd_3, store_y_test[2])
store_mse_hd_4, store_variance_hd_4, store_bias_sq_hd_4 = compute_mse(df_predictions_hd_4, store_y_test[3])
store_mse_hd_5, store_variance_hd_5, store_bias_sq_hd_5 = compute_mse(df_predictions_hd_5, store_y_test[4])
store_mse_hd_6, store_variance_hd_6, store_bias_sq_hd_6 = compute_mse(df_predictions_hd_6, store_y_test[5])
store_mse_hd_7, store_variance_hd_7, store_bias_sq_hd_7 = compute_mse(df_predictions_hd_7, store_y_test[6])
store_mse_hd_8, store_variance_hd_8, store_bias_sq_hd_8 = compute_mse(df_predictions_hd_8, store_y_test[7])
store_mse_hd_9, store_variance_hd_9, store_bias_sq_hd_9 = compute_mse(df_predictions_hd_9, store_y_test[8])






In [8]:
#2.98

for i in store_mse_hd_1: 
    
    print(min(i), np.argmin(i))

5.722529553607648 189
1.9477681259102655 120
2.668716815920924 117
2.527584930575625 116
2.392261614975107 117


In [9]:
# 5.07

for i in store_mse_hd_2: 
    
    print(min(i), np.argmin(i))

9.411593333421497 169
4.078711445913463 119
5.1215344065827 113
4.89542862809702 114
4.679578911904398 115


In [10]:
#9.42

for i in store_mse_hd_3: 
    
    print(min(i), np.argmin(i))

15.714047611249931 135
6.57466636625762 103
8.569909531192156 97
8.209360473522 97
7.843196459203529 100


In [11]:
#2.18

for i in store_mse_hd_4: 
    
    print(min(i), np.argmin(i))

4.973013440888867 176
1.6970645917525118 128
2.0390959927532215 131
1.937973364776967 130
1.8545534128001808 130


In [12]:
# 1.33 

for i in store_mse_hd_5: 
    
    print(min(i), np.argmin(i))

2.6407831326971545 168
1.346996941609988 122
1.2995911852931374 120
1.273022669903418 120
1.2596878836300784 121


In [13]:
#1.48

for i in store_mse_hd_6: 
    
    print(min(i), np.argmin(i))

3.2632563671166532 164
1.7160947171621663 123
1.588781290366569 122
1.547647480773165 123
1.5198960301867455 123


In [14]:
# 0.40

for i in store_mse_hd_7: 
    
    print(min(i), np.argmin(i))

1.0757929804698305 182
0.7343055682549087 128
0.4921354430580525 132
0.49673141668469367 132
0.5096643746121451 132


In [15]:
for i in store_mse_hd_8: 
    
    print(min(i), np.argmin(i))

1.976905220337567 175
1.2760203084945747 128
1.045683234003479 130
1.04319399110841 130
1.050630253141762 131


In [16]:
for i in store_mse_hd_9: 
    
    print(min(i), np.argmin(i))

1.3562858581845878 180
1.7139667625002597 120
0.8605410173064758 139
0.9228001966573318 138
1.0048390110511627 135
