In [1]:
import numpy as np 
import pandas as pd 
from sklearn.linear_model import LinearRegression, Ridge, ElasticNet, Lasso
from sklearn.preprocessing import StandardScaler
from functools import reduce  
import matplotlib.pyplot as plt
import statsmodels.api as sm
import seaborn as sns
from mlxtend.evaluate import bias_variance_decomp
from sklearn import metrics
from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings('ignore')

In [2]:
from auxiliary_files.auxiliary_plots import * 
from auxiliary_files.auxiliary_analysis import * 
from auxiliary_files.auxiliary_tables import *

# 1. High Dimensionality, No Multicollinearity, Varying Sparsity

Decreasing sparsity. We expect lasso to initially do better first. As sparsity declines ridge will begin doing better.

In [3]:
non_zero_betas = [5, 10, 20, 30, 35]
zero_betas = [30, 25, 15, 5, 0]
size_non_zero = 2

true_betas_list = generate_true_betas(non_zero_betas, zero_betas, size_non_zero)

In [None]:
np.random.seed(900)

n = 30
p = 35
cor_factor = 0 # we have not introduced multicollinearity yet. 
iterations = 500
alphas = np.logspace(-4,1,200)

store_X_test = []
store_y_test = []


for i in true_betas_list: 
        
    y_test, X_test, df_test= get_sim_data(n, p, cor_factor, i) # get training data
    store_X_test.append(X_test)
    store_y_test.append(y_test)
    

df_predictions_case_1_5 = get_predictions(n, p, true_betas_list[0], cor_factor, iterations, alphas, store_X_test[0])
df_predictions_case_1_10 = get_predictions(n, p, true_betas_list[1], cor_factor, iterations, alphas, store_X_test[1])
df_predictions_case_1_20 = get_predictions(n, p, true_betas_list[2], cor_factor, iterations, alphas, store_X_test[2])
df_predictions_case_1_30 = get_predictions(n, p, true_betas_list[3], cor_factor, iterations, alphas, store_X_test[3])
df_predictions_case_1_35 = get_predictions(n, p, true_betas_list[4], cor_factor, iterations, alphas, store_X_test[4])



In [None]:
store_mse_case_1_5, store_variance_case_1_5, store_bias_sq_case_1_5 = compute_mse(df_predictions_case_1_5, store_y_test[0], iterations)
store_mse_case_1_10, store_variance_case_1_10, store_bias_sq_case_1_10 = compute_mse(df_predictions_case_1_10, store_y_test[1], iterations)
store_mse_case_1_20, store_variance_case_1_20, store_bias_sq_case_1_20 = compute_mse(df_predictions_case_1_20, store_y_test[2], iterations)
store_mse_case_1_30, store_variance_case_1_30, store_bias_sq_case_1_30 = compute_mse(df_predictions_case_1_30, store_y_test[3], iterations)
store_mse_case_1_35, store_variance_case_1_35, store_bias_sq_case_1_35 = compute_mse(df_predictions_case_1_35, store_y_test[4], iterations)

In [None]:
for i in store_mse_case_1_10:

    ax = plt.subplot(1,1,1)

    mse = ax.plot(alphas, i)

In [None]:
ax = plt.subplot(1,1,1)
mse_1 = ax.plot(alphas, store_mse_case_1_20[1])
mse_2 = ax.plot(alphas, store_mse_case_1_20[0])

ax.legend(["mse_1","mse_2"])



In [None]:
# lasso does best, which is what we expect!

for i in store_mse_case_1_5: 
    
    print(min(i), alphas[np.argmin(i)], np.argmin(i))


In [None]:
# lasso does best, which is what we expect!

for i in store_mse_case_1_10: 
    
    print(min(i), alphas[np.argmin(i)], np.argmin(i))


In [None]:
# lasso starts doing not so well as sparsity decreases. Here, one of the elastic net models would be best. 

for i in store_mse_case_1_20: 
    
    print(min(i), alphas[np.argmin(i)], np.argmin(i))


In [None]:
# lasso starts doing not so well as sparsity decreases. Here, one of the elastic net models would be best. 

for i in store_mse_case_1_30: 
    
    print(min(i), alphas[np.argmin(i)], np.argmin(i))



In [None]:
# ridge does best as we would expect!

for i in store_mse_case_1_35: 
    
    print(min(i), alphas[np.argmin(i)], np.argmin(i))

# 2. Low Dimensionality, Moderate to High Multicollineary, Varying Sparsity

In [None]:
non_zero_betas_2 = [2, 3, 4, 5, 10]
zero_betas_2 = [8, 7, 6, 5, 0]
size_non_zero = 2

true_betas_list_2 = generate_true_betas(non_zero_betas_2, zero_betas_2, size_non_zero)
true_betas_list_2

In [None]:
np.random.seed(900)

n = 30
p = 10
cor_factor = 0.8 
iterations = 500
alphas = np.logspace(-4,1,200)

store_X_test = []
store_y_test = []


for i in true_betas_list_2: 
        
    y_test, X_test, df_test= get_sim_data(n, p, cor_factor, i) # get training data
    store_X_test.append(X_test)
    store_y_test.append(y_test)
    

df_predictions_case_2_2 = get_predictions(n, p, true_betas_list_2[0], cor_factor, iterations, alphas, store_X_test[0])
df_predictions_case_2_3 = get_predictions(n, p, true_betas_list_2[1], cor_factor, iterations, alphas, store_X_test[1])
df_predictions_case_2_4 = get_predictions(n, p, true_betas_list_2[2], cor_factor, iterations, alphas, store_X_test[2])
df_predictions_case_2_5 = get_predictions(n, p, true_betas_list_2[3], cor_factor, iterations, alphas, store_X_test[3])
df_predictions_case_2_10 = get_predictions(n, p, true_betas_list_2[4], cor_factor, iterations, alphas, store_X_test[4])



In [None]:
store_mse_case_2_2, store_variance_case_2_2, store_bias_sq_case_2_2 = compute_mse(df_predictions_case_2_2, store_y_test[0], iterations)
store_mse_case_2_3, store_variance_case_2_3, store_bias_sq_case_2_3 = compute_mse(df_predictions_case_2_3, store_y_test[1], iterations)
store_mse_case_2_4, store_variance_case_2_4, store_bias_sq_case_2_4 = compute_mse(df_predictions_case_2_4, store_y_test[2], iterations)
store_mse_case_2_5, store_variance_case_2_5, store_bias_sq_case_2_5 = compute_mse(df_predictions_case_2_5, store_y_test[3], iterations)
store_mse_case_2_10, store_variance_case_2_10, store_bias_sq_case_2_10 = compute_mse(df_predictions_case_2_10, store_y_test[4], iterations)

In [None]:
for i in store_mse_case_2_2:

    ax = plt.subplot(1,1,1)

    mse = ax.plot(alphas, i)


In [None]:
# lasso does best as we would expect!

for i in store_mse_case_2_2: 
    
    print(min(i), alphas[np.argmin(i)], np.argmin(i))

In [None]:
# ?

for i in store_mse_case_2_3: 
    
    print(min(i), alphas[np.argmin(i)], np.argmin(i))

In [None]:
# ?

for i in store_mse_case_2_4: 
    
    print(min(i), alphas[np.argmin(i)], np.argmin(i))

In [None]:
# lasso still does well here. 

for i in store_mse_case_2_5: 
    
    print(min(i), alphas[np.argmin(i)], np.argmin(i))

In [None]:
# ridge does best here, as we would expect! 

for i in store_mse_case_2_10: 
    
    print(min(i), alphas[np.argmin(i)], np.argmin(i))

# 3. Low Dimensionality, High Sparsity, Varying Degrees of Multicollineary 

In [None]:
#np.random.seed(123)
#np.random.seed(190)
#np.random.seed(200)
np.random.seed(210)
#np.random.seed(66)


n = 20 
p = 8 
true_betas = [3, 1.5, 0, 0, 2, 0, 0, 0]
cor_factor = [0, 0.1, 0.3, 0.5, 0.7, 0.8]
iterations = 500
alphas = np.logspace(-4,1,200)

store_X_test = []
store_y_test = []


for i in cor_factor: 

    y_test, X_test, df_test= get_sim_data(n, p, i, true_betas) # get training data
    store_X_test.append(X_test)
    store_y_test.append(y_test)

df_predictions_case_3_1 = get_predictions(n, p, true_betas, cor_factor[0], iterations, alphas, store_X_test[0])
df_predictions_case_3_2 = get_predictions(n, p, true_betas, cor_factor[1], iterations, alphas, store_X_test[1])
df_predictions_case_3_3 = get_predictions(n, p, true_betas, cor_factor[2], iterations, alphas, store_X_test[2])
df_predictions_case_3_4 = get_predictions(n, p, true_betas, cor_factor[3], iterations, alphas, store_X_test[3])
df_predictions_case_3_5 = get_predictions(n, p, true_betas, cor_factor[4], iterations, alphas, store_X_test[4])
df_predictions_case_3_6 = get_predictions(n, p, true_betas, cor_factor[5], iterations, alphas, store_X_test[5])


In [None]:
store_mse_case_3_1, store_variance_case_3_1, store_bias_sq_case_3_1 = compute_mse(df_predictions_case_3_1, store_y_test[0], iterations)
store_mse_case_3_2, store_variance_case_3_2, store_bias_sq_case_3_2 = compute_mse(df_predictions_case_3_2, store_y_test[1], iterations)
store_mse_case_3_3, store_variance_case_3_3, store_bias_sq_case_3_3 = compute_mse(df_predictions_case_3_3, store_y_test[2], iterations)
store_mse_case_3_4, store_variance_case_3_4, store_bias_sq_case_3_4 = compute_mse(df_predictions_case_3_4, store_y_test[3], iterations)
store_mse_case_3_5, store_variance_case_3_5, store_bias_sq_case_3_5 = compute_mse(df_predictions_case_3_5, store_y_test[4], iterations)
store_mse_case_3_6, store_variance_case_3_6, store_bias_sq_case_3_6 = compute_mse(df_predictions_case_3_6, store_y_test[5], iterations)

In [None]:
for i in store_mse_case_3_1: 
    
    print(min(i), alphas[np.argmin(i)], np.argmin(i))

In [None]:
for i in store_mse_case_3_2: 
    
    print(min(i), alphas[np.argmin(i)], np.argmin(i))

In [None]:
for i in store_mse_case_3_3: 
    
    print(min(i), alphas[np.argmin(i)], np.argmin(i))

In [None]:
#only setting where elastic net outperforms lasso and ridge. (for l1 = 0.5 and 0.7)

for i in store_mse_case_3_4: 
    
    print(min(i), alphas[np.argmin(i)], np.argmin(i))

In [None]:
for i in store_mse_case_3_5: 
    
    print(min(i), alphas[np.argmin(i)], np.argmin(i))

In [None]:
for i in store_mse_case_3_6: 
    
    print(min(i), alphas[np.argmin(i)], np.argmin(i))

# 4. Same in Case 3, but all betas are set to 0.85

In [None]:
"""ridge should do best here in all cases"""

np.random.seed(123)
#np.random.seed(190)
#np.random.seed(200)
#np.random.seed(210)
#np.random.seed(66)


n = 20 
p = 8 
true_betas = np.repeat(0.85, 8)
cor_factor = [0, 0.1, 0.3, 0.5, 0.7, 0.8]
iterations = 500
alphas = np.logspace(-4,1,200)

store_X_test = []
store_y_test = []


for i in cor_factor: 

    y_test, X_test, df_test= get_sim_data(n, p, i, true_betas) # get training data
    store_X_test.append(X_test)
    store_y_test.append(y_test)

df_predictions_case_4_1 = get_predictions(n, p, true_betas, cor_factor[0], iterations, alphas, store_X_test[0])
df_predictions_case_4_2 = get_predictions(n, p, true_betas, cor_factor[1], iterations, alphas, store_X_test[1])
df_predictions_case_4_3 = get_predictions(n, p, true_betas, cor_factor[2], iterations, alphas, store_X_test[2])
df_predictions_case_4_4 = get_predictions(n, p, true_betas, cor_factor[3], iterations, alphas, store_X_test[3])
df_predictions_case_4_5 = get_predictions(n, p, true_betas, cor_factor[4], iterations, alphas, store_X_test[4])
df_predictions_case_4_6 = get_predictions(n, p, true_betas, cor_factor[5], iterations, alphas, store_X_test[5])


In [None]:
store_mse_case_4_1, store_variance_case_4_1, store_bias_sq_case_4_1 = compute_mse(df_predictions_case_4_1, store_y_test[0], iterations)
store_mse_case_4_2, store_variance_case_4_2, store_bias_sq_case_4_2 = compute_mse(df_predictions_case_4_2, store_y_test[1], iterations)
store_mse_case_4_3, store_variance_case_4_3, store_bias_sq_case_4_3 = compute_mse(df_predictions_case_4_3, store_y_test[2], iterations)
store_mse_case_4_4, store_variance_case_4_4, store_bias_sq_case_4_4 = compute_mse(df_predictions_case_4_4, store_y_test[3], iterations)
store_mse_case_4_5, store_variance_case_4_5, store_bias_sq_case_4_5 = compute_mse(df_predictions_case_4_5, store_y_test[4], iterations)
store_mse_case_4_6, store_variance_case_4_6, store_bias_sq_case_4_6 = compute_mse(df_predictions_case_4_6, store_y_test[5], iterations)


In [None]:
for i in store_mse_case_4_1: 
    
    print(min(i), alphas[np.argmin(i)], np.argmin(i))

In [None]:
for i in store_mse_case_4_2: 
    
    print(min(i), alphas[np.argmin(i)], np.argmin(i))

In [None]:
for i in store_mse_case_4_3: 
    
    print(min(i), alphas[np.argmin(i)], np.argmin(i))

In [None]:
for i in store_mse_case_4_4: 
    
    print(min(i), alphas[np.argmin(i)], np.argmin(i))

In [None]:
for i in store_mse_case_4_5: 
    
    print(min(i), alphas[np.argmin(i)], np.argmin(i))

In [None]:
for i in store_mse_case_4_6: 
    
    print(min(i), alphas[np.argmin(i)], np.argmin(i))

# 5. High Dimensionality, High Sparsity, Varying Multicollinearity

In [None]:
#np.random.seed(123) works fine! 
np.random.seed(190)
#np.random.seed(200)
#np.random.seed(210)
#np.random.seed(66)

non_zero_betas = [10]
zero_betas = [25]
size_non_zero = 2
true_betas_hd = generate_true_betas(non_zero_betas, zero_betas, size_non_zero)

n = 30
p = 35 
cor_factor = [0, 0.1, 0.3, 0.5, 0.6, 0.7, 0.75, 0.8, 0.9]
iterations = 500
alphas = np.logspace(-4,1,200)

store_X_test = []
store_y_test = []

for i in cor_factor: 

    y_test, X_test, df_test= get_sim_data(n, p, i, true_betas_hd[0]) # get training data
    store_X_test.append(X_test)
    store_y_test.append(y_test)

df_predictions_case_5_1 = get_predictions(n, p, true_betas_hd[0], cor_factor[0], iterations, alphas, store_X_test[0])
df_predictions_case_5_2 = get_predictions(n, p, true_betas_hd[0], cor_factor[1], iterations, alphas, store_X_test[1])
df_predictions_case_5_3 = get_predictions(n, p, true_betas_hd[0], cor_factor[2], iterations, alphas, store_X_test[2])
df_predictions_case_5_4 = get_predictions(n, p, true_betas_hd[0], cor_factor[3], iterations, alphas, store_X_test[3])
df_predictions_case_5_5 = get_predictions(n, p, true_betas_hd[0], cor_factor[4], iterations, alphas, store_X_test[4])
df_predictions_case_5_6 = get_predictions(n, p, true_betas_hd[0], cor_factor[5], iterations, alphas, store_X_test[5])
df_predictions_case_5_7 = get_predictions(n, p, true_betas_hd[0], cor_factor[6], iterations, alphas, store_X_test[6])
df_predictions_case_5_8 = get_predictions(n, p, true_betas_hd[0], cor_factor[7], iterations, alphas, store_X_test[7])
df_predictions_case_5_9 = get_predictions(n, p, true_betas_hd[0], cor_factor[8], iterations, alphas, store_X_test[8])


In [None]:
store_mse_case_5_1, store_variance_case_5_1, store_bias_sq_case_5_1 = compute_mse(df_predictions_case_5_1, store_y_test[0], iterations)
store_mse_case_5_2, store_variance_case_5_2, store_bias_sq_case_5_2 = compute_mse(df_predictions_case_5_2, store_y_test[1], iterations)
store_mse_case_5_3, store_variance_case_5_3, store_bias_sq_case_5_3 = compute_mse(df_predictions_case_5_3, store_y_test[2], iterations)
store_mse_case_5_4, store_variance_case_5_4, store_bias_sq_case_5_4 = compute_mse(df_predictions_case_5_4, store_y_test[3], iterations)
store_mse_case_5_5, store_variance_case_5_5, store_bias_sq_case_5_5 = compute_mse(df_predictions_case_5_5, store_y_test[4], iterations)
store_mse_case_5_6, store_variance_case_5_6, store_bias_sq_case_5_6 = compute_mse(df_predictions_case_5_6, store_y_test[5], iterations)
store_mse_case_5_7, store_variance_case_5_7, store_bias_sq_case_5_7 = compute_mse(df_predictions_case_5_7, store_y_test[6], iterations)
store_mse_case_5_8, store_variance_case_5_8, store_bias_sq_case_5_8 = compute_mse(df_predictions_case_5_8, store_y_test[7], iterations)
store_mse_case_5_9, store_variance_case_5_9, store_bias_sq_case_5_9 = compute_mse(df_predictions_case_5_9, store_y_test[8], iterations)

In [None]:

for i in store_mse_case_5_1: 
    
    print(min(i), alphas[np.argmin(i)], np.argmin(i))

In [None]:

for i in store_mse_case_5_2: 
    
    print(min(i), alphas[np.argmin(i)], np.argmin(i))

In [None]:

for i in store_mse_case_5_3: 
    
    print(min(i), alphas[np.argmin(i)], np.argmin(i))

In [None]:

for i in store_mse_case_5_4: 
    
    print(min(i), alphas[np.argmin(i)], np.argmin(i))

In [None]:


for i in store_mse_case_5_5: 
    
    print(min(i), alphas[np.argmin(i)], np.argmin(i))

In [None]:

for i in store_mse_case_5_6: 
    
    print(min(i), alphas[np.argmin(i)], np.argmin(i))

In [None]:


for i in store_mse_case_5_7: 
    
    print(min(i), alphas[np.argmin(i)], np.argmin(i))

In [None]:
for i in store_mse_case_5_8: 
    
    print(min(i), alphas[np.argmin(i)], np.argmin(i))

In [None]:
for i in store_mse_case_5_9: 
    
    print(min(i), alphas[np.argmin(i)], np.argmin(i))