In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn import linear_model
from sklearn.linear_model import Ridge, RidgeCV, Lasso, LassoCV, ElasticNet, ElasticNetCV
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.model_selection import validation_curve
import warnings
warnings.filterwarnings(action='ignore')
from sklearn.model_selection import cross_val_score

In [2]:
from auxiliary_files.auxiliary_plots import * 
from auxiliary_files.auxiliary_analysis import * 
from auxiliary_files.auxiliary_tables import *

In [None]:
np.random.seed(135)

n = 30
p = 35
true_betas = generate_true_betas([10], [25], 2)
cor_factor = 0
iterations = 1500
alphas = np.logspace(-4,1,200)


y_test, X_test, df_test= get_sim_data(n, p, cor_factor, true_betas[0])

df_predictions_1 = get_predictions(n, p, true_betas[0], cor_factor, iterations, alphas, X_test)
store_mse_1, store_variance_1, store_bias_sq_1 = compute_mse(df_predictions_1, y_test, iterations)

y_train, X_train, df_train = get_sim_data(n, p, cor_factor, true_betas[0])  

In [None]:
# ridge CV-10

train_scores_ridge, val_scores_ridge = validation_curve(Ridge(),
                                  X_train, y_train, param_name="alpha", param_range=alphas,
                                  cv=10, scoring='neg_mean_squared_error')
train_scores_ridge = -(train_scores_ridge)
val_scores_ridge = -(val_scores_ridge)
val_mean_ridge = np.mean(val_scores_ridge, axis=1)
val_std_ridge = np.std(val_scores_ridge, axis=1)

In [None]:
# ridge LOOCV

train_scores_ridge_loo, val_scores_ridge_loo = validation_curve(Ridge(),
                                  X_train, y_train, param_name="alpha", param_range=alphas,
                                  cv=n, scoring='neg_mean_squared_error')
train_scores_ridge_loo = -(train_scores_ridge_loo)
val_scores_ridge_loo = -(val_scores_ridge_loo)
val_mean_ridge_loo = np.mean(val_scores_ridge_loo, axis=1)
val_std_ridge_loo = np.std(val_scores_ridge_loo, axis=1)

In [None]:
# lasso CV-10

train_scores_lasso, val_scores_lasso = validation_curve(Lasso(),
                                  X_train, y_train, param_name="alpha", param_range=alphas,
                                  cv=10, scoring='neg_mean_squared_error')
train_scores_lasso = -(train_scores_lasso)
val_scores_lasso = -(val_scores_lasso)
val_mean_lasso = np.mean(val_scores_lasso, axis=1)
val_std_lasso = np.std(val_scores_lasso, axis=1)

In [None]:
# lasso LOOCV

train_scores_lasso_loo, val_scores_lasso_loo = validation_curve(Lasso(),
                                  X_train, y_train, param_name="alpha", param_range=alphas,
                                  cv=n, scoring='neg_mean_squared_error')
train_scores_lasso_loo = -(train_scores_lasso_loo)
val_scores_lasso_loo = -(val_scores_lasso_loo)
val_mean_lasso_loo = np.mean(val_scores_lasso_loo, axis=1)
val_std_lasso_loo = np.std(val_scores_lasso_loo, axis=1)

In [None]:
#train_scores_elnet, val_scores = validation_curve(ElasticNet(),
                                  #X_train, y_train, param_name="alpha", param_range=alphas,
                                  #cv=10, scoring='neg_mean_squared_error')
#train_scores = -(train_scores)
#val_scores = -(val_scores)
#val_mean = np.mean(val_scores, axis=1)
#val_std = np.std(val_scores, axis=1)


In [None]:
for i in store_mse_1: 
    
    print(min(i), np.argmin(i))

In [None]:
plot_cv_sim()

In [None]:
print(alphas[np.argmin(store_mse_1[0])], alphas[np.argmin(val_mean_ridge)], alphas[np.argmin(val_mean_ridge_loo)])



In [None]:
print(alphas[np.argmin(store_mse_1[1])], alphas[np.argmin(val_mean_lasso)], alphas[np.argmin(val_mean_lasso_loo)])



# Delete everything below. 

In [None]:
def plot_cv_distributions(store_optimal_alpha, store_min_mse): 

    sns.set(style="white", palette="muted", color_codes=True)

    # Set up the matplotlib figure
    f, axes = plt.subplots(1, 2, figsize = (30,10))

    fig_1=sns.kdeplot(data=store_optimal_alpha, ax=axes[0])
    fig_1.legend([],[], frameon=False)
    fig_1.axvline(x=np.mean(store_optimal_alpha), color='black', linestyle='--')
    fig_1.spines["bottom"].set_linestyle("dotted")
    fig_1.title.set_text(f"Distribution of Selected Lambdas from CV")

    fig_2=sns.kdeplot(data=store_min_mse, ax=axes[1])
    fig_2.legend([],[], frameon=False)
    fig_2.axvline(x=np.mean(store_min_mse), color='black', linestyle='--')
    fig_2.axvline(x=ridge_min_mse, color='red', linestyle='--')
    fig_2.spines["bottom"].set_linestyle("dotted")
    fig_2.title.set_text(f"Distribution  of Min MSEs from CV")

In [None]:
np.random.seed(357)

y_test, X_test, df_test= get_sim_data(n, p, cor_factor, true_betas[0]) # get training data

store_optimal_alpha_ridge = []
store_optimal_alpha_ridge_loo = []
store_min_mse_ridge = []
store_min_mse_ridge_loo = []

store_predictions_list_ridge = []
store_predictions_list_lasso = []


for i in range(iterations):
    
    y_train, X_train, df_train = get_sim_data(n, p, cor_factor, true_betas[0])
    

    train_scores_ridge, val_scores_ridge = validation_curve(Ridge(), X_train, y_train, param_name="alpha", 
                                                            param_range=alphas, cv=10, 
                                                            scoring='neg_mean_squared_error')
    
    train_scores_ridge_loo, val_scores_ridge_loo = validation_curve(Ridge(), X_train, y_train, param_name="alpha",
                                                                    param_range=alphas, cv=n,
                                                                    scoring='neg_mean_squared_error')
    
    train_scores_ridge = -(train_scores_ridge)
    val_scores_ridge = -(val_scores_ridge)
    val_mean_ridge = np.mean(val_scores_ridge, axis=1)
    optimal_alpha_ridge = alphas[np.argmin(val_mean_ridge)]
    min_mse_ridge = np.min(val_mean_ridge)
    store_optimal_alpha_ridge.append(optimal_alpha_ridge)
    store_min_mse_ridge.append(min_mse_ridge)
    
    train_scores_ridge_loo = -(train_scores_ridge_loo)
    val_scores_ridge_loo = -(val_scores_ridge_loo)
    val_mean_ridge_loo = np.mean(val_scores_ridge_loo, axis=1)
    optimal_alpha_ridge_loo = alphas[np.argmin(val_mean_ridge_loo)]
    min_mse_ridge_loo = np.min(val_mean_ridge_loo)
    store_optimal_alpha_ridge_loo.append(optimal_alpha_ridge_loo)
    store_min_mse_ridge_loo.append(min_mse_ridge_loo)
    
    store_predictions_ridge = []
    store_predictions_lasso = []
    
    
    for a in alphas: 

        ridge = Ridge(alpha=a).fit(X_train, y_train)
        ridge_predict = ridge.predict(X_test)
        ridge_predict_select = ridge_predict[14]
        store_predictions_ridge.append(ridge_predict_select) 
        
        lasso = Lasso(alpha=a).fit(X_train, y_train)
        lasso_predict = lasso.predict(X_test)
        lasso_predict_select = lasso_predict[14]
        store_predictions_lasso.append(lasso_predict_select) 
            
    store_predictions_list_ridge.append(store_predictions_ridge)
    store_predictions_list_lasso.append(store_predictions_lasso)
          
    store_predictions_df_ridge = pd.DataFrame(store_predictions_list_ridge)
    store_predictions_df_lasso = pd.DataFrame(store_predictions_list_lasso)

predictions_dfs = [store_predictions_df_ridge, store_predictions_df_lasso]
    

In [None]:
store_mse_2, store_variance_2, store_bias_sq_2 = compute_mse(predictions_dfs, y_test)

In [None]:
for i in store_mse_2: 
    
    print(min(i), np.argmin(i))



In [None]:
ridge_min_mse = np.min(store_mse_2[0])

In [None]:
plot_cv_distributions(store_optimal_alpha_ridge, store_min_mse_ridge)

In [None]:
plot_cv_distributions(store_optimal_alpha_ridge_loo, store_min_mse_ridge_loo)

In [None]:
f, axes = plt.subplots(1, 1)

fig_1=sns.kdeplot(data=store_min_mse_ridge)
fig_1.legend([],[], frameon=False)
fig_1.axvline(x=np.mean(store_min_mse_ridge), color='black', linestyle='--')
fig_1.axvline(x=ridge_min_mse, color='red', linestyle='--')
fig_1.spines["bottom"].set_linestyle("dotted")
fig_1.title.set_text(f"Distribution of Min MSEs from CV")

In [None]:
f, axes = plt.subplots(1, 1)

fig_1=sns.kdeplot(data=store_min_mse_ridge_loo)
fig_1.legend([],[], frameon=False)
fig_1.axvline(x=np.mean(store_min_mse_ridge_loo), color='black', linestyle='--')
fig_1.axvline(x=ridge_min_mse, color='red', linestyle='--')
fig_1.spines["bottom"].set_linestyle("dotted")
fig_1.title.set_text(f"Distribution of Min MSEs from CV")

In [None]:
np.var(store_min_mse_ridge_loo)

In [None]:
np.var(store_min_mse_ridge)