<a href="https://colab.research.google.com/github/robert-shepherd/fpl/blob/main/Project_3_6_elastic_net_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Elastic net model


---

The purpose of this script is to develop the elastic net model

Data sources:
* Data post feature engineering:  https://raw.githubusercontent.com/robert-shepherd/fpl/main/fpl_features.csv


In [None]:
# Loading libraries
import pandas as pd
import pickle
import numpy as np
import scipy.stats as stats
from sklearn import model_selection
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
import matplotlib.pyplot as plt
import seaborn as sns
from google.colab import files

# Model libraries
from sklearn.linear_model import ElasticNet

# Import measures
from sklearn.metrics import explained_variance_score
from sklearn.metrics import mean_squared_error

## Setup

In [None]:
# Reading in files from static output
X_train_url = 'https://raw.githubusercontent.com/robert-shepherd/fpl/main/X_train.csv'
X_test_url = 'https://raw.githubusercontent.com/robert-shepherd/fpl/main/X_test.csv'
Y_train_url = 'https://raw.githubusercontent.com/robert-shepherd/fpl/main/Y_train.csv'
Y_test_url = 'https://raw.githubusercontent.com/robert-shepherd/fpl/main/Y_test.csv'

X_train = pd.read_csv(X_train_url)
X_test = pd.read_csv(X_test_url)
Y_train = pd.read_csv(Y_train_url,squeeze=True)
Y_test = pd.read_csv(Y_test_url,squeeze=True)

print('X_train      : ',X_train.shape)
print('X_test       : ',X_test.shape)
print('Y_train      : ',Y_train.shape)
print('Y_test       : ',Y_test.shape)

X_train      :  (40388, 40)
X_test       :  (10098, 40)
Y_train      :  (40388,)
Y_test       :  (10098,)


In [None]:
# Creating a copy with missing values removed
train_na = X_train.isnull().any(axis=1)
test_na = X_test.isnull().any(axis=1)

X_train_no_na = X_train[~train_na]
Y_train_no_na = Y_train[~train_na]

X_test_no_na = X_test[~test_na]
Y_test_no_na = Y_test[~test_na]

print('X_train_no_na      : ',X_train_no_na.shape)
print('X_test_no_na       : ',X_test_no_na.shape)
print('Y_train_no_na      : ',Y_train_no_na.shape)
print('Y_test_no_na       : ',Y_test_no_na.shape)

X_train_no_na      :  (32698, 40)
X_test_no_na       :  (8195, 40)
Y_train_no_na      :  (32698,)
Y_test_no_na       :  (8195,)


## Default Elastic Net regression model


In [None]:
# Training default Lasso model
elasticnet = ElasticNet(normalize=True)
elasticnet.fit(X_train_no_na,Y_train_no_na)

ElasticNet(alpha=1.0, copy_X=True, fit_intercept=True, l1_ratio=0.5,
           max_iter=1000, normalize=True, positive=False, precompute=False,
           random_state=None, selection='cyclic', tol=0.0001, warm_start=False)

In [None]:
# Predicting on test
y_pred = elasticnet.predict(X_test_no_na)

In [None]:
# Checking min/max prediction
min_pred = min(y_pred)
max_pred = max(y_pred)
print("Min prediction: {}".format(min_pred))
print("Max prediction: {}".format(max_pred))

Min prediction: 1.4166921524252247
Max prediction: 1.4166921524252247


## Evaluating bias/variance tradeoff


In [None]:
# Checking 10 fold cross validation error
MSE_CV_scores = cross_val_score(elasticnet, X_train_no_na, Y_train_no_na, cv=10, 
                       scoring='neg_mean_squared_error',
                       n_jobs=-1)

cv_mse = abs(MSE_CV_scores.mean())

print("CV MSE: {}".format(cv_mse))

CV MSE: 6.527112984750157


In [None]:
# Checking test MSE
test_mse = mean_squared_error(Y_test_no_na, y_pred)
print("Test MSE: {}".format(test_mse))

Test MSE: 6.366269097205641


In [None]:
# Checking training MSE
y_pred_train = elasticnet.predict(X_train_no_na)
train_mse = mean_squared_error(Y_train_no_na, y_pred_train)
print("Train MSE: {}".format(train_mse))

Train MSE: 6.526318717450802


## Optimising L1 and L2 parameters

In [None]:
# Using grid search to identify the optimum alpha level
alpha_space = np.logspace(-4, 0, 50)
l1_ratio = np.arange(0,1.1,0.1)
elasticnet = ElasticNet(normalize=True)
param_grid = {'alpha': alpha_space
              ,"l1_ratio": l1_ratio}

gm_cv = GridSearchCV(estimator=elasticnet, param_grid=param_grid, cv=10)

gm_cv.fit(X_train_no_na,Y_train_no_na)
# Note: takes around 20 minutes to run on default Colab cluster

  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  po

GridSearchCV(cv=10, error_score=nan,
             estimator=ElasticNet(alpha=1.0, copy_X=True, fit_intercept=True,
                                  l1_ratio=0.5, max_iter=1000, normalize=True,
                                  positive=False, precompute=False,
                                  random_state=None, selection='cyclic',
                                  tol=0.0001, warm_start=False),
             iid='deprecated', n_jobs=None,
             param_grid={'alpha': array([1.00000000e-04, 1.20679264e-04, 1.45634848e-04, 1.75751062e-...
       8.68511374e-02, 1.04811313e-01, 1.26485522e-01, 1.52641797e-01,
       1.84206997e-01, 2.22299648e-01, 2.68269580e-01, 3.23745754e-01,
       3.90693994e-01, 4.71486636e-01, 5.68986603e-01, 6.86648845e-01,
       8.28642773e-01, 1.00000000e+00]),
                         'l1_ratio': array([0. , 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1. ])},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scorin

In [None]:
# Check best parameters
print("Tuned ElasticNet best parameters: {}".format(gm_cv.best_params_))

Tuned ElasticNet best parameters: {'alpha': 0.0001, 'l1_ratio': 1.0}


In [None]:
# Training Elastic Net model using best parameters
elasticnet = ElasticNet(normalize=True,alpha=0.0001,l1_ratio=1)
elasticnet.fit(X_train_no_na,Y_train_no_na)

ElasticNet(alpha=0.0001, copy_X=True, fit_intercept=True, l1_ratio=1,
           max_iter=1000, normalize=True, positive=False, precompute=False,
           random_state=None, selection='cyclic', tol=0.0001, warm_start=False)

In [None]:
# Reviewing performance
y_pred = elasticnet.predict(X_test_no_na)
r2 = elasticnet.score(X_test_no_na, Y_test_no_na)
mse = mean_squared_error(Y_test_no_na, y_pred)

print("Tuned ElasticNet R squared: {}".format(r2))
print("Tuned ElasticNet MSE: {}".format(mse))

Tuned ElasticNet R squared: 0.25137638871431034
Tuned ElasticNet MSE: 4.765933864316352


Model found to be identical to Lasso