In [1]:
import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn.linear_model import Ridge, RidgeCV, LassoCV
from sklearn import linear_model
from sklearn.metrics import mean_squared_error

In [2]:
df = pd.read_csv("dodgers_training.csv")

In [3]:
df.shape

(56, 13)

The training dataset contains 56 rows and 13 columns

In [4]:
dodgers_training_X = df[["month","day_of_week","temp","skies","bobblehead"]] # X refers to the predictors

In [5]:
dodgers_training_Y = df[["attend"]] #Y refers to the target variable

In [6]:
dodgers_training_X.shape

(56, 5)

There are 5 columns set as predictors

In [7]:
dodgers_training_Y.shape

(56, 1)

This is the dataset for target variable

In [8]:
#performing one hot encoding of categorical variables for training dataset
dodgers_training_X_ohe = pd.get_dummies(dodgers_training_X)

In [9]:
dodgers_training_X_ohe.shape

(56, 19)

The above dataset contains 19 columns

In [10]:
scaler = preprocessing.StandardScaler().fit(dodgers_training_X_ohe)

In [11]:
dodgers_training_X_ohe_scaled = scaler.transform(dodgers_training_X_ohe)

Scaling the data to remove mean and std deviation

In [12]:
print('original means of the training data= ', np.mean(dodgers_training_X_ohe, axis=0))

original means of the training data=  temp                     73.642857
month_APR                 0.160714
month_AUG                 0.214286
month_JUL                 0.089286
month_JUN                 0.089286
month_MAY                 0.232143
month_OCT                 0.017857
month_SEP                 0.196429
day_of_week_Friday        0.178571
day_of_week_Monday        0.142857
day_of_week_Saturday      0.160714
day_of_week_Sunday        0.196429
day_of_week_Thursday      0.089286
day_of_week_Tuesday       0.089286
day_of_week_Wednesday     0.142857
skies_Clear               0.803571
skies_Cloudy              0.196429
bobblehead_NO             0.910714
bobblehead_YES            0.089286
dtype: float64


The above describes the original mean of the predictors 

In [13]:
print('original stds of the training data= ', np.std(dodgers_training_X_ohe, axis=0))

original stds of the training data=  temp                     8.868944
month_APR                0.367267
month_AUG                0.410326
month_JUL                0.285156
month_JUN                0.285156
month_MAY                0.422200
month_OCT                0.132432
month_SEP                0.397296
day_of_week_Friday       0.382993
day_of_week_Monday       0.349927
day_of_week_Saturday     0.367267
day_of_week_Sunday       0.397296
day_of_week_Thursday     0.285156
day_of_week_Tuesday      0.285156
day_of_week_Wednesday    0.349927
skies_Clear              0.397296
skies_Cloudy             0.397296
bobblehead_NO            0.285156
bobblehead_YES           0.285156
dtype: float64


The above gives the standard deviation of predictors for the training dataset

<h3>Performing the same transformation for testing dataset

In [14]:
df_test = pd.read_csv("dodgers_testing.csv")

In [15]:
dodgers_testing_X = df_test[["month","day_of_week","temp","skies","bobblehead"]] # X refers to the predictors

In [16]:
dodgers_testing_Y = df_test[["attend"]] #Y refers to the target variable
dodgers_testing_Y.shape

(25, 1)

In [17]:
dodgers_testing_X_ohe = pd.get_dummies(dodgers_testing_X)

Performing one hot encoding for the test predictors

In [18]:
dodgers_testing_X_ohe.shape

(25, 18)

In [19]:
dodgers_training_X_ohe.shape

(56, 19)

The training and the testing dataset have unequal number of columns. We need to make the number of columns same. 

In [20]:
list1 = dodgers_training_X_ohe.columns.tolist()

In [21]:
list2 = dodgers_testing_X_ohe.columns.tolist()

In [22]:
rem_col = list(set(list1) - set(list2))
rem_col

['day_of_week_Thursday']

The above column is missing in the test dataset. We will be adding 'day_of_week_Thursday' in test dataset

In [23]:
list1

['temp',
 'month_APR',
 'month_AUG',
 'month_JUL',
 'month_JUN',
 'month_MAY',
 'month_OCT',
 'month_SEP',
 'day_of_week_Friday',
 'day_of_week_Monday',
 'day_of_week_Saturday',
 'day_of_week_Sunday',
 'day_of_week_Thursday',
 'day_of_week_Tuesday',
 'day_of_week_Wednesday',
 'skies_Clear ',
 'skies_Cloudy',
 'bobblehead_NO',
 'bobblehead_YES']

In [24]:
list2

['temp',
 'month_APR',
 'month_AUG',
 'month_JUL',
 'month_JUN',
 'month_MAY',
 'month_OCT',
 'month_SEP',
 'day_of_week_Friday',
 'day_of_week_Monday',
 'day_of_week_Saturday',
 'day_of_week_Sunday',
 'day_of_week_Tuesday',
 'day_of_week_Wednesday',
 'skies_Clear ',
 'skies_Cloudy',
 'bobblehead_NO',
 'bobblehead_YES']

In [25]:
idx = 12

In [26]:
dodgers_testing_X_ohe.insert(loc = 12, column= rem_col[0], value=0 )

In [27]:
list3 = dodgers_testing_X_ohe.columns.tolist()
list3

['temp',
 'month_APR',
 'month_AUG',
 'month_JUL',
 'month_JUN',
 'month_MAY',
 'month_OCT',
 'month_SEP',
 'day_of_week_Friday',
 'day_of_week_Monday',
 'day_of_week_Saturday',
 'day_of_week_Sunday',
 'day_of_week_Thursday',
 'day_of_week_Tuesday',
 'day_of_week_Wednesday',
 'skies_Clear ',
 'skies_Cloudy',
 'bobblehead_NO',
 'bobblehead_YES']

In [28]:
dodgers_testing_X_ohe.head()

Unnamed: 0,temp,month_APR,month_AUG,month_JUL,month_JUN,month_MAY,month_OCT,month_SEP,day_of_week_Friday,day_of_week_Monday,day_of_week_Saturday,day_of_week_Sunday,day_of_week_Thursday,day_of_week_Tuesday,day_of_week_Wednesday,skies_Clear,skies_Cloudy,bobblehead_NO,bobblehead_YES
0,75,0,1,0,0,0,0,0,0,0,0,0,0,0,1,1,0,1,0
1,68,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,1,0
2,79,0,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0,1,0
3,66,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,1,0,1
4,75,0,0,1,0,0,0,0,0,0,0,1,0,0,0,1,0,0,1


In [29]:
dodgers_testing_X_ohe_scaled = scaler.transform(dodgers_testing_X_ohe)

In [30]:
print('original means of the testing data= ', np.mean(dodgers_testing_X_ohe, axis=0))

original means of the testing data=  temp                     72.04
month_APR                 0.12
month_AUG                 0.12
month_JUL                 0.28
month_JUN                 0.16
month_MAY                 0.20
month_OCT                 0.08
month_SEP                 0.04
day_of_week_Friday        0.12
day_of_week_Monday        0.16
day_of_week_Saturday      0.16
day_of_week_Sunday        0.08
day_of_week_Thursday      0.00
day_of_week_Tuesday       0.32
day_of_week_Wednesday     0.16
skies_Clear               0.68
skies_Cloudy              0.32
bobblehead_NO             0.76
bobblehead_YES            0.24
dtype: float64


The above describes the mean of the testing dataset predictors

In [31]:
print('original stds of the testing data= ', np.std(dodgers_testing_X_ohe, axis=0))

original stds of the testing data=  temp                     6.587746
month_APR                0.324962
month_AUG                0.324962
month_JUL                0.448999
month_JUN                0.366606
month_MAY                0.400000
month_OCT                0.271293
month_SEP                0.195959
day_of_week_Friday       0.324962
day_of_week_Monday       0.366606
day_of_week_Saturday     0.366606
day_of_week_Sunday       0.271293
day_of_week_Thursday     0.000000
day_of_week_Tuesday      0.466476
day_of_week_Wednesday    0.366606
skies_Clear              0.466476
skies_Cloudy             0.466476
bobblehead_NO            0.427083
bobblehead_YES           0.427083
dtype: float64


The above describes the standard deviation of the testing dataset predictors

<h3>Performing Ridge Regression 

In [32]:
l2_cv = RidgeCV(cv=None, store_cv_values=True, alphas=np.linspace(0.1,1000,100))
l2_cv.fit(dodgers_training_X_ohe_scaled, dodgers_training_Y)
print('alpha = ', l2_cv.alpha_) 
print('coef = ', l2_cv.coef_)
print('R2 = ', l2_cv.score(dodgers_training_X_ohe_scaled,dodgers_training_Y))

alpha =  50.6
coef =  [[  515.85380007  -273.56102806   457.54654532  -216.88248689
   1102.57100111  -605.71237164   -75.78927596  -186.41964828
     67.69322043  -835.67182144   490.25315842   238.14165655
   -341.53918562  1070.28517076  -617.19884398   271.19412682
   -271.19412682 -1145.58152754  1145.58152754]]
R2 =  0.484627459673377


<h3>Performing Lasso Regression 

In [33]:
no_of_folds = dodgers_training_X_ohe_scaled.shape[0]
l1_cv = LassoCV(cv = no_of_folds,alphas=np.linspace(0.1,1000,100))
l1_cv.fit(dodgers_training_X_ohe_scaled, dodgers_training_Y)

  y = column_or_1d(y, warn=True)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)


LassoCV(alphas=array([1.000e-01, 1.020e+01, 2.030e+01, 3.040e+01, 4.050e+01, 5.060e+01,
       6.070e+01, 7.080e+01, 8.090e+01, 9.100e+01, 1.011e+02, 1.112e+02,
       1.213e+02, 1.314e+02, 1.415e+02, 1.516e+02, 1.617e+02, 1.718e+02,
       1.819e+02, 1.920e+02, 2.021e+02, 2.122e+02, 2.223e+02, 2.324e+02,
       2.425e+02, 2.526e+02, 2.627e+02, 2.728e+02, 2.829e+02, 2.930e+02,
       3.031e+02, 3.132e+02, 3.233e+0...
       8.485e+02, 8.586e+02, 8.687e+02, 8.788e+02, 8.889e+02, 8.990e+02,
       9.091e+02, 9.192e+02, 9.293e+02, 9.394e+02, 9.495e+02, 9.596e+02,
       9.697e+02, 9.798e+02, 9.899e+02, 1.000e+03]),
        copy_X=True, cv=56, eps=0.001, fit_intercept=True, max_iter=1000,
        n_alphas=100, n_jobs=None, normalize=False, positive=False,
        precompute='auto', random_state=None, selection='cyclic', tol=0.0001,
        verbose=False)

In [34]:
print('alpha = ', l1_cv.alpha_) 
print('coef = ', l1_cv.coef_)
print('R2 = ', l1_cv.score(dodgers_training_X_ohe_scaled,dodgers_training_Y))

alpha =  484.9
coef =  [  822.02575876    -0.           753.55887489     0.
  1933.39956554  -124.74778286    -0.            -0.
     0.         -1166.94119794   174.2067818      0.
  -480.10809056  1382.28372468  -885.40353153     0.
    -0.         -2640.57436296     0.        ]
R2 =  0.5177144037948875


In [35]:
np_cols_list = np.asarray(dodgers_training_X_ohe.columns.tolist())
np_cols_list

array(['temp', 'month_APR', 'month_AUG', 'month_JUL', 'month_JUN',
       'month_MAY', 'month_OCT', 'month_SEP', 'day_of_week_Friday',
       'day_of_week_Monday', 'day_of_week_Saturday', 'day_of_week_Sunday',
       'day_of_week_Thursday', 'day_of_week_Tuesday',
       'day_of_week_Wednesday', 'skies_Clear ', 'skies_Cloudy',
       'bobblehead_NO', 'bobblehead_YES'], dtype='<U21')

In [36]:
import numpy.ma as ma
ma.masked_where(l1_cv.coef_ == 0 , np_cols_list)

masked_array(data=['temp', --, 'month_AUG', --, 'month_JUN', 'month_MAY',
                   --, --, --, 'day_of_week_Monday',
                   'day_of_week_Saturday', --, 'day_of_week_Thursday',
                   'day_of_week_Tuesday', 'day_of_week_Wednesday', --, --,
                   'bobblehead_NO', --],
             mask=[False,  True, False,  True, False, False,  True,  True,
                    True, False, False,  True, False, False, False,  True,
                    True, False,  True],
       fill_value='N/A',
            dtype='<U21')

In [37]:
type(dodgers_training_X_ohe_scaled)

numpy.ndarray

In [38]:
type(dodgers_training_Y)

pandas.core.frame.DataFrame

<h3>linear regression without regularisation

In [39]:
ols = linear_model.LinearRegression(fit_intercept=True)

In [40]:
ols.fit(dodgers_training_X_ohe_scaled, dodgers_training_Y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

<h3> RMSE values for the linear models

In [41]:
#calculating RMSE on the Ridge L2 Regression with LOOCV 
l2_train_pred = l2_cv.predict(dodgers_training_X_ohe_scaled)
print('L2 - RMSE train = ', np.sqrt(mean_squared_error(l2_train_pred,dodgers_training_Y)))


l2_test_pred = l2_cv.predict(dodgers_testing_X_ohe_scaled)
print('L2 - RMSE test = ', np.sqrt(mean_squared_error(l2_test_pred,dodgers_testing_Y)))


#calculating RMSE on the Ridge L1 Regression with LOOCV 
l1_train_pred = l1_cv.predict(dodgers_training_X_ohe_scaled)
print('L1 - RMSE train = ', np.sqrt(mean_squared_error(l1_train_pred,dodgers_training_Y)))

l1_test_pred = l1_cv.predict(dodgers_testing_X_ohe_scaled)
print('L1 - RMSE test = ', np.sqrt(mean_squared_error(l1_test_pred,dodgers_testing_Y)))

#calculating RMSE for the linear model with no regularisation
ols_train_pred =ols.predict(dodgers_training_X_ohe_scaled)

print('simple Linear regression train error = ', np.sqrt(mean_squared_error(ols_train_pred,dodgers_training_Y)))

ols_test_pred = ols.predict(dodgers_testing_X_ohe_scaled)
print('simple Linear regression test error = ', np.sqrt(mean_squared_error(ols_test_pred,dodgers_testing_Y)))

L2 - RMSE train =  5648.864216385288
L2 - RMSE test =  6710.582066280918
L1 - RMSE train =  5464.527851899147
L1 - RMSE test =  6804.544002873916
simple Linear regression train error =  5254.5387249249425
simple Linear regression test error =  7058.68452678261


<h2>Regarding Expected RMSE

In [42]:
l2_emrs_values = np.sqrt(l2_cv.cv_values_.mean(axis=0))
l2_emrs_values

array([[7708.54106583, 7075.16783524, 6865.99968379, 6777.98459642,
        6743.94251164, 6737.66094884, 6746.7084851 , 6764.50773761,
        6787.32243574, 6812.92419597, 6839.93849484, 6867.49853321,
        6895.05100865, 6922.24194653, 6948.84702062, 6974.72769677,
        6999.80291926, 7024.03044277, 7047.39430834, 7069.89632101,
        7091.55018431, 7112.37742773, 7132.40456188, 7151.661084  ,
        7170.17807893, 7187.98724046, 7205.12019148, 7221.60801771,
        7237.48095466, 7252.76818461, 7267.49771277, 7281.69630003,
        7295.38943628, 7308.6013421 , 7321.35499051, 7333.67214216,
        7345.57338942, 7357.07820605, 7368.20499989, 7378.97116677,
        7389.39314452, 7399.48646603, 7409.26581084, 7418.74505472,
        7427.93731724, 7436.85500685, 7445.50986368, 7453.91299994,
        7462.07493793, 7470.00564579, 7477.71457107, 7485.21067213,
        7492.50244765, 7499.59796415, 7506.50488179, 7513.23047847,
        7519.78167238, 7526.16504306, 7532.38685

The above values are the values for each of the cross validation dataset.
The expected EMRS of the model is the lowest Mean ERMS value beacuse the minimisation objection of the Ridge function is to find alpha for which EMRS is minimum

In [43]:
print("Expected RMSE for L2 model:",np.min(l2_emrs_values))

Expected RMSE for L2 model: 6737.660948839508


similarly we go for Lasso

In [44]:
l1_emrs_values = np.sqrt(l1_cv.mse_path_.mean(axis=1))
l1_emrs_values

array([7119.87119225, 7116.72422944, 7113.7633959 , 7110.92428887,
       7108.25081733, 7105.40715323, 7102.49996441, 7099.83695787,
       7097.32342698, 7094.66573241, 7091.67731755, 7088.50726245,
       7085.54392242, 7082.75097676, 7080.23753576, 7077.89310148,
       7075.85524742, 7074.12526699, 7072.77612531, 7071.6959351 ,
       7070.96188729, 7070.23719484, 7069.4345852 , 7068.56251068,
       7067.35398716, 7066.01305894, 7064.77562861, 7063.39825444,
       7061.24920045, 7059.17043229, 7057.184863  , 7054.98658952,
       7052.83763434, 7050.38933878, 7047.62694666, 7044.87088903,
       7042.26037308, 7039.7966271 , 7037.46765972, 7035.25261231,
       7033.14260996, 7031.15559944, 7029.42428596, 7028.13356611,
       7027.09725465, 7026.35527866, 7025.68950892, 7023.96398739,
       7022.38723557, 7020.99029856, 7019.85013223, 7018.95537342,
       7019.81841145, 7022.21215893, 7025.08705177, 7028.34901019,
       7031.84781738, 7035.32505543, 7038.65835404, 7042.20209

The above values are the values for each of the cross validation dataset.
The expected EMRS of the model is the lowest Mean ERMS value beacuse the minimisation objection of the Lasso function is to find alpha for which EMRS is minimum

In [45]:
print("Expected RMSE for L1 model:",np.min(l1_emrs_values))

Expected RMSE for L1 model: 7018.9553734195615


Please note that shape of the array for the error array has different shape. That's the reason we have taken aggregate over the rows and in the second case we have taken aggreate over column

<h3> Sample Prediction

let's try to predict the attendance on a <b>clear Monday in June</b> when <b>the expected temperature
is 72 </b>for all three models <b>with and without bobbleheads</b>.
Please note that the inputs are loaded form a sample file to simpify things. 

In [46]:
no_booble_head = pd.read_csv("input_no_bobble_head.csv")

In [47]:
no_booble_head

Unnamed: 0,temp,month_APR,month_AUG,month_JUL,month_JUN,month_MAY,month_OCT,month_SEP,day_of_week_Friday,day_of_week_Monday,day_of_week_Saturday,day_of_week_Sunday,day_of_week_Thursday,day_of_week_Tuesday,day_of_week_Wednesday,skies_Clear,skies_Cloudy,bobblehead_NO,bobblehead_YES
0,72,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,1,0


In [48]:
yes_bobble_head = pd.read_csv("input_yes_bobble_head.csv")

In [49]:
yes_bobble_head

Unnamed: 0,temp,month_APR,month_AUG,month_JUL,month_JUN,month_MAY,month_OCT,month_SEP,day_of_week_Friday,day_of_week_Monday,day_of_week_Saturday,day_of_week_Sunday,day_of_week_Thursday,day_of_week_Tuesday,day_of_week_Wednesday,skies_Clear,skies_Cloudy,bobblehead_NO,bobblehead_YES
0,72,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,1


In [50]:
no_booble_head_scaled = scaler.transform(no_booble_head)
yes_booble_head_scaled = scaler.transform(yes_bobble_head)

<h3>Predictions using L2 - Ridge, L1- Lasso, Simple Regression Model

In [51]:
#Ridge
no_booble_head_scaled_prediction_l2 = l2_cv.predict(no_booble_head_scaled)
yes_booble_head_scaled_prediction_l2 = l2_cv.predict(yes_booble_head_scaled)

In [52]:
no_booble_head_scaled_prediction_l2

array([[40638.44252892]])

In [53]:
yes_booble_head_scaled_prediction_l2

array([[48673.22150451]])

In [54]:
print("Difference in attendance:",(yes_booble_head_scaled_prediction_l2[0] - no_booble_head_scaled_prediction_l2[0])[0])

Difference in attendance: 8034.778975592846


In [55]:
#Laaso
no_booble_head_scaled_prediction_l1 = l1_cv.predict(no_booble_head_scaled)
yes_booble_head_scaled_prediction_l1 = l1_cv.predict(yes_booble_head_scaled)

In [56]:
no_booble_head_scaled_prediction_l1

array([41679.9190964])

In [57]:
yes_booble_head_scaled_prediction_l1

array([50940.03322401])

In [58]:
print("Difference in attendance:",(yes_booble_head_scaled_prediction_l1[0] - no_booble_head_scaled_prediction_l1[0]))

Difference in attendance: 9260.114127606044


In [59]:
#simple regression
no_booble_head_scaled_prediction_ols = ols.predict(no_booble_head_scaled)
yes_booble_head_scaled_prediction_ols = ols.predict(yes_booble_head_scaled)

In [60]:
no_booble_head_scaled_prediction_ols

array([[41823.22849175]])

In [61]:
yes_booble_head_scaled_prediction_ols

array([[51684.84997165]])

In [62]:
print("Difference in attendance:",(yes_booble_head_scaled_prediction_ols[0] - no_booble_head_scaled_prediction_ols[0])[0])

Difference in attendance: 9861.621479891895


<h3> Result 

<b>We can clearly see that if we give the bobble head there is an increase in the number of attendees. So giving bobble heads is benificial for the game</b>