In [3]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [4]:
df = pd.read_csv("Advertising.csv")

In [5]:
df.head()

Unnamed: 0,TV,radio,newspaper,sales
0,230.1,37.8,69.2,22.1
1,44.5,39.3,45.1,10.4
2,17.2,45.9,69.3,9.3
3,151.5,41.3,58.5,18.5
4,180.8,10.8,58.4,12.9


In [6]:
X = df.drop("sales", axis=1)

In [7]:
y = df["sales"]

#### In this exercise, we will try to reduce overfitting by introducing a penalty term, which is done by Ridge regression
Also called L2 regularization

In [8]:
from sklearn.preprocessing import PolynomialFeatures

In [9]:
poly_converter = PolynomialFeatures(degree=3, include_bias=False)

In [10]:
poly_features = poly_converter.fit_transform(X)

In [11]:
poly_features.shape

(200, 19)

In [12]:
from sklearn.model_selection import train_test_split

In [13]:
X_train, X_test, y_train, y_test = train_test_split(poly_features, y, test_size=0.3, random_state=101)

In [14]:
X_train[0]

array([1.93200000e+02, 1.84000000e+01, 6.57000000e+01, 3.73262400e+04,
       3.55488000e+03, 1.26932400e+04, 3.38560000e+02, 1.20888000e+03,
       4.31649000e+03, 7.21142957e+06, 6.86802816e+05, 2.45233397e+06,
       6.54097920e+04, 2.33555616e+05, 8.33945868e+05, 6.22950400e+03,
       2.22433920e+04, 7.94234160e+04, 2.83593393e+05])

In [15]:
X_test[0]

array([7.47000000e+01, 4.94000000e+01, 4.57000000e+01, 5.58009000e+03,
       3.69018000e+03, 3.41379000e+03, 2.44036000e+03, 2.25758000e+03,
       2.08849000e+03, 4.16832723e+05, 2.75656446e+05, 2.55010113e+05,
       1.82294892e+05, 1.68641226e+05, 1.56010203e+05, 1.20553784e+05,
       1.11524452e+05, 1.03171406e+05, 9.54439930e+04])

We need to apply feature scaling on the *X_train* and *X_test* datasets independently

In [16]:
from sklearn.preprocessing import StandardScaler

In [17]:
scaler = StandardScaler()

In [18]:
type(scaler)

sklearn.preprocessing._data.StandardScaler

In [19]:
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [20]:
X_train.shape

(140, 19)

In [21]:
X_train[0]

array([ 0.49300171, -0.33994238,  1.61586707,  0.28407363, -0.02568776,
        1.49677566, -0.59023161,  0.41659155,  1.6137853 ,  0.08057172,
       -0.05392229,  1.01524393, -0.36986163,  0.52457967,  1.48737034,
       -0.66096022, -0.16360242,  0.54694754,  1.37075536])

In the above, the values are standardized

#### Now we will apply Ridge regression or L2 regularization method

In [20]:
from sklearn.linear_model import Ridge

In [21]:
ridge_model = Ridge(alpha=10)

In [22]:
ridge_model.fit(X_train, y_train)

Ridge(alpha=10)

In [23]:
test_predictions = ridge_model.predict(X_test)

In [24]:
# Computing error terms - MAE, RMSE
from sklearn.metrics import mean_absolute_error, mean_squared_error
MAE = mean_absolute_error(y_test, test_predictions)
RMSE = np.sqrt(mean_squared_error(y_test, test_predictions))

In [25]:
print(f"MAE: {MAE}")
print(f"RMSE: {RMSE}")

MAE: 0.5774404204714181
RMSE: 0.8946386461319672


#### Now we will use cross validation to perform the ridge regression model
We will put aside *X_test* (which would be our hold-out test dataset) for now and use *X_train* as training-cum-validation dataset

In [26]:
#Checking the error metrics to be used
from sklearn.metrics import SCORERS

In [27]:
SCORERS.keys()

dict_keys(['explained_variance', 'r2', 'max_error', 'neg_median_absolute_error', 'neg_mean_absolute_error', 'neg_mean_absolute_percentage_error', 'neg_mean_squared_error', 'neg_mean_squared_log_error', 'neg_root_mean_squared_error', 'neg_mean_poisson_deviance', 'neg_mean_gamma_deviance', 'accuracy', 'top_k_accuracy', 'roc_auc', 'roc_auc_ovr', 'roc_auc_ovo', 'roc_auc_ovr_weighted', 'roc_auc_ovo_weighted', 'balanced_accuracy', 'average_precision', 'neg_log_loss', 'neg_brier_score', 'adjusted_rand_score', 'rand_score', 'homogeneity_score', 'completeness_score', 'v_measure_score', 'mutual_info_score', 'adjusted_mutual_info_score', 'normalized_mutual_info_score', 'fowlkes_mallows_score', 'precision', 'precision_macro', 'precision_micro', 'precision_samples', 'precision_weighted', 'recall', 'recall_macro', 'recall_micro', 'recall_samples', 'recall_weighted', 'f1', 'f1_macro', 'f1_micro', 'f1_samples', 'f1_weighted', 'jaccard', 'jaccard_macro', 'jaccard_micro', 'jaccard_samples', 'jaccard_wei

In [28]:
from sklearn.linear_model import RidgeCV

In [29]:
#By default cv=None, which means leave-one-out cross validation
#By default scoring=None, which means mean absolute error scoring is used
ridge_cv_model = RidgeCV(alphas=(0.05, 0.1, 1.0, 10.0), scoring='neg_mean_squared_error', cv=None)

In [30]:
ridge_cv_model.fit(X_train, y_train)

RidgeCV(alphas=array([ 0.05,  0.1 ,  1.  , 10.  ]),
        scoring='neg_mean_squared_error')

In [31]:
test_predictions = ridge_cv_model.predict(X_test)

In [32]:
# Computing error terms - MAE, RMSE - This is to be used only for reporting and NOT fine tuning our model
# y_test is our hold-out test dataset
MAE = mean_absolute_error(y_test, test_predictions)
RMSE = np.sqrt(mean_squared_error(y_test, test_predictions))

In [33]:
print(f"MAE: {MAE}")
print(f"RMSE: {RMSE}")

MAE: 0.4108099689476472
RMSE: 0.5982211275186771


In [34]:
ridge_cv_model.alpha_

0.05

In [None]:
ridge_cv_model.best_score_

## LASSO regression or L1 regularization

In [35]:
from sklearn.linear_model import LassoCV

In [36]:
lasso_cv_model = LassoCV(eps=0.001, n_alphas=100, max_iter=1000, cv=None)

In [37]:
lasso_cv_model.fit(X_train, y_train)

  model = cd_fast.enet_coordinate_descent(


LassoCV()

There are 2 ways of overcoming the convergence warning. 
- Increasing max_iter above 1000 (which is the default value)
- Increasing the eps value (which is the ratio of alpha_min and alpha_max). This way, the number of iterations is limited

Below is the first step

In [48]:
lasso_cv_model = LassoCV(eps=0.001, n_alphas=100, cv=None, max_iter=1000000)

In [49]:
lasso_cv_model.fit(X_train, y_train)

LassoCV(max_iter=1000000)

In [50]:
test_predictions = lasso_cv_model.predict(X_test)

In [51]:
lasso_cv_model.coef_

array([ 4.86023329,  0.12544598,  0.20746872, -4.99250395,  4.38026519,
       -0.22977201, -0.        ,  0.07267717, -0.        ,  1.77780246,
       -0.69614918, -0.        ,  0.12044132, -0.        , -0.        ,
       -0.        ,  0.        ,  0.        , -0.        ])

In [52]:
MAE = mean_absolute_error(y_test, test_predictions)
print(MAE)

0.43350346185900707


In [53]:
RMSE = np.sqrt(mean_squared_error(y_test, test_predictions))
print(RMSE)

0.6063140748984027


Using the 2nd method

In [42]:
lasso_cv_model = LassoCV(eps=0.1, n_alphas=100, cv=None)

In [43]:
lasso_cv_model.fit(X_train, y_train)

LassoCV(eps=0.1)

In [44]:
test_predictions = lasso_cv_model.predict(X_test)

In [45]:
lasso_cv_model.coef_

array([1.002651  , 0.        , 0.        , 0.        , 3.79745279,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        ])

In [46]:
MAE = mean_absolute_error(y_test, test_predictions)
print(MAE)

0.6541723161252854


In [47]:
RMSE = np.sqrt(mean_squared_error(y_test, test_predictions))
print(RMSE)

1.1308001022762533


Considering the complexity of the model, the lasso regression model has performed quite good

## Elastic Net regularization

In [22]:
from sklearn.linear_model import ElasticNetCV

In [23]:
elastic_net_cv_model = ElasticNetCV(l1_ratio=[.1, .5, .7, .9, .95, .99, 1], eps=0.001, n_alphas=100, cv=None, max_iter=1000)

In [24]:
#Training the elastic model
elastic_net_cv_model.fit(X_train, y_train)

  model = cd_fast.enet_coordinate_descent(


ElasticNetCV(l1_ratio=[0.1, 0.5, 0.7, 0.9, 0.95, 0.99, 1])

In [25]:
#Increasing the max_iter parameter
# The list of l1_ratios is given to be skewed towards 1
elastic_net_cv_model = ElasticNetCV(l1_ratio=[.1, .5, .7, .9, .95, .99, 1], eps=0.001, n_alphas=100, cv=None, max_iter=1000000)

In [26]:
#Here, we considered training and validation data within the dataset X_train
# X_test is the hold-out test dataset. This is the same pattern we followed earlier while performing Ridge regression and Lasso regression 
elastic_net_cv_model.fit(X_train, y_train)

ElasticNetCV(l1_ratio=[0.1, 0.5, 0.7, 0.9, 0.95, 0.99, 1], max_iter=1000000)

In [27]:
test_predictions = elastic_net_cv_model.predict(X_test)

In [32]:
print(elastic_net_cv_model.l1_ratio)

[0.1, 0.5, 0.7, 0.9, 0.95, 0.99, 1]


In [33]:
#This is a hyperparameter
elastic_net_cv_model.l1_ratio_

1.0

As we can see above, the model is disregarding Ridge regression and is only considering Lasso.
Below you will see that the coefficients are the same as of Lasso regression

In [34]:
elastic_net_cv_model.coef_

array([ 4.86023329,  0.12544598,  0.20746872, -4.99250395,  4.38026519,
       -0.22977201, -0.        ,  0.07267717, -0.        ,  1.77780246,
       -0.69614918, -0.        ,  0.12044132, -0.        , -0.        ,
       -0.        ,  0.        ,  0.        , -0.        ])

In [39]:
elastic_net_cv_model.alpha_

0.004943070909225827

In [40]:
# Computing the error metrics
from sklearn.metrics import mean_absolute_error, mean_squared_error

In [41]:
MAE = mean_absolute_error(y_test, test_predictions)

In [42]:
RMSE = np.sqrt(mean_squared_error(y_test, test_predictions))

In [43]:
print(MAE)
print(RMSE)

0.43350346185900707
0.6063140748984027


##### Actually in most cases, it makes sense to directly go into elastic net cross validation modelling and compute the error metrics, rather than going individually into ridge or lasso