# Regularization with SciKit-Learn

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv('Data/Advertising.csv')

In [3]:
df.head()

Unnamed: 0,TV,radio,newspaper,sales
0,230.1,37.8,69.2,22.1
1,44.5,39.3,45.1,10.4
2,17.2,45.9,69.3,9.3
3,151.5,41.3,58.5,18.5
4,180.8,10.8,58.4,12.9


In [4]:
X = df.drop('sales', axis=1)
y = df['sales']

### Polynomial Conversion

In [5]:
from sklearn.preprocessing import PolynomialFeatures

In [6]:
poly_converter = PolynomialFeatures(degree=3, include_bias=False)

In [7]:
poly_features = poly_converter.fit_transform(X)

In [8]:
poly_features.shape

(200, 19)

### Train | Test Split

In [9]:
from sklearn.model_selection import train_test_split

In [10]:
X_train, X_test, y_train, y_test = train_test_split(poly_features, y, test_size=0.3, random_state=101)

-----

# Scaling the Data

In [11]:
from sklearn.preprocessing import StandardScaler

In [12]:
scaler = StandardScaler()

In [13]:
# to avoid data leakage : meaning model got an idea of test data
# only fit to train data set
scaler.fit(X_train)

StandardScaler()

In [14]:
# overwrite scaled data
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

We can see that after scaling, values has scaled down.

In [15]:
X_train[0]

array([ 0.49300171, -0.33994238,  1.61586707,  0.28407363, -0.02568776,
        1.49677566, -0.59023161,  0.41659155,  1.6137853 ,  0.08057172,
       -0.05392229,  1.01524393, -0.36986163,  0.52457967,  1.48737034,
       -0.66096022, -0.16360242,  0.54694754,  1.37075536])

In [16]:
poly_features[0]

array([2.30100000e+02, 3.78000000e+01, 6.92000000e+01, 5.29460100e+04,
       8.69778000e+03, 1.59229200e+04, 1.42884000e+03, 2.61576000e+03,
       4.78864000e+03, 1.21828769e+07, 2.00135918e+06, 3.66386389e+06,
       3.28776084e+05, 6.01886376e+05, 1.10186606e+06, 5.40101520e+04,
       9.88757280e+04, 1.81010592e+05, 3.31373888e+05])

--------

-------

# Ridge Regression

In [17]:
from sklearn.linear_model import Ridge

In [18]:
ridge_model = Ridge(alpha=10)

In [19]:
ridge_model.fit(X_train, y_train)

Ridge(alpha=10)

In [20]:
test_predictions = ridge_model.predict(X_test)

In [28]:
from sklearn.metrics import mean_absolute_error, mean_squared_error

In [22]:
MAE = mean_absolute_error(y_test, test_predictions)

In [23]:
MAE

0.5774404204714166

In [24]:
RMSE = np.sqrt(mean_squared_error(y_test, test_predictions))

In [25]:
RMSE

0.8946386461319675

In [31]:
# Training Set Performance
train_predictions = ridge_model.predict(X_train)
MAE = mean_absolute_error(y_train, train_predictions)
RMSE = np.sqrt(mean_squared_error(y_train, train_predictions))

MAE, RMSE

(0.5288348183025329, 0.8491805208255253)

---------

## Choosing an alpha value with Cross-Validation

In [33]:
from sklearn.linear_model import RidgeCV

In [42]:
ridge_cv_model = RidgeCV(alphas=(0.1, 1.0, 10.0), scoring='neg_mean_absolute_error')

In [43]:
ridge_cv_model.fit(X_train, y_train)

RidgeCV(alphas=array([ 0.1,  1. , 10. ]), scoring='neg_mean_absolute_error')

In [44]:
# get the best alpha value
ridge_cv_model.alpha_

0.1

In [49]:
ridge_cv_model.coef_

array([ 5.40769392,  0.5885865 ,  0.40390395, -6.18263924,  4.59607939,
       -1.18789654, -1.15200458,  0.57837796, -0.1261586 ,  2.5569777 ,
       -1.38900471,  0.86059434,  0.72219553, -0.26129256,  0.17870787,
        0.44353612, -0.21362436, -0.04622473, -0.06441449])

As we can see from the coefficient, ridge regression is considering every features.

------


If you don't remember which key to use for `scoring metrics`, we can search like that.

In [39]:
from sklearn.metrics import SCORERS

In [41]:
SCORERS.keys()
# these are the scoring parameters that we can use. but depends on the model, use the appropriate key
# for neg_mean_squared_error, etc the HIGHER the value is, the BETTER (because it is the negative value/opposite of mean squared error (the lower the bettter))

dict_keys(['explained_variance', 'r2', 'max_error', 'neg_median_absolute_error', 'neg_mean_absolute_error', 'neg_mean_squared_error', 'neg_mean_squared_log_error', 'neg_root_mean_squared_error', 'neg_mean_poisson_deviance', 'neg_mean_gamma_deviance', 'accuracy', 'roc_auc', 'roc_auc_ovr', 'roc_auc_ovo', 'roc_auc_ovr_weighted', 'roc_auc_ovo_weighted', 'balanced_accuracy', 'average_precision', 'neg_log_loss', 'neg_brier_score', 'adjusted_rand_score', 'homogeneity_score', 'completeness_score', 'v_measure_score', 'mutual_info_score', 'adjusted_mutual_info_score', 'normalized_mutual_info_score', 'fowlkes_mallows_score', 'precision', 'precision_macro', 'precision_micro', 'precision_samples', 'precision_weighted', 'recall', 'recall_macro', 'recall_micro', 'recall_samples', 'recall_weighted', 'f1', 'f1_macro', 'f1_micro', 'f1_samples', 'f1_weighted', 'jaccard', 'jaccard_macro', 'jaccard_micro', 'jaccard_samples', 'jaccard_weighted'])

-------

In [45]:
# check performance
test_predictions = ridge_cv_model.predict(X_test)

In [48]:
MAE = mean_absolute_error(y_test, test_predictions)
RMSE = np.sqrt(mean_squared_error(y_test, test_predictions))

MAE, RMSE

(0.42737748843249934, 0.6180719926923581)

Comparing the MAE and RMSE with alpha value of 10 result (0.5774404204714166, 0.8946386461319675), the results are much better now.

--------

--------

# Lasso Regression
- LASSO (Least Absolute Shinkage and Selection Operator)
- **the smaller `eps` value is, the wider range we are checking**

In [60]:
from sklearn.linear_model import LassoCV

In [69]:
# lasso_cv_model = LassoCV(eps=0.001,n_alphas=100,cv=5, max_iter=1000000) #wider range because eps value is smaller

lasso_cv_model = LassoCV(eps=0.1,n_alphas=100,cv=5) #narrower range

In [70]:
lasso_cv_model.fit(X_train,y_train)

LassoCV(cv=5, eps=0.1)

In [71]:
# best alpha value
lasso_cv_model.alpha_

0.49430709092258285

In [72]:
test_predictions = lasso_cv_model.predict(X_test)

In [76]:
MAE = mean_absolute_error(y_test, test_predictions)
RMSE = np.sqrt(mean_squared_error(y_test, test_predictions))

In [77]:
MAE, RMSE

(0.6541723161252858, 1.1308001022762542)

By comparing the previous results of Ridge Regression, this model seems like not performing well.

In [78]:
lasso_cv_model.coef_

array([1.002651  , 0.        , 0.        , 0.        , 3.79745279,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        ])

We can check the coefficient of lasso regression model. As we can see from the above, there are only 2 features that model is considering. Other features are 0 and not considered by model.

But based on the context and if we want to consider only two features, Lasso may be a better choice. However we need to take note that MAE, RMSE is not performing as well as Ridge. 

But alphas value can be higher (expand the wider range of search) and make a more complext model.

--------
-------

# Elastic Net (L1 + L2)

In [79]:
from sklearn.linear_model import ElasticNetCV

In [81]:
elastic_cv_model = ElasticNetCV(l1_ratio=[.1, .5, .7,.9, .95, .99, 1], eps=0.001, n_alphas=100, max_iter=1000000)

In [82]:
elastic_cv_model.fit(X_train, y_train)

ElasticNetCV(l1_ratio=[0.1, 0.5, 0.7, 0.9, 0.95, 0.99, 1], max_iter=1000000)

In [83]:
#best l1 ratio
elastic_cv_model.l1_ratio_

1.0

In [85]:
elastic_cv_model.alpha_

0.004943070909225827

In [86]:
test_predictions = elastic_cv_model.predict(X_test)

In [87]:
MAE = mean_absolute_error(y_test, test_predictions)
RMSE = np.sqrt(mean_squared_error(y_test, test_predictions))

In [88]:
MAE, RMSE

(0.43350346185900673, 0.6063140748984029)

In [90]:
elastic_cv_model.coef_

array([ 4.86023329,  0.12544598,  0.20746872, -4.99250395,  4.38026519,
       -0.22977201, -0.        ,  0.07267717, -0.        ,  1.77780246,
       -0.69614918, -0.        ,  0.12044132, -0.        , -0.        ,
       -0.        ,  0.        ,  0.        , -0.        ])