## Regularization 

<ul>
<li>A model will have a low accuracy if it is overfitting. </li>
<li>Overfitting occurs when model is trying too hard to capture the noise (samples that don't represent true pattern)
    in your training dataset.</li>
<li>When model is more flexible, it is prone to overfitting</li>
<li>Regularization shrinks the coefficients (parameter or slops) towards zero to discourages a more complex or flexible model, so as to avoid the risk of overfitting</li>
    <li>Ridge and Lasso are two options </li>
    <li>Regularization strength is provided by parameter <b>alpha</b></li>
</ul>    

In [1]:
# import pandas library
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
cars = pd.read_csv("final_cars.csv")

In [3]:
## create X and Y
y = cars['price']
X = cars.drop(columns=['price'])

In [4]:
X  = pd.get_dummies(X)

In [5]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

In [6]:
X_scaled = scaler.fit_transform(X)

## Ridge Regression

In [7]:
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [8]:
X_train, X_test,y_train,y_test = train_test_split(X,y,test_size=0.3, random_state = 0)

In [21]:
ridge = Ridge(normalize=True, alpha=3.0)
ridge.fit(X_train,y_train)

Ridge(alpha=3.0, copy_X=True, fit_intercept=True, max_iter=None, normalize=True,
      random_state=None, solver='auto', tol=0.001)

In [22]:
# Display coefficient for each column
for t in zip(X.columns, ridge.coef_):
  print(f"{t[0]:25s} {t[1]:10.2f}")

length                         47.18
width                         309.88
curb-weight                     1.38
engine-size                    19.52
highway-mpg                   -79.83
make_alfa-romero              263.41
make_audi                     495.93
make_bmw                     2370.04
make_chevrolet                -37.02
make_dodge                   -392.30
make_honda                   -348.79
make_isuzu                   -963.03
make_jaguar                  3208.70
make_mazda                   -341.74
make_mercedes-benz           2911.89
make_mercury                  244.55
make_mitsubishi              -461.80
make_nissan                  -270.77
make_peugot                  -330.66
make_plymouth                -500.27
make_porsche                    0.00
make_renault                 -634.03
make_saab                     785.69
make_subaru                  -645.10
make_toyota                  -546.51
make_volkswagen              -242.33
make_volvo                    382.27
f

In [23]:
y_pred = ridge.predict(X_test)

In [24]:
mse = mean_squared_error(y_test,y_pred)
print("MSE  : ", mse)
print("RMSE : ", np.sqrt(mse))

MSE  :  36969484.09747219
RMSE :  6080.253621147081


## LassoCV

In [26]:
from sklearn.linear_model import LassoCV
from sklearn.metrics import r2_score

In [33]:
lm = LassoCV(cv=5,alphas=(50,45,40,35,25,10))
lm.fit(X_scaled,y)

LassoCV(alphas=(50, 45, 40, 35, 25, 10), copy_X=True, cv=5, eps=0.001,
        fit_intercept=True, max_iter=1000, n_alphas=100, n_jobs=None,
        normalize=False, positive=False, precompute='auto', random_state=None,
        selection='cyclic', tol=0.0001, verbose=False)

In [35]:
lm.alpha_

40

In [36]:
# Display coefficient for each column
for t in zip(X.columns, lm.coef_):
  print(f"{t[0]:25s} {t[1]:10.2f}")

length                         -0.00
width                         905.02
curb-weight                  1955.97
engine-size                  2190.40
highway-mpg                  -370.92
make_alfa-romero                0.00
make_audi                     516.36
make_bmw                     1702.47
make_chevrolet                181.73
make_dodge                    -22.07
make_honda                     48.54
make_isuzu                   -208.47
make_jaguar                   607.24
make_mazda                    164.15
make_mercedes-benz           1497.46
make_mercury                    0.00
make_mitsubishi              -290.29
make_nissan                  -156.25
make_peugot                  -333.60
make_plymouth                -111.93
make_porsche                 1836.32
make_renault                 -197.59
make_saab                     361.65
make_subaru                  -202.34
make_toyota                  -439.62
make_volkswagen                 0.00
make_volvo                    270.47
f

In [38]:
lm.coef_[np.abs(lm.coef_) == 0]

array([-0.,  0.,  0.,  0.,  0., -0.,  0., -0.,  0.,  0.])

In [39]:
# Take a part of data for final testing 
y_test  = y[:50]
X_test = X_scaled[:50]
y_pred = lm.predict(X_test)

In [40]:
from sklearn.metrics import mean_squared_error
mse = mean_squared_error(y_test,y_pred)
print("MSE  : ",mse)
print("RMSE : ", np.sqrt(mse))

MSE  :  4888595.6603671415
RMSE :  2211.016883781565


In [41]:
r2score = r2_score(y_test,y_pred)
print(f"R2 Score: {r2score:0.2f}")

R2 Score: 0.95
