## Regularization 

<ul>
<li>A model will have a low accuracy if it is overfitting. </li>
<li>Overfitting occurs when model is trying too hard to capture the noise (samples that don't represent true pattern)
    in your training dataset.</li>
<li>When model is more flexible, it is prone to overfitting</li>
<li>Regularization shrinks the coefficients (parameter or slops) towards zero to discourages a more complex or flexible model, so as to avoid the risk of overfitting</li>
    <li>Ridge and Lasso are two options </li>
    <li>Regularization strength is provided by parameter <b>alpha</b></li>
</ul>    

In [2]:
# import pandas library
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
cars = pd.read_csv("final_cars.csv")

In [4]:
## create X and Y
y = cars['price']
X = cars.drop(columns=['price'])

In [5]:
X  = pd.get_dummies(X)

In [6]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

In [7]:
X_scaled = scaler.fit_transform(X)

## Ridge Regression

In [8]:
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [9]:
X_train, X_test,y_train,y_test = train_test_split(X,y,test_size=0.3, random_state = 0)

In [24]:
ridge = Ridge(normalize=True, alpha=0.10)
ridge.fit(X_train,y_train)

Ridge(alpha=0.1, normalize=True)

In [25]:
# Display coefficient for each column
for t in zip(X.columns, ridge.coef_):
  print(f"{t[0]:25s} {t[1]:10.2f}")

length                         27.59
width                         494.19
curb-weight                     2.88
engine-size                    39.40
highway-mpg                   -83.82
make_alfa-romero              154.21
make_audi                    1372.21
make_bmw                     7650.93
make_chevrolet               1884.43
make_dodge                   -795.11
make_honda                   -299.11
make_isuzu                  -2910.37
make_jaguar                  5710.25
make_mazda                   -472.71
make_mercedes-benz           6881.49
make_mercury                 -416.77
make_mitsubishi             -1724.46
make_nissan                  -587.28
make_peugot                 -1789.85
make_plymouth               -1257.75
make_porsche                    0.00
make_renault                -2510.34
make_saab                    2274.94
make_subaru                 -1597.33
make_toyota                 -1567.72
make_volkswagen              -293.71
make_volvo                    651.34
f

In [26]:
y_pred = ridge.predict(X_train)
mse = mean_squared_error(y_train,y_pred)
print("MSE  : ", mse)
print("RMSE : ", np.sqrt(mse))

MSE  :  3541967.8639164567
RMSE :  1882.0116535017673


In [27]:
y_pred = ridge.predict(X_test)

In [28]:
mse = mean_squared_error(y_test,y_pred)
print("MSE  : ", mse)
print("RMSE : ", np.sqrt(mse))

MSE  :  22134540.55063879
RMSE :  4704.735970342947


## LassoCV

In [29]:
from sklearn.linear_model import LassoCV
from sklearn.metrics import r2_score

In [33]:
lm = LassoCV(cv=5,alphas=(50,45,40,35,25,10))
lm.fit(X_scaled,y)

LassoCV(alphas=(50, 45, 40, 35, 25, 10), cv=5)

In [34]:
lm.alpha_

40

In [35]:
# Display coefficient for each column
for t in zip(X.columns, lm.coef_):
  print(f"{t[0]:25s} {t[1]:10.2f}")

length                         -0.00
width                         905.02
curb-weight                  1955.97
engine-size                  2190.40
highway-mpg                  -370.92
make_alfa-romero                0.00
make_audi                     516.36
make_bmw                     1702.47
make_chevrolet                181.73
make_dodge                    -22.07
make_honda                     48.54
make_isuzu                   -208.47
make_jaguar                   607.24
make_mazda                    164.15
make_mercedes-benz           1497.46
make_mercury                    0.00
make_mitsubishi              -290.29
make_nissan                  -156.25
make_peugot                  -333.60
make_plymouth                -111.93
make_porsche                 1836.32
make_renault                 -197.59
make_saab                     361.65
make_subaru                  -202.34
make_toyota                  -439.62
make_volkswagen                 0.00
make_volvo                    270.47
f

In [36]:
lm.coef_[np.abs(lm.coef_) == 0]

array([-0.,  0.,  0.,  0.,  0., -0.,  0., -0.,  0.,  0.])

In [37]:
# Take a part of data for final testing 
y_test  = y[:50]
X_test = X_scaled[:50]
y_pred = lm.predict(X_test)

In [38]:
from sklearn.metrics import mean_squared_error
mse = mean_squared_error(y_test,y_pred)
print("MSE  : ",mse)
print("RMSE : ", np.sqrt(mse))

MSE  :  4888595.6603671415
RMSE :  2211.016883781565


In [39]:
r2score = r2_score(y_test,y_pred)
print(f"R2 Score: {r2score:0.2f}")

R2 Score: 0.95
