# *Lasso, Ridge, Elastic Net Python Implementation*

In [17]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings("ignore")   

In [18]:
df = sns.load_dataset('mpg')

In [19]:
df.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,name
0,18.0,8,307.0,130.0,3504,12.0,70,usa,chevrolet chevelle malibu
1,15.0,8,350.0,165.0,3693,11.5,70,usa,buick skylark 320
2,18.0,8,318.0,150.0,3436,11.0,70,usa,plymouth satellite
3,16.0,8,304.0,150.0,3433,12.0,70,usa,amc rebel sst
4,17.0,8,302.0,140.0,3449,10.5,70,usa,ford torino


In [20]:
df.drop("name", axis=1, inplace=True)

In [21]:
df.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin
0,18.0,8,307.0,130.0,3504,12.0,70,usa
1,15.0,8,350.0,165.0,3693,11.5,70,usa
2,18.0,8,318.0,150.0,3436,11.0,70,usa
3,16.0,8,304.0,150.0,3433,12.0,70,usa
4,17.0,8,302.0,140.0,3449,10.5,70,usa


In [22]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398 entries, 0 to 397
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   mpg           398 non-null    float64
 1   cylinders     398 non-null    int64  
 2   displacement  398 non-null    float64
 3   horsepower    392 non-null    float64
 4   weight        398 non-null    int64  
 5   acceleration  398 non-null    float64
 6   model_year    398 non-null    int64  
 7   origin        398 non-null    object 
dtypes: float64(4), int64(3), object(1)
memory usage: 25.0+ KB


In [23]:
df.dtypes

mpg             float64
cylinders         int64
displacement    float64
horsepower      float64
weight            int64
acceleration    float64
model_year        int64
origin           object
dtype: object

In [24]:
df.shape

(398, 8)

In [25]:
df.isna().sum()

mpg             0
cylinders       0
displacement    0
horsepower      6
weight          0
acceleration    0
model_year      0
origin          0
dtype: int64

Since we have not done outlier treatment, better idea is to replace missing value with median

In [26]:
df['horsepower']

0      130.0
1      165.0
2      150.0
3      150.0
4      140.0
       ...  
393     86.0
394     52.0
395     84.0
396     79.0
397     82.0
Name: horsepower, Length: 398, dtype: float64

In [27]:
df['horsepower'].median()

np.float64(93.5)

In [28]:
df['horsepower'] = df['horsepower'].fillna(df['horsepower'].median())

In [29]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398 entries, 0 to 397
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   mpg           398 non-null    float64
 1   cylinders     398 non-null    int64  
 2   displacement  398 non-null    float64
 3   horsepower    398 non-null    float64
 4   weight        398 non-null    int64  
 5   acceleration  398 non-null    float64
 6   model_year    398 non-null    int64  
 7   origin        398 non-null    object 
dtypes: float64(4), int64(3), object(1)
memory usage: 25.0+ KB


In [30]:
df['origin'].value_counts()

origin
usa       249
japan      79
europe     70
Name: count, dtype: int64

Data encoding 

In [31]:
df['origin'] = df["origin"].map({'usa': 1, 'japan': 2, 'europe': 3})

In [32]:
df['origin'].value_counts()

origin
1    249
2     79
3     70
Name: count, dtype: int64

In [33]:
df.dtypes

mpg             float64
cylinders         int64
displacement    float64
horsepower      float64
weight            int64
acceleration    float64
model_year        int64
origin            int64
dtype: object

Separate data into X and y

In [34]:
X = df.drop('mpg', axis=1)
y = df['mpg']   

In [35]:
X

Unnamed: 0,cylinders,displacement,horsepower,weight,acceleration,model_year,origin
0,8,307.0,130.0,3504,12.0,70,1
1,8,350.0,165.0,3693,11.5,70,1
2,8,318.0,150.0,3436,11.0,70,1
3,8,304.0,150.0,3433,12.0,70,1
4,8,302.0,140.0,3449,10.5,70,1
...,...,...,...,...,...,...,...
393,4,140.0,86.0,2790,15.6,82,1
394,4,97.0,52.0,2130,24.6,82,3
395,4,135.0,84.0,2295,11.6,82,1
396,4,120.0,79.0,2625,18.6,82,1


In [36]:
y

0      18.0
1      15.0
2      18.0
3      16.0
4      17.0
       ... 
393    27.0
394    44.0
395    32.0
396    28.0
397    31.0
Name: mpg, Length: 398, dtype: float64

Train test split

In [37]:
from sklearn.model_selection import train_test_split    

In [38]:
X_train, X_test, y_train, y_test =  train_test_split(X, y, test_size=0.3, random_state=1)

In [39]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((278, 7), (120, 7), (278,), (120,))

## SLR Model

In [40]:
from sklearn.linear_model import LinearRegression
regression_model = LinearRegression()

In [41]:
regression_model

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [42]:
regression_model.fit(X_train, y_train)

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [43]:
regression_model.coef_

array([-0.31761423,  0.02623748, -0.01827076, -0.00748775,  0.05040673,
        0.84709514,  1.51909584])

In [44]:
for i, col in enumerate(X_train.columns):
    print(f"The coefficient for {col} is : {regression_model.coef_[i]}")

The coefficient for cylinders is : -0.3176142302799369
The coefficient for displacement is : 0.026237482599078946
The coefficient for horsepower is : -0.018270764913124595
The coefficient for weight is : -0.007487750398361897
The coefficient for acceleration is : 0.0504067346197138
The coefficient for model_year is : 0.8470951427061365
The coefficient for origin is : 1.5190958387975024


Coefficients are relatively smaller, if one IV changes, there will be not much difference in prediction, sometime called as **Smoother Model**

In [45]:
from sklearn.metrics import mean_squared_error, r2_score

In [46]:
y_pred_linear = regression_model.predict(X_test)

In [47]:
y_pred_linear

array([21.16196121, 27.89684387, 20.45045592, 27.12361164, 24.36117063,
       15.87763934, 29.93157794, 34.02155729, 17.08992155, 10.56782304,
       30.53231377, 16.48854992, 22.4061424 , 27.76978226, 36.0209892 ,
       23.79725872, 10.82747269, 20.27707855,  8.86935273, 32.48801009,
       25.36507567, 32.75235387, 20.95486868, 24.54530695, 25.77582154,
       30.20140405, 32.01102103, 31.96692512, 15.25929349, 30.41225966,
       27.50427715, 10.93370544, 21.42816438, 28.08300976, 25.03368839,
       13.67199264, 26.67769394,  9.04050101, 32.03270673, 23.97429191,
       24.18855895, 24.60440771, 21.16368861, 34.53665774, 26.31981331,
       22.23170907, 21.0865992 , 11.65432984, 27.9398198 , 18.98058597,
       23.69821181, 26.86564242, 17.04794305, 12.03955477, 28.70876897,
       24.26227131, 10.20293895, 13.03594704, 29.96910853, 35.35029687,
       37.01162788, 35.38558158, 18.04991116, 27.90304164, 20.67174751,
       33.83899858, 27.02537633, 26.73184442, 29.93216787, 12.33

In [48]:
r2_linear = r2_score(y_test, y_pred_linear)

In [49]:
print(f"The R square value for Linear Regression is : {r2_linear}  ")

The R square value for Linear Regression is : 0.8348001123742285  


# Regularised Model

## Ridge Regression Model

In [50]:
from sklearn.linear_model import Ridge

In [51]:
ridge_regression_model = Ridge(alpha=1.0)
ridge_regression_model

0,1,2
,alpha,1.0
,fit_intercept,True
,copy_X,True
,max_iter,
,tol,0.0001
,solver,'auto'
,positive,False
,random_state,


In [52]:
ridge_regression_model.fit(X_train, y_train)

0,1,2
,alpha,1.0
,fit_intercept,True
,copy_X,True
,max_iter,
,tol,0.0001
,solver,'auto'
,positive,False
,random_state,


In [53]:
for i, col in enumerate(X_train.columns):
    print(f"The coefficient for {col} is : {ridge_regression_model.coef_[i]}")

The coefficient for cylinders is : -0.31159794459952483
The coefficient for displacement is : 0.02599797725360628
The coefficient for horsepower is : -0.01819643128246157
The coefficient for weight is : -0.0074835421258030175
The coefficient for acceleration is : 0.05003261423029815
The coefficient for model_year is : 0.8462126659579847
The coefficient for origin is : 1.5028233365628925


For ridge regression evaluation we have

In [55]:
y_pred_ridge = ridge_regression_model.predict(X_test)
r2_score_ridge = r2_score(y_test, y_pred_ridge)

In [56]:
print(f"The R square value for Ridge Regression is : {r2_score_ridge}  ")

The R square value for Ridge Regression is : 0.8348814925912453  
