In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import warnings 
warnings.filterwarnings('ignore')

In [3]:
df = sns.load_dataset('mpg')

In [4]:
df.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,name
0,18.0,8,307.0,130.0,3504,12.0,70,usa,chevrolet chevelle malibu
1,15.0,8,350.0,165.0,3693,11.5,70,usa,buick skylark 320
2,18.0,8,318.0,150.0,3436,11.0,70,usa,plymouth satellite
3,16.0,8,304.0,150.0,3433,12.0,70,usa,amc rebel sst
4,17.0,8,302.0,140.0,3449,10.5,70,usa,ford torino


In [5]:
df.drop("name", axis = 1, inplace = True)

In [7]:
df.isna().sum()

mpg             0
cylinders       0
displacement    0
horsepower      6
weight          0
acceleration    0
model_year      0
origin          0
dtype: int64

In [8]:
df['horsepower'].median()

93.5

In [9]:
df['horsepower'] = df['horsepower'].fillna(df['horsepower'].median())

In [10]:
df.isna().sum()

mpg             0
cylinders       0
displacement    0
horsepower      0
weight          0
acceleration    0
model_year      0
origin          0
dtype: int64

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398 entries, 0 to 397
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   mpg           398 non-null    float64
 1   cylinders     398 non-null    int64  
 2   displacement  398 non-null    float64
 3   horsepower    398 non-null    float64
 4   weight        398 non-null    int64  
 5   acceleration  398 non-null    float64
 6   model_year    398 non-null    int64  
 7   origin        398 non-null    object 
dtypes: float64(4), int64(3), object(1)
memory usage: 25.0+ KB


In [16]:
df.dtypes

mpg             float64
cylinders         int64
displacement    float64
horsepower      float64
weight            int64
acceleration    float64
model_year        int64
origin           object
dtype: object

In [17]:
df['origin'].value_counts()

usa       249
japan      79
europe     70
Name: origin, dtype: int64

In [18]:
df['origin'] = df['origin'].map({'usa': 1, "japan": 2, "europe": 3})

In [19]:
df['origin'] = df['origin'].astype(int)

In [20]:
df.dtypes

mpg             float64
cylinders         int64
displacement    float64
horsepower      float64
weight            int64
acceleration    float64
model_year        int64
origin            int64
dtype: object

In [21]:
#separate X and y
X = df.drop('mpg', axis =1)
y = df['mpg']

In [22]:
X

Unnamed: 0,cylinders,displacement,horsepower,weight,acceleration,model_year,origin
0,8,307.0,130.0,3504,12.0,70,1
1,8,350.0,165.0,3693,11.5,70,1
2,8,318.0,150.0,3436,11.0,70,1
3,8,304.0,150.0,3433,12.0,70,1
4,8,302.0,140.0,3449,10.5,70,1
...,...,...,...,...,...,...,...
393,4,140.0,86.0,2790,15.6,82,1
394,4,97.0,52.0,2130,24.6,82,3
395,4,135.0,84.0,2295,11.6,82,1
396,4,120.0,79.0,2625,18.6,82,1


In [23]:
y

0      18.0
1      15.0
2      18.0
3      16.0
4      17.0
       ... 
393    27.0
394    44.0
395    32.0
396    28.0
397    31.0
Name: mpg, Length: 398, dtype: float64

In [25]:
#train test split
from sklearn.model_selection import train_test_split

In [26]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state=1)

In [28]:
X_train.shape, X_test.shape

((278, 7), (120, 7))

In [29]:
#simple linear regression model
from sklearn.linear_model import LinearRegression

regression_model = LinearRegression()

In [30]:
regression_model

In [31]:
regression_model.fit(X_train, y_train)

In [33]:
for i, col_name in enumerate(X_train.columns):
    print(f"The coefficient for {col_name} is {regression_model.coef_[i]}")

The coefficient for cylinders is -0.3176142302799304
The coefficient for displacement is 0.026237482599078935
The coefficient for horsepower is -0.018270764913124574
The coefficient for weight is -0.007487750398361904
The coefficient for acceleration is 0.0504067346197142
The coefficient for model_year is 0.847095142706137
The coefficient for origin is 1.5190958387975046


In [34]:
#coefficients are relatively smaller, if one independent variable changes 
#slightly there will be not much difference in prediction.
#This is sometime is called smoother model

In [35]:
from sklearn.metrics import r2_score

y_pred_linear = regression_model.predict(X_test)
r2_linear = r2_score(y_test, y_pred_linear)
print(f"R square of linear regression {r2_linear}")

R square of linear regression 0.8348001123742286


In [36]:
#Ridge regression
from sklearn.linear_model import Ridge
ridge_regression_model = Ridge(alpha = 0.1)
ridge_regression_model.fit(X_train, y_train)

for i, col_name in enumerate(X_train.columns):
    print(f"The coefficient for {col_name} is {ridge_regression_model.coef_[i]}")

The coefficient for cylinders is -0.31700321010067906
The coefficient for displacement is 0.02621324975798342
The coefficient for horsepower is -0.018263252481449534
The coefficient for weight is -0.00748732605021309
The coefficient for acceleration is 0.050368969474425776
The coefficient for model_year is 0.8470062938903167
The coefficient for origin is 1.5174528285653937


In [37]:
# For Ridge Regression evaluation
y_pred_ridge = ridge_regression_model.predict(X_test)
r2_ridge = r2_score(y_test, y_pred_ridge)
print(f"R-squared score for Ridge Regression: {r2_ridge}")

R-squared score for Ridge Regression: 0.8348084889168355


In [38]:
#We dont see much variation in coeff of ridge regression as compared to linear regression

In [39]:
from sklearn.linear_model import Lasso

In [40]:
lasso_regression_model = Lasso(alpha = 0.5)
lasso_regression_model.fit(X_train, y_train)

for i, col_name in enumerate(X_train.columns):
    print(f"The coefficient for {col_name} is {lasso_regression_model.coef_[i]}")

The coefficient for cylinders is -0.0
The coefficient for displacement is 0.006208198888300358
The coefficient for horsepower is -0.011058382987169565
The coefficient for weight is -0.0069826731680230885
The coefficient for acceleration is 0.0
The coefficient for model_year is 0.744654952003819
The coefficient for origin is 0.0


In [None]:
#3 features coefficients are 0, Lasso helps in feature selection

In [41]:
y_pred_lasso =lasso_regression_model.predict(X_test)
r2_lasso = r2_score(y_test, y_pred_lasso)
print(f"R-squared score for Lasso Regression: {r2_lasso}")

R-squared score for Lasso Regression: 0.8277934716635555


In [43]:
from sklearn.linear_model import ElasticNet
elastic_net_model = ElasticNet(alpha = 1, l1_ratio=0.5)
elastic_net_model.fit(X_train, y_train)

In [44]:
for i, col_name in enumerate(X_train.columns):
    print(f"The coefficient for {col_name} is {elastic_net_model.coef_[i]}")

The coefficient for cylinders is -0.0
The coefficient for displacement is 0.005888869953667563
The coefficient for horsepower is -0.012403874933570126
The coefficient for weight is -0.006934550516257631
The coefficient for acceleration is 0.0
The coefficient for model_year is 0.7133150744603874
The coefficient for origin is 0.0


In [45]:
# Predict on the test set
y_pred_elastic_net = elastic_net_model.predict(X_test)

# Calculate evaluation metrics
r2_elastic_net = r2_score(y_test, y_pred_elastic_net)


print(f"R-squared score for Elastic Net Regression: {r2_elastic_net}")

R-squared score for Elastic Net Regression: 0.8284840073256804


In [46]:
from sklearn.linear_model import LassoCV
lassocv=LassoCV(cv=5)
lassocv.fit(X_train,y_train)

y_pred=lassocv.predict(X_test)



score=r2_score(y_test,y_pred)
print("R2 Score", score)

R2 Score 0.8082805983844751


In [47]:
from sklearn.linear_model import RidgeCV
ridgecv=RidgeCV(cv=5)
ridgecv.fit(X_train,y_train)
y_pred=ridgecv.predict(X_test)

score=r2_score(y_test,y_pred)
print("R2 Score", score)

R2 Score 0.8354145247502054


In [48]:
ridgecv.get_params()

{'alpha_per_target': False,
 'alphas': (0.1, 1.0, 10.0),
 'cv': 5,
 'fit_intercept': True,
 'gcv_mode': None,
 'scoring': None,
 'store_cv_values': False}