### Importing dataset and required libraries


In [1]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn import preprocessing
import numpy as np

In [2]:
df = pd.read_csv("archive/auto-mpg.csv")

In [3]:
df.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car name
0,18.0,8,307.0,130,3504,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,165,3693,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150,3436,11.0,70,1,plymouth satellite
3,16.0,8,304.0,150,3433,12.0,70,1,amc rebel sst
4,17.0,8,302.0,140,3449,10.5,70,1,ford torino


In [4]:
df.isna().sum()
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398 entries, 0 to 397
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   mpg           398 non-null    float64
 1   cylinders     398 non-null    int64  
 2   displacement  398 non-null    float64
 3   horsepower    398 non-null    object 
 4   weight        398 non-null    int64  
 5   acceleration  398 non-null    float64
 6   model year    398 non-null    int64  
 7   origin        398 non-null    int64  
 8   car name      398 non-null    object 
dtypes: float64(3), int64(4), object(2)
memory usage: 28.1+ KB


In [5]:
df['horsepower'] = df['horsepower'].replace("?",np.nan)
df["horsepower"] = df["horsepower"].astype("float64")
df = df.dropna()
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 392 entries, 0 to 397
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   mpg           392 non-null    float64
 1   cylinders     392 non-null    int64  
 2   displacement  392 non-null    float64
 3   horsepower    392 non-null    float64
 4   weight        392 non-null    int64  
 5   acceleration  392 non-null    float64
 6   model year    392 non-null    int64  
 7   origin        392 non-null    int64  
 8   car name      392 non-null    object 
dtypes: float64(4), int64(4), object(1)
memory usage: 30.6+ KB


### L2 Normalization

In [6]:
df = df.drop(['car name'],axis =1)
df_scaled=preprocessing.normalize(df, norm='l2')
df_scaled = pd.DataFrame(df_scaled, columns=df.columns)
features = df_scaled.iloc[:,1:]
response = df_scaled.iloc[:,0]
print(features)
print('*****************************************')
print(response)

     cylinders  displacement  horsepower    weight  acceleration  model year  \
0     0.002272      0.087201    0.036926  0.995286      0.003409    0.019883   
1     0.002154      0.094240    0.044427  0.994364      0.003096    0.018848   
2     0.002316      0.092048    0.043419  0.994580      0.003184    0.020262   
3     0.002319      0.088104    0.043472  0.994936      0.003478    0.020287   
4     0.002308      0.087137    0.040395  0.995153      0.003030    0.020197   
..         ...           ...         ...       ...           ...         ...   
387   0.001431      0.050068    0.030756  0.997778      0.005579    0.029325   
388   0.001874      0.045433    0.024356  0.997650      0.011522    0.038407   
389   0.001737      0.058639    0.036487  0.996865      0.005039    0.035618   
390   0.001521      0.045620    0.030033  0.997938      0.007071    0.031174   
391   0.001468      0.043665    0.030088  0.998048      0.007118    0.030088   

       origin  
0    0.000284  
1    0.

### splitting the data for training and testing

In [7]:
from sklearn.model_selection import train_test_split,GridSearchCV

In [8]:
X_train, X_test, y_train, y_test = train_test_split( features,response, test_size=1/3, random_state=42)
print(X_train)

     cylinders  displacement  horsepower    weight  acceleration  model year  \
180   0.001620      0.043334    0.034829  0.997893      0.006277    0.030779   
119   0.001392      0.042105    0.038973  0.997992      0.005394    0.025402   
307   0.001861      0.045591    0.035357  0.997428      0.006839    0.037217   
219   0.002054      0.078296    0.037222  0.996022      0.003209    0.019766   
139   0.001798      0.044061    0.037317  0.997662      0.007418    0.033270   
..         ...           ...         ...       ...           ...         ...   
71    0.002047      0.077800    0.038388  0.996045      0.003199    0.018426   
106   0.002142      0.082814    0.035695  0.995547      0.005354    0.026058   
270   0.001398      0.052770    0.029705  0.997738      0.006151    0.027259   
348   0.001677      0.041098    0.027259  0.998087      0.008681    0.033969   
102   0.001595      0.079748    0.029906  0.996252      0.002791    0.014554   

       origin  
180  0.000810  
119  0.

### Implement Linear Regression

In [9]:
from sklearn.linear_model import LinearRegression, RidgeCV, LassoCV, Ridge, Lasso
from sklearn.metrics import mean_squared_error
model=LinearRegression()
model.fit(X_train,y_train)
y_pred = model.predict(X_test)
error = mean_squared_error(y_test, y_pred)
print("linear regression mse:{}".format(error))
coefficients_cols = pd.DataFrame(model.coef_, X_train.columns, columns=['Coefficients'])
print(coefficients_cols)

linear regression mse:1.867365317882146e-06
              Coefficients
cylinders         0.839891
displacement     -0.117330
horsepower       -0.171236
weight           -1.621712
acceleration     -0.278295
model year        0.569352
origin            0.823610


##### model year coefficient is 0.5693 and this feature contributes significant amount of linear relation for the response attributes. It is the second most significant contributor.

### Ridge Regression

#### Grid search cv for figuring out value of alpha

In [38]:
feature_count=7
ridge =Ridge()
alphas =  {'alpha':[0.0001,0.001,0.01,0.1,1, 10]}
ridge_model= GridSearchCV(ridge, alphas, scoring='neg_mean_squared_error',cv=feature_count)


In [39]:
ridge_model.fit(X_train,y_train)
y_pred = ridge_model.predict(X_test)
error = mean_squared_error(y_test, y_pred)
print("Ridge regression mse:{}".format(error))
print("Best alpha value: ",ridge_model.best_params_)
best_ridge_model=ridge_model.best_estimator_

Ridge regression mse:1.8668374775885816e-06
Best alpha value:  {'alpha': 0.0001}


In [40]:
coefficients_cols_r = pd.DataFrame(best_ridge_model.coef_, X_train.columns, columns=['Coefficients'])
print(coefficients_cols_r)

              Coefficients
cylinders         0.055028
displacement     -0.004863
horsepower       -0.061296
weight           -0.200234
acceleration      0.038674
model year        0.612884
origin            0.186435


### Lasso Regression

In [35]:
feature_count=7
lasso =Lasso()
alphas =  {'alpha':[0.00000001,0.0000001, 0.0000001,0.0000001,0.000001,0.00001,0.0001,0.001,0.01,0.1,110]}
lasso_model= GridSearchCV(lasso, alphas, scoring='neg_mean_squared_error',cv=feature_count)


In [36]:
lasso_model.fit(X_train,y_train)
y_pred = lasso_model.predict(X_test)
error = mean_squared_error(y_test, y_pred)
print("Lasso regression mse:{}".format(error))
print("Best alpha value: ",lasso_model.best_params_)
best_lasso_model=lasso_model.best_estimator_

Lasso regression mse:1.8817715660294576e-06
Best alpha value:  {'alpha': 1e-07}


In [37]:
coefficients_cols_l = pd.DataFrame(best_lasso_model.coef_, X_train.columns, columns=['Coefficients'])
print(coefficients_cols_l)

              Coefficients
cylinders         0.000000
displacement      0.009309
horsepower       -0.049586
weight           -0.000000
acceleration      0.000000
model year        0.642626
origin            0.000000
