In [24]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso, Ridge
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV

## Understanding Data

In [2]:
motors = pd.read_csv('dataset/mtcars.csv')
motors.head()

Unnamed: 0,model,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb
0,Mazda RX4,21.0,6,160.0,110,3.9,2.62,16.46,0,1,4,4
1,Mazda RX4 Wag,21.0,6,160.0,110,3.9,2.875,17.02,0,1,4,4
2,Datsun 710,22.8,4,108.0,93,3.85,2.32,18.61,1,1,4,1
3,Hornet 4 Drive,21.4,6,258.0,110,3.08,3.215,19.44,1,0,3,1
4,Hornet Sportabout,18.7,8,360.0,175,3.15,3.44,17.02,0,0,3,2


1. disp : amount of air that is displaced by the cylinders when they are at the bottom of their stroke
2. hp : horsepower = torques/revolution of the engine
3. drat :
4. v-shape / straight line
5. am (automatic transmission)
6. mpg (target) : miles per gallons

In [6]:
# We have tiny set of datas thus we are inclined to make mistakes to create a model.
x = motors.drop(['model','mpg'], axis=1)
y = motors['mpg']
x.shape, y.shape

((32, 10), (32,))

In [9]:
# Not doing random sampling so as to avoid 
x_train = x[6:]
y_train = y[6:]

x_test = x[:6]
y_test = y[:6]

## Build Model

In [11]:
reg = LinearRegression()
reg.fit(x_train,y_train)
predictions = reg.predict(x_test)

In [13]:
np.sqrt(mean_squared_error(y_test,predictions))

3.010512533406115

In [21]:
reg.coef_, reg.intercept_

(array([-0.08477517,  0.00763515, -0.0255906 , -0.44631173, -2.95798848,
         0.67960416,  1.02384134,  4.58818205,  0.10867342, -0.21611603]),
 19.902325524595028)

### Can we do better ?
## Lasso

In [15]:
lasso = Lasso(alpha=10)
lasso.fit(x_train,y_train)
np.sqrt(mean_squared_error(y_test,lasso.predict(x_test)))

2.6977452393170736

In [22]:
lasso.coef_

array([-0.        , -0.0334088 , -0.02224576,  0.        , -0.        ,
       -0.        ,  0.        ,  0.        ,  0.        , -0.        ])

## Ridge

In [19]:
ridge = Ridge(alpha=10)
ridge.fit(x_train,y_train)
np.sqrt(mean_squared_error(y_test,ridge.predict(x_test)))

2.4853642157334113

In [23]:
ridge.coef_

array([-0.54178614, -0.016409  , -0.01668308,  0.29684408, -0.786796  ,
       -0.17103069,  0.10146613,  0.8470123 ,  0.48936665, -0.6704062 ])

## Chossing the Value of alpha

In [29]:
params = {'alpha':[1e-3, 0.01, 0.1, 1, 10, 100, 1000, 10000]}
lasso = Lasso()
reg1 = GridSearchCV(lasso, params, scoring='neg_mean_squared_error', cv=5)
reg1.fit(x_train, y_train)
reg1.best_params_

{'alpha': 0.1}

In [26]:
params = {'alpha':[1e-3, 0.01, 0.1, 1, 10, 100, 1000, 10000]}
ridge = Ridge()
reg = GridSearchCV(ridge, params, scoring='neg_mean_squared_error', cv=5)
reg.fit(x_train, y_train)

GridSearchCV(cv=5, estimator=Ridge(),
             param_grid={'alpha': [0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000]},
             scoring='neg_mean_squared_error')

In [27]:
reg.best_params_

{'alpha': 10}

In [30]:
lasso = Lasso(alpha=0.1)
lasso.fit(x_train,y_train)
np.sqrt(mean_squared_error(y_test,lasso.predict(x_test)))

2.8597848620975026

Since we did it using validation set hence, it's probably more closer to the truth as compared to earlier.