In [69]:
import pandas as pd
import numpy as np
from sklearn import metrics
from sklearn import linear_model
from sklearn.model_selection import GridSearchCV, train_test_split
import seaborn as sns
from sklearn.preprocessing import StandardScaler

In [109]:
cars=pd.read_csv('mtcars.csv')

In [110]:
cars.head()

Unnamed: 0.1,Unnamed: 0,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb
0,Mazda RX4,21.0,6,160.0,110,3.9,2.62,16.46,0,1,4,4
1,Mazda RX4 Wag,21.0,6,160.0,110,3.9,2.875,17.02,0,1,4,4
2,Datsun 710,22.8,4,108.0,93,3.85,2.32,18.61,1,1,4,1
3,Hornet 4 Drive,21.4,6,258.0,110,3.08,3.215,19.44,1,0,3,1
4,Hornet Sportabout,18.7,8,360.0,175,3.15,3.44,17.02,0,0,3,2


In [111]:
cars=cars.iloc[:,1:] # removing the name columns

In [112]:
cars.head() # we have our dataset ready for prediction with mpg as target variable

Unnamed: 0,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb
0,21.0,6,160.0,110,3.9,2.62,16.46,0,1,4,4
1,21.0,6,160.0,110,3.9,2.875,17.02,0,1,4,4
2,22.8,4,108.0,93,3.85,2.32,18.61,1,1,4,1
3,21.4,6,258.0,110,3.08,3.215,19.44,1,0,3,1
4,18.7,8,360.0,175,3.15,3.44,17.02,0,0,3,2


In [113]:
X=list(cars.iloc[:,2:].columns)

In [114]:
Y=['mpg']

### Function to iterate over 100 samples with 80:20 train test split

In [118]:
def test(models, data, iterations = 100):
    index=['train','test']
    results = {}
    for i in models:
        r2_train = []
        r2_test = []
        for j in range(iterations):
            X_train, X_test, y_train, y_test = train_test_split(data[X], 
                                                                data[Y], 
                                                                test_size= 0.2)
            r2_test.append(metrics.r2_score(y_test,
                                            models[i].fit(X_train, 
                                                         y_train).predict(X_test)))
            r2_train.append(metrics.r2_score(y_train, 
                                             models[i].fit(X_train, 
                                                          y_train).predict(X_train)))
        results[i] = [np.mean(r2_train), np.mean(r2_test)]
    return pd.DataFrame(results,index=index)

In [119]:
models_Linear_reg = {'OLS': linear_model.LinearRegression(),
         'Lasso': linear_model.Lasso(),
         'Ridge': linear_model.Ridge(),}

## Mean R-sq value from 100 sampling iterations

In [121]:
test(models_Linear_reg, cars)

Unnamed: 0,OLS,Lasso,Ridge
train,0.881507,0.757262,0.878032
test,0.425216,0.353671,0.568288


### Woahh! Ridge performed far better than OLS on test data while maintaining decent score on train 

### However, Lasso can also outperform other two! 

### lets use Grid Search CV to find the BEST ESTIMATOR

In [90]:
lasso_params = {'alpha':[0.005, 0.02, 0.03, 0.05, 0.06]}
ridge_params = {'alpha':[550, 580, 600, 620, 650]}

models2 = {'OLS': linear_model.LinearRegression(),
           'Lasso': GridSearchCV(linear_model.Lasso(), 
                               param_grid=lasso_params).fit(cars[X], cars[Y]).best_estimator_,
           'Ridge': GridSearchCV(linear_model.Ridge(), 
                               param_grid=ridge_params).fit(cars[X], cars[Y]).best_estimator_,}

### lets check the best estimator for Lasso

In [94]:
GridSearchCV(linear_model.Lasso(),param_grid=lasso_params).fit(cars[X], cars[Y]).best_estimator_

Lasso(alpha=0.05)

### lets check the best estimator for Ridge

In [95]:
GridSearchCV(linear_model.Ridge(),param_grid=ridge_params).fit(cars[X], cars[Y]).best_estimator_

Ridge(alpha=550)

### Now , lets compare the model with above BEST parameters

In [93]:
test(models2, cars)

Unnamed: 0,OLS,Lasso,Ridge
train,0.888409,0.876164,0.761059
test,0.364675,0.59488,0.451484


### Looks like we got Lasso as the winner!

### What if we want to know the best features for our model ?


In [133]:
lasso=linear_model.Lasso(alpha=0.05,max_iter=100000).fit(cars[X], cars[Y])

In [134]:
lasso.coef_

array([ 0.0061085 , -0.02134248,  0.68297162, -3.22844631,  0.67984934,
        0.        ,  2.07189838,  0.73535894, -0.32026222])

In [135]:
final_features = []
for i in range(len(list(X))):
    if lasso.coef_[i]:
        final_features.append(list(X)[i])

In [136]:
final_features

['disp', 'hp', 'drat', 'wt', 'qsec', 'am', 'gear', 'carb']

In [128]:
# original features passed to the model
X

['disp', 'hp', 'drat', 'wt', 'qsec', 'vs', 'am', 'gear', 'carb']

### looks like the feature vs , i.e. type of engine (V-shaped or straight) is not contributing enough to the model

                            ----------- ### Thank You ### --------------