EDA shows that some features present high colinearity. Highly colinear features can affect the stability of linear models.
We will use Lasso Regression because it automatically eleminates unecessary features and Random Forest because it is robust to features colinearity.

Another option we could explore to address features colinearity is:
- PCA
- Feature selection (eleminating features that don't improve evaluation scores)

We will use Mean Squarred Error metric for model evaluation.

In [45]:
import os
# for nice dark theme in Jupyter Notebooks
from jupyterthemes import jtplot
jtplot.style(theme='monokai', context='notebook', ticks=True, grid=False)
import warnings
warnings.filterwarnings('once')

from sklearn.linear_model import LassoCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.dummy import DummyRegressor

import pandas as pd
import numpy as np

In [None]:
%load_ext autoreload
%autoreload 2

In [2]:
from src.features.build_features import make_final_train_set

In [3]:
column_names, full_pipeline, df_train = make_final_train_set()



In [4]:
df_train.head()

Unnamed: 0,displacement,horsepower,weight,acceleration,cylinders,year,x0_ASIA,x0_USA,mpg
0,1.090196,1.266232,0.552826,-1.319334,1.527188,-1.696667,0.0,1.0,16.0
1,-0.922996,-0.407925,-0.999667,-0.413182,-0.850515,-1.696667,1.0,0.0,27.0
2,-0.98135,-0.947975,-1.124772,0.927922,-0.850515,1.638975,1.0,0.0,37.0
3,-0.98135,-1.163996,-1.392854,0.275493,-0.850515,0.527094,1.0,0.0,36.1
4,-0.747936,-0.218907,-0.327675,-0.231952,-0.850515,-0.306816,0.0,0.0,23.0


In [6]:
df_train.shape

(318, 9)

In [11]:
X_train = df_train.drop('mpg', axis=1)
y_train = df_train['mpg']

## Baseline Regression model 

In [46]:
X_train_dummy, X_val_dummy, y_train_dummy, y_val_dummy = train_test_split(X_train, y_train, 
                                                                          random_state=42,
                                                                          test_size=0.2)
dummy_mean = DummyRegressor(strategy='mean')
dummy_mean.fit(X_train_dummy, y_train_dummy)
dummy_mean.score(X_val_dummy, y_val_dummy)

-0.004781699033904019

## Algorithm selection 

In [91]:
lasso_cv = LassoCV(alphas=[0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.1], cv=10, random_state=42)
cv_results = cross_validate(lasso_cv, X_train, y_train, 
                            cv=10,
                            return_estimator=True, 
                            return_train_score=True,
                           )
errors_cv(cv_results)
alphas = [model.alpha_ for model in cv_results['estimator']]
print('alphas:', alphas)


train_score: 0.817 +/- 0.003
test_score: 0.800 +/- 0.032
alphas: [0.07, 0.04, 0.08, 0.03, 0.02, 0.08, 0.08, 0.07, 0.02, 0.03]


In [78]:
def accuracy_cv(cv_result):
    print('train_score: %.3f +/- %.3f' % (cv_results['train_score'].mean(),
                                          cv_results['train_score'].std()))
    print('test_score: %.3f +/- %.3f' % (cv_results['test_score'].mean(),
                                         cv_results['test_score'].std()))




In [85]:
errors_cv(cv_results)

train_score: 0.817 +/- 0.003
test_score: 0.802 +/- 0.032


In [88]:
alphas = [model.alpha_ for model in cv_results['estimator']]
params = [model.coef_ for model in cv_results['estimator']]
print('alphas:', alphas)
print('params:', params)

alphas: [0.1, 0.03, 0.1, 0.03, 0.03, 0.1, 0.03, 0.1, 0.03, 0.03]
params: [array([-0.        , -0.2538364 , -4.82833848,  0.01721716, -0.        ,
        2.75550805,  0.        , -2.03585072]), array([ 1.03942535, -0.2920954 , -5.49614889,  0.18045098,  0.        ,
        2.92136871, -0.        , -2.39527932]), array([-0.        , -0.18163169, -4.92435909,  0.        ,  0.        ,
        2.75983791,  0.        , -2.02867338]), array([ 0.94526203, -0.34546441, -5.52524736,  0.06502501, -0.        ,
        2.90383542,  0.        , -2.13042341]), array([ 1.14072787, -0.56783512, -5.41696756,  0.16337792, -0.06621791,
        2.85464956, -0.        , -2.28945258]), array([ 0.        , -0.11687899, -5.08843341,  0.        , -0.        ,
        2.83376711,  0.        , -1.32147412]), array([ 0.80263118, -0.42985098, -5.31825708,  0.01050869, -0.        ,
        2.87538186,  0.        , -2.23001858]), array([-0.        , -0.31323382, -4.87548118,  0.        , -0.        ,
        2.8094

In [82]:
alphas

[0.1, 0.03, 0.1, 0.03, 0.01, 0.1, 0.03, 0.1, 0.03, 0.03]

In [54]:
cv_results

{'fit_time': array([0.01011467, 0.00977993, 0.00763392, 0.007797  , 0.00777602]),
 'score_time': array([0.00145102, 0.00203395, 0.0010922 , 0.00102305, 0.00100875]),
 'estimator': (LassoCV(alphas=[0.01, 0.03, 0.1, 0.3, 1, 3, 10], copy_X=True, cv=3, eps=0.001,
      fit_intercept=True, max_iter=1000, n_alphas=100, n_jobs=None,
      normalize=False, positive=False, precompute='auto', random_state=42,
      selection='cyclic', tol=0.0001, verbose=False),
  LassoCV(alphas=[0.01, 0.03, 0.1, 0.3, 1, 3, 10], copy_X=True, cv=3, eps=0.001,
      fit_intercept=True, max_iter=1000, n_alphas=100, n_jobs=None,
      normalize=False, positive=False, precompute='auto', random_state=42,
      selection='cyclic', tol=0.0001, verbose=False),
  LassoCV(alphas=[0.01, 0.03, 0.1, 0.3, 1, 3, 10], copy_X=True, cv=3, eps=0.001,
      fit_intercept=True, max_iter=1000, n_alphas=100, n_jobs=None,
      normalize=False, positive=False, precompute='auto', random_state=42,
      selection='cyclic', tol=0.0001, ver

In [92]:
for model in cv_results['estimator']:
    print('coef: ', model.coef_)
    

coef:  [ 0.         -0.22731362 -4.82474644  0.04467388  0.          2.77847757
  0.         -2.17335886]
coef:  [ 0.75294082 -0.22663649 -5.33484503  0.15241441  0.          2.89881262
 -0.         -2.24909384]
coef:  [ 0.         -0.19050158 -4.90414369  0.          0.          2.77342814
  0.         -2.12489468]
coef:  [ 0.94526203 -0.34546441 -5.52524736  0.06502501 -0.          2.90383542
  0.         -2.13042341]
coef:  [ 1.63782291 -0.65898584 -5.55982807  0.18279823 -0.28236264  2.87623369
 -0.05707    -2.48638134]
coef:  [ 0.         -0.12361433 -5.07079715  0.          0.          2.84945122
 -0.         -1.41629889]
coef:  [ 0.         -0.13796389 -4.95558639  0.          0.          2.79049414
  0.         -1.70288533]
coef:  [ 0.         -0.32042961 -4.85197881  0.          0.          2.83125848
  0.         -1.77432546]
coef:  [ 1.60518747 -0.         -6.2041685   0.23798672  0.          3.03861157
  0.         -2.4812899 ]
coef:  [ 1.04419726 -0.47580894 -5.32389006  0

In [None]:
param = 