### Importing libraries

In [1]:
import pandas as pd
import numpy as np

In [2]:
automobile = pd.read_csv('datasets/cars_processed.csv')

automobile.head(10)

Unnamed: 0,MPG,Cylinders,Displacement,Horsepower,Weight,Acceleration,Origin,Age
0,18.0,8,307.0,130,3504,12.0,US,49
1,16.0,8,304.0,150,3433,12.0,US,49
2,17.0,8,302.0,140,3449,10.5,US,49
3,14.0,8,454.0,220,4354,9.0,US,49
4,23.551429,8,440.0,215,4312,8.5,US,49
5,14.0,8,455.0,225,4425,8.5,US,49
6,15.0,8,390.0,190,3850,8.5,US,49
7,15.0,8,383.0,170,3563,10.0,US,49
8,14.0,5,340.0,160,3609,8.0,US,49
9,23.551429,8,400.0,150,3761,9.5,US,49


In [3]:
X = automobile.drop(['MPG', 'Origin'], axis=1)

Y = automobile['MPG']

### Regularization Technique
#### Lasso
https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.Lasso.html

In [4]:
from sklearn.linear_model import Lasso

In [5]:
lasso = Lasso(alpha=0.8)
lasso.fit(X, Y)

Lasso(alpha=0.8, copy_X=True, fit_intercept=True, max_iter=1000,
      normalize=False, positive=False, precompute=False, random_state=None,
      selection='cyclic', tol=0.0001, warm_start=False)

In [6]:
predictors = X.columns

coef = pd.Series(lasso.coef_, predictors).sort_values()
print(coef)

Age            -0.673784
Weight         -0.006603
Horsepower     -0.004511
Cylinders      -0.000000
Acceleration    0.000000
Displacement    0.001366
dtype: float64


In [7]:
lasso_features = ['Age', 'Weight']

X[lasso_features].head()

Unnamed: 0,Age,Weight
0,49,3504
1,49,3433
2,49,3449
3,49,4354
4,49,4312


#### Decision Tree
https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeRegressor.html#sklearn.tree.DecisionTreeRegressor

In [8]:
from sklearn.tree import DecisionTreeRegressor

In [9]:
decision_tree = DecisionTreeRegressor(max_depth=4)
decision_tree.fit(X,Y)

DecisionTreeRegressor(criterion='mse', max_depth=4, max_features=None,
                      max_leaf_nodes=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      presort=False, random_state=None, splitter='best')

In [10]:
predictors = X.columns

coef = pd.Series(decision_tree.feature_importances_, predictors).sort_values()
print(coef)

Cylinders       0.000000
Acceleration    0.006328
Weight          0.058285
Age             0.112716
Horsepower      0.177081
Displacement    0.645590
dtype: float64


In [11]:
decision_tree_features = ['Displacement', 'Horsepower']

X[decision_tree_features].head()

Unnamed: 0,Displacement,Horsepower
0,307.0,130
1,304.0,150
2,302.0,140
3,454.0,220
4,440.0,215


In [12]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

In [13]:
def build_model(X, Y, test_frac):
    
    x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=test_frac)
    
    model = LinearRegression().fit(x_train, y_train)
    y_pred = model.predict(x_test)
    
    print("Test_score : ", r2_score(y_test, y_pred))

In [14]:
build_model(X[lasso_features], Y, 0.2)

Test_score :  0.7566710129840005


In [15]:
build_model(X[decision_tree_features], Y, 0.2)

Test_score :  0.643899658822721
