In [1]:
import pandas as pd
import numpy as np

from sklearn.linear_model import Lasso

from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression

from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

In [2]:
cars_data = pd.read_csv("datasets/car_price_data.csv")

cars_data.head()

Unnamed: 0,car_ID,symboling,CarName,fueltype,aspiration,doornumber,carbody,drivewheel,enginelocation,wheelbase,...,enginesize,fuelsystem,boreratio,stroke,compressionratio,horsepower,peakrpm,citympg,highwaympg,price
0,1,3,alfa-romero giulia,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,13495.0
1,2,3,alfa-romero stelvio,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,16500.0
2,3,1,alfa-romero Quadrifoglio,gas,std,two,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154,5000,19,26,16500.0
3,4,2,audi 100 ls,gas,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.4,10.0,102,5500,24,30,13950.0
4,5,2,audi 100ls,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.4,8.0,115,5500,18,22,17450.0


In [3]:
cars_data.drop(["car_ID", "symboling", "CarName", "fueltype", "aspiration",
                "carbody", "enginelocation", "enginetype", "fuelsystem",
                "cylindernumber"], axis = 1, inplace = True)

In [4]:
cars_data["drivewheel"].unique()

array(['rwd', 'fwd', '4wd'], dtype=object)

In [5]:
cars_data["doornumber"].unique()

array(['two', 'four'], dtype=object)

In [6]:
doornumber_dict = {'two':0, 'four':1}

cars_data['doornumber'].replace(doornumber_dict, inplace=True)

cars_data.head()

Unnamed: 0,doornumber,drivewheel,wheelbase,carlength,carwidth,carheight,curbweight,enginesize,boreratio,stroke,compressionratio,horsepower,peakrpm,citympg,highwaympg,price
0,0,rwd,88.6,168.8,64.1,48.8,2548,130,3.47,2.68,9.0,111,5000,21,27,13495.0
1,0,rwd,88.6,168.8,64.1,48.8,2548,130,3.47,2.68,9.0,111,5000,21,27,16500.0
2,0,rwd,94.5,171.2,65.5,52.4,2823,152,2.68,3.47,9.0,154,5000,19,26,16500.0
3,1,fwd,99.8,176.6,66.2,54.3,2337,109,3.19,3.4,10.0,102,5500,24,30,13950.0
4,1,4wd,99.4,176.6,66.4,54.3,2824,136,3.19,3.4,8.0,115,5500,18,22,17450.0


In [7]:
pd.get_dummies(cars_data, columns=['drivewheel'])

Unnamed: 0,doornumber,wheelbase,carlength,carwidth,carheight,curbweight,enginesize,boreratio,stroke,compressionratio,horsepower,peakrpm,citympg,highwaympg,price,drivewheel_4wd,drivewheel_fwd,drivewheel_rwd
0,0,88.6,168.8,64.1,48.8,2548,130,3.47,2.68,9.0,111,5000,21,27,13495.0,0,0,1
1,0,88.6,168.8,64.1,48.8,2548,130,3.47,2.68,9.0,111,5000,21,27,16500.0,0,0,1
2,0,94.5,171.2,65.5,52.4,2823,152,2.68,3.47,9.0,154,5000,19,26,16500.0,0,0,1
3,1,99.8,176.6,66.2,54.3,2337,109,3.19,3.40,10.0,102,5500,24,30,13950.0,0,1,0
4,1,99.4,176.6,66.4,54.3,2824,136,3.19,3.40,8.0,115,5500,18,22,17450.0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
200,1,109.1,188.8,68.9,55.5,2952,141,3.78,3.15,9.5,114,5400,23,28,16845.0,0,0,1
201,1,109.1,188.8,68.8,55.5,3049,141,3.78,3.15,8.7,160,5300,19,25,19045.0,0,0,1
202,1,109.1,188.8,68.9,55.5,3012,173,3.58,2.87,8.8,134,5500,18,23,21485.0,0,0,1
203,1,109.1,188.8,68.9,55.5,3217,145,3.01,3.40,23.0,106,4800,26,27,22470.0,0,0,1


In [8]:
cars_data = pd.get_dummies(cars_data, columns=['drivewheel'])

cars_data.head(10)

Unnamed: 0,doornumber,wheelbase,carlength,carwidth,carheight,curbweight,enginesize,boreratio,stroke,compressionratio,horsepower,peakrpm,citympg,highwaympg,price,drivewheel_4wd,drivewheel_fwd,drivewheel_rwd
0,0,88.6,168.8,64.1,48.8,2548,130,3.47,2.68,9.0,111,5000,21,27,13495.0,0,0,1
1,0,88.6,168.8,64.1,48.8,2548,130,3.47,2.68,9.0,111,5000,21,27,16500.0,0,0,1
2,0,94.5,171.2,65.5,52.4,2823,152,2.68,3.47,9.0,154,5000,19,26,16500.0,0,0,1
3,1,99.8,176.6,66.2,54.3,2337,109,3.19,3.4,10.0,102,5500,24,30,13950.0,0,1,0
4,1,99.4,176.6,66.4,54.3,2824,136,3.19,3.4,8.0,115,5500,18,22,17450.0,1,0,0
5,0,99.8,177.3,66.3,53.1,2507,136,3.19,3.4,8.5,110,5500,19,25,15250.0,0,1,0
6,1,105.8,192.7,71.4,55.7,2844,136,3.19,3.4,8.5,110,5500,19,25,17710.0,0,1,0
7,1,105.8,192.7,71.4,55.7,2954,136,3.19,3.4,8.5,110,5500,19,25,18920.0,0,1,0
8,1,105.8,192.7,71.4,55.9,3086,131,3.13,3.4,8.3,140,5500,17,20,23875.0,0,1,0
9,0,99.5,178.2,67.9,52.0,3053,131,3.13,3.4,7.0,160,5500,16,22,17859.167,1,0,0


In [9]:
x = cars_data.drop("price", axis = 1)

y = cars_data["price"]

### Regularized regression technique also used for model selection

The coefficients of less significant features are reduced to be close to zero

In [10]:
lasso = Lasso(alpha = 0.8)

lasso.fit(x, y)

Lasso(alpha=0.8, copy_X=True, fit_intercept=True, max_iter=1000,
      normalize=False, positive=False, precompute=False, random_state=None,
      selection='cyclic', tol=0.0001, warm_start=False)

In [11]:
predictors = x.columns

coef = pd.Series(lasso.coef_, predictors).sort_values()

print(coef)

stroke             -2307.882575
boreratio          -2069.438781
drivewheel_fwd     -1352.333728
citympg             -320.099399
carlength            -67.877556
curbweight            -0.360044
drivewheel_4wd        -0.000000
peakrpm                2.159219
horsepower            27.356996
wheelbase             27.836604
enginesize           123.285281
doornumber           187.753940
highwaympg           213.450582
compressionratio     256.420149
carheight            272.587146
carwidth             771.639576
drivewheel_rwd      1180.145247
dtype: float64


In [12]:
lasso_features = ["stroke", "boreratio", "drivewheel_fwd", "drivewheel_rwd", "carwidth"]

In [13]:
x[lasso_features].head()

Unnamed: 0,stroke,boreratio,drivewheel_fwd,drivewheel_rwd,carwidth
0,2.68,3.47,0,1,64.1
1,2.68,3.47,0,1,64.1
2,3.47,2.68,0,1,65.5
3,3.4,3.19,1,0,66.2
4,3.4,3.19,0,0,66.4


In [14]:
decision_tree = DecisionTreeRegressor(max_depth = 5)

decision_tree.fit(x, y)

DecisionTreeRegressor(ccp_alpha=0.0, criterion='mse', max_depth=5,
                      max_features=None, max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, presort='deprecated',
                      random_state=None, splitter='best')

In [15]:
predictors = x.columns

coef = pd.Series(decision_tree.feature_importances_, predictors).sort_values()
print(coef)

doornumber          0.000000
drivewheel_4wd      0.000000
peakrpm             0.000000
compressionratio    0.000000
carheight           0.000000
drivewheel_fwd      0.000000
boreratio           0.000000
drivewheel_rwd      0.000093
carlength           0.000783
wheelbase           0.001089
citympg             0.009623
carwidth            0.013061
horsepower          0.014528
stroke              0.014585
highwaympg          0.015310
curbweight          0.236146
enginesize          0.694781
dtype: float64


In [16]:
decision_tree_features = ["enginesize", "curbweight", "highwaympg"]

x[decision_tree_features].head()

Unnamed: 0,enginesize,curbweight,highwaympg
0,130,2548,27
1,130,2548,27
2,152,2823,26
3,109,2337,30
4,136,2824,22


In [17]:
def build_model(x, y, test_frac):
    
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=test_frac)
    
    model = LinearRegression().fit(x_train, y_train)
    y_pred = model.predict(x_test)
    
    print("Test_score : ", r2_score(y_test, y_pred))

In [18]:
build_model(x[lasso_features], y, 0.2) 

Test_score :  0.7763423177993528


In [19]:
build_model(x[decision_tree_features], y, 0.2)

Test_score :  0.876176409985714
