In [34]:
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing import minmax_scale

In [35]:
cars_data = pd.read_csv("datasets/car_price_data.csv")

cars_data.head()

Unnamed: 0,car_ID,symboling,CarName,fueltype,aspiration,doornumber,carbody,drivewheel,enginelocation,wheelbase,...,enginesize,fuelsystem,boreratio,stroke,compressionratio,horsepower,peakrpm,citympg,highwaympg,price
0,1,3,alfa-romero giulia,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,13495.0
1,2,3,alfa-romero stelvio,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,16500.0
2,3,1,alfa-romero Quadrifoglio,gas,std,two,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154,5000,19,26,16500.0
3,4,2,audi 100 ls,gas,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.4,10.0,102,5500,24,30,13950.0
4,5,2,audi 100ls,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.4,8.0,115,5500,18,22,17450.0


In [36]:
numeric_data = cars_data.drop(["car_ID", "symboling", "CarName", "aspiration",
                               "carbody", "enginelocation", "enginetype", "fuelsystem",
                               "cylindernumber", "fueltype", "doornumber", "drivewheel"], axis = 1)

numeric_data.head()

Unnamed: 0,wheelbase,carlength,carwidth,carheight,curbweight,enginesize,boreratio,stroke,compressionratio,horsepower,peakrpm,citympg,highwaympg,price
0,88.6,168.8,64.1,48.8,2548,130,3.47,2.68,9.0,111,5000,21,27,13495.0
1,88.6,168.8,64.1,48.8,2548,130,3.47,2.68,9.0,111,5000,21,27,16500.0
2,94.5,171.2,65.5,52.4,2823,152,2.68,3.47,9.0,154,5000,19,26,16500.0
3,99.8,176.6,66.2,54.3,2337,109,3.19,3.4,10.0,102,5500,24,30,13950.0
4,99.4,176.6,66.4,54.3,2824,136,3.19,3.4,8.0,115,5500,18,22,17450.0


In [55]:
x = numeric_data.drop(["price"], axis = 1)

y = numeric_data["price"]

In [56]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2)

x_train.head()

Unnamed: 0,wheelbase,carlength,carwidth,carheight,curbweight,enginesize,boreratio,stroke,compressionratio,horsepower,peakrpm,citympg,highwaympg
112,107.9,186.7,68.4,56.7,3252,152,3.7,3.52,21.0,95,4150,28,33
26,93.7,157.3,63.8,50.6,1989,90,2.97,3.23,9.4,68,5500,31,38
30,86.6,144.6,63.9,50.8,1713,92,2.91,3.41,9.6,58,4800,49,54
33,93.7,150.0,64.0,52.6,1940,92,2.91,3.41,9.2,76,6000,30,34
175,102.4,175.6,66.5,53.9,2414,122,3.31,3.54,8.7,92,4200,27,32


### Baseline model with all features

In [57]:
linear_model = LinearRegression(normalize = True).fit(x_train, y_train)

print("Training Score :", linear_model.score(x_train, y_train))

Training Score : 0.8620549982757401


In [58]:
y_pred = linear_model.predict(x_test)

print("Testing Score :", r2_score(y_pred, y_test))

Testing Score : 0.7997811016977021


In [59]:
x.var()

wheelbase               36.261782
carlength              152.208688
carwidth                 4.601900
carheight                5.970800
curbweight          271107.874319
enginesize            1734.113917
boreratio                0.073356
stroke                   0.098343
compressionratio        15.777104
horsepower            1563.741129
peakrpm             227515.303682
citympg                 42.799617
highwaympg              47.423099
dtype: float64

### Scale the data before applying variance thresholding so it is easier to compare variance across features

In [60]:
x_scaled = pd.DataFrame(minmax_scale(x, feature_range = (0, 10)), columns = x.columns)

x_scaled.var()

wheelbase           3.082201
carlength           3.390704
carwidth            3.195764
carheight           4.146389
curbweight          4.079212
enginesize          2.469368
boreratio           3.742669
stroke              2.230002
compressionratio    6.162931
horsepower          2.714828
peakrpm             3.790342
citympg             3.302440
highwaympg          3.284148
dtype: float64

### Choose a cut-off for the variance

In [61]:
select_features = VarianceThreshold(threshold = 4)

x_new = select_features.fit_transform(x_scaled)

select_features.get_support()

array([False, False, False,  True,  True, False, False, False,  True,
       False, False, False, False])

### Display the features which have a variance above the cut-off threshold

In [62]:
x_scaled.columns[select_features.get_support()]

Index(['carheight', 'curbweight', 'compressionratio'], dtype='object')

In [63]:
highvar_data = cars_data[['carheight', 'curbweight', 'compressionratio', 'price']]

In [64]:
x = highvar_data.drop(["price"], axis = 1)

y = highvar_data["price"]

In [65]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2)

x_train.head()

Unnamed: 0,carheight,curbweight,compressionratio
31,50.8,1819,9.2
109,58.7,3230,8.4
149,54.9,2650,7.7
64,55.5,2425,8.6
138,53.7,2050,9.0


In [66]:
linear_model = LinearRegression(normalize = True).fit(x_train, y_train)

print("Training Score :", linear_model.score(x_train, y_train))

Training Score : 0.7105807413831181


In [67]:
y_pred = linear_model.predict(x_test)

print("Testing Score :", r2_score(y_pred, y_test))

Testing Score : 0.5524139555140472
