In [12]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

https://www.kaggle.com/hellbuoy/car-price-prediction

In [13]:
cars_data = pd.read_csv("datasets/car_price_data_null_values.csv")

cars_data.head(5)

Unnamed: 0,car_ID,symboling,CarName,fueltype,aspiration,doornumber,carbody,drivewheel,enginelocation,wheelbase,...,enginesize,fuelsystem,boreratio,stroke,compressionratio,horsepower,peakrpm,citympg,highwaympg,price
0,1,3,alfa-romero giulia,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111.0,5000.0,21.0,27,13495.0
1,2,3,alfa-romero stelvio,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111.0,5000.0,21.0,27,16500.0
2,3,1,alfa-romero Quadrifoglio,gas,std,two,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154.0,5000.0,19.0,26,16500.0
3,4,2,audi 100 ls,gas,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.4,10.0,102.0,5500.0,24.0,30,13950.0
4,5,2,audi 100ls,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.4,8.0,115.0,5500.0,,22,17450.0


### Drop certain features so we're working with a more manageable dataset

In [14]:
cars_data.drop(["car_ID", "symboling", "CarName", "fueltype", "aspiration",
                "carbody", "enginelocation", "enginetype", "fuelsystem",
                "cylindernumber"], axis = 1, inplace = True)


cars_data.head(5)

Unnamed: 0,doornumber,drivewheel,wheelbase,carlength,carwidth,carheight,curbweight,enginesize,boreratio,stroke,compressionratio,horsepower,peakrpm,citympg,highwaympg,price
0,two,rwd,88.6,168.8,64.1,48.8,2548,130,3.47,2.68,9.0,111.0,5000.0,21.0,27,13495.0
1,two,rwd,88.6,168.8,64.1,48.8,2548,130,3.47,2.68,9.0,111.0,5000.0,21.0,27,16500.0
2,two,rwd,94.5,171.2,65.5,52.4,2823,152,2.68,3.47,9.0,154.0,5000.0,19.0,26,16500.0
3,four,fwd,99.8,176.6,66.2,54.3,2337,109,3.19,3.4,10.0,102.0,5500.0,24.0,30,13950.0
4,four,4wd,99.4,176.6,66.4,54.3,2824,136,3.19,3.4,8.0,115.0,5500.0,,22,17450.0


### Label encode the doornumber feature i.e. represent each categorical variable with a unique integer value

In [15]:
doornumber_dict = {'two':0, 'four':1}

cars_data['doornumber'].replace(doornumber_dict, inplace=True)

cars_data.head()

Unnamed: 0,doornumber,drivewheel,wheelbase,carlength,carwidth,carheight,curbweight,enginesize,boreratio,stroke,compressionratio,horsepower,peakrpm,citympg,highwaympg,price
0,0,rwd,88.6,168.8,64.1,48.8,2548,130,3.47,2.68,9.0,111.0,5000.0,21.0,27,13495.0
1,0,rwd,88.6,168.8,64.1,48.8,2548,130,3.47,2.68,9.0,111.0,5000.0,21.0,27,16500.0
2,0,rwd,94.5,171.2,65.5,52.4,2823,152,2.68,3.47,9.0,154.0,5000.0,19.0,26,16500.0
3,1,fwd,99.8,176.6,66.2,54.3,2337,109,3.19,3.4,10.0,102.0,5500.0,24.0,30,13950.0
4,1,4wd,99.4,176.6,66.4,54.3,2824,136,3.19,3.4,8.0,115.0,5500.0,,22,17450.0


### Represent each value of the categorical variable in one-hot encoded form

- The original `drivewheel` column is replaced with 3 columns `drivewheel_4wd`, `drivewheel_fwd`, and `drivewheel_rwd`

In [16]:
cars_data = pd.get_dummies(cars_data, columns=['drivewheel'])

cars_data.head()

Unnamed: 0,doornumber,wheelbase,carlength,carwidth,carheight,curbweight,enginesize,boreratio,stroke,compressionratio,horsepower,peakrpm,citympg,highwaympg,price,drivewheel_4wd,drivewheel_fwd,drivewheel_rwd
0,0,88.6,168.8,64.1,48.8,2548,130,3.47,2.68,9.0,111.0,5000.0,21.0,27,13495.0,0,0,1
1,0,88.6,168.8,64.1,48.8,2548,130,3.47,2.68,9.0,111.0,5000.0,21.0,27,16500.0,0,0,1
2,0,94.5,171.2,65.5,52.4,2823,152,2.68,3.47,9.0,154.0,5000.0,19.0,26,16500.0,0,0,1
3,1,99.8,176.6,66.2,54.3,2337,109,3.19,3.4,10.0,102.0,5500.0,24.0,30,13950.0,0,1,0
4,1,99.4,176.6,66.4,54.3,2824,136,3.19,3.4,8.0,115.0,5500.0,,22,17450.0,1,0,0


In [17]:
cars_data.drop(["doornumber", "drivewheel_4wd", "drivewheel_fwd", "drivewheel_rwd"], 
               axis = 1, inplace = False).replace(0, np.nan, inplace = True)

cars_data.isna().sum()

doornumber            0
wheelbase            10
carlength             0
carwidth              0
carheight             0
curbweight            0
enginesize            0
boreratio           104
stroke               45
compressionratio    133
horsepower            4
peakrpm               3
citympg             180
highwaympg            0
price                 0
drivewheel_4wd        0
drivewheel_fwd        0
drivewheel_rwd        0
dtype: int64

### Calculate what percentage of the records have missing values for each of these fields

In [18]:
cars_data["peakrpm"].isna().sum() / len(cars_data) * 100

1.4634146341463417

In [19]:
cars_data["wheelbase"].isna().sum() / len(cars_data) * 100

4.878048780487805

In [20]:
cars_data["stroke"].isna().sum() / len(cars_data) * 100

21.951219512195124

In [21]:
cars_data["citympg"].isna().sum() / len(cars_data) * 100

87.8048780487805

In [22]:
cars_data["compressionratio"].isna().sum() / len(cars_data) * 100

64.8780487804878

In [23]:
cars_data["boreratio"].isna().sum() / len(cars_data) * 100

50.73170731707317

In [24]:
cars_data.columns

Index(['doornumber', 'wheelbase', 'carlength', 'carwidth', 'carheight',
       'curbweight', 'enginesize', 'boreratio', 'stroke', 'compressionratio',
       'horsepower', 'peakrpm', 'citympg', 'highwaympg', 'price',
       'drivewheel_4wd', 'drivewheel_fwd', 'drivewheel_rwd'],
      dtype='object')

### Choose a missing value threshold of 50%, columns with more than 50% of the data missing will be dropped

In [25]:
cars_data_trimmed = cars_data.dropna(thresh = int(cars_data.shape[0] * .50), axis = 1)

cars_data_trimmed.head()

Unnamed: 0,doornumber,wheelbase,carlength,carwidth,carheight,curbweight,enginesize,stroke,horsepower,peakrpm,highwaympg,price,drivewheel_4wd,drivewheel_fwd,drivewheel_rwd
0,0,88.6,168.8,64.1,48.8,2548,130,2.68,111.0,5000.0,27,13495.0,0,0,1
1,0,88.6,168.8,64.1,48.8,2548,130,2.68,111.0,5000.0,27,16500.0,0,0,1
2,0,94.5,171.2,65.5,52.4,2823,152,3.47,154.0,5000.0,26,16500.0,0,0,1
3,1,99.8,176.6,66.2,54.3,2337,109,3.4,102.0,5500.0,30,13950.0,0,1,0
4,1,99.4,176.6,66.4,54.3,2824,136,3.4,115.0,5500.0,22,17450.0,1,0,0


In [26]:
cars_data_trimmed = cars_data_trimmed.fillna(cars_data_trimmed.mean())

cars_data_trimmed.isna().sum()

doornumber        0
wheelbase         0
carlength         0
carwidth          0
carheight         0
curbweight        0
enginesize        0
stroke            0
horsepower        0
peakrpm           0
highwaympg        0
price             0
drivewheel_4wd    0
drivewheel_fwd    0
drivewheel_rwd    0
dtype: int64

In [27]:
x = cars_data_trimmed.drop(["price"], axis = 1)

y = cars_data_trimmed["price"]

In [28]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size =  0.25)

x_train.head()

Unnamed: 0,doornumber,wheelbase,carlength,carwidth,carheight,curbweight,enginesize,stroke,horsepower,peakrpm,highwaympg,drivewheel_4wd,drivewheel_fwd,drivewheel_rwd
123,1,103.3,174.6,64.6,59.8,2535,122,3.46,88.0,5000.0,30,0,1,0
160,1,95.7,166.3,64.4,53.0,2094,98,3.242,70.0,4800.0,47,0,1,0
27,0,98.924103,157.3,63.8,50.6,2191,98,3.39,102.0,5500.0,30,0,1,0
169,0,98.4,176.2,65.6,52.0,2551,146,3.242,116.0,4800.0,30,0,0,1
101,1,100.4,181.7,66.5,55.1,3095,181,3.27,152.0,5200.0,22,0,1,0


In [29]:
linear_model = LinearRegression().fit(x_train, y_train)

print("Training Score :", linear_model.score(x_train, y_train))

Training Score : 0.8706998028758303


In [30]:
y_pred = linear_model.predict(x_test)

print("Testing Score :", r2_score(y_test, y_pred))

Testing Score : 0.7413313497941009
