

# Used Cars Price Prediction 

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

# preprocessing
from sklearn.model_selection import train_test_split, GridSearchCV

# metrics and models
from sklearn.metrics import r2_score,mean_squared_error
import xgboost as xgb

import warnings
warnings.filterwarnings("ignore")

## Read datasets 

In [3]:
df = pd.read_csv(r'vehicles_data.csv')
df.head(5)

Unnamed: 0.1,Unnamed: 0,id,url,region,region_url,price,year,manufacturer,model,condition,...,size,type,paint_color,image_url,description,county,state,lat,long,posting_date
0,55483,7315914053,https://sandiego.craigslist.org/esd/ctd/d/el-c...,san diego,https://sandiego.craigslist.org,0,2018.0,ram,promaster 2500,excellent,...,,van,,https://images.craigslist.org/00101_7UYL6JvF6q...,East County Pre-Owned Superstore provides a 10...,,ca,32.7928,-116.9665,2021-05-02T13:30:34-0700
1,162368,7310885048,https://omaha.craigslist.org/ctd/d/omaha-2017-...,omaha / council bluffs,https://omaha.craigslist.org,13995,2017.0,mazda,cx-3,,...,,SUV,white,https://images.craigslist.org/00t0t_2tlVeJAW5Z...,"2017 *Mazda* *CX-3* Sport AWD SUV - $13,995CAL...",,ia,41.207382,-96.023096,2021-04-22T11:20:19-0500
2,234393,7308243856,https://charlotte.craigslist.org/ctd/d/charlot...,charlotte,https://charlotte.craigslist.org,19990,2019.0,mitsubishi,eclipse cross sp,good,...,,hatchback,white,https://images.craigslist.org/00000_3c4A3S9r5d...,Carvana is the safer way to buy a car During t...,,nc,35.19,-80.83,2021-04-17T11:11:13-0400
3,276110,7315817729,https://newyork.craigslist.org/brk/ctd/d/passa...,new york city,https://newyork.craigslist.org,0,2019.0,honda,cr-v,,...,,SUV,orange,https://images.craigslist.org/00A0A_8tKj1EDSRK...,2019 Honda CR-V EX AWD Offered by: NASA AU...,,ny,40.854573,-74.120219,2021-05-02T13:24:50-0400
4,349033,7301620999,https://greenville.craigslist.org/ctd/d/newry-...,greenville / upstate,https://greenville.craigslist.org,42900,2015.0,chevrolet,corvette,excellent,...,,convertible,black,https://images.craigslist.org/00w0w_8pj8aAH74G...,Video link below!!! Stunning 2015 Chevrolet Co...,,sc,34.755562,-82.906419,2021-04-04T09:18:23-0400


In [4]:
drop_columns = ['Unnamed: 0','id','url','region','region_url', 'title_status', 'VIN', 'size', 'image_url', 'lat','long','county','description','posting_date']
df = df.drop(columns = drop_columns,axis=1)

In [5]:
df.shape

(64032, 13)

In [6]:
df.isna().sum()

price               0
year              158
manufacturer     2569
model             802
condition       26097
cylinders       26511
fuel              424
odometer          669
transmission      353
drive           19471
type            13785
paint_color     19505
state               0
dtype: int64

In [7]:
df = df.dropna()
df.head(5)

Unnamed: 0,price,year,manufacturer,model,condition,cylinders,fuel,odometer,transmission,drive,type,paint_color,state
5,0,2006.0,chrysler,300,like new,8 cylinders,gas,149000.0,automatic,rwd,sedan,white,fl
9,20995,2011.0,chevrolet,silverado 1500,excellent,8 cylinders,gas,92001.0,automatic,4wd,truck,blue,wi
15,50995,2017.0,gmc,yukon denali,like new,8 cylinders,gas,70227.0,automatic,4wd,SUV,grey,ak
22,13500,2014.0,chevrolet,tahoe,good,8 cylinders,gas,96007.0,automatic,rwd,SUV,white,fl
29,34990,2016.0,gmc,canyon crew cab sle pickup,good,6 cylinders,gas,34425.0,other,4wd,pickup,red,ma


In [8]:
df.shape

(17491, 13)

In [9]:
df.describe()

Unnamed: 0,price,year,odometer
count,17491.0,17491.0,17491.0
mean,16325.45,2009.535247,112261.6
std,129442.9,9.596493,215514.9
min,0.0,1918.0,0.0
25%,5600.0,2006.0,55787.0
50%,10950.0,2012.0,102567.0
75%,22500.0,2015.0,148000.0
max,17000000.0,2022.0,10000000.0


## Filter Categorical features

In [10]:
numerics = ['int8', 'int16', 'int32', 'int64', 'float16', 'float32', 'float64']

categorical_columns = []

features = df.columns.values.tolist()

for col in features:
    if df[col].dtype in numerics:
        continue
    categorical_columns.append(col)

## Encoding categorical columns using get_dummies. 

In [11]:
df_dummies = pd.get_dummies(df[categorical_columns],drop_first=False) 

In [12]:
df_dummies.head()

Unnamed: 0,manufacturer_acura,manufacturer_alfa-romeo,manufacturer_audi,manufacturer_bmw,manufacturer_buick,manufacturer_cadillac,manufacturer_chevrolet,manufacturer_chrysler,manufacturer_datsun,manufacturer_dodge,...,state_sd,state_tn,state_tx,state_ut,state_va,state_vt,state_wa,state_wi,state_wv,state_wy
5,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0
15,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
22,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
29,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [13]:
df_dummies.shape

(17491, 4328)

In [14]:
df = df.join(df_dummies)

In [15]:
df.shape

(17491, 4341)

In [16]:
df.head()

Unnamed: 0,price,year,manufacturer,model,condition,cylinders,fuel,odometer,transmission,drive,...,state_sd,state_tn,state_tx,state_ut,state_va,state_vt,state_wa,state_wi,state_wv,state_wy
5,0,2006.0,chrysler,300,like new,8 cylinders,gas,149000.0,automatic,rwd,...,0,0,0,0,0,0,0,0,0,0
9,20995,2011.0,chevrolet,silverado 1500,excellent,8 cylinders,gas,92001.0,automatic,4wd,...,0,0,0,0,0,0,0,1,0,0
15,50995,2017.0,gmc,yukon denali,like new,8 cylinders,gas,70227.0,automatic,4wd,...,0,0,0,0,0,0,0,0,0,0
22,13500,2014.0,chevrolet,tahoe,good,8 cylinders,gas,96007.0,automatic,rwd,...,0,0,0,0,0,0,0,0,0,0
29,34990,2016.0,gmc,canyon crew cab sle pickup,good,6 cylinders,gas,34425.0,other,4wd,...,0,0,0,0,0,0,0,0,0,0


In [17]:
df.drop(columns=categorical_columns,axis=1,inplace=True)

In [18]:
df.head(2)

Unnamed: 0,price,year,odometer,manufacturer_acura,manufacturer_alfa-romeo,manufacturer_audi,manufacturer_bmw,manufacturer_buick,manufacturer_cadillac,manufacturer_chevrolet,...,state_sd,state_tn,state_tx,state_ut,state_va,state_vt,state_wa,state_wi,state_wv,state_wy
5,0,2006.0,149000.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,20995,2011.0,92001.0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0


### Selecting realistic data

In [19]:
df = df[df['price'] > 1000]
df = df[df['price'] < 40000]

In [20]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 15672 entries, 9 to 64031
Columns: 4331 entries, price to state_wy
dtypes: float64(2), int64(1), uint8(4328)
memory usage: 65.2 MB


## Dataset devide into features and label

In [21]:
y = df['price']
X = df.drop(['price'], axis=1)

In [22]:
# Data split into train test
train_X, test_x, train_Y, test_y = train_test_split(X, y, test_size=0.25, random_state=10)

# XGB

In [23]:
import xgboost as xgb
xgb = xgb.XGBRegressor()

In [24]:
xgb.fit(train_X, train_Y)



XGBRegressor()

In [25]:
y_pred = xgb.predict(test_x)

In [26]:
r2_score(test_y,y_pred)

0.8344943724269679

# Tune the hyperparameter and check if you can increase the score