

# Used Cars Price Prediction 

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

# preprocessing
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, StratifiedKFold
import pandas_profiling as pp

# models
from sklearn.model_selection import cross_val_predict as cvp
from sklearn import metrics
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
import xgboost as xgb



import warnings
warnings.filterwarnings("ignore")

## Read datasets 

In [2]:
df = pd.read_csv(r'vehicles_data.csv')
df.head(5)

Unnamed: 0.1,Unnamed: 0,id,url,region,region_url,price,year,manufacturer,model,condition,...,size,type,paint_color,image_url,description,county,state,lat,long,posting_date
0,55483,7315914053,https://sandiego.craigslist.org/esd/ctd/d/el-c...,san diego,https://sandiego.craigslist.org,0,2018.0,ram,promaster 2500,excellent,...,,van,,https://images.craigslist.org/00101_7UYL6JvF6q...,East County Pre-Owned Superstore provides a 10...,,ca,32.7928,-116.9665,2021-05-02T13:30:34-0700
1,162368,7310885048,https://omaha.craigslist.org/ctd/d/omaha-2017-...,omaha / council bluffs,https://omaha.craigslist.org,13995,2017.0,mazda,cx-3,,...,,SUV,white,https://images.craigslist.org/00t0t_2tlVeJAW5Z...,"2017 *Mazda* *CX-3* Sport AWD SUV - $13,995CAL...",,ia,41.207382,-96.023096,2021-04-22T11:20:19-0500
2,234393,7308243856,https://charlotte.craigslist.org/ctd/d/charlot...,charlotte,https://charlotte.craigslist.org,19990,2019.0,mitsubishi,eclipse cross sp,good,...,,hatchback,white,https://images.craigslist.org/00000_3c4A3S9r5d...,Carvana is the safer way to buy a car During t...,,nc,35.19,-80.83,2021-04-17T11:11:13-0400
3,276110,7315817729,https://newyork.craigslist.org/brk/ctd/d/passa...,new york city,https://newyork.craigslist.org,0,2019.0,honda,cr-v,,...,,SUV,orange,https://images.craigslist.org/00A0A_8tKj1EDSRK...,2019 Honda CR-V EX AWD Offered by: NASA AU...,,ny,40.854573,-74.120219,2021-05-02T13:24:50-0400
4,349033,7301620999,https://greenville.craigslist.org/ctd/d/newry-...,greenville / upstate,https://greenville.craigslist.org,42900,2015.0,chevrolet,corvette,excellent,...,,convertible,black,https://images.craigslist.org/00w0w_8pj8aAH74G...,Video link below!!! Stunning 2015 Chevrolet Co...,,sc,34.755562,-82.906419,2021-04-04T09:18:23-0400


In [3]:
drop_columns = ['Unnamed: 0','id','url','region','region_url', 'title_status', 'VIN', 'size', 'image_url', 'lat','long','county','description','posting_date']
df = df.drop(columns = drop_columns,axis=1)

In [4]:
df.shape

(64032, 13)

In [5]:
df.isna().sum()

price               0
year              158
manufacturer     2569
model             802
condition       26097
cylinders       26511
fuel              424
odometer          669
transmission      353
drive           19471
type            13785
paint_color     19505
state               0
dtype: int64

In [6]:
df = df.dropna()
df.head(5)

Unnamed: 0,price,year,manufacturer,model,condition,cylinders,fuel,odometer,transmission,drive,type,paint_color,state
5,0,2006.0,chrysler,300,like new,8 cylinders,gas,149000.0,automatic,rwd,sedan,white,fl
9,20995,2011.0,chevrolet,silverado 1500,excellent,8 cylinders,gas,92001.0,automatic,4wd,truck,blue,wi
15,50995,2017.0,gmc,yukon denali,like new,8 cylinders,gas,70227.0,automatic,4wd,SUV,grey,ak
22,13500,2014.0,chevrolet,tahoe,good,8 cylinders,gas,96007.0,automatic,rwd,SUV,white,fl
29,34990,2016.0,gmc,canyon crew cab sle pickup,good,6 cylinders,gas,34425.0,other,4wd,pickup,red,ma


In [7]:
df.shape

(17491, 13)

In [8]:
df.describe()

Unnamed: 0,price,year,odometer
count,17491.0,17491.0,17491.0
mean,16325.45,2009.535247,112261.6
std,129442.9,9.596493,215514.9
min,0.0,1918.0,0.0
25%,5600.0,2006.0,55787.0
50%,10950.0,2012.0,102567.0
75%,22500.0,2015.0,148000.0
max,17000000.0,2022.0,10000000.0


In [9]:

# Determination categorical features

numerics = ['int8', 'int16', 'int32', 'int64', 'float16', 'float32', 'float64']

categorical_columns = []

features = df.columns.values.tolist()

for col in features:
    if df[col].dtype in numerics: continue
    categorical_columns.append(col)
# Encoding categorical features
for col in categorical_columns:
    if col in df.columns:
        le = LabelEncoder()
        le.fit(list(df[col].astype(str).values))
        df[col] = le.transform(list(df[col].astype(str).values))

In [11]:
df.head()

Unnamed: 0,price,year,manufacturer,model,condition,cylinders,fuel,odometer,transmission,drive,type,paint_color,state
5,0,2006.0,7,161,3,6,2,149000.0,0,2,9,10,9
9,20995,2011.0,6,3351,0,6,2,92001.0,0,0,10,1,48
15,50995,2017.0,13,4147,3,6,2,70227.0,0,0,0,5,0
22,13500,2014.0,6,3653,2,6,2,96007.0,0,2,0,10,9
29,34990,2016.0,13,851,2,5,2,34425.0,2,0,8,8,19


In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 17491 entries, 5 to 64031
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   price         17491 non-null  int64  
 1   year          17491 non-null  float64
 2   manufacturer  17491 non-null  int64  
 3   model         17491 non-null  int64  
 4   condition     17491 non-null  int64  
 5   cylinders     17491 non-null  int64  
 6   fuel          17491 non-null  int64  
 7   odometer      17491 non-null  float64
 8   transmission  17491 non-null  int64  
 9   drive         17491 non-null  int64  
 10  type          17491 non-null  int64  
 11  paint_color   17491 non-null  int64  
 12  state         17491 non-null  int64  
dtypes: float64(2), int64(11)
memory usage: 1.9 MB


### Selecting realistic data

In [13]:
df = df[df['price'] > 1000]
df = df[df['price'] < 40000]

In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 15672 entries, 9 to 64031
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   price         15672 non-null  int64  
 1   year          15672 non-null  float64
 2   manufacturer  15672 non-null  int64  
 3   model         15672 non-null  int64  
 4   condition     15672 non-null  int64  
 5   cylinders     15672 non-null  int64  
 6   fuel          15672 non-null  int64  
 7   odometer      15672 non-null  float64
 8   transmission  15672 non-null  int64  
 9   drive         15672 non-null  int64  
 10  type          15672 non-null  int64  
 11  paint_color   15672 non-null  int64  
 12  state         15672 non-null  int64  
dtypes: float64(2), int64(11)
memory usage: 1.7 MB


In [16]:
df.describe()

Unnamed: 0,price,year,manufacturer,model,condition,cylinders,fuel,odometer,transmission,drive,type,paint_color,state
count,15672.0,15672.0,15672.0,15672.0,15672.0,15672.0,15672.0,15672.0,15672.0,15672.0,15672.0,15672.0,15672.0
mean,14628.795559,2009.213502,17.586077,2110.26206,1.192637,4.600881,1.95291,113720.7,0.278139,0.829952,6.008486,5.575932,24.028905
std,10522.984302,9.373542,11.11504,1195.354356,1.136227,1.263622,0.49251,174538.1,0.639189,0.782515,4.054207,4.002485,14.836002
min,1100.0,1918.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,6200.0,2006.0,9.0,1098.0,0.0,3.0,2.0,61796.0,0.0,0.0,2.0,1.0,9.0
50%,10996.5,2011.0,13.0,2066.0,2.0,5.0,2.0,106424.5,0.0,1.0,8.0,6.0,23.0
75%,20995.5,2015.0,28.0,3245.0,2.0,6.0,2.0,150000.0,0.0,1.0,9.0,9.0,37.0
max,39999.0,2022.0,39.0,4186.0,5.0,7.0,4.0,10000000.0,2.0,2.0,12.0,11.0,50.0


## Correlation

In [15]:
df.corr()

Unnamed: 0,price,year,manufacturer,model,condition,cylinders,fuel,odometer,transmission,drive,type,paint_color,state
price,1.0,0.365629,-0.063317,0.044708,0.113452,0.308654,-0.043515,-0.225551,0.48089,-0.062936,0.035673,0.055619,-0.003258
year,0.365629,1.0,0.045193,0.046446,0.018426,-0.17683,0.073875,-0.155168,0.12094,-0.206506,0.016615,0.020187,0.009275
manufacturer,-0.063317,0.045193,1.0,0.013222,-0.02709,-0.25157,0.023902,0.017731,-0.004443,-0.118914,0.006027,-0.018046,-0.016123
model,0.044708,0.046446,0.013222,1.0,-0.003162,0.080125,0.023951,0.003002,-0.010415,-0.163773,-0.106811,0.008621,0.025645
condition,0.113452,0.018426,-0.02709,-0.003162,1.0,0.068665,0.043579,-0.022735,0.222409,0.049836,0.035288,0.005069,0.02069
cylinders,0.308654,-0.17683,-0.25157,0.080125,0.068665,1.0,-0.110999,0.01801,0.153805,0.015704,0.008297,0.03281,0.017029
fuel,-0.043515,0.073875,0.023902,0.023951,0.043579,-0.110999,1.0,-0.06727,0.128162,0.045451,-0.11258,-0.052829,-0.019516
odometer,-0.225551,-0.155168,0.017731,0.003002,-0.022735,0.01801,-0.06727,1.0,-0.13722,-0.016554,-0.001251,-0.002651,-0.008964
transmission,0.48089,0.12094,-0.004443,-0.010415,0.222409,0.153805,0.128162,-0.13722,1.0,0.054125,0.000714,-0.00101,0.000727
drive,-0.062936,-0.206506,-0.118914,-0.163773,0.049836,0.015704,0.045451,-0.016554,0.054125,1.0,0.152097,0.072733,-0.066366


## Data Scaling

In [17]:
y = df['price']
X = df.drop(['price'], axis=1)

In [18]:
# Synthesis test0 from train0
train_X, test_x, train_Y, test_y = train_test_split(X, y, test_size=0.25, random_state=10)

# XGB

In [19]:
import xgboost as xgb
xgb = xgb.XGBRegressor()

In [20]:
xgb.fit(train_X, train_Y)



XGBRegressor()

In [21]:
y_pred = xgb.predict(test_x)

In [22]:
r2_score(test_y,y_pred)

0.8312090494420467