# Automobile price prediction

The service for the sale of cars with mileage "Autoland" is developing an application to attract new customers. It helps you quickly find out the market value of your car. We have historical data in our disposal: technical characteristics, features and prices of cars. We need to build a model to determine the cost.

In [4]:
import os
import numpy as np
import pandas as pd
import datetime as dt


from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.metrics import make_scorer
from sklearn.dummy import DummyRegressor


from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor

import warnings
import seaborn as sns
import matplotlib.pyplot as plt

sns.set()


%config InlineBackend.figure_format = 'retina'

warnings.filterwarnings("ignore")

In [5]:
cwd = os.getcwd()

In [6]:
try:
    df = pd.read_csv(cwd+'/autos.csv')
except:
    df = pd.read_csv('/datasets/autos.csv')

# Overview

In [7]:
df.shape

(354369, 16)

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 354369 entries, 0 to 354368
Data columns (total 16 columns):
DateCrawled          354369 non-null object
Price                354369 non-null int64
VehicleType          316879 non-null object
RegistrationYear     354369 non-null int64
Gearbox              334536 non-null object
Power                354369 non-null int64
Model                334664 non-null object
Kilometer            354369 non-null int64
RegistrationMonth    354369 non-null int64
FuelType             321474 non-null object
Brand                354369 non-null object
NotRepaired          283215 non-null object
DateCreated          354369 non-null object
NumberOfPictures     354369 non-null int64
PostalCode           354369 non-null int64
LastSeen             354369 non-null object
dtypes: int64(7), object(9)
memory usage: 43.3+ MB


Let's delete all useless columns

In [9]:
unused_cols = ['DateCrawled', 'RegistrationMonth', 'DateCreated', 'PostalCode', 'LastSeen', 'NumberOfPictures']
cat_cols = ['VehicleType', 'Gearbox', 'Model', 'FuelType', 'Brand', 'NotRepaired']
num_cols = ['RegistrationYear', 'Power', 'Kilometer']

In [10]:
df = df.drop(unused_cols, axis=1)

## Check data relevance

In [11]:
df.Price.describe()

count    354369.000000
mean       4416.656776
std        4514.158514
min           0.000000
25%        1050.000000
50%        2700.000000
75%        6400.000000
max       20000.000000
Name: Price, dtype: float64

In [12]:
df.RegistrationYear.describe()

count    354369.000000
mean       2004.234448
std          90.227958
min        1000.000000
25%        1999.000000
50%        2003.000000
75%        2008.000000
max        9999.000000
Name: RegistrationYear, dtype: float64

In [13]:
df = df[df['Price']>0]

In [14]:
df = df[(df['RegistrationYear']< 2021) & (df['RegistrationYear']> 1940)]

Let's check other features

In [15]:
df.Gearbox.unique()

array(['manual', 'auto', nan], dtype=object)

In [16]:
df.VehicleType.unique()

array([nan, 'coupe', 'suv', 'small', 'sedan', 'convertible', 'bus',
       'wagon', 'other'], dtype=object)

In [17]:
df.FuelType.unique()

array(['petrol', 'gasoline', nan, 'lpg', 'other', 'hybrid', 'cng',
       'electric'], dtype=object)

In [18]:
df.Brand.unique()

array(['volkswagen', 'audi', 'jeep', 'skoda', 'bmw', 'peugeot', 'ford',
       'mazda', 'nissan', 'renault', 'mercedes_benz', 'opel', 'seat',
       'citroen', 'honda', 'fiat', 'mini', 'smart', 'hyundai',
       'sonstige_autos', 'alfa_romeo', 'subaru', 'volvo', 'mitsubishi',
       'kia', 'suzuki', 'lancia', 'toyota', 'chevrolet', 'dacia',
       'daihatsu', 'trabant', 'chrysler', 'jaguar', 'daewoo', 'porsche',
       'rover', 'saab', 'land_rover', 'lada'], dtype=object)

In [19]:
df.Model.unique()

array(['golf', nan, 'grand', 'fabia', '3er', '2_reihe', 'c_max',
       '3_reihe', 'passat', 'navara', 'polo', 'twingo', 'a_klasse',
       'scirocco', '5er', 'meriva', 'arosa', 'other', 'c4', 'civic',
       'transporter', 'punto', 'e_klasse', 'clio', 'kadett', 'kangoo',
       'one', 'fortwo', '1er', 'b_klasse', 'signum', 'astra', 'a8',
       'jetta', 'fiesta', 'c_klasse', 'micra', 'vito', 'sprinter', '156',
       'escort', 'forester', 'xc_reihe', 'scenic', 'a4', 'ka', 'a1',
       'insignia', 'combo', 'focus', 'tt', 'corsa', 'a6', 'jazz', 'omega',
       'slk', '7er', '80', '147', '100', 'z_reihe', 'sportage', 'sorento',
       'v40', 'ibiza', 'mustang', 'eos', 'touran', 'getz', 'a3', 'almera',
       'megane', 'lupo', 'r19', 'zafira', 'caddy', 'mondeo', 'cordoba',
       'colt', 'impreza', 'vectra', 'berlingo', 'tiguan', 'i_reihe',
       'espace', 'sharan', '6_reihe', 'panda', 'up', 'seicento', 'ceed',
       '5_reihe', 'yeti', 'octavia', 'mii', 'rx_reihe', '6er', 'modus',
     

Everything is ОК in categorical data

In [20]:
df.Power.describe()

count    343352.000000
mean        111.009399
std         187.874492
min           0.000000
25%          69.000000
50%         105.000000
75%         143.000000
max       20000.000000
Name: Power, dtype: float64

In [21]:
df[df['Power']==0].sample(3)

Unnamed: 0,Price,VehicleType,RegistrationYear,Gearbox,Power,Model,Kilometer,FuelType,Brand,NotRepaired
211914,650,,2000,manual,0,e_klasse,150000,petrol,mercedes_benz,no
162565,2500,small,1997,manual,0,a3,150000,,audi,no
326490,500,,2017,auto,0,,150000,petrol,audi,no


In [22]:
df[df['Power']>500].sort_values('Power', ascending=False).head()

Unnamed: 0,Price,VehicleType,RegistrationYear,Gearbox,Power,Model,Kilometer,FuelType,Brand,NotRepaired
219584,4300,coupe,1999,auto,20000,clk,150000,petrol,mercedes_benz,no
299180,1500,wagon,1997,manual,19312,5er,150000,,bmw,no
114106,9999,sedan,2006,manual,19211,1er,125000,gasoline,bmw,
132485,2100,wagon,2001,manual,19208,5er,150000,,bmw,yes
63986,3250,sedan,2001,auto,17932,omega,150000,petrol,opel,


In [23]:
df[df['Power']>500].sort_values('Power').head()

Unnamed: 0,Price,VehicleType,RegistrationYear,Gearbox,Power,Model,Kilometer,FuelType,Brand,NotRepaired
23188,11999,wagon,2002,auto,504,other,150000,petrol,audi,no
221255,12699,,2017,auto,504,,150000,petrol,audi,yes
124994,12900,wagon,2003,auto,504,other,150000,petrol,audi,no
177451,11999,wagon,2002,auto,505,other,150000,petrol,audi,no
241096,2500,coupe,2005,auto,507,m_reihe,125000,petrol,bmw,yes


In [24]:
df = df[(df['Power']>0) & (df['Power']<500)]

there is no BMW from 2002 with 504 horse power, data is erroneous

In [25]:
def na_describe(df):
    na_df = pd.concat([df.isna().sum(), df.isna().sum()/len(df)*100], axis=1)
    na_df.columns = ["missing count", "missing %"]
    na_df = na_df[na_df["missing count"] > 0]
    na_df = na_df.sort_values(by="missing %", ascending=False)
    if na_df.empty:
        return f'The dataset has no missing values'
    return na_df

In [26]:
na_describe(df)

Unnamed: 0,missing count,missing %
NotRepaired,47002,15.324424
VehicleType,21297,6.943625
FuelType,19715,6.427833
Model,12421,4.049714
Gearbox,6062,1.976441


We've got a lot of missing data in the <code> NotRepaired </code> column

In [27]:
df.groupby('NotRepaired')['Price'].agg(['mean', 'median', 'min', 'max', 'count'])

Unnamed: 0_level_0,mean,median,min,max,count
NotRepaired,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
no,5494.159758,3900,1,20000,230011
yes,2130.826667,1000,1,20000,29700


We can't really fill this empty values with for example the median value because we risk to distort  the data, so let's just make them<code> 'unknown' </code>

In [28]:
df = df.fillna('unknown')

In [29]:
na_describe(df)

'The dataset has no missing values'

In [30]:
df.duplicated().sum()

39771

In [31]:
df = df.drop_duplicates()

In [32]:
df.duplicated().sum()

0

# Preprocessing for the models

Well test the following models on our data:
- Random Forest
- XGB
- LightGBM
- Catboost

We won't use any linear models because we've got a lot of categorical data and using OHE we'd get +300 new columns and it would be hard ( and long ) to train 

LightGBM and Catboost can deal with categorical data, let's make two datasets - one for XGB and RandomForest and the other one for the Catboost and LightGBM

In [33]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 266942 entries, 1 to 354368
Data columns (total 10 columns):
Price               266942 non-null int64
VehicleType         266942 non-null object
RegistrationYear    266942 non-null int64
Gearbox             266942 non-null object
Power               266942 non-null int64
Model               266942 non-null object
Kilometer           266942 non-null int64
FuelType            266942 non-null object
Brand               266942 non-null object
NotRepaired         266942 non-null object
dtypes: int64(4), object(6)
memory usage: 22.4+ MB


In [34]:
features = df.drop('Price', axis=1)
target = df['Price']

In [35]:
features[cat_cols] = features[cat_cols].astype('category')

In [36]:
X_train_cat, X_test_cat, y_train_cat, y_test_cat = train_test_split(features, target, test_size=0.2, random_state=42)

In [37]:
ordinal_encoder = OrdinalEncoder()
encoded_cols = pd.DataFrame(ordinal_encoder.fit_transform(features[cat_cols]), index=features.index)
features_encoded = pd.concat([features.drop(cat_cols, axis=1), encoded_cols], axis=1)

In [38]:
X_train, X_test, y_train, y_test = train_test_split(features_encoded, target, test_size=0.2, random_state=42)

In [39]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((213553, 9), (53389, 9), (213553,), (53389,))

In [40]:
X_train_cat.shape, X_test_cat.shape, y_train_cat.shape, y_test_cat.shape

((213553, 9), (53389, 9), (213553,), (53389,))

In [41]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [42]:
scaler_cat = StandardScaler()
X_train_cat[num_cols] = scaler.fit_transform(X_train_cat[num_cols])
X_test_cat[num_cols] = scaler.transform(X_test_cat[num_cols])

In [43]:
X_test_cat

Unnamed: 0,VehicleType,RegistrationYear,Gearbox,Power,Model,Kilometer,FuelType,Brand,NotRepaired
270075,coupe,0.368666,manual,0.784954,4_reihe,0.604856,petrol,peugeot,no
119375,sedan,-1.186810,manual,-0.390652,golf,0.604856,unknown,volkswagen,unknown
137957,small,-0.338368,manual,-1.137069,2_reihe,-0.064851,petrol,peugeot,no
53607,coupe,-1.186810,manual,1.344767,3er,-0.064851,petrol,bmw,no
37741,small,-1.045403,manual,-1.416976,polo,0.604856,petrol,volkswagen,no
...,...,...,...,...,...,...,...,...,...
268428,bus,-0.196962,manual,0.542369,sharan,0.604856,petrol,volkswagen,no
172326,unknown,1.924141,auto,0.542369,zafira,0.604856,unknown,opel,no
225911,unknown,1.782734,manual,-0.857163,golf,0.604856,unknown,volkswagen,yes
123542,small,-1.186810,manual,-1.416976,corsa,0.604856,petrol,opel,unknown


# Testing Models

In [44]:
def rmse(target, pred):
    return mean_squared_error(target, pred) ** 0.5

rmse_ = make_scorer(rmse, greater_is_better=False)

In [45]:
def eval_model(model, params, X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test):
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)
    grid = GridSearchCV(model, param_grid=params, cv=skf, scoring=rmse_)
    if model in [lgbm, cat]:
          X_train = X_train_cat
          y_train = y_train_cat
          X_test = X_test_cat
          y_test = y_test_cat
    grid.fit(X_train, y_train)
    begin_time = dt.datetime.now()
    grid.best_estimator_.fit(X_train, y_train)
    learn_time = dt.datetime.now() - begin_time
    begin_predict = dt.datetime.now()
    pred = grid.predict(X_test)
    predict_time = dt.datetime.now() - begin_predict
    score = rmse(y_test, pred)
    
    return score, learn_time, predict_time

In [46]:
xgb_params = {
    'min_child_weight': [1, 3, 5],
    'max_depth': [3, 6, 10],
    
}

rf_params = {
    'max_depth': [5, 10, 25],
    'n_estimators': [10, 50, 200]
    
}

categorical_features_indices = np.where(X_train_cat.dtypes == 'category')[0]

cat_boost_params = {
    'silent':[True], 
    'cat_features': [categorical_features_indices],
    'depth': [3, 6, 10],
    'iterations': [500, 1000, 1500]
}

lightgbm_params = {
    'max_depth': [15, 30, 50],
    'num_leaves': [10, 150, 300]
}

In [47]:
xgb = XGBRegressor()
rf = RandomForestRegressor()
cat = CatBoostRegressor(silent=True)
lgbm = LGBMRegressor()

In [48]:
%%script echo (1669.5060534729346, datetime.timedelta(seconds=98, microseconds=673960), datetime.timedelta(seconds=4, microseconds=86420))
eval_model(rf, rf_params, X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test)

(1669.5060534729346, datetime.timedelta(seconds=98, microseconds=673960), datetime.timedelta(seconds=4, microseconds=86420))


In [49]:
%%script echo (1600.755878216347, datetime.timedelta(seconds=8, microseconds=965798), datetime.timedelta(microseconds=43552))
eval_model(xgb, xgb_params)

(1600.755878216347, datetime.timedelta(seconds=8, microseconds=965798), datetime.timedelta(microseconds=43552))


In [50]:
%%script echo (1569.407988738034, datetime.timedelta(seconds=1, microseconds=844018), datetime.timedelta(microseconds=196565))
eval_model(lgbm, lightgbm_params)

(1569.407988738034, datetime.timedelta(seconds=1, microseconds=844018), datetime.timedelta(microseconds=196565))


In [51]:
%%script echo (1584.3221276760344, datetime.timedelta(seconds=220, microseconds=258046), datetime.timedelta(microseconds=423979))
eval_model(cat, cat_boost_params)

(1584.3221276760344, datetime.timedelta(seconds=220, microseconds=258046), datetime.timedelta(microseconds=423979))


In [52]:
dummy = DummyRegressor(strategy='mean')
dummy.fit(X_train, y_train)
pred = dummy.predict(X_test)
rmse(pred, y_test)

4610.932408327369

In [53]:
models = pd.Series(['Catboost', 'XGB', 'LightGBM', 'RandomForest'])

In [54]:
results = pd.DataFrame({'rmse' : [1584, 1600, 1569, 1669],
              'train_time': [220.26, 8.96, 1.84, 98],
              'predict_time': [0.42, 0.96, 0.19, 0.67]
               }, index=models)


In [55]:
results.sort_values('rmse')

Unnamed: 0,rmse,train_time,predict_time
LightGBM,1569,1.84,0.19
Catboost,1584,220.26,0.42
XGB,1600,8.96,0.96
RandomForest,1669,98.0,0.67


# Conclusion

Our winner is - <b> LightGBM </b> with a <code> rmse = 1569 </code> it only took <code> 1.84 </code> to train