# Cleansing, EDA and Random Forest model including explanations for Nigerian used car prices

**Table of Contents:**
* [Data Cleansing](#1)

* [Numerical Features](#2)

* [Categorical Features](#3)

* [Price vs Features](#4)

* [Other Evaluations](#5)

* [Predictive Model for Price](#6)

* [Local Explanations for Predictions](#7)

* [Can we do better?](#8)

In [None]:
# packages 
import numpy as np
import pandas as pd
import time

import matplotlib.pyplot as plt
import seaborn as sns

import scipy.stats as stats

from sklearn.metrics import mean_absolute_error

import h2o
from h2o.estimators import H2ORandomForestEstimator

In [None]:
# import data
df = pd.read_csv('../input/nigerian-used-car-marketplace/car_scrape.csv')
df.head()

In [None]:
# dimensions of table
df.shape

In [None]:
# summary stats
df.describe(include='all')

<a id='1'></a>
# Data cleansing

### A few years are obiviously wrong:

In [None]:
# show rows having unreasonable years
df[(df.year<1980) | (df.year>2020)]

In [None]:
# these are only 5 rows, so let's remove them
df = df[(df.year>=1980) & (df.year<=2020)]

### Levels for paint are not clean:

In [None]:
# convert to upper case in a first step
df.paint = list(map(str.upper, df.paint))
levels = df.paint.value_counts()
levels.index

In [None]:
# now correct misspellings, redundant spaces, etc.
df.paint = df.paint.replace({'SLIVER': 'SILVER'})
df.paint = df.paint.replace({'SLIVER ': 'SILVER'})
df.paint = df.paint.replace({'BLUE ': 'BLUE'})
df.paint = df.paint.replace({'GREY ': 'GREY'})
df.paint = df.paint.replace({'GERY': 'GREY'})
df.paint = df.paint.replace({'GOLD ': 'GOLD'})
df.paint = df.paint.replace({'BLACK ': 'BLACK'})
df.paint = df.paint.replace({' BLACK': 'BLACK'})
df.paint = df.paint.replace({'BLACK.': 'BLACK'})
df.paint = df.paint.replace({'BLAC': 'BLACK'})
df.paint = df.paint.replace({'DARK BLUE ': 'DARK BLUE'})
df.paint = df.paint.replace({'CREAM ': 'CREAM'})
df.paint = df.paint.replace({'SILVER ': 'SILVER'})
df.paint = df.paint.replace({'GREEN ': 'GREEN'})
df.paint = df.paint.replace({'WHITE ': 'WHITE'})
df.paint = df.paint.replace({'  BROWN': 'BROWN'})
df.paint = df.paint.replace({'GRAY': 'GREY'})
df.paint = df.paint.replace({'GRAY ': 'GREY'})
df.paint = df.paint.replace({'DARK GRAY': 'DARK GREY'})
df.paint = df.paint.replace({'REDL': 'RED'})
df.paint = df.paint.replace({'SKYE BLUE': 'SKY BLUE'})
df.paint = df.paint.replace({'DARK SILVER ': 'DARK SILVER'})
df.paint = df.paint.replace({'LIGHT SILVER ': 'LIGHT SILVER'})
df.paint = df.paint.replace({'OFF WHITE L': 'OFF WHITE'})
df.paint = df.paint.replace({' BLACK/RED': 'BLACK/RED'})
df.paint = df.paint.replace({'WHINE ': 'WINE'})
df.paint = df.paint.replace({'WHITE ORCHILD PEARL': 'WHITE ORCHID PEARL'})
df.paint = df.paint.replace({'MAGNETIC GRAY': 'MAGNETIC GREY'})
df.paint = df.paint.replace({'MAGNETIC GRAY METALLIC': 'MAGNETIC GREY METALLIC'})

In [None]:
# check
paint_count = df.paint.value_counts()
paint_count

In [None]:
# for the following reduce paints to only those that occur at least 10 times 
paint_top = list(paint_count[paint_count.values>10].index)
df['paint_reduced'] = df.paint.where(df.paint.isin(paint_top), '_OTHER_')
df.paint_reduced.value_counts()

### Extract manufacturer from "title":

In [None]:
def first_piece(i_string):
    return i_string.split()[0]

df['manufacturer'] = list(map(first_piece, df.title))

In [None]:
# minor adjustment
df.manufacturer = df.manufacturer.replace({'Land': 'Land Rover'})

In [None]:
# count frequencies
manu_count = df.manufacturer.value_counts()
manu_count

In [None]:
# again reduce number of levels
manu_top = list(manu_count[manu_count.values>10].index)
df['manufacturer_reduced'] = df.manufacturer.where(df.manufacturer.isin(manu_top), '_OTHER_')
df.manufacturer_reduced.value_counts()

<a id='2'></a>
# Numerical features

In [None]:
# define numerical features
features_num = ['year', 'odometer', 'price']

In [None]:
# plot mileage
df.odometer.plot(kind='hist', bins=50)
plt.title('Mileage')
plt.grid()
plt.show()

In [None]:
# same in log scale
np.log10(1+df.odometer).plot(kind='hist', bins=50)
plt.title('Log10(1+Mileage)')
plt.grid()
plt.show()

#### Bar at 0 in the previous plot represents cars with 0 miles!

In [None]:
print('Number of cars with mileage 0:', df[df.odometer==0].shape[0])

In [None]:
# Price
df.price.plot(kind='hist', bins=50)
plt.title('Price')
plt.grid()
plt.show()

In [None]:
# Price - log plot
np.log10(df.price).plot(kind='hist', bins=50)
plt.title('Log10(Price)')
plt.grid()
plt.show()

In [None]:
# Year
plt.figure(figsize=(8,4))
df.year.value_counts().sort_index().plot(kind='bar')
plt.title('Year')
plt.grid()
plt.show()

In [None]:
# pairwise scatterplot of numerical features
sns.pairplot(df[features_num])
plt.show()

In [None]:
# add transformed version of variables to data frame
df['odometer_trafo'] = np.log10(1+df.odometer)
df['log_price'] = np.log10(df.price)

In [None]:
# pairwise scatterplot using transformed features
sns.pairplot(df[['year','odometer_trafo','log_price']])
plt.show()

In [None]:
# evaluate rank correlation
corr_spearman = df[features_num].corr(method='spearman')

# plot matrix
sns.heatmap(corr_spearman, annot=True, cmap="RdYlGn")
plt.title('Spearman correlation')
plt.show()

<a id='3'></a>
# Categorical features

In [None]:
# define categorical features for the following
features_cat = ['location', 'isimported', 
                'engine', 'transmission',
                'fuel', 'paint_reduced',
                'manufacturer_reduced']

In [None]:
# plot distribution of categorical features
for f in features_cat:
    plt.figure(figsize=(16,4))
    df[f].value_counts().plot(kind='bar')
    plt.title(f)
    plt.grid()
    plt.show()

#### Evaluate also "title". Here we have 240 different values therefore we just look at the 25 most frequent ones

In [None]:
# frequency plot for title
plt.figure(figsize=(16,4))
df.title.value_counts()[0:25].plot(kind='bar')
plt.title('Title (Top 25)')
plt.grid()
plt.show()

<a id='4'></a>
# Price vs features

### Numeric features

In [None]:
plt.scatter(df.year, df.log_price, alpha=0.25)
plt.grid()
plt.title('Log10(Price) vs Year')
plt.show()

#### We see an increasing price trend over time starting around year 2000.

In [None]:
plt.scatter(np.log10(1+df.odometer), df.log_price, alpha=0.25)
plt.grid()
plt.title('Log10(Price) vs Log10(1+Miles)')
plt.show()

#### Price shows expected decreasing behavior with increasing number of miles driven.

### Categorical features

In [None]:
# plot impact of categorical features on (log) price using violinplots
for f in features_cat:
    plt.figure(figsize=(16,4))
    sns.violinplot(x=f, y='log_price', data=df)
    plt.title(f)
    plt.grid()
    plt.xticks(rotation=90)
    plt.show()

#### Look at title separately once again - we use only the most frequent 25:

In [None]:
# impact of title on (log) price
title_top = df.title.value_counts()[0:25].index # select most frequent titles
df_temp = df[df.title.isin(title_top)] # temporary data frame reduced to top 25 titles

plt.figure(figsize=(16,4))
sns.violinplot(x='title', y='log_price', data=df_temp)
plt.title('Title - Top 25')
plt.grid()
plt.xticks(rotation=90)
plt.show()

<a id='5'></a>
# Other evaluations

### Is there a dependency between manufacturer and paint?

In [None]:
# cross table with absolute counts
ctab = pd.crosstab(df.manufacturer_reduced, df.paint_reduced)
ctab

In [None]:
# normalize table for each manufacturer
cc = ctab.sum(axis=1).values
ctab_norm = (ctab.transpose() / cc).transpose()
ctab_norm

In [None]:
# visualize the matrix
plt.rcParams['figure.figsize']=(11,7)
sns.heatmap(ctab_norm, cmap=plt.cm.plasma, annot=True)
plt.title('Paint by manufacturer')
plt.show()

#### BMWs are mostly black (61.54%) whereas the majority of Volvos (45.45%) is white

<a id='6'></a>
# Predictive Model for Price

In [None]:
# select predictors
predictors = ['odometer', 'year'] + features_cat
print('Number of predictors: ', len(predictors))
print(predictors)

# define target
target='price'

In [None]:
# start H2O
h2o.init(max_mem_size='12G', nthreads=4)

In [None]:
# upload data frame in H2O environment
df_hex = h2o.H2OFrame(df)

# train / test split (80/20)
train_hex, test_hex = df_hex.split_frame(ratios=[0.8], seed=999)

In [None]:
# define (distributed) random forest model
fit_DRF = H2ORandomForestEstimator(ntrees=300,
                                   max_depth=15,
                                   min_rows=1,
                                   nfolds=5,
                                   seed=999)

In [None]:
# train model
t1 = time.time()
fit_DRF.train(x=predictors,
              y=target,
              training_frame=train_hex)
t2 = time.time()
print('Elapsed time [s]: ', np.round(t2-t1,2))

In [None]:
# show training scoring history
plt.rcParams['figure.figsize']=(7,4)
fit_DRF.plot()

In [None]:
# variable importance
fit_DRF.varimp_plot()

In [None]:
# alternative: use (global) shap plot => see also direction and severity of feature impact
fit_DRF.shap_summary_plot(train_hex);

In [None]:
# show performance on training data
perf_train = fit_DRF.model_performance(train=True)
print(perf_train)

In [None]:
# show cross validation metrics
fit_DRF.cross_validation_metrics_summary()

In [None]:
# show scoring history - training vs cross validations
for i in range(5):
    cv_model_temp = fit_DRF.cross_validation_models()[i]
    df_cv_score_history = cv_model_temp.score_history()
    my_title = 'CV ' + str(1+i) + ' - Scoring History [RMSE]'
    plt.scatter(df_cv_score_history.number_of_trees,
                y=df_cv_score_history.training_rmse, 
                c='blue', label='training')
    plt.scatter(df_cv_score_history.number_of_trees,
                y=df_cv_score_history.validation_rmse, 
                c='darkorange', label='cross val.')
    plt.title(my_title)
    plt.xlabel('Number of Trees')
    plt.legend()
    plt.grid()
    plt.show()

### Predict on Training Data

In [None]:
pred_train = fit_DRF.predict(train_hex)
y_train_act = train_hex.as_data_frame().price.values # actual values
y_train_pred = pred_train.as_data_frame().predict.values # predictions

In [None]:
# plot predictions vs actuals
plt.scatter(y_train_act, y_train_pred, alpha=0.25)
plt.title('Prediction vs Actual - Training Data')
plt.grid()
plt.xlabel('Actual')
plt.ylabel('Prediction')
plt.show()

### Please note that we are only using the manufacturer but not the specific model of the car for predicting. Therefore we cannot yet expect a very precise model!

In [None]:
print('Correlations - Training Data')
print('Correlation Pearson:', stats.pearsonr(y_train_act, y_train_pred))
print('Correlation Spearman:', stats.spearmanr(y_train_act, y_train_pred))

In [None]:
# mean absolute error
print('MAE (train): ', np.round(mean_absolute_error(y_train_act, y_train_pred),2))

### Predict on Test Set

In [None]:
pred_test = fit_DRF.predict(test_hex)
y_test_act = test_hex.as_data_frame().price.values # actual values
y_test_pred = pred_test.as_data_frame().predict.values # predictions

In [None]:
# plot predictions vs actuals
plt.scatter(y_test_act, y_test_pred, alpha=0.25)
plt.title('Prediction vs Actual - Test Data')
plt.grid()
plt.xlabel('Actual')
plt.ylabel('Prediction')
plt.show()

In [None]:
print('Correlations - Test Set')
print('Correlation Pearson:', stats.pearsonr(y_test_act, y_test_pred))
print('Correlation Spearman:', stats.spearmanr(y_test_act, y_test_pred))

In [None]:
# mean absolute error
print('MAE (test): ', np.round(mean_absolute_error(y_test_act, y_test_pred),2))

<a id='7'></a>
# Local explanation for predictions

### H2O provides nice explanations by just using one line of code:

In [None]:
# select individual row (from training data)
my_row = 8
train_hex[my_row,:]

In [None]:
# and show corresponding prediction
print('Prediction:', y_train_pred[my_row])

In [None]:
# now show detailed explanations for this individual prediction
fit_DRF.explain_row(frame=train_hex, row_index=my_row);

### Let's interpret the SHAP explanation plot: The prediction of the price is driven upward due to a relatively low age of the car, the paint color black and the foreign use. On the other side we have a 4 cylinder I4 engine, relatively high mileage and the manufacturer Toyota which drive the prediction downward.

<a id='8'></a>
# Can we do better?
Spoiler: Yes, we can!

#### Let's simply try to add the "title" feature (even if it has 240 different values).

In [None]:
predictors_plus = predictors + ['title']
predictors_plus

In [None]:
# define (distributed) random forest model
fit_DRF_plus = H2ORandomForestEstimator(ntrees=300,
                                   max_depth=15,
                                   min_rows=1,
                                   nfolds=5,
                                   seed=999)

In [None]:
# train model
t1 = time.time()
fit_DRF_plus.train(x=predictors_plus,
              y=target,
              training_frame=train_hex)
t2 = time.time()
print('Elapsed time [s]: ', np.round(t2-t1,2))

In [None]:
# show training scoring history
plt.rcParams['figure.figsize']=(7,4)
fit_DRF_plus.plot()

In [None]:
# variable importance
fit_DRF_plus.varimp_plot()

In [None]:
# cross validation metrics
fit_DRF_plus.cross_validation_metrics_summary()

In [None]:
# show scoring history - training vs cross validations
for i in range(5):
    cv_model_temp = fit_DRF_plus.cross_validation_models()[i]
    df_cv_score_history = cv_model_temp.score_history()
    my_title = 'CV ' + str(1+i) + ' - Scoring History [RMSE]'
    plt.scatter(df_cv_score_history.number_of_trees,
                y=df_cv_score_history.training_rmse,
                c='blue', label='training')
    plt.scatter(df_cv_score_history.number_of_trees,
                y=df_cv_score_history.validation_rmse,
                c='darkorange', label='cross val.')
    plt.title(my_title)
    plt.xlabel('Number of Trees')
    plt.legend()
    plt.grid()
    plt.show()

### Predict on Training Data

In [None]:
pred_train_plus = fit_DRF_plus.predict(train_hex)
y_train_pred_plus = pred_train_plus.as_data_frame().predict.values # predictions

In [None]:
# plot predictions vs actuals
plt.scatter(y_train_act, y_train_pred_plus, c='red', alpha=0.25)
plt.title('Prediction vs Actual - Improved Model - Training Data')
plt.grid()
plt.xlabel('Actual')
plt.ylabel('Prediction')
plt.show()

In [None]:
# compare with 1st model
plt.scatter(y_train_act, y_train_pred_plus, c='red', alpha=0.25)
plt.scatter(y_train_act, y_train_pred, c='green', alpha=0.25)
plt.title('Prediction vs Actual - Training Data')
plt.grid()
plt.xlabel('Actual')
plt.ylabel('Prediction')
plt.show()

In [None]:
print('Correlations - Training Data (2nd model)')
print('Correlation Pearson:', stats.pearsonr(y_train_act, y_train_pred_plus))
print('Correlation Spearman:', stats.spearmanr(y_train_act, y_train_pred_plus))

In [None]:
# mean absolute error
print('1st model - MAE (train): ', np.round(mean_absolute_error(y_train_act, y_train_pred),2))
print('2nd model - MAE (train): ', np.round(mean_absolute_error(y_train_act, y_train_pred_plus),2))

In [None]:
pred_test_plus = fit_DRF_plus.predict(test_hex)
y_test_pred_plus = pred_test_plus.as_data_frame().predict.values # predictions

# plot predictions vs actuals
plt.scatter(y_test_act, y_test_pred_plus, c='red', alpha=0.25)
plt.title('Prediction vs Actual - Improved Model - Test Data')
plt.grid()
plt.xlabel('Actual')
plt.ylabel('Prediction')
plt.show()

In [None]:
# compare with 1st model
plt.scatter(y_test_act, y_test_pred_plus, c='red', alpha=0.25)
plt.scatter(y_test_act, y_test_pred, c='green', alpha=0.25)
plt.title('Prediction vs Actual - Training Data')
plt.grid()
plt.xlabel('Actual')
plt.ylabel('Prediction')
plt.show()

In [None]:
print('Correlations - Test Set (2nd model)')
print('Correlation Pearson:', stats.pearsonr(y_test_act, y_test_pred_plus))
print('Correlation Spearman:', stats.spearmanr(y_test_act, y_test_pred_plus))

In [None]:
# mean absolute error
print('1st model - MAE (test): ', np.round(mean_absolute_error(y_test_act, y_test_pred),2))
print('2nd model - MAE (test): ', np.round(mean_absolute_error(y_test_act, y_test_pred_plus),2))

### Ok, we have clearly improved the model significantly by adding "title"...