In [6]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from math import sqrt

In [7]:
data = pd.read_csv("craiglist_clean.csv")

In [8]:
data_train = pd.read_csv("craigslist_train.csv")

In [9]:
data_test = pd.read_csv("craigslist_test.csv")

In [10]:
data

Unnamed: 0,fuel,odometer,paint color,price,transmission,year,brand,model,variant
0,gas,111111,unknown,200000,manual,1970,plymouth,road,runner
1,gas,90000,white,20000,automatic,2002,lexus,sc430,unknown
2,gas,160600,grey,11500,automatic,2013,honda,civic,sedan
3,gas,199000,white,3300,automatic,1995,ford,van,unknown
4,gas,107292,silver,7000,automatic,2012,vw,jetta,unknown
...,...,...,...,...,...,...,...,...,...
20724,diesel,114000,grey,27500,manual,2003,ford,f350,unknown
20725,diesel,155000,white,1650,manual,2007,chrysler,300,unknown
20726,gas,102500,unknown,15000,automatic,2012,acura,tl,unknown
20727,gas,157000,unknown,13999,automatic,1993,gmc,sierra,unknown


In [11]:
data_train

Unnamed: 0,fuel,odometer,paint color,transmission,year,brand,model,variant,price
0,gas,121500,white,automatic,2011,ford,crown,victoria,6300
1,gas,195000,white,automatic,2002,chevrolet,silverado,1500,3800
2,gas,142000,unknown,automatic,2007,nissan,maxima,unknown,2800
3,gas,152250,red,automatic,2009,gmc,sierra,1500 sle,14000
4,gas,208000,unknown,automatic,2013,chevy,impala,unknown,4800
...,...,...,...,...,...,...,...,...,...
14505,gas,97800,black,automatic,2015,ram,1500,sport,25900
14506,gas,116000,unknown,automatic,2015,chevrolet,silverado,4x4,31000
14507,gas,164000,unknown,automatic,2002,bmw,330i,unknown,2800
14508,gas,158000,unknown,manual,1998,chevrolet,corvette,unknown,11500


In [12]:
data_test

Unnamed: 0,fuel,odometer,paint color,transmission,year,brand,model,variant,price
0,gas,97000,red,automatic,1991,chevrolet,corvette,convertible,19750
1,gas,13842,black,manual,2017,ford,fiesta,st,15000
2,gas,136000,white,automatic,2008,bmw,135i,convertible,3500
3,gas,7500,grey,automatic,2005,dodge,dakota,unknown,8000
4,gas,5675,white,automatic,2018,isuzu,npr,box truck,45000
...,...,...,...,...,...,...,...,...,...
6214,diesel,212200,unknown,manual,2012,international,4300,26' box truck,25000
6215,gas,86000,black,automatic,2002,mercedes-benz,cl500,unknown,2900
6216,gas,99999,blue,automatic,1965,ford,mustang,convertible,11750
6217,gas,42242,unknown,automatic,2016,honda,fit,unknown,12900


In [13]:
# Label encoding to handle categorical value for regression

from sklearn.preprocessing import LabelEncoder

labelencoder = LabelEncoder()
data_train[['fuel', 'paint color', 'transmission', 'brand', 'model', 'variant']] = data_train[['fuel', 'paint color', 'transmission', 'brand', 'model', 'variant']].apply(labelencoder.fit_transform)
data_test[['fuel', 'paint color', 'transmission', 'brand', 'model', 'variant']] = data_test[['fuel', 'paint color', 'transmission', 'brand', 'model', 'variant']].apply(labelencoder.fit_transform)
data[['fuel', 'paint color', 'transmission', 'brand', 'model', 'variant']] = data[['fuel', 'paint color', 'transmission', 'brand', 'model', 'variant']].apply(labelencoder.fit_transform)

## Assigning the Split Data & Feature Scaling

In [14]:
# Split the data for X and Y values for the RF regressor
#X = data.iloc[:,:-1].values
X = data.loc[:,["odometer","year","fuel","transmission","paint color","brand","model","variant"]].values
y = data.loc[:,"price"].values

In [15]:
# Split the data for features and labels accordingly
X_train = data_train.loc[:,["odometer","year","fuel","transmission","paint color","brand","model","variant"]].values

In [16]:
X_test = data_test.loc[:,["odometer","year","fuel","transmission","paint color","brand","model","variant"]].values

In [17]:
y_train = data_train.loc[:,"price"].values

In [18]:
y_test = data_test.loc[:,"price"].values

In [61]:
# Feature scaling on dataset to standardize the independent features present in the data in a fixed range
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train_sc = sc.fit_transform(X_train)
X_test_sc = sc.fit_transform(X_test)

## Random Forest using default parameters

In [70]:
# Use Random Forest classifier
from sklearn.ensemble import RandomForestRegressor
regressor = RandomForestRegressor(random_state=0)
regressor.fit(X, y) 

RandomForestRegressor(random_state=0)

In [71]:
# fit the random forest model to the training data
regressorModel = regressor.fit(X_train_sc,y_train)

In [72]:
# Prediction of y using the test data X_test
y_pred_r = regressorModel.predict(X_test_sc)

In [73]:
# Import regression metrics
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [74]:
# Calculating the performance of the random forest model. 

mae_r = mean_absolute_error(y_true = y_test,y_pred = y_pred_r)
mse_r = mean_squared_error(y_true = y_test,y_pred = y_pred_r)
rmse_r = np.sqrt(mse_r)
r2_score_r = r2_score(y_test, y_pred_r)

print(f"MAE: {round(mae_r,2)} \nMSE: {round(mse_r,2)} \nRMSE: {round(rmse_r,2)} \nr2_score: {round(r2_score_r,2)}")

MAE: 9062.4 
MSE: 16372034032.78 
RMSE: 127953.25 
r2_score: 0.02


In [44]:
# Variable Importance

feature_list = list(data.columns)
# Get numerical feature importances
importances = list(regressorModel.feature_importances_)
# List of tuples with variable and importance
feature_importances = [(feature, round(importance, 2)) for feature, importance in zip(feature_list, importances)]
# Sort the feature importances by most important first
feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)
# Print out the feature and importances 
[print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances];

Variable: odometer             Importance: 0.26
Variable: fuel                 Importance: 0.25
Variable: brand                Importance: 0.15
Variable: model                Importance: 0.1
Variable: paint color          Importance: 0.09
Variable: year                 Importance: 0.08
Variable: price                Importance: 0.03
Variable: transmission         Importance: 0.03


## With hyperparameter tuning using Random Search Cross Validation 

Using Scikit-Learn’s RandomizedSearchCV method, we can define a grid of hyperparameter ranges, and randomly sample from the grid, performing K-Fold CV with each combination of values.

In [45]:
# Create a parameter grid to sample from during fitting

n_estimators = [5,20,50,100] # number of trees in the random forest
max_features = ['auto', 'sqrt'] # number of features in consideration at every split
max_depth = [int(x) for x in np.linspace(10, 120, num = 12)] # maximum number of levels allowed in each decision tree
min_samples_split = [2, 6, 10] # minimum sample number to split a node
min_samples_leaf = [1, 3, 4] # minimum sample number that can be stored in a leaf node
bootstrap = [True, False] # method used to sample data points

random_grid = {'n_estimators': n_estimators,

'max_features': max_features,

'max_depth': max_depth,

'min_samples_split': min_samples_split,

'min_samples_leaf': min_samples_leaf,

'bootstrap': bootstrap}

In [46]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(random_state = 0)

In [47]:
# Use RandomizedSearchCV for hyperparamater tuning
# Random search of parameters, using 10 fold cross validation, 
# search across 100 different combinations, and use all available cores

from sklearn.model_selection import RandomizedSearchCV
rf_random = RandomizedSearchCV(estimator = rf,param_distributions = random_grid,
               n_iter = 100, cv = 10, verbose=2, random_state=0, n_jobs = -1)

In [48]:
# Fit the random search model
rf_random.fit(X_train_sc, y_train)

Fitting 10 folds for each of 100 candidates, totalling 1000 fits


RandomizedSearchCV(cv=10, estimator=RandomForestRegressor(random_state=0),
                   n_iter=100, n_jobs=-1,
                   param_distributions={'bootstrap': [True, False],
                                        'max_depth': [10, 20, 30, 40, 50, 60,
                                                      70, 80, 90, 100, 110,
                                                      120],
                                        'max_features': ['auto', 'sqrt'],
                                        'min_samples_leaf': [1, 3, 4],
                                        'min_samples_split': [2, 6, 10],
                                        'n_estimators': [5, 20, 50, 100]},
                   random_state=0, verbose=2)

In [49]:
# View the best parameters from fitting the random search:

print ('Random grid: ', random_grid, '\n')
# print the best parameters
print ('Best Parameters: ', rf_random.best_params_, ' \n')

Random grid:  {'n_estimators': [5, 20, 50, 100], 'max_features': ['auto', 'sqrt'], 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120], 'min_samples_split': [2, 6, 10], 'min_samples_leaf': [1, 3, 4], 'bootstrap': [True, False]} 

Best Parameters:  {'n_estimators': 100, 'min_samples_split': 6, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_depth': 40, 'bootstrap': False}  



In [50]:
# Use the best parameters printed above for our random forest model
randmf = RandomForestRegressor(n_estimators = 100, min_samples_split = 6, min_samples_leaf= 1, max_features = 'sqrt', max_depth= 40, bootstrap=False) 
regressorModel_tuned = randmf.fit(X_train_sc, y_train) 

In [51]:
# Plot in the table what are the actual target variable (price) vs predicted target variable using the y test data
y_pred_rf1 = pd.DataFrame({ "actual_price": y_test, 
"predicted_price": randmf.predict((X_test_sc)) }) 

# Print the dataframe
y_pred_rf1

Unnamed: 0,actual_price,predicted_price
0,19750,12681.979667
1,15000,41032.620000
2,3500,10491.614833
3,8000,16034.015000
4,45000,52608.492833
...,...,...
6214,25000,27407.578833
6215,2900,11114.552500
6216,11750,15718.885167
6217,12900,28957.188667


In [52]:
# Prediction of y using the test data with best parameters
y_pred_r_tuned = regressorModel_tuned.predict(X_test_sc)

In [53]:
# Import regression metrics
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [54]:
# Calculating the performance of the random forest model (with hyperparamater tuning)

mae_rs = mean_absolute_error(y_true = y_test,y_pred = y_pred_r_tuned)
mse_rs = mean_squared_error(y_true = y_test,y_pred = y_pred_r_tuned)
rmse_rs = np.sqrt(mse_rs)
r2_score_rs = r2_score(y_test, y_pred_r_tuned)

print(f"MAE: {round(mae_rs,2)} \nMSE: {round(mse_rs,2)} \nRMSE: {round(rmse_rs,2)} \nr2_score: {round(r2_score_rs,2)}")

MAE: 9044.46 
MSE: 16457273452.06 
RMSE: 128285.91 
r2_score: 0.02


## Evaluate the Random Search Cross Validation results

In [55]:
# Compare the results from base Random Forest model with the hyperparameter tuned Random Forest model
base_rf =[mae_r, mse_r, rmse_r, r2_score_r]
tuned_rf =[mae_rs, mse_rs, rmse_rs, r2_score_rs]
result = pd.DataFrame({"baseRandomForest": base_rf,"tunedRandomForest": tuned_rf},index=["MAE","MSE","RMSE","R2_Score"])

# Add thousand separator with two decimal points for the dataframe
result.loc[:,"baseRandomForest"] = result["baseRandomForest"].map('{:,.2f}'.format)
result.loc[:,"tunedRandomForest"] = result["tunedRandomForest"].map('{:,.2f}'.format)

# Print the results in dataframe
result

Unnamed: 0,baseRandomForest,tunedRandomForest
MAE,9062.4,9044.46
MSE,16372034032.78,16457273452.06
RMSE,127953.25,128285.91
R2_Score,0.02,0.02
