In [23]:
# import packages
import dmba
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
import matplotlib.pylab as plt
from dmba import plotDecisionTree, classificationSummary, regressionSummary

In [24]:
# load dataset
df = dmba.load_data('ToyotaCorolla.csv')

In [25]:
df.head()

Unnamed: 0,Id,Model,Price,Age_08_04,Mfg_Month,Mfg_Year,KM,Fuel_Type,HP,Met_Color,...,Powered_Windows,Power_Steering,Radio,Mistlamps,Sport_Model,Backseat_Divider,Metallic_Rim,Radio_cassette,Parking_Assistant,Tow_Bar
0,1,TOYOTA Corolla 2.0 D4D HATCHB TERRA 2/3-Doors,13500,23,10,2002,46986,Diesel,90,1,...,1,1,0,0,0,1,0,0,0,0
1,2,TOYOTA Corolla 2.0 D4D HATCHB TERRA 2/3-Doors,13750,23,10,2002,72937,Diesel,90,1,...,0,1,0,0,0,1,0,0,0,0
2,3,TOYOTA Corolla 2.0 D4D HATCHB TERRA 2/3-Doors,13950,24,9,2002,41711,Diesel,90,1,...,0,1,0,0,0,1,0,0,0,0
3,4,TOYOTA Corolla 2.0 D4D HATCHB TERRA 2/3-Doors,14950,26,7,2002,48000,Diesel,90,0,...,0,1,0,0,0,1,0,0,0,0
4,5,TOYOTA Corolla 2.0 D4D HATCHB SOL 2/3-Doors,13750,30,3,2002,38500,Diesel,90,0,...,1,1,0,1,0,1,0,0,0,0


In [26]:
df.drop(columns = 'Id', inplace = True)

In [27]:
# Convert column Fuel_Type and Color to categories
df.Fuel_Type = df.Fuel_Type.astype('category')
df.Color = df.Color.astype('category')

In [28]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1436 entries, 0 to 1435
Data columns (total 38 columns):
 #   Column             Non-Null Count  Dtype   
---  ------             --------------  -----   
 0   Model              1436 non-null   object  
 1   Price              1436 non-null   int64   
 2   Age_08_04          1436 non-null   int64   
 3   Mfg_Month          1436 non-null   int64   
 4   Mfg_Year           1436 non-null   int64   
 5   KM                 1436 non-null   int64   
 6   Fuel_Type          1436 non-null   category
 7   HP                 1436 non-null   int64   
 8   Met_Color          1436 non-null   int64   
 9   Color              1436 non-null   category
 10  Automatic          1436 non-null   int64   
 11  CC                 1436 non-null   int64   
 12  Doors              1436 non-null   int64   
 13  Cylinders          1436 non-null   int64   
 14  Gears              1436 non-null   int64   
 15  Quarterly_Tax      1436 non-null   int64   
 16  Weight

In [29]:
predictors = ['Age_08_04','KM','Fuel_Type','HP','Automatic','Doors','Quarterly_Tax','Mfr_Guarantee','Guarantee_Period','Airco','Automatic_airco','CD_Player','Powered_Windows','Sport_Model','Tow_Bar']
outcome = 'Price'

In [30]:
X = pd.get_dummies(df[predictors])
y = df[outcome]

In [31]:
# partition the data
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size = 0.4, random_state = 1)

In [33]:
fullTree = DecisionTreeRegressor(random_state = 1)

In [34]:
fullTree.fit(X_train, y_train)

DecisionTreeRegressor(random_state=1)

In [42]:
regressionSummary(y_train, fullTree.predict(X_train))


Regression statistics

                      Mean Error (ME) : 0.0000
       Root Mean Squared Error (RMSE) : 0.0000
            Mean Absolute Error (MAE) : 0.0000
          Mean Percentage Error (MPE) : 0.0000
Mean Absolute Percentage Error (MAPE) : 0.0000


In [43]:
regressionSummary(y_valid, fullTree.predict(X_valid))


Regression statistics

                      Mean Error (ME) : 73.9461
       Root Mean Squared Error (RMSE) : 1440.1110
            Mean Absolute Error (MAE) : 1107.1843
          Mean Percentage Error (MPE) : -0.4775
Mean Absolute Percentage Error (MAPE) : 10.9358


In [44]:
importances = fullTree.feature_importances_

In [45]:
importance_df = pd.DataFrame({'feature': X_train.columns, 'importance':importances})
importance_df = importance_df.sort_values('importance')
print(importance_df)

             feature  importance
16  Fuel_Type_Petrol    0.000034
15  Fuel_Type_Diesel    0.000059
14     Fuel_Type_CNG    0.000305
3          Automatic    0.001328
10         CD_Player    0.001894
7   Guarantee_Period    0.002301
13           Tow_Bar    0.002783
6      Mfr_Guarantee    0.003969
4              Doors    0.004188
8              Airco    0.004615
12       Sport_Model    0.004678
11   Powered_Windows    0.005146
5      Quarterly_Tax    0.008880
9    Automatic_airco    0.013358
2                 HP    0.052097
1                 KM    0.059568
0          Age_08_04    0.834798


In [46]:
# RMSE of training data, to no surprise is 0 as we used a full tree. This may have caused overfitting in the tree (too many branches) causing the RMSE of the validation set to be 1440
# The top 3 features of importance in determining price seem to be Age, KM, and HP.

In [63]:
smallTree = DecisionTreeRegressor(max_depth = 5, random_state = 1)

In [64]:
smallTree.fit(X_train, y_train)

DecisionTreeRegressor(max_depth=5, random_state=1)

In [65]:
regressionSummary(y_train, smallTree.predict(X_train))


Regression statistics

                      Mean Error (ME) : -0.0000
       Root Mean Squared Error (RMSE) : 1028.0279
            Mean Absolute Error (MAE) : 773.2770
          Mean Percentage Error (MPE) : -1.0039
Mean Absolute Percentage Error (MAPE) : 7.6715


In [67]:
regressionSummary(y_valid, smallTree.predict(X_valid))


Regression statistics

                      Mean Error (ME) : 67.4074
       Root Mean Squared Error (RMSE) : 1177.3943
            Mean Absolute Error (MAE) : 909.1902
          Mean Percentage Error (MPE) : -0.6160
Mean Absolute Percentage Error (MAPE) : 9.2327


In [68]:
# Here we see with a max depth of 5, the RMSE for the training data is much higher than for a full tree, but that is to be expected.\
# The RMSE for the validation set has been decreased greatly, however, indicating that this may be a better depth for future data.

In [69]:
param_grid = {
    'max_depth': [5,10,15,20,25],
    'min_impurity_decrease':[0,0.001,0.005,0.01],
    'min_samples_split':[10,20,30,40,50],
}

In [70]:
gridSearch = GridSearchCV(DecisionTreeRegressor(), param_grid, cv = 5, n_jobs = 1)

In [71]:
gridSearch.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=DecisionTreeRegressor(), n_jobs=1,
             param_grid={'max_depth': [5, 10, 15, 20, 25],
                         'min_impurity_decrease': [0, 0.001, 0.005, 0.01],
                         'min_samples_split': [10, 20, 30, 40, 50]})

In [72]:
print('Initial parameters ', gridSearch.best_params_)

Initial parameters  {'max_depth': 10, 'min_impurity_decrease': 0.001, 'min_samples_split': 20}


In [73]:
param_grid = {
    'max_depth': [3,4,5,6,7,8,9,10,11,12],
    'min_impurity_decrease':[0,0.001,0.002,0.003,0.004,0.005,0.006,0.007,0.008],
    'min_samples_split':[10,12,14,16,18,20,22,24,26,28,30],
}

In [74]:
gridSearch = GridSearchCV(DecisionTreeRegressor(), param_grid, cv = 5, n_jobs = 1)

In [75]:
gridSearch.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=DecisionTreeRegressor(), n_jobs=1,
             param_grid={'max_depth': [3, 4, 5, 6, 7, 8, 9, 10, 11, 12],
                         'min_impurity_decrease': [0, 0.001, 0.002, 0.003,
                                                   0.004, 0.005, 0.006, 0.007,
                                                   0.008],
                         'min_samples_split': [10, 12, 14, 16, 18, 20, 22, 24,
                                               26, 28, 30]})

In [76]:
print('Improved parameters ', gridSearch.best_params_)

Improved parameters  {'max_depth': 6, 'min_impurity_decrease': 0, 'min_samples_split': 20}


In [77]:
regTree = gridSearch.best_estimator_

In [78]:
regressionSummary(y_train, regTree.predict(X_train))


Regression statistics

                      Mean Error (ME) : -0.0000
       Root Mean Squared Error (RMSE) : 1082.6992
            Mean Absolute Error (MAE) : 786.5953
          Mean Percentage Error (MPE) : -0.9986
Mean Absolute Percentage Error (MAPE) : 7.6224


In [79]:
regressionSummary(y_valid, regTree.predict(X_valid))


Regression statistics

                      Mean Error (ME) : 24.8976
       Root Mean Squared Error (RMSE) : 1251.3861
            Mean Absolute Error (MAE) : 958.1684
          Mean Percentage Error (MPE) : -1.0544
Mean Absolute Percentage Error (MAPE) : 9.5594
