# Model Training

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

  import pandas.util.testing as tm


In [3]:
# Set display
pd.set_option('display.max_columns', 100)
pd.set_option("display.max_rows", 500)

### Evaluation Metrics

In [3]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, accuracy_score, f1_score, log_loss

def mean_absolute_percentage_error(y_true, y_pred): 
    total = 0
    for i in range(y_test.size):
        total += np.abs((y_true[i] - y_pred[i]) / y_true[i]) * 100
    return total / y_test.size

def mean_error(y_true, y_pred): 
    total = 0
    for i in range(y_test.size):
        total += y_true[i] - y_pred[i]
    return total / y_test.size

### Read Data

In [3]:
y = pd.read_csv("./dwelltime.csv")
display(y.head())

Unnamed: 0,DwellTime
0,0.083333
1,0.083333
2,0.8
3,0.083333
4,0.083333


### One-Hot Encoder
### ****Note: Run this line when training Ridge Regression and Logistic Regression

In [None]:
x = pd.read_csv("./indpt_vars.csv",index_col = None)

### Numerical Encoder
### ****Note: Run this line when training Random Forest and Gradient Boosting

In [None]:
x = pd.read_csv("./indpt_vars_num.csv",index_col = None)

In [13]:
y.shape

(1758839, 1)

In [396]:
x.shape

(1758839, 201)

# Part A - Regression Approach

### Split Train/Test Data

In [10]:
from sklearn.model_selection import train_test_split

# Split dataset into training set and test set -- 70% training and 30% test
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, stratify=y)
display(x_train.head())
display(y_train.head())

Unnamed: 0,Miles,MilesToNextStop,ClusterId,ArriveTimeUpdateType,BounceCount,TotalPallets,TotalWeight,Hot,DnBIndustry,ScheduleType,EquipmentType,EquipmentLength,LoadStopType,LoadStopSequence,WorkType,OnTime,DayOfWeek,HourOfDay,PeakHour,facility_traffic,driver_exp,driver_complexity,facility_complexity,avg_dt,median_dt,min_dt,max_dt,std_dt
1748899,23.0,0.0,19,0,0.0,4,1525.0,0,2,0,0,53.0,0,2,1,1,3,9,1,1,342,2,2,1.36,1.15,0.5,2.05,0.664747
280247,378.0,378.0,106,1,0.0,31,45408.352,0,0,0,0,53.0,1,1,0,0,3,14,1,64,583,62,1433,2.274595,2.0,0.016667,6.0,1.287106
1686029,2972.0,2972.0,49,0,0.0,32,25000.0,0,0,2,0,53.0,1,1,0,1,3,12,1,1,229,49,5,1.601852,1.5,0.5,3.0,0.828924
471518,302.0,0.0,78,0,0.0,52,25899.0,0,9,0,0,53.0,0,2,0,1,1,8,1,64,2118,1,286,1.693054,2.0,0.016667,6.0,0.839518
103745,72.0,0.0,28,0,0.0,48,42768.0,0,2,0,1,53.0,0,2,0,1,1,5,0,14,22,5,1072,1.653821,1.5,0.016667,6.0,0.902175


Unnamed: 0,DwellTime
1748899,1.083333
280247,0.983333
1686029,1.5
471518,1.0
103745,2.25


## Model 1 – RidgeRegression

In [393]:
from sklearn.linear_model import Ridge

reg = Ridge(normalize=True, alpha=0.01)
reg.fit(x_train, y_train) 
y_pred = reg.predict(x_test)

In [118]:
print('MAE: ' + str(mean_absolute_error(y_test, y_pred)))
print('RMSE: ' + str(np.sqrt(mean_squared_error(y_test, y_pred))))
print('MAPE: ' + str(mean_absolute_percentage_error(y_test.values.ravel(), y_pred)[0]))
print('Mean Error: ' + str(mean_error(y_test.values.ravel(), y_pred)[0]))

MAE: 0.7751842965206802
RMSE: 1.0404202751615836
MAPE: 121.09877100489096
Mean Error: 0.0005378251687322052


In [119]:
print('R2: ' + str(reg.score(x_train, y_train)))
print('OSR2: ' + str(reg.score(x_test, y_test)))

R2: 0.16705887349655776
OSR2: 0.16535468676678566


### Label Encoder

In [397]:
# set min/max
y_pred[y_pred<0] = 0.1

In [398]:
y_pred.max()

5.550114640141779

In [399]:
y_train_bin = np.ceil(y_train) #le.transform()
y_test_bin = np.ceil(y_test)
y_pred_bin = np.ceil(y_pred)

In [400]:
print('Accuracy: ' + str(accuracy_score(y_test_bin, y_pred_bin)))
print('F1 score: ' + str(f1_score(y_test_bin, y_pred_bin, average='weighted')))

Accuracy: 0.37017958806182866
F1 score: 0.30873261701783994


In [401]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test_bin, y_pred_bin)

array([[ 21274, 141161,  29648,    337,      4,      0],
       [  2902, 144632,  52208,    907,     11,      0],
       [   350,  43428,  28632,    731,     28,      0],
       [    92,  15995,  17216,    741,     33,      0],
       [    33,   6613,  10093,    738,     43,      2],
       [    16,   3161,   6031,    524,     64,      4]])

### Variable Coefficients

In [122]:
# Get variable coefficients
coefs = reg.coef_.tolist()[0]
# List of tuples with variable and coefficients
feature_coef = [(feature, round(coef, 3)) for feature, coef in zip(x, coefs)]
# Sort the coefficients by highest first
feature_coef= sorted(feature_coef, key = lambda x: x[1], reverse = True)
# Print out the feature and coefficients 
[print('Variable: {:20} Coefficient: {}'.format(*pair)) for pair in feature_coef];

Variable: avg_dt               Coefficient: 0.817
Variable: C136                 Coefficient: 0.151
Variable: Utilities            Coefficient: 0.139
Variable: None                 Coefficient: 0.086
Variable: C14                  Coefficient: 0.083
Variable: median_dt            Coefficient: 0.081
Variable: C128                 Coefficient: 0.073
Variable: std_dt               Coefficient: 0.065
Variable: Hot                  Coefficient: 0.061
Variable: Arts, Entertainment, and Recreation Coefficient: 0.049
Variable: C40                  Coefficient: 0.041
Variable: Public Administration Coefficient: 0.04
Variable: BounceCount          Coefficient: 0.038
Variable: C97                  Coefficient: 0.034
Variable: Real Estate and Rental and Leasing Coefficient: 0.033
Variable: Mon                  Coefficient: 0.031
Variable: C100                 Coefficient: 0.029
Variable: Manual               Coefficient: 0.029
Variable: R                    Coefficient: 0.028
Variable: C121       

## Model 2 – RandomForestRegressor

In [11]:
from sklearn.ensemble import RandomForestRegressor

rfr = RandomForestRegressor(max_depth=10, min_samples_leaf=3, n_estimators=300, n_jobs=-1)
rfr.fit(x_train, y_train.values.ravel())
y_pred = rfr.predict(x_test)

In [325]:
print('MAE: ' + str(mean_absolute_error(y_test, y_pred)))
print('RMSE: ' + str(np.sqrt(mean_squared_error(y_test, y_pred))))
print('MAPE: ' + str(mean_absolute_percentage_error(y_test.values.ravel(), y_pred)))
print('Mean Error: ' + str(mean_error(y_test.values.ravel(), y_pred)))

MAE: 0.7639943196865182
RMSE: 1.0317027860564651
MAPE: 118.34082275492895
Mean Error: 0.0005891269767494292


In [142]:
print('R2: ' + str(rfr.score(x_train, y_train)))
print('OSR2: ' + str(rfr.score(x_test, y_test)))

R2: 0.19123826633363053
OSR2: 0.17929607297749162


### Label Encoder

In [14]:
y_train_bin = np.ceil(y_train)
y_test_bin = np.ceil(y_test)
y_pred_bin = np.ceil(y_pred)

In [329]:
print('Accuracy: ' + str(accuracy_score(y_test_bin, y_pred_bin)))
print('F1 score: ' + str(f1_score(y_test_bin, y_pred_bin, average='weighted')))

Accuracy: 0.37242728161742966
F1 score: 0.30676394452066036


In [330]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test_bin, y_pred_bin)

array([[ 19817, 144745,  27495,    363,      4,      0],
       [  2201, 147968,  49507,    964,     20,      0],
       [   227,  44239,  27762,    906,     35,      0],
       [    56,  16233,  16830,    911,     46,      1],
       [    21,   6662,   9875,    909,     53,      2],
       [    14,   3172,   5901,    635,     77,      1]])

### Variable Coefficients

In [141]:
# Get numerical feature importances
importances = list(rfr.feature_importances_)
# List of tuples with variable and importance
feature_importances = [(feature, round(importance, 3)) for feature, importance in zip(x, importances)]
# Sort the feature importances by most important first
feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)
# Print out the feature and importances 
[print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances];

Variable: avg_dt               Importance: 0.855
Variable: Miles                Importance: 0.016
Variable: ArriveTimeUpdateType Importance: 0.015
Variable: HourOfDay            Importance: 0.014
Variable: driver_exp           Importance: 0.012
Variable: driver_complexity    Importance: 0.011
Variable: std_dt               Importance: 0.011
Variable: TotalPallets         Importance: 0.009
Variable: facility_complexity  Importance: 0.009
Variable: MilesToNextStop      Importance: 0.008
Variable: TotalWeight          Importance: 0.006
Variable: OnTime               Importance: 0.006
Variable: facility_traffic     Importance: 0.006
Variable: median_dt            Importance: 0.004
Variable: ClusterId            Importance: 0.003
Variable: max_dt               Importance: 0.003
Variable: ScheduleType         Importance: 0.002
Variable: EquipmentType        Importance: 0.002
Variable: DayOfWeek            Importance: 0.002
Variable: DnBIndustry          Importance: 0.001
Variable: LoadStopSe

In [149]:
from sklearn.inspection import permutation_importance
result = permutation_importance(rfr, x_test, y_test, n_repeats=10, n_jobs=-1)

In [150]:
# Get numerical feature importances
importances = list(result.importances_mean)
# List of tuples with variable and importance
permut_importances = [(feature, round(importance/result.importances_mean.sum(), 3)) for feature, importance in zip(x, importances)]
# Sort the permutation importances by most important first
permut_importances = sorted(permut_importances, key = lambda x: x[1], reverse = True)
# Print out the feature and importances 
[print('Variable: {:20} Importance: {}'.format(*pair)) for pair in permut_importances];

Variable: avg_dt               Importance: 0.842
Variable: ArriveTimeUpdateType Importance: 0.038
Variable: Miles                Importance: 0.021
Variable: HourOfDay            Importance: 0.019
Variable: driver_exp           Importance: 0.012
Variable: driver_complexity    Importance: 0.011
Variable: facility_complexity  Importance: 0.01
Variable: MilesToNextStop      Importance: 0.009
Variable: OnTime               Importance: 0.009
Variable: TotalPallets         Importance: 0.005
Variable: std_dt               Importance: 0.005
Variable: ScheduleType         Importance: 0.003
Variable: facility_traffic     Importance: 0.003
Variable: median_dt            Importance: 0.003
Variable: max_dt               Importance: 0.003
Variable: TotalWeight          Importance: 0.002
Variable: EquipmentType        Importance: 0.002
Variable: ClusterId            Importance: 0.001
Variable: LoadStopSequence     Importance: 0.001
Variable: DayOfWeek            Importance: 0.001
Variable: BounceCount

## Model 3 – GradientBoostingRegressor

In [20]:
from sklearn.ensemble import GradientBoostingRegressor

gbrt = GradientBoostingRegressor(max_depth=5, n_estimators=30, learning_rate=1.0)
gbrt.fit(x_train, y_train.values.ravel())
y_pred = gbrt.predict(x_test)

In [9]:
print('MAE: ' + str(mean_absolute_error(y_test, y_pred)))
print('RMSE: ' + str(np.sqrt(mean_squared_error(y_test, y_pred))))
print('MAPE: ' + str(mean_absolute_percentage_error(y_test.values.ravel(), y_pred)))
print('Mean Error: ' + str(mean_error(y_test.values.ravel(), y_pred)))

MAE: 0.764033146036543
RMSE: 1.0312611307006379
MAPE: 117.10888414542615
Mean Error: 0.0003916000240295572


In [11]:
print('R2: ' + str(gbrt.score(x_train, y_train)))
print('OSR2: ' + str(gbrt.score(x_test, y_test)))

R2: 0.18981631970247204
OSR2: 0.17998528964204863


### Label Encoder

In [14]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
le.fit(list(range(0, 7)))
print(le.classes_)

[0 1 2 3 4 5 6]


In [15]:
# set min/max
y_pred[y_pred<0] = 0.1
y_pred[y_pred>6] = 5.9

In [16]:
y_train_bin = le.transform(np.ceil(y_train))
y_test_bin = le.transform(np.ceil(y_test))
y_pred_bin = le.transform(np.ceil(y_pred))

  y = column_or_1d(y, warn=True)


In [18]:
print('Accuracy: ' + str(accuracy_score(y_test_bin, y_pred_bin)))
print('F1 score: ' + str(f1_score(y_test_bin, y_pred_bin, average='weighted')))

Accuracy: 0.38017670737531556
F1 score: 0.32715802492902507


In [19]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test_bin, y_pred_bin)

array([[ 26182, 137655,  27866,    688,     25,      8],
       [  3699, 144488,  50830,   1574,     64,      5],
       [   445,  42862,  28372,   1419,     63,      8],
       [   126,  15643,  16793,   1438,     66,     11],
       [    55,   6499,   9566,   1295,     97,     10],
       [    29,   2993,   5751,    900,    103,     24]])

# Part B - Classification Approach (1-hr Binning)

### Split Train/Test Data

In [402]:
from sklearn.model_selection import train_test_split

# Split dataset into training set and test set -- 70% training and 30% test
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, stratify=y)
display(x_train.head())
display(y_train.head())

Unnamed: 0,Miles,MilesToNextStop,BounceCount,TotalPallets,TotalWeight,Hot,EquipmentLength,LoadStopSequence,OnTime,HourOfDay,PeakHour,facility_traffic,driver_exp,driver_complexity,facility_complexity,avg_dt,median_dt,min_dt,max_dt,std_dt,C0,C1,C10,C100,C101,C102,C103,C104,C105,C106,C107,C108,C109,C11,C110,C111,C112,C113,C114,C115,C116,C117,C118,C119,C12,C120,C121,C122,C123,C124,...,C94,C95,C96,C97,C98,C99,Automated,Manual,Accommodation and Food Services,Administrative and Support and Waste Management and Remediation Services,"Agriculture, Forestry, Fishing and Hunting","Arts, Entertainment, and Recreation",Construction,Educational Services,Finance and Insurance,Health Care and Social Assistance,Information,Management of Companies and Enterprises,Manufacturing,"Mining, Quarrying, and Oil and Gas Extraction",Nonclassified establishment,Other Services (except Public Administration),"Professional, Scientific, and Technical Services",Public Administration,Real Estate and Rental and Leasing,Retail Trade,Transportation and Warehousing,Utilities,Wholesale Trade,Appt,None,Notice,Open,R,V,Delivery,Pick Up,Assist/Check,Driver Count,Driver Load,Lumper,No Touch,Unknown,Fri,Mon,Sat,Sun,Thu,Tue,Wed
1748899,23.0,0.0,0.0,4,1525.0,False,53.0,2,1,9,1,1,342,2,2,1.36,1.15,0.5,2.05,0.664747,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,1,1,0,1,0,0,0,0,0,0,0,0,0,1,0,0
280247,378.0,378.0,0.0,31,45408.352,False,53.0,1,0,14,1,64,583,62,1433,2.274595,2.0,0.016667,6.0,1.287106,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0
1686029,2972.0,2972.0,0.0,32,25000.0,False,53.0,1,1,12,1,1,229,49,5,1.601852,1.5,0.5,3.0,0.828924,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0
471518,302.0,0.0,0.0,52,25899.0,False,53.0,2,1,8,1,64,2118,1,286,1.693054,2.0,0.016667,6.0,0.839518,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0
103745,72.0,0.0,0.0,48,42768.0,False,53.0,2,1,5,0,14,22,5,1072,1.653821,1.5,0.016667,6.0,0.902175,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,1,0,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0


Unnamed: 0,DwellTime
1748899,1.083333
280247,0.983333
1686029,1.5
471518,1.0
103745,2.25


### Label Encoder

In [5]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
le.fit(list(range(1, 7)))
print(le.classes_)

[1 2 3 4 5 6]


In [404]:
y_train_bin = le.transform(np.ceil(y_train))
y_test_bin = le.transform(np.ceil(y_test))

  y = column_or_1d(y, warn=True)


## Model 4 – LogisticRegression

In [405]:
from sklearn.linear_model import LogisticRegression

log = LogisticRegression(solver='saga', n_jobs=-1, penalty='l2')
log.fit(x_train, y_train_bin)
y_pred = le.inverse_transform(log.predict(x_test)) - 0.5



In [406]:
print('MAE: ' + str(mean_absolute_error(y_test, y_pred)))
print('RMSE: ' + str(np.sqrt(mean_squared_error(y_test, y_pred))))
print('MAPE: ' + str(mean_absolute_percentage_error(y_test.values.ravel(), y_pred)))
print('Mean Error: ' + str(mean_error(y_test.values.ravel(), y_pred)))

MAE: 0.9657124076727339
RMSE: 1.3357097200291288
MAPE: 97.1988039010463
Mean Error: 0.6378255302105406


In [407]:
y_pred_bin = log.predict(x_test)
y_pred_prob = log.predict_proba(x_test)
print('Accuracy: ' + str(accuracy_score(y_test_bin, y_pred_bin)))
print('F1 score: ' + str(f1_score(y_test_bin, y_pred_bin, average='weighted')))
print('Log loss: ' + str(log_loss(y_test_bin, y_pred_prob)))

Accuracy: 0.41988090635494607
F1 score: 0.35748675390866624
Log loss: 1.359781637045141


In [408]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test_bin, y_pred_bin)

array([[ 90840, 101379,    205,      0,      0,      0],
       [ 69768, 130630,    260,      2,      0,      0],
       [ 21697,  51391,     80,      1,      0,      0],
       [  9435,  24620,     21,      1,      0,      0],
       [  4692,  12819,     11,      0,      0,      0],
       [  2537,   7256,      7,      0,      0,      0]])

In [433]:
# Get variable coefficients
coefs = reg.coef_.tolist()[0]
# List of tuples with variable and coefficients
feature_coef = [(feature, round(coef, 3)) for feature, coef in zip(x, coefs)]
# Sort the coefficients by highest first
feature_coef= sorted(feature_coef, key = lambda x: x[1], reverse = True)
# Print out the feature and coefficients 
[print('Variable: {:20} Coefficient: {}'.format(*pair)) for pair in feature_coef];

Variable: OnTime               Coefficient: 0.817
Variable: DayOfWeek            Coefficient: 0.081
Variable: facility_traffic     Coefficient: 0.065
Variable: TotalPallets         Coefficient: 0.061
Variable: ClusterId            Coefficient: 0.038
Variable: avg_dt               Coefficient: 0.029
Variable: HourOfDay            Coefficient: 0.025
Variable: driver_complexity    Coefficient: 0.009
Variable: PeakHour             Coefficient: 0.008
Variable: ArriveTimeUpdateType Coefficient: 0.002
Variable: Miles                Coefficient: 0.0
Variable: MilesToNextStop      Coefficient: 0.0
Variable: BounceCount          Coefficient: 0.0
Variable: TotalWeight          Coefficient: 0.0
Variable: EquipmentLength      Coefficient: -0.0
Variable: LoadStopType         Coefficient: -0.0
Variable: LoadStopSequence     Coefficient: 0.0
Variable: WorkType             Coefficient: -0.0
Variable: facility_complexity  Coefficient: -0.002
Variable: ScheduleType         Coefficient: -0.008
Variable: d

## Model 5 – RandomForestClassifier

In [376]:
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier(max_depth=10, min_samples_leaf=3, n_estimators=300)
rfc.fit(x_train, y_train_bin)
y_pred = le.inverse_transform(rfc.predict(x_test)) - 0.5

In [377]:
print('MAE: ' + str(mean_absolute_error(y_test, y_pred)))
print('RMSE: ' + str(np.sqrt(mean_squared_error(y_test, y_pred))))
print('MAPE: ' + str(mean_absolute_percentage_error(y_test.values.ravel(), y_pred)))
print('Mean Error: ' + str(mean_error(y_test.values.ravel(), y_pred)))

MAE: 0.8611045727108014
RMSE: 1.215682585594272
MAPE: 91.6649725538986
Mean Error: 0.5131941443729232


In [379]:
y_pred_bin = rfc.predict(x_test)
y_pred_prob = rfc.predict_proba(x_test)
print('Accuracy: ' + str(accuracy_score(y_test_bin, y_pred_bin)))
print('F1 score: ' + str(f1_score(y_test_bin, y_pred_bin, average='weighted')))
print('Log loss: ' + str(log_loss(y_test_bin, y_pred_prob)))

Accuracy: 0.4699972709285666
F1 score: 0.4011268626994227
Log loss: 1.2397043047529985


In [380]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test_bin, y_pred_bin)

array([[ 83919, 108322,    169,     11,      2,      1],
       [ 37285, 162795,    556,     23,      0,      1],
       [  9281,  62745,   1045,     63,      6,     29],
       [  3379,  30124,    376,    187,      6,      5],
       [  1510,  15742,    202,     43,     24,      1],
       [   759,   8895,     95,     19,      7,     25]])

In [92]:
# Get numerical feature importances
importances = list(rfc.feature_importances_)
# List of tuples with variable and importance
feature_importances = [(feature, round(importance, 3)) for feature, importance in zip(x, importances)]
# Sort the feature importances by most important first
feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)
# Print out the feature and importances 
[print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances];

Variable: median_dt            Importance: 0.264
Variable: avg_dt               Importance: 0.222
Variable: std_dt               Importance: 0.108
Variable: ArriveTimeUpdateType Importance: 0.099
Variable: max_dt               Importance: 0.06
Variable: min_dt               Importance: 0.048
Variable: Miles                Importance: 0.033
Variable: driver_exp           Importance: 0.032
Variable: driver_complexity    Importance: 0.026
Variable: facility_complexity  Importance: 0.02
Variable: TotalPallets         Importance: 0.014
Variable: facility_traffic     Importance: 0.014
Variable: TotalWeight          Importance: 0.011
Variable: HourOfDay            Importance: 0.011
Variable: MilesToNextStop      Importance: 0.007
Variable: ClusterId            Importance: 0.006
Variable: DnBIndustry          Importance: 0.006
Variable: EquipmentType        Importance: 0.005
Variable: LoadStopSequence     Importance: 0.004
Variable: ScheduleType         Importance: 0.002
Variable: WorkType    

In [81]:
from sklearn.inspection import permutation_importance
result = permutation_importance(rfc, x_test, y_test_bin, n_repeats=10, n_jobs=-1)

In [91]:
# Get numerical feature importances
importances = list(result.importances_mean)
# List of tuples with variable and importance
permut_importances = [(feature, round(importance/result.importances_mean.sum(), 3)) for feature, importance in zip(x, importances)]
# Sort the permutation importances by most important first
permut_importances = sorted(permut_importances, key = lambda x: x[1], reverse = True)
# Print out the feature and importances 
[print('Variable: {:20} Importance: {}'.format(*pair)) for pair in permut_importances];

Variable: median_dt            Importance: 0.427
Variable: avg_dt               Importance: 0.166
Variable: ArriveTimeUpdateType Importance: 0.139
Variable: std_dt               Importance: 0.058
Variable: max_dt               Importance: 0.034
Variable: facility_complexity  Importance: 0.028
Variable: driver_exp           Importance: 0.024
Variable: min_dt               Importance: 0.021
Variable: driver_complexity    Importance: 0.019
Variable: Miles                Importance: 0.018
Variable: TotalPallets         Importance: 0.015
Variable: HourOfDay            Importance: 0.015
Variable: TotalWeight          Importance: 0.008
Variable: facility_traffic     Importance: 0.008
Variable: MilesToNextStop      Importance: 0.007
Variable: DnBIndustry          Importance: 0.005
Variable: PeakHour             Importance: 0.003
Variable: EquipmentType        Importance: 0.002
Variable: LoadStopSequence     Importance: 0.002
Variable: ScheduleType         Importance: 0.001
Variable: WorkType  

## Model 6 – GradientBoostingClassifier

In [24]:
from sklearn.ensemble import GradientBoostingClassifier

gbct = GradientBoostingClassifier(max_depth=5, n_estimators=30, learning_rate=1.0)
gbct.fit(x_train, y_train_bin)
y_pred = le.inverse_transform(gbct.predict(x_test)) - 0.5

In [25]:
print('MAE: ' + str(mean_absolute_error(y_test, y_pred)))
print('RMSE: ' + str(np.sqrt(mean_squared_error(y_test, y_pred))))
print('MAPE: ' + str(mean_absolute_percentage_error(y_test.values.ravel(), y_pred)))
print('Mean Error: ' + str(mean_error(y_test.values.ravel(), y_pred)))

MAE: 0.8791217506993247
RMSE: 1.2358500142937512
MAPE: 92.30313634790942
Mean Error: 0.49616587574132176


In [26]:
y_pred_bin = gbct.predict(x_test)
y_pred_prob = gbct.predict_proba(x_test)
print('Accuracy: ' + str(accuracy_score(y_test_bin, y_pred_bin)))
print('F1 score: ' + str(f1_score(y_test_bin, y_pred_bin, average='weighted')))
print('Log loss: ' + str(log_loss(y_test_bin, y_pred_prob)))

Accuracy: 0.466625730595165
F1 score: 0.41682253893549503
Log loss: 1.861982161463932


In [27]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test_bin, y_pred_bin)

array([[ 92147,  96626,   2444,    917,    152,    138],
       [ 45843, 148038,   4013,   2353,    272,    141],
       [ 12414,  54720,   4296,   1444,    147,    148],
       [  5129,  25187,   2188,   1320,    133,    120],
       [  2480,  12615,   1281,    847,    276,     23],
       [  1312,   7007,    766,    559,     17,    139]])

In [387]:
from sklearn.ensemble import GradientBoostingClassifier

gbct = GradientBoostingClassifier(max_depth=10, n_estimators=30, learning_rate=1.0)
gbct.fit(x_train, y_train_bin)
y_pred = le.inverse_transform(gbct.predict(x_test)) - 0.5

In [388]:
print('MAE: ' + str(mean_absolute_error(y_test, y_pred)))
print('RMSE: ' + str(np.sqrt(mean_squared_error(y_test, y_pred))))
print('MAPE: ' + str(mean_absolute_percentage_error(y_test.values.ravel(), y_pred)))
print('Mean Error: ' + str(mean_error(y_test.values.ravel(), y_pred)))

MAE: 0.8966942542938657
RMSE: 1.2626167879869292
MAPE: 93.47072836249696
Mean Error: 0.48740631451536365


In [389]:
y_pred_bin = gbct.predict(x_test)
y_pred_prob = gbct.predict_proba(x_test)
print('Accuracy: ' + str(accuracy_score(y_test_bin, y_pred_bin)))
print('F1 score: ' + str(f1_score(y_test_bin, y_pred_bin, average='weighted')))
print('Log loss: ' + str(log_loss(y_test_bin, y_pred_prob)))

Accuracy: 0.4673117888305171
F1 score: 0.42423205798521924
Log loss: 1.5433664261643396


In [390]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test_bin, y_pred_bin)

array([[ 97970,  88888,   3224,   1310,    548,    484],
       [ 50675, 140716,   5872,   1841,    861,    695],
       [ 14488,  50594,   5640,   1480,    558,    409],
       [  5989,  23059,   2819,   1398,    480,    332],
       [  2886,  11510,   1550,    761,    518,    297],
       [  1537,   6227,    900,    555,    245,    336]])