In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from twilio_sms import send_message
import warnings
warnings.filterwarnings('ignore')

In [2]:
# load files
X_train = pd.read_csv("datasets/X_train_f5.csv", index_col="index")
X_test = pd.read_csv("datasets/X_test_f5.csv", index_col="index")

y_train = pd.read_csv("datasets/y_train_f5.csv", index_col="index")
y_test = pd.read_csv("datasets/y_test_f5.csv", index_col="index")

In [3]:
def print_results(gsearch):
    print("wall-time: %.2f min"%(sum(gsearch.cv_results_["mean_fit_time"]) / 60))
    for params, score in zip(gsearch.cv_results_["params"], gsearch.cv_results_["mean_test_score"]):
        print(params, - score)
    print("best:", gsearch.best_params_, - gsearch.best_score_)

### Replicate Results from Local Machine

In [4]:
import xgboost as xgb
xgr = xgb.XGBRegressor(n_estimators=200, max_depth=5, base_score=0.5,
                        objective='reg:linear', random_state=42)
xgr.fit(X_train, y_train)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, importance_type='gain',
       learning_rate=0.1, max_delta_step=0, max_depth=5,
       min_child_weight=1, missing=None, n_estimators=200, n_jobs=1,
       nthread=None, objective='reg:linear', random_state=42, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, seed=None, silent=True,
       subsample=1)

In [5]:
from sklearn.metrics import mean_absolute_error, r2_score
y_test_pred = xgr.predict(X_test)
mae = mean_absolute_error(y_test, y_test_pred)

print("avg mean_absolute_error: %.2f"%mae)
print("r2_score: %.3f"%r2_score(y_test, y_test_pred))

avg mean_absolute_error: 13491.91
r2_score: 0.654


### Hyperparameter Tuning
Use __cross validation__ error to tune parameters. Apply __test set__ in the end.

In [5]:
from sklearn.model_selection import GridSearchCV

In [6]:
param_test1 = {
 'max_depth': list(range(3, 9, 2)),
 'min_child_weight': list(range(1, 10, 3))
}

param_test1

{'max_depth': [3, 5, 7], 'min_child_weight': [1, 4, 7]}

In [8]:
gsearch1 = GridSearchCV(
    estimator=xgb.XGBRegressor(learning_rate=0.1, n_estimators=140, max_depth=5, 
                               min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8,
                               objective='reg:linear', nthread=4, scale_pos_weight=1, seed=0), 
    param_grid=param_test1, 
    scoring='neg_mean_absolute_error',
    n_jobs=4, iid=False, cv=5)

gsearch1.fit(X_train, y_train)

print("wall-time: %.2f min"%(sum(gsearch1.cv_results_["mean_fit_time"]) / 60))
for params, score in zip(gsearch1.cv_results_["params"], gsearch1.cv_results_["mean_test_score"]):
    print(params, - score)
print("best:", gsearch1.best_params_, - gsearch1.best_score_)

wall-time: 22.97 min
{'max_depth': 3, 'min_child_weight': 1} 15387.434002313963
{'max_depth': 3, 'min_child_weight': 4} 15240.775977722962
{'max_depth': 3, 'min_child_weight': 7} 15236.380889588134
{'max_depth': 5, 'min_child_weight': 1} 14627.949400639647
{'max_depth': 5, 'min_child_weight': 4} 14634.520900086183
{'max_depth': 5, 'min_child_weight': 7} 14668.530760296755
{'max_depth': 7, 'min_child_weight': 1} 14366.845154738407
{'max_depth': 7, 'min_child_weight': 4} 14429.456026231064
{'max_depth': 7, 'min_child_weight': 7} 14462.610975154419
best: {'max_depth': 7, 'min_child_weight': 1} 14366.845154738407


__Observation__: min_child_weight is best set to 1; search max_depth for smaller granularity.

In [17]:
param_test2 = {
 'max_depth': list(range(6, 9, 1)),
}

param_test2

{'max_depth': [6, 7, 8]}

In [18]:
gsearch2 = GridSearchCV(
    estimator=xgb.XGBRegressor(learning_rate=0.1, n_estimators=140, max_depth=5,
                               min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8,
                               objective='reg:linear', nthread=4, scale_pos_weight=1, seed=0),
    param_grid=param_test2,
    scoring='neg_mean_absolute_error',
    n_jobs=4, iid=False, cv=5)

gsearch2.fit(X_train, y_train)

print("wall-time: %.2f min"%(sum(gsearch2.cv_results_["mean_fit_time"]) / 60))
for params, score in zip(gsearch2.cv_results_["params"], gsearch2.cv_results_["mean_test_score"]):
    print(params, - score)
print("best:", gsearch2.best_params_, - gsearch2.best_score_)

wall-time: 11.62 min
{'max_depth': 6} 14490.723691292387
{'max_depth': 7} 14366.845154738407
{'max_depth': 8} 14298.482048833923
best: {'max_depth': 8} 14298.482048833923


__Observation__: Depth 8 is still not overfitting, try deeper tree

In [39]:
param_test3 = {
 'max_depth': list(range(8, 18, 2)),
}
param_test3

{'max_depth': [8, 10, 12, 14, 16]}

In [43]:
gsearch3 = GridSearchCV(
    estimator=xgb.XGBRegressor(learning_rate=0.1, n_estimators=140, max_depth=7,
                               min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8,
                               objective='reg:linear', nthread=4, scale_pos_weight=1, seed=0),
    param_grid=param_test3,
    scoring='neg_mean_absolute_error',
    n_jobs=4, iid=False, cv=5)

gsearch3.fit(X_train, y_train)

print("wall-time: %.2f min"%(sum(gsearch3.cv_results_["mean_fit_time"]) / 60))
for params, score in zip(gsearch3.cv_results_["params"], gsearch3.cv_results_["mean_test_score"]):
    print(params, - score)
# print("best:", gsearch3.best_params_, - gsearch3.best_score_)
send_message(gsearch3)

wall-time: 44.43 min
{'max_depth': 8} 14298.482048833923
{'max_depth': 10} 14265.829552426148
{'max_depth': 12} 14308.254388208743
{'max_depth': 14} 14475.683507333937
{'max_depth': 16} 14621.671539762605
best: {'max_depth': 10} 14265.829552426148
message body Finished gsearch, 
    wall-time: 44.43 min, 
    best params {'max_depth': 10}, 
    mae 14265.82955
message sent SMdc7080be6a794b6a91059a5196e90de5


__Observation__: Max depth 10 is optimal.

In [44]:
param_test4 = {
 'gamma': [i / 10.0 for i in range(0, 10)]
}
param_test4

{'gamma': [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]}

In [45]:
gsearch4 = GridSearchCV(
    estimator=xgb.XGBRegressor(learning_rate=0.1, n_estimators=140, max_depth=10,
                               min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8,
                               objective='reg:linear', nthread=4, scale_pos_weight=1, seed=0),
    param_grid=param_test4,
    scoring='neg_mean_absolute_error',
    n_jobs=4, iid=False, cv=5)

gsearch4.fit(X_train, y_train)

print("wall-time: %.2f min"%(sum(gsearch4.cv_results_["mean_fit_time"]) / 60))
for params, score in zip(gsearch4.cv_results_["params"], gsearch4.cv_results_["mean_test_score"]):
    print(params, - score)
send_message(gsearch4)

wall-time: 40.72 min
{'gamma': 0.0} 14366.845154738407
{'gamma': 0.1} 14366.845154738407
{'gamma': 0.2} 14366.845154738407
{'gamma': 0.3} 14366.845154738407
{'gamma': 0.4} 14366.845154738407
{'gamma': 0.5} 14366.845154738407
{'gamma': 0.6} 14366.845154738407
{'gamma': 0.7} 14366.845154738407
{'gamma': 0.8} 14366.845154738407
{'gamma': 0.9} 14366.845154738407
message body Finished gsearch, 
    wall-time: 40.72 min, 
    best params {'gamma': 0.0}, 
    mae 14366.84515
message sent SM1a1d0aed0cf14b4fb52b5fe876a37438


__Observation__: gamma 0 is optimal.

In [46]:
param_test5 = {
 'subsample': [i / 10 for i in range(6,10)],
 'colsample_bytree': [i / 10 for i in range(6,10)]
}
param_test5

{'colsample_bytree': [0.6, 0.7, 0.8, 0.9], 'subsample': [0.6, 0.7, 0.8, 0.9]}

In [None]:
gsearch5 = GridSearchCV(
    estimator=xgb.XGBRegressor(learning_rate=0.1, n_estimators=140, max_depth=10,
                               min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8,
                               objective='reg:linear', nthread=4, scale_pos_weight=1, seed=0),
    param_grid=param_test5,
    scoring='neg_mean_absolute_error',
    n_jobs=4, iid=False, cv=5)

gsearch5.fit(X_train, y_train)

In [48]:
print("wall-time: %.2f min"%(sum(gsearch5.cv_results_["mean_fit_time"]) / 60))
for params, score in zip(gsearch5.cv_results_["params"], gsearch5.cv_results_["mean_test_score"]):
    print(params, - score)
send_message(gsearch5)

wall-time: 109.95 min
{'colsample_bytree': 0.6, 'subsample': 0.6} 14297.416682924988
{'colsample_bytree': 0.6, 'subsample': 0.7} 14197.824454082885
{'colsample_bytree': 0.6, 'subsample': 0.8} 14151.900348332823
{'colsample_bytree': 0.6, 'subsample': 0.9} 14180.547296141298
{'colsample_bytree': 0.7, 'subsample': 0.6} 14294.527731534059
{'colsample_bytree': 0.7, 'subsample': 0.7} 14261.763590499633
{'colsample_bytree': 0.7, 'subsample': 0.8} 14190.671222262452
{'colsample_bytree': 0.7, 'subsample': 0.9} 14141.545293378726
{'colsample_bytree': 0.8, 'subsample': 0.6} 14350.902895609619
{'colsample_bytree': 0.8, 'subsample': 0.7} 14278.39242803467
{'colsample_bytree': 0.8, 'subsample': 0.8} 14265.829552426148
{'colsample_bytree': 0.8, 'subsample': 0.9} 14146.855755669525
{'colsample_bytree': 0.9, 'subsample': 0.6} 14304.437102858996
{'colsample_bytree': 0.9, 'subsample': 0.7} 14211.96640664618
{'colsample_bytree': 0.9, 'subsample': 0.8} 14224.785304292416
{'colsample_bytree': 0.9, 'subsampl

__Observation__: need to try finer resoluton for colsample_bytree, subsample

In [52]:
param_test6 = {
 'subsample': [i / 100 for i in range(85, 96, 5)],
 'colsample_bytree': [i / 100 for i in range(65, 76, 5)]
}
param_test6

{'colsample_bytree': [0.65, 0.7, 0.75], 'subsample': [0.85, 0.9, 0.95]}

In [None]:
gsearch6 = GridSearchCV(
    estimator=xgb.XGBRegressor(learning_rate=0.1, n_estimators=140, max_depth=10,
                               min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8,
                               objective='reg:linear', nthread=4, scale_pos_weight=1, seed=0),
    param_grid=param_test6,
    scoring='neg_mean_absolute_error',
    n_jobs=4, iid=False, cv=5)

gsearch6.fit(X_train, y_train)
send_message(gsearch6)

In [55]:
print("wall-time: %.2f min"%(sum(gsearch6.cv_results_["mean_fit_time"]) / 60))
for params, score in zip(gsearch6.cv_results_["params"], gsearch6.cv_results_["mean_test_score"]):
    print(params, - score)
print("best:", gsearch6.best_params_, - gsearch6.best_score_)

wall-time: 59.83 min
{'colsample_bytree': 0.65, 'subsample': 0.85} 14194.016213601746
{'colsample_bytree': 0.65, 'subsample': 0.9} 14233.138986253174
{'colsample_bytree': 0.65, 'subsample': 0.95} 14169.474725975491
{'colsample_bytree': 0.7, 'subsample': 0.85} 14150.934154231567
{'colsample_bytree': 0.7, 'subsample': 0.9} 14141.545293378726
{'colsample_bytree': 0.7, 'subsample': 0.95} 14154.664346210542
{'colsample_bytree': 0.75, 'subsample': 0.85} 14169.782717311677
{'colsample_bytree': 0.75, 'subsample': 0.9} 14221.981842573607
{'colsample_bytree': 0.75, 'subsample': 0.95} 14140.736857633437
best: {'colsample_bytree': 0.75, 'subsample': 0.95} 14140.736857633437


__Observation__: subsample at 0.85 and 0.95 give similar performance given fixed colsample_bytree. To avoid overfitting, choose 0.85 for subsample, 0.7 for colsample_bytree. The difference is only around ~10.

In [None]:
param_test7 = {
 'reg_alpha': [1e-4, 1e-3, 1e-2, 0.1, 1, 10, 100]
}

gsearch7 = GridSearchCV(
    estimator=xgb.XGBRegressor(learning_rate=0.1, n_estimators=140, max_depth=10,
                               min_child_weight=1, gamma=0, subsample=0.85, colsample_bytree=0.7,
                               objective='reg:linear', nthread=4, scale_pos_weight=1, seed=0),
    param_grid=param_test7,
    scoring='neg_mean_absolute_error',
    n_jobs=4, iid=False, cv=5)

gsearch7.fit(X_train, y_train)
send_message(gsearch7)

In [58]:
def print_results(gsearch):
    print("wall-time: %.2f min"%(sum(gsearch.cv_results_["mean_fit_time"]) / 60))
    for params, score in zip(gsearch.cv_results_["params"], gsearch.cv_results_["mean_test_score"]):
        print(params, - score)
    print("best:", gsearch.best_params_, - gsearch.best_score_)
print_results(gsearch7)

wall-time: 43.90 min
{'reg_alpha': 0.0001} 14150.93416957733
{'reg_alpha': 0.001} 14150.934202255554
{'reg_alpha': 0.01} 14150.934238186954
{'reg_alpha': 0.1} 14147.684856400989
{'reg_alpha': 1} 14147.63814258606
{'reg_alpha': 10} 14196.656291939085
{'reg_alpha': 100} 14162.209224667724
best: {'reg_alpha': 1} 14147.63814258606


#### Final Model

In [65]:
xgr = xgb.XGBRegressor(learning_rate=0.1, n_estimators=140, max_depth=10, reg_alpha=1,
                       min_child_weight=1, gamma=0, subsample=0.85, colsample_bytree=0.7,
                       objective='reg:linear', nthread=4, scale_pos_weight=1, seed=0)
xgr.fit(X_train, y_train)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.7, gamma=0, importance_type='gain',
       learning_rate=0.1, max_delta_step=0, max_depth=10,
       min_child_weight=1, missing=None, n_estimators=140, n_jobs=1,
       nthread=4, objective='reg:linear', random_state=0, reg_alpha=1,
       reg_lambda=1, scale_pos_weight=1, seed=0, silent=True,
       subsample=0.85)

##### Test Set Performance

In [83]:
from sklearn.metrics import mean_absolute_error, r2_score
y_test_pred = xgr.predict(X_test)
mae = mean_absolute_error(y_test, y_test_pred)

print("avg mean_absolute_error: %.3f"%mae)
print("r2_score: %.3f"%r2_score(y_test, y_test_pred))

avg mean_absolute_error: 12848.090
r2_score: 0.672
