In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
from xgboost import XGBClassifier
from sklearn.metrics import confusion_matrix, fbeta_score, make_scorer
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
import pickle

In [2]:
# Import cleaned train and test data
X_train = pd.read_csv("train_X_In-Car-Rec.csv")
y_train = pd.read_csv("train_y_In-Car-Rec.csv")
X_test = pd.read_csv("test_X_In-Car-Rec.csv")
y_test = pd.read_csv("test_y_In-Car-Rec.csv")

In [3]:
# View columns to spot check values
X_train.columns

Index(['TEMPERATURE', 'HAS_CHILDREN', 'TOCOUPON_GEQ5MIN', 'TOCOUPON_GEQ15MIN',
       'TOCOUPON_GEQ25MIN', 'DIRECTION_SAME', 'DIRECTION_OPP',
       'DESTINATION_HOME', 'DESTINATION_NO_URGENT_PLACE', 'DESTINATION_WORK',
       ...
       'RESTAURANTLESSTHAN20_1~3', 'RESTAURANTLESSTHAN20_4~8',
       'RESTAURANTLESSTHAN20_GT8', 'RESTAURANTLESSTHAN20_LESS1',
       'RESTAURANTLESSTHAN20_NEVER', 'RESTAURANT20TO50_1~3',
       'RESTAURANT20TO50_4~8', 'RESTAURANT20TO50_GT8',
       'RESTAURANT20TO50_LESS1', 'RESTAURANT20TO50_NEVER'],
      dtype='object', length=109)

In [4]:
# Verify imported data is complete
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(10147, 109)
(10147, 1)
(2537, 109)
(2537, 1)


In [5]:
# Verify columns imported correctly, sometimes an extra index column is present when writing to csv
print(X_train.head())
print(y_train.head())
print(X_test.head())
print(y_test.head())

   TEMPERATURE  HAS_CHILDREN  TOCOUPON_GEQ5MIN  TOCOUPON_GEQ15MIN  \
0         80.0           0.0               1.0                1.0   
1         30.0           1.0               1.0                0.0   
2         55.0           0.0               1.0                0.0   
3         55.0           0.0               1.0                1.0   
4         80.0           0.0               1.0                1.0   

   TOCOUPON_GEQ25MIN  DIRECTION_SAME  DIRECTION_OPP  DESTINATION_HOME  \
0                0.0             0.0            1.0                 0   
1                0.0             0.0            1.0                 0   
2                0.0             1.0            0.0                 0   
3                1.0             0.0            1.0                 0   
4                0.0             0.0            1.0                 0   

   DESTINATION_NO_URGENT_PLACE  DESTINATION_WORK  ...  \
0                            1                 0  ...   
1                            1  

In [6]:
# simplify the model with a variable for later use
xgb = XGBClassifier()

The section below sets up the hyperparameters for tuning the model via random search. Due to the large number of combinations, two rounds of tuning will ensue. The first round will feature half the parameters with defaults selected for the others. After the first round, the best model will be used to set a fixed value for the first hyperparameters and the second set will be tested with a distribution via random search.

In [226]:
# Criterion used to guide data splits
booster = ['gbtree'] # This is the default value. Linear booster is rarely used due to poor performance
# max_depth round 1 parameter: [int(x) for x in np.linspace(1,100, num=20)]
max_depth = [60] # Any positive value, default 6
# min_child_weight round 1 parameter: [int(x) for x in np.linspace(1,10000, num=100)]
min_child_weight = [1] # Any positive value, default 0, larger = less overfitting
# subsample round 1 parameter: [x for x in np.linspace(0,1, num=10)]
subsample = [0.77] # any value 0-1, default 1, lower = less over fitting but may underfit
# colsample_bytree round 1 parameter: [x for x in np.linspace(0,1, num=10)]
colsample_bytree = [0.44] # any value 0-1, ratio of colmns selected for each tree
# learning_rate round 2 parameter: [x for x in np.linspace(0,1, num=100)]
learning_rate = [0.01] # any value 0-1, default 0.3
# gamma round 2 parameter: [0,0.1,1,10 ]
gamma = [0,0.1,1,10] # Any positive value, default 0, larger = conservative
# n_estimators round 3 parameter: [int(x) for x in np.linspace(0,1000, num=100)]
n_estimators = [int(x) for x in np.linspace(0,1000, num=100)]# Number of trees in model, more = overfit

# Tune hyperparameters stepwise
# GROUP 1: max_depth , min_child_weight, subsample, colsample_bytree
# GROUP 2: learning_rate, gamma,n_estimators

# Create the random grid
param_grid_random = { 'booster': booster,# Default, stated for clarity
                        'max_depth' : max_depth,# Round 1
                        'min_child_weight' : min_child_weight,# Round 1
                        'subsample' : subsample,# Round 1
                        'colsample_bytree' : colsample_bytree,# Round 1
                        'learning_rate': learning_rate,# Round 2
                        'gamma': gamma,# Round 2
                        'n_estimators' : n_estimators# Round 2
                     }

In [9]:
# Create a custom score to optimize model
f2_scorer = make_scorer(fbeta_score, beta=2)

In [228]:
%%time
random_search = RandomizedSearchCV(xgb, param_grid_random, n_iter=60, cv=5, random_state=42,
                                  scoring = f2_scorer, n_jobs = -1)
random_search.fit(X_train, y_train)

# This code block was used multiple tiems to tune parameters in a step wise manner

Wall time: 1h 3min 56s


In [229]:
# Store best estimator
best_estimator = random_search.best_estimator_

# Get the best parameters and score
best_random_params = random_search.best_params_
best_random_score = random_search.best_score_
best_random_params, best_random_score

({'subsample': 0.77,
  'n_estimators': 616,
  'min_child_weight': 1,
  'max_depth': 60,
  'learning_rate': 0.010101010101010102,
  'gamma': 0,
  'colsample_bytree': 0.44,
  'booster': 'gbtree'},
 0.818844534668821)

In [230]:
# Save preliminary model and view cofusion matrix to spot check
# This model had isssues with underfitting the data
y_pred_xgb_random = best_estimator.predict(X_test)

cm1 = confusion_matrix(y_test, y_pred_xgb_random)
print(cm1)


[[ 748  330]
 [ 224 1235]]
[[4312   84]
 [  33 5718]]


# Second Try - possible underfitting

The second try section records the results for my second attempt at hyperparameter tuning as I ran into serious underfitting issues the first time around

### Round 1

Below are the optimal parameters after round one of random search (time to complete- 39:48). These will be implimented as static values for round two. The round two parameters will now feature a distribution of values instead of defaults. Round one used 1000 random selections from 2 given parameters: 
1. max_depth
2. min_child_weight
3. subsample
4. colsample_bytree

({'subsample': 0.7777777777777777,
  'min_child_weight': 1,
  'max_depth': 58,
  'colsample_bytree': 0.4444444444444444,
  'booster': 'gbtree'},
 0.7942921145067275)

### Round 2


Below are the optimal parameters after round two of random search (time to complete- 1:03:56 mins). These will be implimented as static values for round three. The round two parameters will now feature a distribution of values instead of defaults. Round two used 60 random selections from 3 given parameters and was able to explore all combinations: 
1. learning_rate
2. gamma
3. n_estimators

({'subsample': 0.77,
  'n_estimators': 616,
  'min_child_weight': 1,
  'max_depth': 60,
  'learning_rate': 0.010101010101010102,
  'gamma': 0,
  'colsample_bytree': 0.44,
  'booster': 'gbtree'},
 0.818844534668821)

---

# First Try - possible underfitting

### Round 1 Results

Below are the optimal parameters after round one of random search (time to complete- 3:07:22). These will be implimented as static values for round two. The round two parameters will now feature a distribution of values instead of defaults. Round one used 7000 random selections from 4 given parameters: 
1. max_depth
2. min_child_weight
3. subsample
4. colsample_bytree

({'subsample': 0.5050505050505051,  
  'min_child_weight': 950,  
  'max_depth': 11,  
  'colsample_bytree': 0.33333333333333337,  
  'booster': 'gbtree'},
 0.8673946364835304)

### Round 2 Results

Below are the optimal parameters after round two of random search (time to complete- 34 mins). These will be implimented as static values for round three. The round two parameters will now feature a distribution of values instead of defaults. Round two used 1400 random selections from 2 given parameters and was able to explore all combinations: 
1. learning_rate
2. gamma

({'subsample': 0.5,  
  'min_child_weight': 950,  
  'max_depth': 11,  
  'learning_rate': 0.010101010101010102,  
  'gamma': 0.001,  
  'colsample_bytree': 0.33,  
  'booster': 'gbtree'},  
 0.8673946364835304)

### Round 3 Results

Round three yeilded the result that 10 was the optimal number for n_estimators given the other static parameters. Training time was 12:03. Results were as follows:

({'subsample': 0.5,  
  'n_estimators': 10,  
  'min_child_weight': 950,  
  'max_depth': 11,  
  'learning_rate': 0.01,  
  'gamma': 0.001,  
  'colsample_bytree': 0.33,  
  'booster': 'gbtree'},  
 0.8673946364835304)

### Grid Search

In [10]:
# Criterion used to guide data splits
booster = ['gbtree'] # This is the default value. Linear booster is rarely used due to poor performance
max_depth = [50,60,70] # Any positive value, default 6
min_child_weight = [1,5,10] # Any positive value, default 0, larger = less overfitting
subsample = [0.7,0.8,0.9] # any value 0-1, default 1, lower = less over fitting but may underfit
colsample_bytree = [0.4,0.45, 0.5] # any value 0-1, ratio of colmns selected for each tree
learning_rate = [0.01, 0.05] # any value 0-1, default 0.3
gamma = [0, 1, 10] # Any positive value, default 0, larger = conservative
n_estimators = [500, 600, 700]# Number of trees in model, more = overfit

# Create the grid
param_grid = { 'booster': booster,# Default, stated for clarity
                'max_depth' : max_depth,
                'min_child_weight' : min_child_weight,
                'subsample' : subsample,
                'colsample_bytree' : colsample_bytree,
                'learning_rate': learning_rate,
                'gamma': gamma,
                'n_estimators' : n_estimators
                     }

In [11]:
%%time
best_grid_search_model = GridSearchCV(xgb, param_grid, cv = 5,
                                      scoring=f2_scorer, n_jobs = -1)

_ = best_grid_search_model.fit(X_train, y_train)

# Obtain the best model through grid search

  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)


CPU times: total: 1min 1s
Wall time: 43min 24s


In [12]:
# Get the best parameters and score
best_params = best_grid_search_model.best_params_
best_score = best_grid_search_model.best_score_
best_params, best_score

({'booster': 'gbtree',
  'colsample_bytree': 0.4,
  'gamma': 0,
  'learning_rate': 0.01,
  'max_depth': 50,
  'min_child_weight': 1,
  'n_estimators': 500,
  'subsample': 0.7},
 0.8221619337168005)

In [13]:
pickle.dump(best_grid_search_model, open('xgbV2.pkl','wb'))

In [14]:
y_pred_xgb = best_grid_search_model.predict(X_test)

  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)


In [15]:
cm = confusion_matrix(y_test, y_pred_xgb)
print(cm)

[[ 724  354]
 [ 219 1240]]


In [16]:
fbeta_score(y_test, y_pred_xgb, beta = 2)

0.8344549125168237