In [116]:
# Load Necessary Libraries

import pandas as pd
import numpy as np

from sklearn.model_selection import cross_val_score, StratifiedKFold, GridSearchCV
from sklearn.metrics import accuracy_score

seed = 5

# Load and Prepare Saved Data

In [63]:
## Load preprocessed file

Xy_train = pd.read_csv('season_train.csv', sep = ',')
Xy_test = pd.read_csv('season_test.csv', sep = ',')

In [64]:
features = list(Xy_train.drop(columns = 'season').columns)

In [65]:
# Separating outcome and explanatory variables into their resp. datasets

X_train = pd.DataFrame(Xy_train[features])
y_train = pd.DataFrame(Xy_train['season'])
X_test = pd.DataFrame(Xy_test[features])
y_test = pd.DataFrame(Xy_test['season'])

In [59]:
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(29923, 15) (29923, 1)
(12825, 15) (12825, 1)


In [67]:
# Converting the outcome variable since our model algorithms expects it as 1-d array

y_train = y_train.values.ravel()
y_test = y_test.values.ravel()

(y_train.shape, y_test.shape)

((29923,), (12825,))

# Metric

We will use accuracy metric to measure performance of our models

Note: Accuracy Metric requires the data to be balanced otherwise it leads to bias scores

In [105]:
np.round((Xy_train['season'].value_counts().values / Xy_train.shape[0]) * 100, 2)

array([49.37, 37.56,  8.65,  4.42])

To handle unbalanced data when modelling we can:

* use upsampling/downsmapling technique 
* assign weights to the respective classes
* build separate models and/or model within model
in addition to the above methods when appropriate

In [104]:
from sklearn.utils import class_weight

In [103]:
classes_weights = class_weight.compute_sample_weight(
        class_weight='balanced',
        y = Xy_train['season']
        )

In [101]:
np.unique(classes_weights)

array([0.50641416, 0.6656656 , 2.88943608, 5.65011329])

# Training

We will demonstrate now how to use XGBoost and  CatBoost algorithm for training and building our model

In [69]:
!pip install xgboost



In [70]:
from xgboost.sklearn import XGBClassifier

### Baseline XGBoost Model

In [106]:
model_xgb = XGBClassifier(random_state = seed)
XGB = model_xgb.fit(X_train, y_train, sample_weight = classes_weights)

In [107]:
model_predict = XGB.predict(X_test)

In [117]:
print('Mean ROC AUC: %.3f' % accuracy_score(y_test, model_predict))

Mean ROC AUC: 0.470


### Tuning XGBoost Hyperparameters

Now we will start tuning our XGB classifier model

Note: We can also use XGBoost inbuilt early stopping callback function to automatically stop searching over the grid parameters when chosen loss stops decreasing with respective to increase in number of trees.
Optuna is another library useful for the same and gives better results in more efficient way.

In [119]:
# define evaluation procedure
fold = StratifiedKFold(n_splits = 5, shuffle = True, random_state = seed)

Tuning Number of Trees/Estimators

In [121]:
model = XGBClassifier(learning_rate = 0.01, eval_metric = 'auc', 
                      sample_weight = classes_weights, seed = seed)

param_grid = {'n_estimators': [15,50,100,150]}

GR = GridSearchCV(estimator = model, param_grid = param_grid, 
                  scoring = 'accuracy', cv = fold, n_jobs = -1)    

GR_XGB = GR.fit(X_train,y_train)

print("Best acc score:: %f using %s "%(GR_XGB.best_score_, GR_XGB.best_params_))

Best acc score:: 0.512148 using {'n_estimators': 150} 


Note: We can increase the number of trees to see if it would further help in any significant  increasing of our score.
We can then lower learning rate and then again tune number of trees if required thus tuning both of them side by side as they are both closely related

Tuning 'max depth' and 'min child weight'

* 'max depth' - Increasing this value will make the model more complex and more likely to overfit

* 'min child weight' - Minimum sum of instance weight (hessian) needed in a child. The larger min_child_weight is, the more conservative the algorithm will be.

In [134]:
model = XGBClassifier(learning_rate = 0.01, n_estimators = 150,
                      sample_weight = classes_weights, 
                      eval_metric = 'auc', seed = 7)

param_grid = {'max_depth':range(3,10,2),  
              'min_child_weight':range(1,6,2) }

GR = GridSearchCV(estimator = model, param_grid = param_grid, 
                  scoring = 'accuracy', cv = fold, n_jobs = -1)    

GR_XGB = GR.fit(X_train,y_train)

print("Best acc score:: %f using %s "%(GR_XGB.best_score_,GR_XGB.best_params_))

Best acc score:: 0.516793 using {'max_depth': 5, 'min_child_weight': 1} 


Tuning parameter 'gamma'

* Gamma - Minimum loss reduction required to make a further partition on a leaf node of the tree. The larger gamma is, the more conservative the algorithm will be

In [135]:
model = XGBClassifier(learning_rate = 0.01, n_estimators = 150, max_depth = 5,
                      sample_weight = classes_weights, min_child_weight = 1, 
                      eval_metric = 'auc', seed = 7)

param_grid = { 'gamma':[i/10.0 for i in range(0,5)]}

GR = GridSearchCV(estimator = model, param_grid = param_grid, 
                  scoring = 'accuracy', cv = fold, n_jobs = -1)    

GR_XGB = GR.fit(X_train,y_train)

print("Best acc score:: %f using %s "%(GR_XGB.best_score_,GR_XGB.best_params_))

Best acc score:: 0.516793 using {'gamma': 0.0} 


Tuning parameters 'subsample' and colsample_bytree

* Subsample - Ratio of the training instances. Setting it to 0.5 means that XGBoost would randomly sample half of the training data prior to growing trees. and this will prevent overfitting

* colsample_bytree - It is the subsample ratio of columns when constructing each tree. Subsampling occurs once for every tree constructed.

In [140]:
model = XGBClassifier(learning_rate = 0.01, n_estimators = 150, max_depth = 5,  
                      gamma = 0, min_child_weight = 1, eval_metric = 'auc', 
                      sample_weight = classes_weights, seed = 7)

param_grid = {'subsample':[i/10.0 for i in range(6,10)],
              'colsample_bytree':[i/10.0 for i in range(6,10)]}

GR = GridSearchCV(estimator = model, param_grid = param_grid, 
                  scoring = 'accuracy', cv = fold, n_jobs = -1)    

GR_XGB = GR.fit(X_train,y_train)

print("Best acc score:: %f using %s "%(GR_XGB.best_score_, GR_XGB.best_params_))

Best acc score:: 0.518631 using {'colsample_bytree': 0.9, 'subsample': 0.9} 


Tuning parameter 'alpha'

* alpha - L1 regularization term on weights. Increasing this value will make model more conservative.

In [141]:
model = XGBClassifier(learning_rate = 0.01, n_estimators = 150, max_depth = 5,  
                      gamma = 0, min_child_weight = 1, eval_metric = 'auc',
                      colsample_bytree = 0.9, subsample = 0.9, 
                      sample_weight = classes_weights, refit = True,
                      seed = 7)

param_grid = { 'alpha':[1e-5, 1e-2, 0.1, 1, 100]}

GR = GridSearchCV(estimator = model, param_grid = param_grid, 
                  scoring = 'accuracy', cv = fold, n_jobs = -1)    

GR_XGB = GR.fit(X_train,y_train)

print("Best acc score:: %f using %s "%(GR_XGB.best_score_, GR_XGB.best_params_))

Best acc score:: 0.518631 using {'alpha': 1e-05} 


### Evaluating XGBoost

In [153]:
model_predict = GR_XGB.predict(X_test)

In [154]:
print('Mean ROC AUC: %.5f' % accuracy_score(y_test, model_predict))

Mean ROC AUC: 0.51072


### Baseline CatBoost Model

In [139]:
pip install catboost

Collecting catboost
  Downloading catboost-1.0.5-cp37-none-manylinux1_x86_64.whl (76.6 MB)
[K     |████████████████████████████████| 76.6 MB 1.4 MB/s 
Installing collected packages: catboost
Successfully installed catboost-1.0.5


In [144]:
from catboost import CatBoostClassifier

In [147]:
model_cat = CatBoostClassifier(eval_metric = 'AUC', auto_class_weights = 'Balanced', 
                               random_state = seed)
cat = model_cat.fit(X_train, y_train, verbose = False)

In [148]:
model_predict = cat.predict(X_test)

In [152]:
print('Mean ROC AUC: %.5f' % accuracy_score(y_test, model_predict))

Mean ROC AUC: 0.52000


We notice catboost model without tuning also performs slightly better than XGBoost. The reason for this might be that most of our explanatory variables are categorical in nature