# 0. Importing modules and data

In [31]:
import xgboost as xgb                                                           # model library
import pandas as pd                                                             # dataset processing
import numpy as np                                                              # dataset filtering

from sklearn.preprocessing import OrdinalEncoder                                # sklearn's preprocessing module
from sklearn.model_selection import train_test_split, GridSearchCV              # sklearn's model selection module
from sklearn.feature_selection import SelectFromModel                           # sklearn's feature selection module
from sklearn.metrics import classification_report                               # sklearn's metrics module

from imblearn.over_sampling import SMOTENC                                      # SMOTENC module from imbalanecd-learning package

In [32]:
# loading dataset
data = pd.read_csv("data/bank-additional-full.csv", sep=";", index_col=None)


# 1. Preprocessing steps

In [33]:
ordinal_ec = OrdinalEncoder() # defining an instance of the ordinal encoder

## Applying a bunch of basic dataframe transformations using pd and np

data['education'] = data['education'].replace(to_replace=['basic.4y','basic.6y','basic.9y'], value = 'basic')
data['education'] = ordinal_ec.fit_transform(data[['education']])

data['pdays'] = np.where(data['pdays'] != 999, 1, 0)

data['y'] = data['y'].map({'no': 0, 'yes': 1})

data.drop('duration', axis=1, inplace=True)


# 2. Train-test splitting and class imbalance using SMOTE-NC

In [34]:
# this step is necessary since xgboost can't handle features that are of type `object`, they must be converted to `category` first or xgboost will error out

object_columns = data.select_dtypes(include=['object']).columns # filters for `object` columns

data[object_columns] = data[object_columns].astype('category') # transforms `object` columns to `category`

In [35]:
# the dataset's target class is imbalanced (as is the case with most datasets really) so we need to balance it using SMOTE.
# but since we have categorical variables in the data, we need to use SMOTE-NC (essentially an extension to SMOTE that supports categorical features)

# categorical columns, in hindsight this could have also been done by filtering using dtypes like the code cell above but wtv for now
cat_cols_smote_nc = ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'day_of_week', 'poutcome']

# isolating the target class (y) from the features (X)
X = data.loc[:, data.columns != 'y']
y = data['y']

# using sklearn's test_train_split() method to split the data into 4 components
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                test_size = 0.2,
                                                random_state = 0)

# applying SMOTE-NC algorithm on data to balance it. SMOTE-NC requires specifying which columns are categorical (hence why `cat_cols_smote_nc` exists)
smotenc = SMOTENC(cat_cols_smote_nc,random_state = 0)
X_train, y_train = smotenc.fit_resample(X_train, y_train)

# 3. Model fitting and hyperparameter tuning

In [36]:
# creating the baseline xgboost model
clf = xgb.XGBClassifier(enable_categorical=True, tree_method='approx')

# defining the parameters that will be used for hyperparameter tuning (this can be very computationally expensive so select carefully!)
params_grid = {
    'n_estimators': [50, 100],
    'max_depth': [8, 10],
    'learning_rate': [0.05, 0.1, 0.3],
    'eval_metric': ['error', 'auc'],
}


# creating and fitting instance of grid search on balanced dataset.
# `cv` parameter is cross-validation (how many subsets of the training data will be sampled to measure the model on unseen data)
# `verbose` set to 2 is nice because it tells you how many combinations GridSearch will iterate over and the parameters, score, 
# and computation time of every 'fit'. can be removed altogether tho, makes no difference 
# `error_score` always leave that to 'raise', will prevent wasting lots of time by directly flagging errors and exiting out of GridSearch for debugging
grid_search = GridSearchCV(clf, params_grid, cv=5, n_jobs=-1, verbose=2, error_score='raise')
grid_search.fit(X_train, y_train)

# grid search objects have a few attributes, .best_params_ pull the parameters that were fitted
# .best_estimator_ is the model fit that had the highest evaluation score
grid_best_params = grid_search.best_params_
grid_best_model = grid_search.best_estimator_


Fitting 5 folds for each of 24 candidates, totalling 120 fits
[CV] END eval_metric=error, learning_rate=0.05, max_depth=8, n_estimators=50; total time=   1.4s
[CV] END eval_metric=error, learning_rate=0.05, max_depth=8, n_estimators=50; total time=   1.5s
[CV] END eval_metric=error, learning_rate=0.05, max_depth=8, n_estimators=50; total time=   1.5s
[CV] END eval_metric=error, learning_rate=0.05, max_depth=8, n_estimators=50; total time=   1.5s
[CV] END eval_metric=error, learning_rate=0.05, max_depth=8, n_estimators=50; total time=   1.5s
[CV] END eval_metric=error, learning_rate=0.05, max_depth=8, n_estimators=100; total time=   2.8s
[CV] END eval_metric=error, learning_rate=0.05, max_depth=8, n_estimators=100; total time=   2.9s
[CV] END eval_metric=error, learning_rate=0.05, max_depth=8, n_estimators=100; total time=   3.0s
[CV] END eval_metric=error, learning_rate=0.05, max_depth=10, n_estimators=50; total time=   1.5s
[CV] END eval_metric=error, learning_rate=0.05, max_depth=10,

# 4. Feature selection and final model evaluation

In [37]:
# SelectFromModel is a class of sklearn that lets you select features after a model has been fitted based on 
# how the model itself ranks the features. 

feature_selector = SelectFromModel(grid_best_model, prefit=True)
feature_selector.fit(X_train, X_test)

# get the indices of the selected features (essentially the number of the column on a 0-based index)
selected_feature_indices = feature_selector.get_support(indices=True)

# pull the corresponding column names of the selected indices
selected_feature_names = X_train.columns[selected_feature_indices]

# update the X_ train+test subsets to reflect the selected features
X_train_selected = feature_selector.transform(X_train)
X_test_selected = feature_selector.transform(X_test)

# still using the best estimator and parameters from gridsearch, fit it with the selected features
# this should technically give the best model possible if done right.
final_model = grid_best_model.set_params(**grid_best_params)
final_model.fit(X_train_selected, y_train)

# classification_report is a cool function that generates a group of evaluation metrics for the model
# instead of having to manually compute evaluation metrics seperately then consolidate them together! 
y_pred = final_model.predict(X_test_selected)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.95      0.72      0.82      7319
           1       0.24      0.70      0.36       919

    accuracy                           0.72      8238
   macro avg       0.59      0.71      0.59      8238
weighted avg       0.87      0.72      0.77      8238

