In [None]:
import warnings
warnings.filterwarnings("ignore")

import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from imblearn.over_sampling import SMOTE
import xgboost as xgb
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, classification_report, confusion_matrix

import pickle
import sys
from importlib import reload
if "visualizations" in sys.modules: 
    reload(sys.modules["visualizations"])
else:
    import visualizations
vis = visualizations.Visualizations()
if "modelling" in sys.modules: 
    reload(sys.modules["modelling"])
else:
    import modelling
mod = modelling.Modelling(vis=vis)

RANDOM_SEED = 1243

In [None]:
def printTimeLapsed(startTime, endTime):
    seconds = endTime - startTime
    minutes = seconds // 60
    seconds = seconds % 60
    hours = minutes // 60
    minutes = minutes % 60
    print("Time Lapsed = {0}:{1}:{2}".format(int(hours), int(minutes), int(seconds)))

In [None]:
modelData = pickle.load(file=open("data/modelData.pkl", "rb"))
X_train = modelData["X_train"]
y_train = modelData["y_train"]
X_test = modelData["X_test"]
y_test = modelData["y_test"]

print("Shape of the train/test data: %s / %s" % (str(X_train.shape), str(X_test.shape)))

In [None]:
smoteModel = SMOTE(random_state=RANDOM_SEED)
X_train_sm, y_train_sm = smoteModel.fit_resample(X_train, y_train)

In [None]:
rfModel = RandomForestClassifier(n_estimators=200, random_state=RANDOM_SEED)
rfModel.fit(X_train_sm, y_train_sm)
rfPreds = rfModel.predict(X_test)

In [None]:
mod.getModelPerformance(trueVals=y_test, preds=rfPreds, figSize=(5,5), plotTitle="Random Forests Performance", targetNames=["Stable","Bankrupt"])

In [None]:
xgbModel = xgb.XGBClassifier(
    nrounds= 1000, max_depth=3, eta=0.1, objective="binary:logistic", eval_metric="auc", 
    verbosity=0, use_label_encoder=False, random_state=RANDOM_SEED)
xgbModel.fit(X_train_sm, y_train_sm)
xgbPreds = xgbModel.predict(X_test)


In [None]:
mod.getModelPerformance(trueVals=y_test, preds=xgbPreds, figSize=(5,5), plotTitle="XGBoost Performance", targetNames=["Stable","Bankrupt"])

In [None]:
xgbParams = {
    "n_estimators": [1000, 1500, 2000],
    "eta": [0.01, 0.03, 0.1],
    "gamma": [0.03, 0.1, 0.3],
    "lambda": [1, 3, 10],
    #"max_depth": [3, 6, 10],
    #"min_child_weight": [1, 3, 5],
    #"subsample": [0.67, 1.0],
    #"colsample_bytree": [0.67, 1.0]
    }
xgbcvModel = xgb.XGBClassifier(
    objective="binary:logistic", eval_metric="auc", nthreads=1, silent=True, 
    use_label_encoder=False, random_state=RANDOM_SEED
    )

In [None]:
nfolds = 5
nParamCombos = 50
stratKF = StratifiedKFold(n_splits=nfolds, shuffle=True, random_state=RANDOM_SEED)
randomSearch = RandomizedSearchCV(
    xgbcvModel, param_distributions=xgbParams, n_iter=nParamCombos, scoring="roc_auc", 
    n_jobs=4, cv=stratKF.split(X_train_sm, y_train_sm), verbose=3, random_state=RANDOM_SEED
    )

In [None]:
startTime = time.time()
randomSearch.fit(X_train_sm, y_train_sm)
endTime = time.time()
printTimeLapsed(startTime=startTime, endTime=endTime)

In [None]:
print(randomSearch.best_params_)
print("Best score from randomized grid search = %.4f" % randomSearch.best_score_)

In [None]:
xgbcvPreds = randomSearch.predict(X_test)
mod.getModelPerformance(trueVals=y_test, preds=xgbPreds, figSize=(5,5), plotTitle="XGBoost Performance", targetNames=["Stable","Bankrupt"])