# Import Packages

In [1]:
import warnings 
warnings.filterwarnings('ignore')

# for data analytics
import pandas as pd
import numpy as np
from collections import Counter

# for visualizations
import seaborn as sns
import matplotlib.pyplot as plt

# for data preparation
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score

# imblean provides tools for us to deal with imbalanced class sizes
from imblearn.over_sampling import SMOTE 
from imblearn.under_sampling import EditedNearestNeighbours
from imblearn.combine import SMOTEENN

# machine learning models
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

import os
os.environ['KMP_DUPLICATE_LIB_OK']='True'
from xgboost import XGBClassifier 
import pickle # to save models and results

# hyperopt - used for Bayesian hyperparameter tuning
from hyperopt import hp, fmin, tpe, STATUS_OK, Trials
from hyperopt.pyll import scope

# for evaluation of machine learning models
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, average_precision_score
from sklearn.metrics import confusion_matrix

# for timing
from time import time
from tqdm import tqdm

# Initialisation of Data/Handling of Categorical Variables

### Reading Data

In [2]:
# read data
siccodes = pd.read_csv('sic-codes.csv', low_memory=False, header=0)
test = pd.read_csv('test.csv', low_memory=False, header=0)
train = pd.read_excel('train.xlsx',engine='openpyxl')
variable_list = pd.read_csv('Variable_List.csv', low_memory=False, header=0)

# View train dataset
print(f'Original Train Shape: {train.shape}')
train.head(5)

Original Train Shape: (26819, 30)


Unnamed: 0,gvkey,datadate,conm,at,cogs,csho,dvp,ebit,gp,ib,...,xido,xopr,mkvalt,sic,bankrupt,Altman_X1,Altman_X2,Altman_X3,Altman_X4,Altman_X5
0,1240,2002-01-31,ALBERTSON'S INC,15967.0,26094.0,407.0,0.0,1787.0,11837.0,501.0,...,0.0,35118.0,11701.25,5411,0,0.06432,0.339074,0.111918,1.164072,2.375587
1,1655,2002-01-31,ANGELICA CORP,290.865,246.466,8.608,0.0,12.856,103.597,1.629,...,-24.338,326.107,101.5744,7200,0,0.161449,0.488845,0.044199,0.68006,1.203524
2,1864,2002-01-31,REX AMERICAN RESOURCES CORP,307.329,325.912,8.163,0.0,22.236,138.591,22.554,...,-0.245,438.069,240.4004,2860,0,0.300186,0.438319,0.072352,1.508575,1.511419
3,1878,2002-01-31,AUTODESK INC,902.444,124.903,55.644,0.0,131.804,822.588,90.313,...,0.0,768.484,2281.404,7370,0,0.214106,0.079664,0.146052,6.114234,1.049917
4,2436,2002-01-31,CALERES INC,700.898,1044.11,17.484,0.0,54.842,711.738,0.949,...,-4.911,1674.299,278.695,3140,0,0.321669,0.206953,0.078245,0.62735,2.505141


In [3]:
# View test dataset
print(f'Original Test Shape: {test.shape}')
test.head(5)

Original Test Shape: (7711, 28)


Unnamed: 0,id,at,cogs,csho,dvp,ebit,gp,ib,lct,lt,...,xido,xopr,mkvalt,sic,Altman_X1,Altman_X2,Altman_X3,Altman_X4,Altman_X5,bankrupt
0,0,438.049,364.536,8.33,0.0,23.46,45.416,26.495,37.241,156.206,...,1.775,374.912,214.081,2860,0.204995,0.740381,0.053556,1.370504,0.935859,
1,1,3227.8,113.6,225.9,0.0,354.3,2102.0,285.3,954.1,1344.9,...,0.0,1745.8,8132.4,7370,0.291778,0.160326,0.109765,6.046844,0.686412,
2,2,1227.476,1520.884,41.971,0.0,65.41,1061.94,8.923,524.58,813.76,...,14.998,2458.303,396.626,3140,0.236712,0.241455,0.053288,0.487399,2.104175,
3,3,394.559,358.532,11.088,0.0,82.269,177.235,63.046,56.518,83.832,...,0.0,443.516,630.0202,3537,0.394286,0.739892,0.208509,7.515271,1.357888,
4,4,551.089,574.176,29.163,0.0,96.475,357.282,64.834,156.993,184.41,...,0.0,813.158,781.86,5621,0.49382,0.532903,0.175062,4.239792,1.690213,


### Handling of Categorical Variables
- Encoding by SIC variable

In [4]:
# for categorical variables, perform one-hot-encoding
sic_dummy = pd.get_dummies(train['sic'], prefix='sic')  

# join dummy variables to original dataframe
train = train.join([sic_dummy]) 
test = test.join([sic_dummy])

# drop original sic variable
train = train.drop("sic", axis=1)
test = test.drop("sic", axis=1)

# view train dataset aftter one-hot encoding
print(f'Train Shape after one-hot encoding: {train.shape}')
train.head(5)

Train Shape after one-hot encoding: (26819, 432)


Unnamed: 0,gvkey,datadate,conm,at,cogs,csho,dvp,ebit,gp,ib,...,sic_8700,sic_8711,sic_8721,sic_8731,sic_8734,sic_8741,sic_8742,sic_8744,sic_9995,sic_9997
0,1240,2002-01-31,ALBERTSON'S INC,15967.0,26094.0,407.0,0.0,1787.0,11837.0,501.0,...,0,0,0,0,0,0,0,0,0,0
1,1655,2002-01-31,ANGELICA CORP,290.865,246.466,8.608,0.0,12.856,103.597,1.629,...,0,0,0,0,0,0,0,0,0,0
2,1864,2002-01-31,REX AMERICAN RESOURCES CORP,307.329,325.912,8.163,0.0,22.236,138.591,22.554,...,0,0,0,0,0,0,0,0,0,0
3,1878,2002-01-31,AUTODESK INC,902.444,124.903,55.644,0.0,131.804,822.588,90.313,...,0,0,0,0,0,0,0,0,0,0
4,2436,2002-01-31,CALERES INC,700.898,1044.11,17.484,0.0,54.842,711.738,0.949,...,0,0,0,0,0,0,0,0,0,0


In [5]:
# view test dataset after one-hot encoding
print(f'Test Shape: {test.shape}')
test.head(5)

Test Shape: (7711, 430)


Unnamed: 0,id,at,cogs,csho,dvp,ebit,gp,ib,lct,lt,...,sic_8700,sic_8711,sic_8721,sic_8731,sic_8734,sic_8741,sic_8742,sic_8744,sic_9995,sic_9997
0,0.0,438.049,364.536,8.33,0.0,23.46,45.416,26.495,37.241,156.206,...,0,0,0,0,0,0,0,0,0,0
1,1.0,3227.8,113.6,225.9,0.0,354.3,2102.0,285.3,954.1,1344.9,...,0,0,0,0,0,0,0,0,0,0
2,2.0,1227.476,1520.884,41.971,0.0,65.41,1061.94,8.923,524.58,813.76,...,0,0,0,0,0,0,0,0,0,0
3,3.0,394.559,358.532,11.088,0.0,82.269,177.235,63.046,56.518,83.832,...,0,0,0,0,0,0,0,0,0,0
4,4.0,551.089,574.176,29.163,0.0,96.475,357.282,64.834,156.993,184.41,...,0,0,0,0,0,0,0,0,0,0


# Q1: Altman Score
- Altman_X1 = working capital / total assets = "wcap"/ "at"
- Altman_X2 = retained earnings / total assets = "re" / "at"
- Altman_X3 = earnings before interest and tax / total assets = "ebit" / "at"
- Altman_X4 = market value of equity / total liabilities = "mkvalt" / "lt"
- Altman_X5 = sales / total assets = "sale" / "at"

In [6]:
# make copy of train dataframe for q1
train_q1 = train.copy(deep=True)

In [7]:
# calculate z-score
train_q1['Altman_Z_score'] = 1.2*train_q1['Altman_X1']+1.4*train_q1['Altman_X2']+3.3*train_q1['Altman_X3']+0.6*train_q1['Altman_X4']+1.0*train_q1['Altman_X5']

# view train dataframe
train_q1.head(5)

Unnamed: 0,gvkey,datadate,conm,at,cogs,csho,dvp,ebit,gp,ib,...,sic_8711,sic_8721,sic_8731,sic_8734,sic_8741,sic_8742,sic_8744,sic_9995,sic_9997,Altman_Z_score
0,1240,2002-01-31,ALBERTSON'S INC,15967.0,26094.0,407.0,0.0,1787.0,11837.0,501.0,...,0,0,0,0,0,0,0,0,0,3.995249
1,1655,2002-01-31,ANGELICA CORP,290.865,246.466,8.608,0.0,12.856,103.597,1.629,...,0,0,0,0,0,0,0,0,0,2.63554
2,1864,2002-01-31,REX AMERICAN RESOURCES CORP,307.329,325.912,8.163,0.0,22.236,138.591,22.554,...,0,0,0,0,0,0,0,0,0,3.629197
3,1878,2002-01-31,AUTODESK INC,902.444,124.903,55.644,0.0,131.804,822.588,90.313,...,0,0,0,0,0,0,0,0,0,5.568886
4,2436,2002-01-31,CALERES INC,700.898,1044.11,17.484,0.0,54.842,711.738,0.949,...,0,0,0,0,0,0,0,0,0,3.815497


In [8]:
# Classification using cutoff=2.675 where 0 refers to not bankrupt, 1 refers to bankrupt
train_q1['altman_bankrupt_classification'] = np.where(train_q1['Altman_Z_score']<2.675, 1,0 )

# view train dataframe
train_q1.head(5)

Unnamed: 0,gvkey,datadate,conm,at,cogs,csho,dvp,ebit,gp,ib,...,sic_8721,sic_8731,sic_8734,sic_8741,sic_8742,sic_8744,sic_9995,sic_9997,Altman_Z_score,altman_bankrupt_classification
0,1240,2002-01-31,ALBERTSON'S INC,15967.0,26094.0,407.0,0.0,1787.0,11837.0,501.0,...,0,0,0,0,0,0,0,0,3.995249,0
1,1655,2002-01-31,ANGELICA CORP,290.865,246.466,8.608,0.0,12.856,103.597,1.629,...,0,0,0,0,0,0,0,0,2.63554,1
2,1864,2002-01-31,REX AMERICAN RESOURCES CORP,307.329,325.912,8.163,0.0,22.236,138.591,22.554,...,0,0,0,0,0,0,0,0,3.629197,0
3,1878,2002-01-31,AUTODESK INC,902.444,124.903,55.644,0.0,131.804,822.588,90.313,...,0,0,0,0,0,0,0,0,5.568886,0
4,2436,2002-01-31,CALERES INC,700.898,1044.11,17.484,0.0,54.842,711.738,0.949,...,0,0,0,0,0,0,0,0,3.815497,0


In [9]:
# evaluate confusion matrix 
def evaluate_results(y_test, y_pred):
    '''
    This is a helper function that we will call to print basic results statistics.
    '''
    # Create confusion matrix of training data
    cm = confusion_matrix(y_test, y_pred)
    TN, FP, FN, TP = cm.ravel()
    print("Confusion Matrix: \n", cm)
    print("TN: %s, FP: %s, FN: %s, TP: %s" %(TN, FP, FN, TP))
    print("\n")

    # Evaluation Statistics
    # Performance of decision tree model on validation data
    print("Accuracy: ", accuracy_score(y_test, y_pred))
    print("Recall: ", recall_score(y_test, y_pred))
    print("Precision: ", precision_score(y_test, y_pred))
    
    
    print("F1: ", f1_score(y_test, y_pred))


In [10]:
y_test = train_q1['bankrupt']
y_pred = train_q1['altman_bankrupt_classification']
evaluate_results(y_test, y_pred)

Confusion Matrix: 
 [[15433 11205]
 [   17   164]]
TN: 15433, FP: 11205, FN: 17, TP: 164


Accuracy:  0.5815653081770387
Recall:  0.9060773480662984
Precision:  0.01442519130970182
F1:  0.028398268398268398


# Q2: CART & Logistic Regression

In [11]:
# make copy of train dataframe for q2
train_q2 = train.copy(deep=True)

# view train dataframe
train_q2.head(5)

Unnamed: 0,gvkey,datadate,conm,at,cogs,csho,dvp,ebit,gp,ib,...,sic_8700,sic_8711,sic_8721,sic_8731,sic_8734,sic_8741,sic_8742,sic_8744,sic_9995,sic_9997
0,1240,2002-01-31,ALBERTSON'S INC,15967.0,26094.0,407.0,0.0,1787.0,11837.0,501.0,...,0,0,0,0,0,0,0,0,0,0
1,1655,2002-01-31,ANGELICA CORP,290.865,246.466,8.608,0.0,12.856,103.597,1.629,...,0,0,0,0,0,0,0,0,0,0
2,1864,2002-01-31,REX AMERICAN RESOURCES CORP,307.329,325.912,8.163,0.0,22.236,138.591,22.554,...,0,0,0,0,0,0,0,0,0,0
3,1878,2002-01-31,AUTODESK INC,902.444,124.903,55.644,0.0,131.804,822.588,90.313,...,0,0,0,0,0,0,0,0,0,0
4,2436,2002-01-31,CALERES INC,700.898,1044.11,17.484,0.0,54.842,711.738,0.949,...,0,0,0,0,0,0,0,0,0,0


In [12]:
# define xvar and yvar
x_var = ['Altman_X1','Altman_X2','Altman_X3','Altman_X4','Altman_X5']
y_var = 'bankrupt'

print(f'x_var used: ', x_var)
print(f'y_var used: ', y_var)

x_var used:  ['Altman_X1', 'Altman_X2', 'Altman_X3', 'Altman_X4', 'Altman_X5']
y_var used:  bankrupt


## Decision Tree

In [13]:
# define empty lists to store results
all_y_preds = [] # predictions
all_y_true = [] # true labels

# define stratified KFold object and split train data
# Use shuffle = false for assignment
kf = StratifiedKFold(n_splits=5, shuffle=False)
kf.get_n_splits(train_q2[x_var], train_q2[y_var])

for train_index, test_index in kf.split(train_q2[x_var], train_q2[y_var]):
    # get the current train and test sets (for this particular fold)
    curr_train, curr_test = train_q2.iloc[train_index], train_q2.iloc[test_index]
    X_train = curr_train[x_var]
    y_train = curr_train[y_var]
    
    X_test = curr_test[x_var]
    y_test = curr_test[y_var]
    
    # define DT Classifier
    tree_1 = DecisionTreeClassifier(random_state=1)# Set seed to 1

    # fit model on training data
    simple_tree_model = tree_1.fit(X_train, y_train)

    # predict on validation set
    y_preds = simple_tree_model.predict(X_test)
    all_y_preds.extend(y_preds)
    all_y_true.extend(y_test)

# Evaluate results
print('Decision Tree Results')
evaluate_results(all_y_true, all_y_preds)

Decision Tree Results
Confusion Matrix: 
 [[26462   176]
 [  156    25]]
TN: 26462, FP: 176, FN: 156, TP: 25


Accuracy:  0.9876207166561021
Recall:  0.13812154696132597
Precision:  0.12437810945273632
F1:  0.13089005235602091


## Logistic Regression

In [14]:
# define empty lists to store results
all_y_preds = [] # predictions
all_y_true = [] # true labels

# define stratified KFold object and split train data
# Use shuffle = false for assignment
kf = StratifiedKFold(n_splits=5, shuffle=False)
kf.get_n_splits(train_q2[x_var], train_q2[y_var])

for train_index, test_index in kf.split(train_q2[x_var], train_q2[y_var]):
    # get the current train and test sets (for this particular fold)
    curr_train, curr_test = train_q2.iloc[train_index], train_q2.iloc[test_index]
    X_train = curr_train[x_var]
    y_train = curr_train[y_var]
    
    X_test = curr_test[x_var]
    y_test = curr_test[y_var]
    
    # define DT Classifier
    lr = LogisticRegression()

    # fit model on training data
    lr_model = lr.fit(X_train, y_train)

    # predict on validation set
    y_preds = lr_model.predict(X_test)
    all_y_preds.extend(y_preds)
    all_y_true.extend(y_test)

# Evaluate results
print('Logistic Regression Results')
evaluate_results(all_y_true, all_y_preds)

Logistic Regression Results
Confusion Matrix: 
 [[26630     8]
 [  173     8]]
TN: 26630, FP: 8, FN: 173, TP: 8


Accuracy:  0.9932510533576941
Recall:  0.04419889502762431
Precision:  0.5
F1:  0.08121827411167512


# Q3: Handling Unbalanced Classifiers (CART)

In [15]:
train_q3 = train.copy(deep=True)
train_q3.head(5)

Unnamed: 0,gvkey,datadate,conm,at,cogs,csho,dvp,ebit,gp,ib,...,sic_8700,sic_8711,sic_8721,sic_8731,sic_8734,sic_8741,sic_8742,sic_8744,sic_9995,sic_9997
0,1240,2002-01-31,ALBERTSON'S INC,15967.0,26094.0,407.0,0.0,1787.0,11837.0,501.0,...,0,0,0,0,0,0,0,0,0,0
1,1655,2002-01-31,ANGELICA CORP,290.865,246.466,8.608,0.0,12.856,103.597,1.629,...,0,0,0,0,0,0,0,0,0,0
2,1864,2002-01-31,REX AMERICAN RESOURCES CORP,307.329,325.912,8.163,0.0,22.236,138.591,22.554,...,0,0,0,0,0,0,0,0,0,0
3,1878,2002-01-31,AUTODESK INC,902.444,124.903,55.644,0.0,131.804,822.588,90.313,...,0,0,0,0,0,0,0,0,0,0
4,2436,2002-01-31,CALERES INC,700.898,1044.11,17.484,0.0,54.842,711.738,0.949,...,0,0,0,0,0,0,0,0,0,0


In [16]:
# Check for unbalanced data
print('Checking for unbalanced data')
print(train_q3['bankrupt'].value_counts())
print('Conclusion: Data is unbalanced')

Checking for unbalanced data
0    26638
1      181
Name: bankrupt, dtype: int64
Conclusion: Data is unbalanced


## (1) SMOTE

In [17]:
# define empty lists to store results
all_y_preds = [] # predictions
all_y_true = [] # true labels

# define stratified KFold object and split train data
# Use shuffle = false for assignment
kf = StratifiedKFold(n_splits=5, shuffle=False)
kf.get_n_splits(train_q3[x_var], train_q3[y_var])

for train_index, test_index in kf.split(train_q3[x_var], train_q3[y_var]):
    # get the current train and test sets (for this particular fold)
    curr_train, curr_test = train_q3.iloc[train_index], train_q3.iloc[test_index]
    X_train = curr_train[x_var]
    y_train = curr_train[y_var]

    print(f'Original dataset shape: {Counter(y_train)}')

    # SMOTE oversampling
    smote_only = SMOTE(random_state=1, sampling_strategy=0.1)
    X_res_smote, y_res_smote = smote_only.fit_resample(X_train, y_train)
    print(f'SMOTE resampled dataset shape: {Counter(y_res_smote)}')
    
    X_test = curr_test[x_var]
    y_test = curr_test[y_var]
    
    # define DT Classifier
    tree_1 = DecisionTreeClassifier(random_state=1)# Set seed to 1

    # fit model on training data
    simple_tree_model = tree_1.fit(X_res_smote, y_res_smote)

    # predict on validation set
    y_preds = simple_tree_model.predict(X_test)
    all_y_preds.extend(y_preds)
    all_y_true.extend(y_test)

# Evaluate results
print("")
print('SMOTE Results')
evaluate_results(all_y_true, all_y_preds)

Original dataset shape: Counter({0: 21310, 1: 145})
SMOTE resampled dataset shape: Counter({0: 21310, 1: 2131})
Original dataset shape: Counter({0: 21310, 1: 145})
SMOTE resampled dataset shape: Counter({0: 21310, 1: 2131})
Original dataset shape: Counter({0: 21310, 1: 145})
SMOTE resampled dataset shape: Counter({0: 21310, 1: 2131})
Original dataset shape: Counter({0: 21311, 1: 144})
SMOTE resampled dataset shape: Counter({0: 21311, 1: 2131})
Original dataset shape: Counter({0: 21311, 1: 145})
SMOTE resampled dataset shape: Counter({0: 21311, 1: 2131})

SMOTE Results
Confusion Matrix: 
 [[26026   612]
 [  135    46]]
TN: 26026, FP: 612, FN: 135, TP: 46


Accuracy:  0.9721466124762296
Recall:  0.2541436464088398
Precision:  0.06990881458966565
F1:  0.1096543504171633


## (2) SMOTE + Edited NN

In [18]:
# define empty lists to store results
all_y_preds = [] # predictions
all_y_true = [] # true labels

# define stratified KFold object and split train data
# Use shuffle = false for assignment
kf = StratifiedKFold(n_splits=5, shuffle=False)
kf.get_n_splits(train_q3[x_var], train_q3[y_var])

for train_index, test_index in kf.split(train_q3[x_var], train_q3[y_var]):
    # get the current train and test sets (for this particular fold)
    curr_train, curr_test = train_q3.iloc[train_index], train_q3.iloc[test_index]
    X_train = curr_train[x_var]
    y_train = curr_train[y_var]

    print(f'Original dataset shape: {Counter(y_train)}')

    # SMOTE oversampling
    #USE RANDOM STATE 1 FOR ASSIGNMENT
    smote_only = SMOTE(random_state=1, sampling_strategy=0.1)

    # Hybrid
    ennObj = EditedNearestNeighbours(n_neighbors=5)
    smote_enn = SMOTEENN(smote=smote_only , enn= ennObj)
    X_res_smoteENN, y_res_smoteENN = smote_enn.fit_resample(X_train, y_train)
    print(f'SMOTEENN Resampled dataset shape: {Counter(y_res_smoteENN)}')

    
    X_test = curr_test[x_var]
    y_test = curr_test[y_var]
    
    # define DT Classifier
    tree_1 = DecisionTreeClassifier(random_state=1) # Set seed to 1

    # fit model on training data
    simple_tree_model = tree_1.fit(X_res_smoteENN, y_res_smoteENN)

    # predict on validation set
    y_preds = simple_tree_model.predict(X_test)
    all_y_preds.extend(y_preds)
    all_y_true.extend(y_test)

# Evaluate results
print("")
print("SMOTE + Edited NN Results")
evaluate_results(all_y_true, all_y_preds)

Original dataset shape: Counter({0: 21310, 1: 145})
SMOTEENN Resampled dataset shape: Counter({0: 20025, 1: 2131})
Original dataset shape: Counter({0: 21310, 1: 145})
SMOTEENN Resampled dataset shape: Counter({0: 19866, 1: 2131})
Original dataset shape: Counter({0: 21310, 1: 145})
SMOTEENN Resampled dataset shape: Counter({0: 20070, 1: 2131})
Original dataset shape: Counter({0: 21311, 1: 144})
SMOTEENN Resampled dataset shape: Counter({0: 19949, 1: 2131})
Original dataset shape: Counter({0: 21311, 1: 145})
SMOTEENN Resampled dataset shape: Counter({0: 19924, 1: 2131})

SMOTE + Edited NN Results
Confusion Matrix: 
 [[25697   941]
 [  107    74]]
TN: 25697, FP: 941, FN: 107, TP: 74


Accuracy:  0.9609232260710691
Recall:  0.4088397790055249
Precision:  0.0729064039408867
F1:  0.1237458193979933


## (3) Over weighting

In [19]:
# define empty lists to store results
all_y_preds = [] # predictions
all_y_true = [] # true labels

# define stratified KFold object and split train data
# Use shuffle = false for assignment
kf = StratifiedKFold(n_splits=5, shuffle=False)
kf.get_n_splits(train_q3[x_var], train_q3[y_var])

for train_index, test_index in kf.split(train_q3[x_var], train_q3[y_var]):
    # get the current train and test sets (for this particular fold)
    curr_train, curr_test = train_q3.iloc[train_index], train_q3.iloc[test_index]
    X_train = curr_train[x_var]
    y_train = curr_train[y_var]

    X_test = curr_test[x_var]
    y_test = curr_test[y_var]

    # Define weighted tree (all is the same as above except now we specify class_weight)
    tree_weighted = DecisionTreeClassifier(
                                    class_weight={0: 1, 1: 10},
                                    random_state=1 # Set seed to 1
                                    )

    # Fit weighted tree model
    simple_tree_model_class_weights = tree_weighted.fit(X_train, y_train)

    # predict on validation set
    y_preds = simple_tree_model_class_weights.predict(X_test)
    all_y_preds.extend(y_preds)
    all_y_true.extend(y_test)

# Evaluate results
print("")
print('Overweight using class weight Results')
evaluate_results(all_y_true, all_y_preds)


Overweight using class weight Results
Confusion Matrix: 
 [[26443   195]
 [  162    19]]
TN: 26443, FP: 195, FN: 162, TP: 19


Accuracy:  0.9866885417055072
Recall:  0.10497237569060773
Precision:  0.08878504672897196
F1:  0.09620253164556962


Q3 Observations: No, all three methods did not help to get better results; they have lower F1 scores as compared to the basic CART Decision Tree in Q2. The method using class weights performed the worst with the lowest F1 score, and the method using SMOTE+ENN performed the best with the highest F1 score amongst the three balancing methods. 

# Q4

In [20]:
# make copy of train dataset for q4
train_q4 = train.copy(deep=True)

# view train dataset
train_q4.head(5)

Unnamed: 0,gvkey,datadate,conm,at,cogs,csho,dvp,ebit,gp,ib,...,sic_8700,sic_8711,sic_8721,sic_8731,sic_8734,sic_8741,sic_8742,sic_8744,sic_9995,sic_9997
0,1240,2002-01-31,ALBERTSON'S INC,15967.0,26094.0,407.0,0.0,1787.0,11837.0,501.0,...,0,0,0,0,0,0,0,0,0,0
1,1655,2002-01-31,ANGELICA CORP,290.865,246.466,8.608,0.0,12.856,103.597,1.629,...,0,0,0,0,0,0,0,0,0,0
2,1864,2002-01-31,REX AMERICAN RESOURCES CORP,307.329,325.912,8.163,0.0,22.236,138.591,22.554,...,0,0,0,0,0,0,0,0,0,0
3,1878,2002-01-31,AUTODESK INC,902.444,124.903,55.644,0.0,131.804,822.588,90.313,...,0,0,0,0,0,0,0,0,0,0
4,2436,2002-01-31,CALERES INC,700.898,1044.11,17.484,0.0,54.842,711.738,0.949,...,0,0,0,0,0,0,0,0,0,0


In [21]:
# define xvar and yvar
x_var = [i for i in list(train_q4.columns) if i not in ['bankrupt','gvkey','datadate','conm']]
y_var = 'bankrupt'

## Hyperparameter tuning: Carrying out one iteration and finding the best sampling method

### Functions ###

In [22]:
# This cell contains helper functions to save tuning results.
if not os.path.exists('Tuning'): os.mkdir('Tuning')
    
def save_file(path, data):
    output = open(path, 'wb')
    pickle.dump(data, output)
    output.close()

def load_file(path):
    pkl_file = open(path, 'rb')
    data = pickle.load(pkl_file)
    pkl_file.close()
    return data

In [23]:
# This function evaluates the PRAUC scores for the overweighting method
def evaluate_prauc(i):
    kf = StratifiedKFold(n_splits=5, shuffle=False)

    all_y_preds = [] # predictions
    all_y_proba = [] # probabilities
    all_y_true = [] # true labels

    for train_index, test_index in kf.split(train_q4[x_var], train_q4[y_var]):
        curr_train, curr_test = train_q4.iloc[train_index], train_q4.iloc[test_index]
        X_train = curr_train[x_var]
        y_train = curr_train[y_var]

        X_test = curr_test[x_var]
        y_test = curr_test[y_var]

        XG = i #assign the classifier here


        XG.fit(X_train, y_train)
        y_preds = XG.predict(X_test)
        y_proba = XG.predict_proba(X_test)
        all_y_preds.extend(y_preds)
        all_y_proba.extend(y_proba[:, 1]) #use this to get predict probabilities (not for this tutorial)
                #Question: Why did I specifically pull out the 2nd column (1) instead of (0)?
        all_y_true.extend(y_test)
        
        
    return(average_precision_score(all_y_true, all_y_proba))
    

In [24]:
# This function evaluates the PRAUC scores for SMOTE rebalancing method
def evaluate_prauc_smote(i):
    kf = StratifiedKFold(n_splits=5, shuffle=False)

    all_y_preds = [] # predictions
    all_y_proba = [] # probabilities
    all_y_true = [] # true labels

    for train_index, test_index in kf.split(train_q4[x_var], train_q4[y_var]):
        curr_train, curr_test = train_q4.iloc[train_index], train_q4.iloc[test_index]
        X_train = curr_train[x_var]
        y_train = curr_train[y_var]

        # SMOTE oversampling
        #USE RANDOM STATE 1 FOR ASSIGNMENT
        smote_only = SMOTE(random_state=1, sampling_strategy=0.1)
        X_res_smote, y_res_smote = smote_only.fit_resample(X_train, y_train)
        print(f'SMOTE resampled dataset shape: {Counter(y_res_smote)}')
        
        X_test = curr_test[x_var]
        y_test = curr_test[y_var]
        
        XG = i
        XG.fit(X_res_smote,y_res_smote)

        # predict on validation set
        y_preds = XG.predict(X_test)
        y_proba = XG.predict_proba(X_test)
        all_y_preds.extend(y_preds)
        all_y_proba.extend(y_proba[:, 1])
        all_y_true.extend(y_test)
        
        
    return(average_precision_score(all_y_true, all_y_proba))

In [25]:
# This function evaluates the PRAUC scores for SMOTE + ENN reablancing method
def evaluate_prauc_smote_enn(i):
    kf = StratifiedKFold(n_splits=5, shuffle=False)

    all_y_preds = [] # predictions
    all_y_proba = [] # probabilities
    all_y_true = [] # true labels

    for train_index, test_index in kf.split(train_q4[x_var], train_q4[y_var]):
        curr_train, curr_test = train_q4.iloc[train_index], train_q4.iloc[test_index]
        X_train = curr_train[x_var]
        y_train = curr_train[y_var]

        # SMOTE oversampling
        smote_only = SMOTE(random_state=1, sampling_strategy=0.1)

        # Hybrid
        ennObj = EditedNearestNeighbours(n_neighbors=5)
        smote_enn = SMOTEENN(smote=smote_only , enn= ennObj)
        X_res_smoteENN, y_res_smoteENN = smote_enn.fit_resample(X_train, y_train)
        print(f'SMOTEENN Resampled dataset shape: {Counter(y_res_smoteENN)}')

        
        X_test = curr_test[x_var]
        y_test = curr_test[y_var]
        
        XG = i 
        XG.fit(X_res_smoteENN,y_res_smoteENN)

        # predict on validation set
        y_preds = XG.predict(X_test)
        y_proba = XG.predict_proba(X_test)
        all_y_preds.extend(y_preds)
        all_y_proba.extend(y_proba[:, 1])
        all_y_true.extend(y_test)
        
    return(average_precision_score(all_y_true, all_y_proba))

In [26]:
# This function returns the xgboost models with the best parameters that produce the highest PRAUC scores using overweighting method
def hyperopt(param_space, num_eval, classifier, use_scaling = False):  
    start = time()
    def objective_function(params):
        clf = classifier(**params)
        score = evaluate_prauc(clf)
        return {'loss': -score, 'status': STATUS_OK}
    trials = Trials()
    best_param = fmin(objective_function, param_space, algo=tpe.suggest, max_evals=num_eval, 
                      trials=trials, rstate= np.random.RandomState(1))
    loss = [x['result']['loss'] for x in trials.trials] 
    best_param_values = best_param
    print(loss)
    return trials, best_param_values

In [27]:
# This function returns the xgboost models with the best parameters that produce the highest PRAUC scores using SMOTE
def hyperopt_smote(param_space, num_eval, classifier, use_scaling = False):  
    start = time()
    def objective_function(params):
        clf = classifier(**params)
        score = evaluate_prauc_smote(clf) 
        return {'loss': -score, 'status': STATUS_OK}
    trials = Trials()
    best_param = fmin(objective_function, param_space, algo=tpe.suggest, max_evals=num_eval, 
                      trials=trials, rstate= np.random.RandomState(1))
    loss = [x['result']['loss'] for x in trials.trials] 
    best_param_values = best_param
    print(loss)
    return trials, best_param_values

In [28]:
# This function returns the xgboost models with the best parameters that produce the highest PRAUC scores using SMOTE + ENN 
def hyperopt_smote_enn(param_space, num_eval, classifier, use_scaling = False):  
    start = time()
    def objective_function(params):
        clf = classifier(**params)
        score = evaluate_prauc_smote_enn(clf) 
        return {'loss': -score, 'status': STATUS_OK}
    trials = Trials()
    best_param = fmin(objective_function, param_space, algo=tpe.suggest, max_evals=num_eval, 
                      trials=trials, rstate= np.random.RandomState(1))
    loss = [x['result']['loss'] for x in trials.trials] 
    best_param_values = best_param
    print(loss)
    return trials, best_param_values

### First Iteration (Weighting method) ###

In [29]:
# Initial parameters to be tested
num_eval = 15

XGB_param_hyperopt = {
    'booster': hp.choice('booster', ['dart','gbtree']), #not trying gblinear
    'learning_rate': hp.uniform('learning_rate', 0.05, 0.8),
    'max_depth': scope.int(hp.quniform('max_depth', 1, 30, 1)),
    'subsample': hp.uniform('subsample', 0.1, 0.5),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.2, 0.7),
    'colsample_bynode': hp.uniform('colsample_bynode', 0.1, 0.8),
    'colsample_bylevel': hp.uniform('colsample_bylevel', 0.1, 0.4),
    'gamma': hp.uniform('gamma', 1, 30),
    'min_child_weight': hp.uniform('min_child_weight', 1.5, 2.5),
    'n_estimators': scope.int(hp.quniform('n_estimators', 50, 600, 20)), #not tuning this due to lack of time for training, but you should tune this impt variable
    'reg_lambda': hp.uniform('reg_lambda', 1, 6),
    'reg_alpha':hp.uniform('reg_alpha', 0.02, 0.1),
    'scale_pos_weight':hp.uniform('scale_pos_weight', 1, 15)
}

In [30]:
# Generating results
load = False #put load = True if you don't want to load the whole thing again
if(load == False):
    XGB_hyperopt = hyperopt(XGB_param_hyperopt,  num_eval, XGBClassifier)
    save_file('Tuning/XGB_hyperopt_weight(original).pkl', XGB_hyperopt)
else:
    try:
        XGB_hyperopt = load_file('Tuning/XGB_hyperopt_weight(original).pkl')
    except Exception as e:
        print(e)

100%|██████████| 15/15 [48:36<00:00, 194.44s/trial, best loss: -0.27122402027634807]
[-0.2009914202618387, -0.19920530803766084, -0.17785474044796154, -0.23156412915558067, -0.14161483521841695, -0.17500387348268626, -0.19542758620276013, -0.27122402027634807, -0.22606640852147536, -0.14179442513708304, -0.08176872510654358, -0.13328498464676775, -0.1816120528230295, -0.22452842811085494, -0.2207680866115485]


In [31]:
# View trial results
trials = XGB_hyperopt[0].trials
record = []
losses = []
for trial in trials:
    loss = trial["result"]["loss"]
    values = trial["misc"]['vals']
    #print(loss,values)
    losses.append(loss)
    record.append(values)
    
bayesTrialsData = pd.DataFrame(record)
bayesTrialsData.insert(0, 'loss', losses)

In [32]:
first_iteration_sorted_results_weights = bayesTrialsData.sort_values(by='loss')
first_iteration_sorted_results_weights

Unnamed: 0,loss,booster,colsample_bylevel,colsample_bynode,colsample_bytree,gamma,learning_rate,max_depth,min_child_weight,n_estimators,reg_alpha,reg_lambda,scale_pos_weight,subsample
7,-0.271224,[1],[0.3856407823645719],[0.6417905844125836],[0.2074268839767509],[26.745692745181636],[0.4022934313448799],[17.0],[2.316621449648167],[180.0],[0.05032317761342595],[4.008955998790208],[2.3540665574607695],[0.3598871712814795]
3,-0.231564,[1],[0.2787822061354191],[0.7862095581586269],[0.40437522388849034],[29.447031283553354],[0.3146564842262579],[21.0],[2.1897676461199844],[300.0],[0.04240964497438046],[4.849339446151476],[1.188286707127822],[0.43215960488291505]
8,-0.226066,[0],[0.10968167760779302],[0.6825243168070361],[0.4870849295764946],[4.813632749213815],[0.054134200770739674],[15.0],[1.5365444900620422],[400.0],[0.06367494230036537],[5.125173322422995],[11.344311792853544],[0.22645818119417976]
13,-0.224528,[1],[0.3161690485427503],[0.35551032320574927],[0.4667765590139416],[4.716579606980421],[0.12963618664595838],[2.0],[1.8320627316568583],[120.0],[0.06443297994104237],[4.148598254092472],[5.433579369702904],[0.1818833599029262]
14,-0.220768,[1],[0.18302530347889107],[0.6536158445783201],[0.5342806678291363],[3.572486402755124],[0.05933558120801584],[29.0],[2.138803382687602],[340.0],[0.026367593781803915],[1.9398907142939572],[7.689032542376289],[0.28479244210278815]
0,-0.200991,[0],[0.12542969047995745],[0.755502345614914],[0.6124939150624013],[14.949271229785204],[0.22270998380403612],[24.0],[1.5905701992657186],[360.0],[0.05065689544080341],[5.745967098842719],[10.479602602240838],[0.1958202509624458]
1,-0.199205,[1],[0.38948720826229033],[0.32094969887812186],[0.5380437488771675],[14.889740750380586],[0.24177111337053386],[3.0],[2.3889254049717006],[320.0],[0.03165147163180139],[2.8844593808692833],[1.8631242453991252],[0.38105799697774356]
6,-0.195428,[0],[0.2902087005990216],[0.4932501486288169],[0.4334407340772909],[21.777433662690104],[0.2799997879454427],[21.0],[1.5092900418299524],[400.0],[0.03584155959638893],[3.9892220316586773],[7.040422912921564],[0.4870263185713366]
12,-0.181612,[0],[0.39894374212003514],[0.4840156551884841],[0.47996769225376323],[1.490355858334281],[0.24407632160862425],[4.0],[1.8959592944989605],[80.0],[0.02025724261590553],[1.3533593664132613],[13.148902837641984],[0.41788154006066625]
2,-0.177855,[1],[0.21129150327761534],[0.7054203875793286],[0.6736165635080298],[25.865988697503074],[0.32623559217170445],[2.0],[2.2464378847238287],[280.0],[0.054782000495695635],[1.1470565335016216],[5.684491632815158],[0.37342297594971663]


### First iteration (SMOTE)

In [33]:
# Initial parameters to be tested for SMOTE, without scale_pos_weight
num_eval = 15

XGB_param_hyperopt = {
    'booster': hp.choice('booster', ['dart','gbtree']), #not trying gblinear
    'learning_rate': hp.uniform('learning_rate', 0.05, 0.8),
    'max_depth': scope.int(hp.quniform('max_depth', 1, 30, 1)),
    'subsample': hp.uniform('subsample', 0.1, 0.5),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.2, 0.7),
    'colsample_bynode': hp.uniform('colsample_bynode', 0.1, 0.8),
    'colsample_bylevel': hp.uniform('colsample_bylevel', 0.1, 0.4),
    'gamma': hp.uniform('gamma', 1, 30),
    'min_child_weight': hp.uniform('min_child_weight', 1.5, 2.5),
    'n_estimators': scope.int(hp.quniform('n_estimators', 50, 600, 20)), #not tuning this due to lack of time for training, but you should tune this impt variable
    'reg_lambda': hp.uniform('reg_lambda', 1, 6),
    'reg_alpha':hp.uniform('reg_alpha', 0.02, 0.1)
}

In [34]:
# Generating results
load = False #put load = True if you don't want to load the whole thing again
if(load == False):
    XGB_hyperopt = hyperopt_smote(XGB_param_hyperopt,  num_eval, XGBClassifier)
    save_file('Tuning/XGB_hyperopt_smote(original).pkl', XGB_hyperopt)
else:
    try:
        XGB_hyperopt = load_file('Tuning/XGB_hyperopt_smote(original).pkl')
    except Exception as e:
        print(e)

SMOTE resampled dataset shape: Counter({0: 21310, 1: 2131})
SMOTE resampled dataset shape: Counter({0: 21310, 1: 2131})
SMOTE resampled dataset shape: Counter({0: 21310, 1: 2131})
SMOTE resampled dataset shape: Counter({0: 21311, 1: 2131})
SMOTE resampled dataset shape: Counter({0: 21311, 1: 2131})
SMOTE resampled dataset shape: Counter({0: 21310, 1: 2131})
SMOTE resampled dataset shape: Counter({0: 21310, 1: 2131})
SMOTE resampled dataset shape: Counter({0: 21310, 1: 2131})
SMOTE resampled dataset shape: Counter({0: 21311, 1: 2131})
SMOTE resampled dataset shape: Counter({0: 21311, 1: 2131})
SMOTE resampled dataset shape: Counter({0: 21310, 1: 2131})
SMOTE resampled dataset shape: Counter({0: 21310, 1: 2131})
SMOTE resampled dataset shape: Counter({0: 21310, 1: 2131})
SMOTE resampled dataset shape: Counter({0: 21311, 1: 2131})
SMOTE resampled dataset shape: Counter({0: 21311, 1: 2131})
SMOTE resampled dataset shape: Counter({0: 21310, 1: 2131})
SMOTE resampled dataset shape: Counter({

### First iteration (SMOTE-ENN)

In [35]:
# Initial parameters to be tested for SMOTE, without scale_pos_weight
num_eval = 15

XGB_param_hyperopt = {
    'booster': hp.choice('booster', ['dart','gbtree']), #not trying gblinear
    'learning_rate': hp.uniform('learning_rate', 0.05, 0.8),
    'max_depth': scope.int(hp.quniform('max_depth', 1, 30, 1)),
    'subsample': hp.uniform('subsample', 0.1, 0.5),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.2, 0.7),
    'colsample_bynode': hp.uniform('colsample_bynode', 0.1, 0.8),
    'colsample_bylevel': hp.uniform('colsample_bylevel', 0.1, 0.4),
    'gamma': hp.uniform('gamma', 1, 30),
    'min_child_weight': hp.uniform('min_child_weight', 1.5, 2.5),
    'n_estimators': scope.int(hp.quniform('n_estimators', 50, 600, 20)), #not tuning this due to lack of time for training, but you should tune this impt variable
    'reg_lambda': hp.uniform('reg_lambda', 1, 6),
    'reg_alpha':hp.uniform('reg_alpha', 0.02, 0.1)
}

In [36]:
# Generating results
load = False #put load = True if you don't want to load the whole thing again
if(load == False):
    XGB_hyperopt = hyperopt_smote_enn(XGB_param_hyperopt,  num_eval, XGBClassifier)
    save_file('Tuning/XGB_hyperopt_smote_enn(original).pkl', XGB_hyperopt)
else:
    try:
        XGB_hyperopt = load_file('Tuning/XGB_hyperopt_smote_enn(original).pkl')
    except Exception as e:
        print(e)

SMOTEENN Resampled dataset shape: Counter({0: 20372, 1: 2131})
SMOTEENN Resampled dataset shape: Counter({0: 20244, 1: 2131})
SMOTEENN Resampled dataset shape: Counter({0: 20387, 1: 2131})
SMOTEENN Resampled dataset shape: Counter({0: 20310, 1: 2131})
SMOTEENN Resampled dataset shape: Counter({0: 20285, 1: 2131})
SMOTEENN Resampled dataset shape: Counter({0: 20372, 1: 2131})
SMOTEENN Resampled dataset shape: Counter({0: 20244, 1: 2131})
SMOTEENN Resampled dataset shape: Counter({0: 20387, 1: 2131})
SMOTEENN Resampled dataset shape: Counter({0: 20310, 1: 2131})
SMOTEENN Resampled dataset shape: Counter({0: 20285, 1: 2131})
SMOTEENN Resampled dataset shape: Counter({0: 20372, 1: 2131})
SMOTEENN Resampled dataset shape: Counter({0: 20244, 1: 2131})
SMOTEENN Resampled dataset shape: Counter({0: 20387, 1: 2131})
SMOTEENN Resampled dataset shape: Counter({0: 20310, 1: 2131})
SMOTEENN Resampled dataset shape: Counter({0: 20285, 1: 2131})
SMOTEENN Resampled dataset shape: Counter({0: 20372, 1:

### Conclusion of first iteration for all three sampling methods ###
The weighting method gives the maximum PRAUC, so we will proceed with weighting method for the second iteration to further finetune the model.  

### Second iteration (Weighting method) ###

#### Obtaining hyperparameters for second iteration, using first iteration

In [33]:
# Get top 10 results of first iteration
top10 = first_iteration_sorted_results_weights.iloc[:10,1:]
top10

Unnamed: 0,booster,colsample_bylevel,colsample_bynode,colsample_bytree,gamma,learning_rate,max_depth,min_child_weight,n_estimators,reg_alpha,reg_lambda,scale_pos_weight,subsample
7,[1],[0.3856407823645719],[0.6417905844125836],[0.2074268839767509],[26.745692745181636],[0.4022934313448799],[17.0],[2.316621449648167],[180.0],[0.05032317761342595],[4.008955998790208],[2.3540665574607695],[0.3598871712814795]
3,[1],[0.2787822061354191],[0.7862095581586269],[0.40437522388849034],[29.447031283553354],[0.3146564842262579],[21.0],[2.1897676461199844],[300.0],[0.04240964497438046],[4.849339446151476],[1.188286707127822],[0.43215960488291505]
8,[0],[0.10968167760779302],[0.6825243168070361],[0.4870849295764946],[4.813632749213815],[0.054134200770739674],[15.0],[1.5365444900620422],[400.0],[0.06367494230036537],[5.125173322422995],[11.344311792853544],[0.22645818119417976]
13,[1],[0.3161690485427503],[0.35551032320574927],[0.4667765590139416],[4.716579606980421],[0.12963618664595838],[2.0],[1.8320627316568583],[120.0],[0.06443297994104237],[4.148598254092472],[5.433579369702904],[0.1818833599029262]
14,[1],[0.18302530347889107],[0.6536158445783201],[0.5342806678291363],[3.572486402755124],[0.05933558120801584],[29.0],[2.138803382687602],[340.0],[0.026367593781803915],[1.9398907142939572],[7.689032542376289],[0.28479244210278815]
0,[0],[0.12542969047995745],[0.755502345614914],[0.6124939150624013],[14.949271229785204],[0.22270998380403612],[24.0],[1.5905701992657186],[360.0],[0.05065689544080341],[5.745967098842719],[10.479602602240838],[0.1958202509624458]
1,[1],[0.38948720826229033],[0.32094969887812186],[0.5380437488771675],[14.889740750380586],[0.24177111337053386],[3.0],[2.3889254049717006],[320.0],[0.03165147163180139],[2.8844593808692833],[1.8631242453991252],[0.38105799697774356]
6,[0],[0.2902087005990216],[0.4932501486288169],[0.4334407340772909],[21.777433662690104],[0.2799997879454427],[21.0],[1.5092900418299524],[400.0],[0.03584155959638893],[3.9892220316586773],[7.040422912921564],[0.4870263185713366]
12,[0],[0.39894374212003514],[0.4840156551884841],[0.47996769225376323],[1.490355858334281],[0.24407632160862425],[4.0],[1.8959592944989605],[80.0],[0.02025724261590553],[1.3533593664132613],[13.148902837641984],[0.41788154006066625]
2,[1],[0.21129150327761534],[0.7054203875793286],[0.6736165635080298],[25.865988697503074],[0.32623559217170445],[2.0],[2.2464378847238287],[280.0],[0.054782000495695635],[1.1470565335016216],[5.684491632815158],[0.37342297594971663]


In [34]:
# Cleaning the top 10 results and putting it into a dataframe
result = []
for col in top10.columns:
    result.append(list(top10[col].apply(lambda x: x[0])))

top10_cleaned = pd.DataFrame(result)
top10_cleaned = top10_cleaned.transpose()
top10_cleaned.columns = top10.columns
    

In [35]:
# Obtaining max, min for top 10 results
print('Max/Min Statistics from first iteration')
top10_stats = top10_cleaned.describe()
top10_stats

Max/Min Statistics from first iteration


Unnamed: 0,booster,colsample_bylevel,colsample_bynode,colsample_bytree,gamma,learning_rate,max_depth,min_child_weight,n_estimators,reg_alpha,reg_lambda,scale_pos_weight,subsample
count,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0
mean,0.6,0.268866,0.587879,0.483751,14.826821,0.227485,13.8,1.964498,278.0,0.04404,3.519202,6.622582,0.334039
std,0.516398,0.107752,0.164121,0.126326,10.713468,0.115021,10.2285,0.336182,113.705272,0.01528,1.611724,4.133523,0.105802
min,0.0,0.109682,0.32095,0.207427,1.490356,0.054134,2.0,1.50929,80.0,0.020257,1.147057,1.188287,0.181883
25%,0.0,0.190092,0.486324,0.441775,4.740843,0.152905,3.25,1.650943,205.0,0.032699,2.176033,3.123945,0.241042
50%,1.0,0.284495,0.647703,0.483526,14.919506,0.242924,16.0,2.017381,310.0,0.046366,3.999089,6.362457,0.366655
75%,1.0,0.368273,0.699696,0.537103,24.84385,0.305992,21.0,2.23227,355.0,0.053751,4.674154,9.78196,0.408676
max,1.0,0.398944,0.78621,0.673617,29.447031,0.402293,29.0,2.388925,400.0,0.064433,5.745967,13.148903,0.487026


In [36]:
# Results statistics for booster method

print('Booster Method Results (from first iteration)')
top10_cleaned['booster'].value_counts()

Booster Method Results (from first iteration)


1.0    6
0.0    4
Name: booster, dtype: int64

In [37]:
# Specifying hyperparamters to be tested for second iteration
XGB_param_hyperopt = {
    'booster': hp.choice('booster', ['dart','gbtree']), #not trying gblinear
    'learning_rate': hp.uniform('learning_rate', 0.05, top10_stats['learning_rate']['max']),
    'max_depth': scope.int(hp.quniform('max_depth', 1, 20, 1)),
    'subsample': hp.uniform('subsample', 0.1, 0.5),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.2, 0.7),
    'colsample_bynode': hp.uniform('colsample_bynode', 0.1, 0.8),
    'colsample_bylevel': hp.uniform('colsample_bylevel', 0.1, 0.4),
    'gamma': hp.uniform('gamma', 1, 20),
    'min_child_weight': hp.uniform('min_child_weight', 1.5, 2.5),
    'n_estimators': scope.int(hp.quniform('n_estimators', 50, 550, 20)), 
    'reg_lambda': hp.uniform('reg_lambda', 1, 6),
    'reg_alpha':hp.uniform('reg_alpha', 0.02, 0.1),
    'scale_pos_weight':hp.uniform('scale_pos_weight', 1, 10)
}

In [42]:
# Generating results
load = False #put load = True if you don't want to load the whole thing again

if(load == False):
    XGB_hyperopt = hyperopt(XGB_param_hyperopt,  num_eval, XGBClassifier)
    save_file('Tuning/XGB_hyperopt_weight(second).pkl', XGB_hyperopt)
else:
    try:
        XGB_hyperopt = load_file('Tuning/XGB_hyperopt_weight(second).pkl') 
    except Exception as e:
        print(e)

100%|██████████| 15/15 [38:45<00:00, 155.01s/trial, best loss: -0.27794155871879117]
[-0.20794943761286924, -0.27794155871879117, -0.21910372901979994, -0.25148491223318714, -0.1745298890559774, -0.19678183382747796, -0.21910174792507528, -0.2453127846656343, -0.23989897442950595, -0.16024436735711545, -0.14245039950504787, -0.16848766404192703, -0.221509439343657, -0.2490687202940899, -0.21993984767856106]


### Final Best Model ###

In [39]:
xg_chosen = XGB_hyperopt[1]

xgboost_model   =   XGBClassifier(booster=['dart','gbtree'][xg_chosen['booster']], 
                                  colsample_bytree=xg_chosen['colsample_bytree'],
                                  colsample_bynode = xg_chosen['colsample_bynode'], 
                                  colsample_bylevel = xg_chosen['colsample_bylevel'],
                                  n_estimators = int(xg_chosen['n_estimators']),
                                  reg_alpha = xg_chosen['reg_alpha'], 
                                  reg_lambda = xg_chosen['reg_lambda'],
                                  max_depth=int(xg_chosen['max_depth']),
                                  subsample = xg_chosen['subsample'], 
                                  gamma = xg_chosen['gamma'], 
                                  min_child_weight = xg_chosen['min_child_weight'],
                                  scale_pos_weight = xg_chosen['scale_pos_weight'],
                                  learning_rate=xg_chosen['learning_rate'],
                                  random_state=1)


xg_chosen

{'booster': 1,
 'colsample_bylevel': 0.38948720826229033,
 'colsample_bynode': 0.32094969887812186,
 'colsample_bytree': 0.5380437488771675,
 'gamma': 10.10017497438728,
 'learning_rate': 0.14007960474951114,
 'max_depth': 2.0,
 'min_child_weight': 2.3889254049717006,
 'n_estimators': 300.0,
 'reg_alpha': 0.03165147163180139,
 'reg_lambda': 2.8844593808692833,
 'scale_pos_weight': 1.5548655863280092,
 'subsample': 0.38105799697774356}

In [53]:
# training dataset using the optimal xg boost hyperparameters

# define empty lists to store results
auc_scores = [] # AUC scores
all_y_preds = [] # predictions
all_y_proba = [] # probabilities
all_y_true = [] # true labels

# define stratified KFold object and split train data
# Use shuffle = false for assignment
kf = StratifiedKFold(n_splits=5, shuffle=False)
kf.get_n_splits(train_q4[x_var], train_q4[y_var])

for train_index, test_index in kf.split(train_q4[x_var], train_q4[y_var]):
    # get the current train and test sets (for this particular fold)
    curr_train, curr_test = train_q4.iloc[train_index], train_q4.iloc[test_index]
    X_train = curr_train[x_var]
    y_train = curr_train[y_var]
    
    X_test = curr_test[x_var]
    y_test = curr_test[y_var]
    
    # build XGBoost model on train set
    XG = xgboost_model
    XG.fit(X_train, y_train)
    
    # predict on validation set
    y_preds = XG.predict(X_test)
    y_proba = XG.predict_proba(X_test)
    all_y_preds.extend(y_preds)
    all_y_proba.extend(y_proba[:, 1])
    all_y_true.extend(y_test)

# Evaluate results
print('Final XGBoost Model Results')
print(f'Average Precision Score:', average_precision_score(all_y_true, all_y_proba))

Final XGBoost Model Results
Average Precision Score: 0.23179873435521017


In [42]:
# predict on test set
X_real_test = test[x_var]
y_proba = XG.predict_proba(X_real_test)

In [44]:
# Saving results
result = test[['id']]
result['predict_prob'] = y_proba[:,1]

print(result)
result.to_csv('submission_A0204036W.csv')

          id  predict_prob
0        0.0      0.005935
1        1.0      0.000851
2        2.0      0.012514
3        3.0      0.001051
4        4.0      0.000849
...      ...           ...
7706  7706.0      0.006766
7707  7707.0      0.002338
7708  7708.0      0.004436
7709  7709.0      0.001608
7710  7710.0      0.002096

[7711 rows x 2 columns]


# Q5

In [45]:
# read in classified train dataset
train_classified_q5 = pd.read_excel('train.xlsx', sheet_name= 'Q5', engine='openpyxl')

# make copy of q1 train dataset (with z scores) for q5
train_q5 = train_q1.copy(deep=True)

# View train classified dataset
train_classified_q5.head(5)

Unnamed: 0,GVKey,Datadate,Name,Rating,Unnamed: 4
0,11649,2011-12-31,YRC WORLDWIDE INC,SD,
1,120301,2011-12-31,URBAN ONE INC,SD,
2,163586,2011-12-31,HORIZON LINES INC,SD,
3,29453,2011-12-31,CHENIERE ENERGY INC,CCC+,
4,181904,2011-12-31,CLEARWIRE CORP,CCC+,


### Cleaning train_classified_q5 and train datasets

In [46]:
# Renaming gvkey for merge later on
train_classified_q5['gvkey']= train_classified_q5['GVKey']

# dropping duplicates
train_classified_q5 = train_classified_q5.drop_duplicates(subset =['gvkey'])

# View train_classified
train_classified_q5.head(5)

Unnamed: 0,GVKey,Datadate,Name,Rating,Unnamed: 4,gvkey
0,11649,2011-12-31,YRC WORLDWIDE INC,SD,,11649
1,120301,2011-12-31,URBAN ONE INC,SD,,120301
2,163586,2011-12-31,HORIZON LINES INC,SD,,163586
3,29453,2011-12-31,CHENIERE ENERGY INC,CCC+,,29453
4,181904,2011-12-31,CLEARWIRE CORP,CCC+,,181904


### Merging train and train_classified

In [47]:
train_combined_q5 = train_q5.merge(train_classified_q5, on ='gvkey', how='left')
train_combined_q5 

Unnamed: 0,gvkey,datadate,conm,at,cogs,csho,dvp,ebit,gp,ib,...,sic_8744,sic_9995,sic_9997,Altman_Z_score,altman_bankrupt_classification,GVKey,Datadate,Name,Rating,Unnamed: 4
0,1240,2002-01-31,ALBERTSON'S INC,15967.000,26094.000,407.000,0.0,1787.000,11837.000,501.000,...,0,0,0,3.995249,0,,NaT,,,
1,1655,2002-01-31,ANGELICA CORP,290.865,246.466,8.608,0.0,12.856,103.597,1.629,...,0,0,0,2.635540,1,,NaT,,,
2,1864,2002-01-31,REX AMERICAN RESOURCES CORP,307.329,325.912,8.163,0.0,22.236,138.591,22.554,...,0,0,0,3.629197,0,1864.0,2011-01-31,REX AMERICAN RESOURCES CORP,,
3,1878,2002-01-31,AUTODESK INC,902.444,124.903,55.644,0.0,131.804,822.588,90.313,...,0,0,0,5.568886,0,1878.0,2011-01-31,AUTODESK INC,,
4,2436,2002-01-31,CALERES INC,700.898,1044.110,17.484,0.0,54.842,711.738,0.949,...,0,0,0,3.815497,0,2436.0,2011-01-31,CALERES INC,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26814,287462,2011-12-31,FUTUREFUEL CORP,385.244,239.011,41.308,0.0,51.615,70.874,34.509,...,0,0,0,5.189941,0,287462.0,2011-12-31,FUTUREFUEL CORP,,
26815,293884,2011-12-31,INTERXION HOLDING NV,965.556,132.021,66.129,0.0,76.970,184.922,33.175,...,0,0,0,1.441835,1,293884.0,2011-12-31,INTERXION HOLDING NV,B+,
26816,294524,2011-12-31,LYONDELLBASELL INDUSTRIES NV,22839.000,44889.000,569.340,0.0,4212.000,6146.000,2147.000,...,0,0,0,4.116011,0,294524.0,2011-12-31,LYONDELLBASELL INDUSTRIES NV,BB+,
26817,296318,2011-12-31,OCEAN RIG UDW INC,6015.355,284.000,131.697,0.0,217.645,415.649,95.298,...,0,0,0,0.461944,1,296318.0,2011-12-31,OCEAN RIG UDW INC,,


### Cleaning the ratings 

In [48]:
cleaned_ratings = []
for idx, row in train_combined_q5.iterrows():
    try:
        if 'A' in row['Rating']:
            cleaned_ratings.append('A')
        elif 'BBB' in row['Rating']:
            cleaned_ratings.append('BBB')
        elif 'BB' in row['Rating']:
            cleaned_ratings.append('BB')
        elif 'B' in row['Rating']:
            cleaned_ratings.append('B')
        elif 'C' in row['Rating']:
            cleaned_ratings.append('C')
        elif 'SD' in row['Rating']:
            cleaned_ratings.append('SD')
    except:
        cleaned_ratings.append('None')

In [49]:
train_combined_q5['cleaned_ratings'] = cleaned_ratings

# Check ratings
train_combined_q5[train_combined_q5['cleaned_ratings'] != 'None']

Unnamed: 0,gvkey,datadate,conm,at,cogs,csho,dvp,ebit,gp,ib,...,sic_9995,sic_9997,Altman_Z_score,altman_bankrupt_classification,GVKey,Datadate,Name,Rating,Unnamed: 4,cleaned_ratings
14,3964,2002-01-31,DILLARDS INC -CL A,7074.559,5507.702,83.888,0.000,305.711,2880.637,65.786,...,0,0,2.330559,1,3964.0,2011-01-31,DILLARDS INC -CL A,B+,,B
15,4016,2002-01-31,DOLLAR GENERAL CORP,2552.385,3691.183,332.606,0.000,402.011,1631.712,207.513,...,0,0,5.207046,0,4016.0,2011-01-31,DOLLAR GENERAL CORP,BB+,,BB
17,4523,2002-01-31,JO-ANN STORES INC,693.700,874.600,18.632,0.000,36.800,695.700,-14.300,...,0,0,3.425504,0,4523.0,2011-01-31,JO-ANN STORES INC,B,,B
35,8551,2002-01-31,PVH CORP,708.933,894.528,27.622,0.000,62.139,537.364,10.680,...,0,0,3.492006,0,8551.0,2011-01-31,PVH CORP,BB+,,BB
47,12123,2002-01-31,BIG LOTS INC,1533.209,1960.797,114.398,0.000,70.041,1472.524,-28.714,...,0,0,4.578833,0,12123.0,2011-01-31,BIG LOTS INC,BBB,,BBB
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26721,185318,2011-12-31,SWIFT TRANSPORTATION CO,2638.665,2813.938,139.498,0.000,297.539,519.970,90.550,...,0,0,1.636323,1,185318.0,2011-12-31,SWIFT TRANSPORTATION CO,B,,B
26781,187961,2011-12-31,LAREDO PETROLEUM INC,1627.652,80.325,127.617,0.000,201.899,429.945,105.554,...,0,0,2.457752,1,187961.0,2011-12-31,LAREDO PETROLEUM INC,B,,B
26802,264388,2011-12-31,WCA WASTE CORP,496.206,199.750,23.699,4.724,25.704,74.064,-2.397,...,0,0,0.923015,1,264388.0,2011-12-31,WCA WASTE CORP,B,,B
26815,293884,2011-12-31,INTERXION HOLDING NV,965.556,132.021,66.129,0.000,76.970,184.922,33.175,...,0,0,1.441835,1,293884.0,2011-12-31,INTERXION HOLDING NV,B+,,B


### Getting bankruptcy predictions using Q4's XG parameters

In [50]:
# define empty lists to store results
auc_scores = [] # AUC scores
all_y_preds = [] # predictions
all_y_proba = [] # probabilities
all_y_true = [] # true labels

# define stratified KFold object and split train data
# Use shuffle = false for assignment
kf = StratifiedKFold(n_splits=5, shuffle=False)
kf.get_n_splits(train_combined_q5[x_var], train_combined_q5[y_var])

for train_index, test_index in kf.split(train_combined_q5[x_var], train_combined_q5[y_var]):
    # get the current train and test sets (for this particular fold)
    curr_train, curr_test = train_combined_q5.iloc[train_index], train_combined_q5.iloc[test_index]
    X_train = curr_train[x_var]
    y_train = curr_train[y_var]
    
    X_test = curr_test[x_var]
    y_test = curr_test[y_var]
    
    # build XGBoost model on train set
    XG = xgboost_model
    XG.fit(X_train, y_train)
    
    # predict on validation set
    y_preds = XG.predict(X_test)
    y_proba = XG.predict_proba(X_test)
    all_y_preds.extend(y_preds)
    all_y_proba.extend(y_proba[:, 1])
    all_y_true.extend(y_test)

# Evaluate results
print('Q5 XGBoost Model Results')
print(f'Average Precision Score:', average_precision_score(all_y_true, all_y_proba))

Q5 XGBoost Model Results
Average Precision Score: 0.23179873435521017


In [51]:
# Obtaining bankruptcy Probabilities
train_combined_q5['CV Bankruptcy Probability'] = all_y_proba

### Dataset statistics

In [52]:
results = pd.DataFrame(train_combined_q5.groupby('cleaned_ratings').median()[['Altman_X1','Altman_X2','Altman_X3','Altman_X4','Altman_X5','Altman_Z_score','CV Bankruptcy Probability']]).reset_index()

# view results
print('Q5 Table')
results

Q5 Table


Unnamed: 0,cleaned_ratings,Altman_X1,Altman_X2,Altman_X3,Altman_X4,Altman_X5,Altman_Z_score,CV Bankruptcy Probability
0,A,0.058037,0.285203,0.098699,1.812067,0.772909,3.041495,0.001522
1,B,0.103114,0.015612,0.05535,0.660988,0.721074,1.613678,0.003596
2,BB,0.157023,0.09591,0.082734,1.261687,0.856552,2.325841,0.001465
3,BBB,0.100148,0.219728,0.088047,1.678763,0.762043,2.855746,0.001328
4,C,0.080532,-0.133184,0.024599,0.255402,0.448808,0.590272,0.005575
5,,0.209235,0.127989,0.072929,2.162328,0.846996,3.183581,0.001463
6,SD,0.006133,-0.030374,0.056371,0.280904,1.302312,1.559271,0.002968
