## Import necessary libraries

In [24]:
%reset -f

import pickle
import pandas as pd
import numpy
import time
from sklearn import svm
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import accuracy_score
# ! pip install hyperopt
# from hyperopt import STATUS_OK, Trials, fmin, hp, tpe
import xgboost as xgb

In [25]:
environment_variables = dir()

## Definition of model related constants

In [26]:
# Input parameters
INPUT_TRAINING_FILE = "../data/original/training_dataset.csv"
INPUT_EVALUATION_FILE = "../data/original/evaluation_dataset.csv"

# Output parameters
METHOD_NAME = "xgboost"
TIMESTAMP = time.strftime("%d_%m_%Y-%H_%M_%S")
OUTPUT_MODEL = f"../data/models/{METHOD_NAME}_model_{TIMESTAMP}.pkl"
OUTPUT_RESULTS = f"../data/results/{METHOD_NAME}_model_{TIMESTAMP}.txt"

# Hyper parameter alternatives
HYPER_PARAMETER_KERNEL = ["rbf"]
HYPER_PARAMETER_GAMMA = ["scale"]
# Bankrupt companies must have higher weight, because there are fewer samples
# HYPER_PARAMETER_CLASS_WEIGHT = {0: 1.0, 1: 100.0}
HYPER_PARAMETER_CLASS_WEIGHT = "balanced"

# Hyper parameter optimization parameters
HYPER_PARAMETER_OPTIMIZATION_SCORING = "accuracy"
HYPER_PARAMETER_OPTIMIZATION_CV = 2

# Cross validation parameters
CROSS_VALIDATION_CV = 5

# Cost parameters
PUNISHMENT_FOR_FALSE_BANKRUPT_PREDICTION = 100

# Other constants
LABELS = ["Operational", "Bankrupt"]
RANDOM_SEED = 42

In [27]:
training_dataset = pd.read_csv(INPUT_TRAINING_FILE, engine="python", delimiter=",")
training_features = training_dataset.loc[:, training_dataset.columns != "Bankrupt?"]
training_targets = training_dataset["Bankrupt?"]

evaluation_dataset = pd.read_csv(INPUT_EVALUATION_FILE, engine="python", delimiter=",")
evaluation_features = evaluation_dataset.loc[:, evaluation_dataset.columns != "Bankrupt?"]
evaluation_targets = evaluation_dataset["Bankrupt?"]

## Define pipeline

In [28]:
# Creating XGB clasifier with internal pipeline
xgb_classifier = xgb.XGBClassifier(eval_metric='logloss',seed=RANDOM_SEED)
xgb_pipeline = Pipeline(steps=[("clasification",xgb_classifier)])


In [29]:
# Getting the default hyper parameters of XGBoost
xgb_pipeline

Pipeline(steps=[('clasification',
                 XGBClassifier(base_score=None, booster=None,
                               colsample_bylevel=None, colsample_bynode=None,
                               colsample_bytree=None, enable_categorical=False,
                               eval_metric='logloss', gamma=None, gpu_id=None,
                               importance_type=None,
                               interaction_constraints=None, learning_rate=None,
                               max_delta_step=None, max_depth=None,
                               min_child_weight=None, missing=nan,
                               monotone_constraints=None, n_estimators=100,
                               n_jobs=None, num_parallel_tree=None,
                               predictor=None, random_state=None,
                               reg_alpha=None, reg_lambda=None,
                               scale_pos_weight=None, seed=42, subsample=None,
                               tree_method=No

In [30]:
# Setting tunable XBG parameters
parameters = {"clasification__subsample": [0.8]
            , "clasification__colsample_bytree": [0.8]
            , "clasification__learning_rate": [0.0001, 0.001, 0.01, 0.1, 0.2, 0.3]
            , "clasification__n_estimators": range(50,500,50)
              }

# K-fold parameters
cv = StratifiedKFold(n_splits=HYPER_PARAMETER_OPTIMIZATION_CV, shuffle=True, random_state=RANDOM_SEED)

grid_estimator = GridSearchCV(xgb_pipeline
                            , parameters
                            , scoring=HYPER_PARAMETER_OPTIMIZATION_SCORING
                            , cv=cv
                            )


In [31]:
grid_estimator.fit(training_features, training_targets)

  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.co

GridSearchCV(cv=StratifiedKFold(n_splits=2, random_state=42, shuffle=True),
             estimator=Pipeline(steps=[('clasification',
                                        XGBClassifier(base_score=None,
                                                      booster=None,
                                                      colsample_bylevel=None,
                                                      colsample_bynode=None,
                                                      colsample_bytree=None,
                                                      enable_categorical=False,
                                                      eval_metric='logloss',
                                                      gamma=None, gpu_id=None,
                                                      importance_type=None,
                                                      interaction_constraints=None,
                                                      lear...
                                      

In [36]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(xgb_pipeline, training_features, training_targets, cv=cv, scoring = 'roc_auc')
print("roc_auc = %f (%f)" % (scores.mean(), scores.std()))

evaluation_estimates = grid_estimator.predict(evaluation_features)
scores = cross_val_score(xgb_pipeline, evaluation_targets, evaluation_estimates, cv=cv, scoring = 'roc_auc')
print("roc_auc of evaluation %f (%f)" % (scores.mean(), scores.std()))

  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


roc_auc = 0.930740 (0.003726)
roc_auc of evaluation 0.846020 (0.004949)


In [32]:
def confusion_matrix_to_string(cm):
    return f"True operational={cm[0][0]}, True bankrupt={cm[1][1]}, False operational={cm[0][1]}, False bankrupt={cm[1][0]}"

# Use all training data to calculate confusion matrix for training data
training_estimates = grid_estimator.predict(training_features)
training_accuracy = balanced_accuracy_score(training_targets, training_estimates)
training_confusion_matrix = confusion_matrix(training_targets, training_estimates)
training_confusion_matrix = confusion_matrix_to_string(training_confusion_matrix)

# Use model to estimate manually labeled evaluations
evaluation_estimates = grid_estimator.predict(evaluation_features)
evaluation_accuracy = balanced_accuracy_score(evaluation_targets, evaluation_estimates)
evaluation_confusion_matrix = confusion_matrix(evaluation_targets, evaluation_estimates)
evaluation_confusion_matrix = confusion_matrix_to_string(evaluation_confusion_matrix)

  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


In [33]:
with open(OUTPUT_MODEL, "wb") as handle:
    pickle.dump(grid_estimator, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [34]:
current_variables = dir()

results_string = ""

for variable in current_variables:
    # Skip environment variables and their container variable
    # Ignore also underscore variables
    if variable in environment_variables or variable == "environment_variables" or variable.startswith("_"):
        continue

    # Get variables value
    variable_value = globals()[variable]

    # If variable is numerical or string, append it to results
    if type(variable_value) is str or type(variable_value) is int or \
        type(variable_value) is float or type(variable_value) is list or \
        type(variable_value) is numpy.float64 or type(variable_value) is dict:
        results_string += f"{variable}: {variable_value}\n"

# Print results to screen
print(results_string)

# Save results to file
with open(OUTPUT_RESULTS, "w") as file:
    file.write(results_string)

CROSS_VALIDATION_CV: 5
HYPER_PARAMETER_CLASS_WEIGHT: balanced
HYPER_PARAMETER_GAMMA: ['scale']
HYPER_PARAMETER_KERNEL: ['rbf']
HYPER_PARAMETER_OPTIMIZATION_CV: 2
HYPER_PARAMETER_OPTIMIZATION_SCORING: accuracy
INPUT_EVALUATION_FILE: ../data/original/evaluation_dataset.csv
INPUT_TRAINING_FILE: ../data/original/training_dataset.csv
LABELS: ['Operational', 'Bankrupt']
METHOD_NAME: xgboost
OUTPUT_MODEL: ../data/models/xgboost_model_09_05_2022-10_45_39.pkl
OUTPUT_RESULTS: ../data/results/xgboost_model_09_05_2022-10_45_39.txt
PUNISHMENT_FOR_FALSE_BANKRUPT_PREDICTION: 100
RANDOM_SEED: 42
TIMESTAMP: 09_05_2022-10_45_39
evaluation_accuracy: 0.6681818181818182
evaluation_confusion_matrix: True operational=1314, True bankrupt=15, False operational=6, False bankrupt=29
parameters: {'clasification__subsample': [0.8], 'clasification__colsample_bytree': [0.8], 'clasification__learning_rate': [0.0001, 0.001, 0.01, 0.1, 0.2, 0.3], 'clasification__n_estimators': range(50, 500, 50)}
training_accuracy: 0.9