## Import necessary libraries

In [1]:
%reset -f

import pickle
import pandas as pd
import numpy
import time
from sklearn.naive_bayes import GaussianNB
from imblearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.feature_selection import SelectKBest, chi2, f_classif, mutual_info_classif
from sklearn.decomposition import PCA
from imblearn.over_sampling import SMOTE
from sklearn.metrics import confusion_matrix
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import fbeta_score
from sklearn.metrics import make_scorer

## Save environment parameters, so those won't be saved to report file

In [2]:
environment_variables = dir()

## Define model related constants

In [3]:
# Input parameters
INPUT_TRAINING_FILE = "../data/original/training_dataset.csv"
INPUT_EVALUATION_FILE = "../data/original/evaluation_dataset.csv"

# Output parameters
METHOD_NAME = "naive_bayes"
TIMESTAMP = time.strftime("%d_%m_%Y-%H_%M_%S")
OUTPUT_MODEL = f"../data/models/{METHOD_NAME}_model_{TIMESTAMP}.pkl"
OUTPUT_RESULTS = f"../data/results/{METHOD_NAME}_model_{TIMESTAMP}.txt"

# Hyper parameter alternatives
HYPER_PARAMETER_K_FEATURES = list(range(10, 90, 10))
HYPER_PARAMETER_SCORE_FUNC = [chi2, f_classif, mutual_info_classif]
# HYPER_PARAMETER_N_COMPONENTS = [3, 5, 7, 10, 15]
HYPER_PARAMETER_VAR_SMOOTHING = list(numpy.arange(1e-9, 1, 0.05))

# Hyper parameter optimization parameters
# HYPER_PARAMETER_OPTIMIZATION_SCORING = "balanced_accuracy"
HYPER_PARAMETER_OPTIMIZATION_BETA = 2
HYPER_PARAMETER_OPTIMIZATION_CV = 10

# Other constants
LABELS = ["Operational", "Bankrupt"]
RANDOM_SEED = 42
VERBOSITY = 3

## Load the datasets
These dataset should contain "Bankrupt?" label and features that should be considered during learning.

In [4]:
training_dataset = pd.read_csv(INPUT_TRAINING_FILE, engine="python", delimiter=",")
training_features = training_dataset.loc[:, training_dataset.columns != "Bankrupt?"]
training_targets = training_dataset["Bankrupt?"]

evaluation_dataset = pd.read_csv(INPUT_EVALUATION_FILE, engine="python", delimiter=",")
evaluation_features = evaluation_dataset.loc[:, evaluation_dataset.columns != "Bankrupt?"]
evaluation_targets = evaluation_dataset["Bankrupt?"]

## Minor preprocessing
Remove constant Net Income Flag and  Liability-Assets Flag features.

In [5]:
training_features = training_features.drop(columns=[" Net Income Flag"])
evaluation_features = evaluation_features.drop(columns=[" Net Income Flag"])
training_features = training_features.drop(columns=[" Liability-Assets Flag"])
evaluation_features = evaluation_features.drop(columns=[" Liability-Assets Flag"])

## Define the training pipeline

In [6]:
# Let's calculate prior propabilities
all_samples_count = len(training_targets.index)
bankrupt_samples_count = sum(training_targets)
operational_samples_count = all_samples_count - bankrupt_samples_count
bankrupt_prior_propability = bankrupt_samples_count / all_samples_count
operational_prior_propability = operational_samples_count / all_samples_count
prior_propabilities = [operational_prior_propability, bankrupt_prior_propability]

# Create internal pipeline
classifier = GaussianNB(priors=prior_propabilities)
standard_scaler =  StandardScaler()
min_max_scaler =  MinMaxScaler()
feature_selection = SelectKBest()
# pca = PCA(random_state=RANDOM_SEED)
smote = SMOTE(sampling_strategy="minority", random_state=RANDOM_SEED)
pipeline = Pipeline(steps=[("standard_scaler", standard_scaler),("min_max_scaler", min_max_scaler),
    ("selection", feature_selection), ("smote", smote), ("classification", classifier)])

# Specify the tunable hyper parameters
parameters = {
    "selection__k": HYPER_PARAMETER_K_FEATURES,
    "selection__score_func": HYPER_PARAMETER_SCORE_FUNC,
    "classification__var_smoothing": HYPER_PARAMETER_VAR_SMOOTHING
}

# Define KFold parameters
cv = StratifiedKFold(n_splits=HYPER_PARAMETER_OPTIMIZATION_CV, shuffle=True, random_state=RANDOM_SEED)

# Define custom fbeta scorer function that put emphasis on recall
def custom_scorer(y_true, y_pred):
    return fbeta_score(y_true, y_pred, beta=HYPER_PARAMETER_OPTIMIZATION_BETA)

estimator = GridSearchCV(pipeline, parameters, verbose=VERBOSITY,
    scoring=make_scorer(custom_scorer), cv=cv, n_jobs=-1)

## Do training

In [7]:
estimator.fit(training_features, training_targets)

Fitting 10 folds for each of 480 candidates, totalling 4800 fits
[CV 1/10] END classification__var_smoothing=1e-09, selection__k=10, selection__score_func=<function chi2 at 0x7fc859ff7040>;, score=0.062 total time=   0.1s
[CV 6/10] END classification__var_smoothing=1e-09, selection__k=10, selection__score_func=<function chi2 at 0x7f4320c8a040>;, score=0.000 total time=   0.1s
[CV 4/10] END classification__var_smoothing=1e-09, selection__k=10, selection__score_func=<function chi2 at 0x7f03b519c040>;, score=0.000 total time=   0.1s
[CV 2/10] END classification__var_smoothing=1e-09, selection__k=10, selection__score_func=<function chi2 at 0x7f8da9b84040>;, score=0.000 total time=   0.1s
[CV 5/10] END classification__var_smoothing=1e-09, selection__k=10, selection__score_func=<function chi2 at 0x7f21f7fd1040>;, score=0.067 total time=   0.1s
[CV 3/10] END classification__var_smoothing=1e-09, selection__k=10, selection__score_func=<function chi2 at 0x7f253ec58040>;, score=0.000 total time= 

GridSearchCV(cv=StratifiedKFold(n_splits=10, random_state=42, shuffle=True),
             estimator=Pipeline(steps=[('standard_scaler', StandardScaler()),
                                       ('min_max_scaler', MinMaxScaler()),
                                       ('selection', SelectKBest()),
                                       ('smote',
                                        SMOTE(random_state=42,
                                              sampling_strategy='minority')),
                                       ('classification',
                                        GaussianNB(priors=[0.9677360219981668,
                                                           0.03226397800183318]))]),
             n_jobs=-...
                                                           0.500000001,
                                                           0.550000001,
                                                           0.6000000010000001,
                                         

## Calculate metric values

In [9]:
def confusion_matrix_to_string(cm):
    return f"True operational={cm[0][0]}, True bankrupt={cm[1][1]}, False operational={cm[0][1]}, False bankrupt={cm[1][0]}"

# Use all training data to calculate confusion matrix for training data
training_estimates = estimator.predict(training_features)
training_accuracy = balanced_accuracy_score(training_targets, training_estimates)
training_confusion_matrix = confusion_matrix(training_targets, training_estimates)
training_confusion_matrix = confusion_matrix_to_string(training_confusion_matrix)
training_classification_report = classification_report(training_targets, training_estimates, output_dict=True, target_names=LABELS)
training_f_beta_score = fbeta_score(training_targets, training_estimates, beta=HYPER_PARAMETER_OPTIMIZATION_BETA)

# Use model to estimate manually labeled evaluation Tweets
evaluation_estimates = estimator.predict(evaluation_features)
evaluation_accuracy = balanced_accuracy_score(evaluation_targets, evaluation_estimates)
evaluation_confusion_matrix = confusion_matrix(evaluation_targets, evaluation_estimates)
evaluation_confusion_matrix = confusion_matrix_to_string(evaluation_confusion_matrix)
evaluation_classification_report = classification_report(evaluation_targets, evaluation_estimates, output_dict=True, target_names=LABELS)
evaluation_f_beta_score = fbeta_score(evaluation_targets, evaluation_estimates, beta=HYPER_PARAMETER_OPTIMIZATION_BETA)

# Store best parameters
best_parameters = estimator.best_params_
print(best_parameters)

{'classification__var_smoothing': 0.050000001, 'selection__k': 40, 'selection__score_func': <function chi2 at 0x7ff9f4048a60>}


## Save trained model for future reference

In [10]:
with open(OUTPUT_MODEL, "wb") as handle:
    pickle.dump(estimator, handle, protocol=pickle.HIGHEST_PROTOCOL)

## Save metadata and calculated statistics
This is done by saving every textual and numerical variable to file.

In [11]:
current_variables = dir()

results_string = ""

for variable in current_variables:
    # Skip environment variables and their container variable
    # Ignore also underscore variables
    if variable in environment_variables or variable == "environment_variables" or variable.startswith("_"):
        continue

    # Get variables value
    variable_value = globals()[variable]

    # If variable is numerical or string, append it to results
    if type(variable_value) is str or type(variable_value) is int or \
        type(variable_value) is float or type(variable_value) is list or \
        type(variable_value) is numpy.float64 or type(variable_value) is dict:
        results_string += f"{variable}: {variable_value}\n"

# Print results to screen
print(results_string)

# Save results to file
with open(OUTPUT_RESULTS, "w") as file:
    file.write(results_string)

HYPER_PARAMETER_K_FEATURES: [10, 20, 30, 40, 50, 60, 70, 80]
HYPER_PARAMETER_OPTIMIZATION_BETA: 2
HYPER_PARAMETER_OPTIMIZATION_CV: 10
HYPER_PARAMETER_SCORE_FUNC: [<function chi2 at 0x7ff9f4048a60>, <function f_classif at 0x7ff9f4048940>, <function mutual_info_classif at 0x7ff9f4060550>]
HYPER_PARAMETER_VAR_SMOOTHING: [1e-09, 0.050000001, 0.100000001, 0.15000000100000002, 0.200000001, 0.250000001, 0.30000000100000007, 0.35000000100000006, 0.40000000100000005, 0.45000000100000004, 0.500000001, 0.550000001, 0.6000000010000001, 0.650000001, 0.700000001, 0.750000001, 0.800000001, 0.8500000010000001, 0.900000001, 0.950000001]
INPUT_EVALUATION_FILE: ../data/original/evaluation_dataset.csv
INPUT_TRAINING_FILE: ../data/original/training_dataset.csv
LABELS: ['Operational', 'Bankrupt']
METHOD_NAME: naive_bayes
OUTPUT_MODEL: ../data/models/naive_bayes_model_11_05_2022-15_34_03.pkl
OUTPUT_RESULTS: ../data/results/naive_bayes_model_11_05_2022-15_34_03.txt
RANDOM_SEED: 42
TIMESTAMP: 11_05_2022-15_34_