## Import necessary libraries

In [29]:
%reset -f

import pickle
import pandas as pd
import numpy
import time
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix
from sklearn.metrics import balanced_accuracy_score

## Save environment parameters, so those won't be saved to report file

In [30]:
environment_variables = dir()

## Define model related constants

In [31]:
# Input parameters
INPUT_TRAINING_FILE = "../data/preprocessed/difference_preprocessed_training_dataset.csv"
INPUT_EVALUATION_FILE = "../data/preprocessed/difference_preprocessed_evaluation_dataset.csv"

# Output parameters
METHOD_NAME = "svm"
TIMESTAMP = time.strftime("%d_%m_%Y-%H_%M_%S")
OUTPUT_MODEL = f"../data/models/{METHOD_NAME}_model_{TIMESTAMP}.pkl"
OUTPUT_RESULTS = f"../data/results/{METHOD_NAME}_model_{TIMESTAMP}.txt"

# Hyper parameter alternatives
HYPER_PARAMETER_C = list(numpy.arange(0.2, 2, 0.2))
HYPER_PARAMETER_KERNEL = ["linear", "poly", "rbf", "sigmoid"]
HYPER_PARAMETER_GAMMA = ["scale", "auto"]
HYPER_PARAMETER_COEF0 = list(numpy.arange(0.1, 1, 0.1))
HYPER_PARAMETER_CLASS_WEIGHT = "balanced"

# Hyper parameter optimization parameters
HYPER_PARAMETER_OPTIMIZATION_SCORING = "accuracy"
HYPER_PARAMETER_OPTIMIZATION_CV = 10

# Other constants
LABELS = ["Operational", "Bankrupt"]
RANDOM_SEED = 42
VERBOSITY = 2

## Load the datasets
These dataset should contain "Bankrupt?" label and features that should be considered during learning.

In [32]:
training_dataset = pd.read_csv(INPUT_TRAINING_FILE, engine="python", delimiter=",")
training_features = training_dataset.loc[:, training_dataset.columns != "Bankrupt?"]
training_targets = training_dataset["Bankrupt?"]

evaluation_dataset = pd.read_csv(INPUT_EVALUATION_FILE, engine="python", delimiter=",")
evaluation_features = evaluation_dataset.loc[:, evaluation_dataset.columns != "Bankrupt?"]
evaluation_targets = evaluation_dataset["Bankrupt?"]

## Define the training pipeline

In [33]:
# Create internal pipeline
classifier = SVC(class_weight=HYPER_PARAMETER_CLASS_WEIGHT, random_state=RANDOM_SEED)
pipeline = Pipeline(steps=[("classification", classifier)])

# Specify the tunable hyper parameters
parameters = {
    "classification__C": HYPER_PARAMETER_C,
    "classification__kernel": HYPER_PARAMETER_KERNEL,
    "classification__gamma": HYPER_PARAMETER_GAMMA,
    "classification__coef0": HYPER_PARAMETER_COEF0
}

# Define KFold parameters
cv = StratifiedKFold(n_splits=HYPER_PARAMETER_OPTIMIZATION_CV, shuffle=True, random_state=42)

estimator = GridSearchCV(pipeline, parameters, verbose=VERBOSITY,
    scoring=HYPER_PARAMETER_OPTIMIZATION_SCORING, cv=cv, n_jobs=-1)

## Do training

In [34]:
estimator.fit(training_features, training_targets)

Fitting 10 folds for each of 648 candidates, totalling 6480 fits
[CV] END classification__C=0.2, classification__coef0=0.1, classification__gamma=scale, classification__kernel=linear; total time=   1.5s
[CV] END classification__C=0.2, classification__coef0=0.1, classification__gamma=scale, classification__kernel=linear; total time=   1.6s
[CV] END classification__C=0.2, classification__coef0=0.1, classification__gamma=scale, classification__kernel=linear; total time=   1.6s
[CV] END classification__C=0.2, classification__coef0=0.1, classification__gamma=scale, classification__kernel=linear; total time=   1.7s
[CV] END classification__C=0.2, classification__coef0=0.1, classification__gamma=scale, classification__kernel=linear; total time=   1.8s
[CV] END classification__C=0.2, classification__coef0=0.1, classification__gamma=scale, classification__kernel=linear; total time=   1.9s
[CV] END classification__C=0.2, classification__coef0=0.1, classification__gamma=scale, classification__ker

GridSearchCV(cv=StratifiedKFold(n_splits=10, random_state=42, shuffle=True),
             estimator=Pipeline(steps=[('classification',
                                        SVC(class_weight='balanced',
                                            random_state=42))]),
             n_jobs=-1,
             param_grid={'classification__C': [0.2, 0.4, 0.6000000000000001,
                                               0.8, 1.0, 1.2,
                                               1.4000000000000001, 1.6, 1.8],
                         'classification__coef0': [0.1, 0.2,
                                                   0.30000000000000004, 0.4,
                                                   0.5, 0.6, 0.7000000000000001,
                                                   0.8, 0.9],
                         'classification__gamma': ['scale', 'auto'],
                         'classification__kernel': ['linear', 'poly', 'rbf',
                                                    'sigmoid']}

## Calculate metric values

In [35]:
def confusion_matrix_to_string(cm):
    return f"True operational={cm[0][0]}, True bankrupt={cm[1][1]}, False operational={cm[0][1]}, False bankrupt={cm[1][0]}"

# Use all training data to calculate confusion matrix for training data
training_estimates = estimator.predict(training_features)
training_accuracy = balanced_accuracy_score(training_targets, training_estimates)
training_confusion_matrix = confusion_matrix(training_targets, training_estimates)
training_confusion_matrix = confusion_matrix_to_string(training_confusion_matrix)

# Use model to estimate manually labeled evaluation Tweets
evaluation_estimates = estimator.predict(evaluation_features)
evaluation_accuracy = balanced_accuracy_score(evaluation_targets, evaluation_estimates)
evaluation_confusion_matrix = confusion_matrix(evaluation_targets, evaluation_estimates)
evaluation_confusion_matrix = confusion_matrix_to_string(evaluation_confusion_matrix)

## Save trained model for future reference

In [36]:
with open(OUTPUT_MODEL, "wb") as handle:
    pickle.dump(estimator, handle, protocol=pickle.HIGHEST_PROTOCOL)

## Save metadata and calculated statistics
This is done by saving every textual and numerical variable to file.

In [37]:
current_variables = dir()

results_string = ""

for variable in current_variables:
    # Skip environment variables and their container variable
    # Ignore also underscore variables
    if variable in environment_variables or variable == "environment_variables" or variable.startswith("_"):
        continue

    # Get variables value
    variable_value = globals()[variable]

    # If variable is numerical or string, append it to results
    if type(variable_value) is str or type(variable_value) is int or \
        type(variable_value) is float or type(variable_value) is list or \
        type(variable_value) is numpy.float64 or type(variable_value) is dict:
        results_string += f"{variable}: {variable_value}\n"

# Print results to screen
print(results_string)

# Save results to file
with open(OUTPUT_RESULTS, "w") as file:
    file.write(results_string)

HYPER_PARAMETER_C: [0.2, 0.4, 0.6000000000000001, 0.8, 1.0, 1.2, 1.4000000000000001, 1.6, 1.8]
HYPER_PARAMETER_CLASS_WEIGHT: balanced
HYPER_PARAMETER_COEF0: [0.1, 0.2, 0.30000000000000004, 0.4, 0.5, 0.6, 0.7000000000000001, 0.8, 0.9]
HYPER_PARAMETER_GAMMA: ['scale', 'auto']
HYPER_PARAMETER_KERNEL: ['linear', 'poly', 'rbf', 'sigmoid']
HYPER_PARAMETER_OPTIMIZATION_CV: 10
HYPER_PARAMETER_OPTIMIZATION_SCORING: accuracy
INPUT_EVALUATION_FILE: ../data/preprocessed/difference_preprocessed_evaluation_dataset.csv
INPUT_TRAINING_FILE: ../data/preprocessed/difference_preprocessed_training_dataset.csv
LABELS: ['Operational', 'Bankrupt']
METHOD_NAME: svm
OUTPUT_MODEL: ../data/models/svm_model_09_05_2022-15_20_52.pkl
OUTPUT_RESULTS: ../data/results/svm_model_09_05_2022-15_20_52.txt
RANDOM_SEED: 42
TIMESTAMP: 09_05_2022-15_20_52
VERBOSITY: 2
evaluation_accuracy: 0.8200757575757576
evaluation_confusion_matrix: True operational=1115, True bankrupt=35, False operational=205, False bankrupt=9
parameters: