In [None]:
import csv
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np

# Read the input data
input_data = input("Enter the path to the dataset CSV file: ")

#Add severity column index
si = int(input("Enter the severity column index: "))
# 7 - stats19, 14 - crss, 0 - BAAC

# load data from CSV file using a CSV reader
df = pd.read_csv(input_data)
target_col_name = df.columns[si]
data = df.drop([target_col_name], axis=1)
severity = df[target_col_name]
header = data.columns

# split data into training and test sets
data_train, data_test, severity_train, severity_test = train_test_split(
    data, severity, test_size=0.25, stratify=severity, random_state=42)

print(len(data_test), len(data_train))
print(len(severity_test), len(severity_train))
print("Severity distribution in train:")
print(str(pd.Series(severity_train).value_counts()))
print("Severity distribution in test:")
print(str(pd.Series(severity_test).value_counts()))

In [None]:
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.utils.class_weight import compute_sample_weight
from lightgbm import LGBMClassifier
from lightgbm import early_stopping, log_evaluation
from imblearn.over_sampling import SMOTE

# Ask the user to select from three different options
print("Please select a boosting model and dataset balancing option:")
print("Type 0 for Catboost with no balancing")
print("Type 1 for Catboost weighted balancing")
print("Type 2 for Catboost oversampled balancing")
print("Type 3 for Catboost undersampled balancing")
print("Type 4 for Catboost SMOTE balancing")
print("Type 5 for XGBoost with no balancing")
print("Type 6 for XGBoost weighted balancing")
print("Type 7 for XGBoost oversampled balancing")
print("Type 8 for XGBoost undersampled balancing")
print("Type 9 for XGBoost SMOTE balancing")
print("Type 10 for LGBM with no balancing")
print("Type 11 for LGBM weighted balancing")
print("Type 12 for LGBM oversampled balancing")
print("Type 13 for LGBM undersampled balancing")

# Get the user's input
user_input = input("Enter your choice: ")

# Handle the user's input using conditional statements

if user_input == "0":
    print("You have selected Catboost with no balancing")
    # Training our classifier with weighted severity classes
    model = CatBoostClassifier(iterations=300, early_stopping_rounds=10, random_state=42)
    model.fit(data_train, severity_train, eval_set=(data_test, severity_test), plot = True)
    model_name = 'no_balance_catboost_model'
    model.save_model(model_name + '.cbm')

elif user_input == "1":
    print("You have selected Catboost weighted balancing")
    # Training our classifier with weighted severity classes
    unique_classes = np.unique(severity_train)
    class_weights = [1 / pd.Series(severity_train).value_counts()[cls] for cls in unique_classes]
    model = CatBoostClassifier(iterations=300, early_stopping_rounds=10, class_weights=class_weights, random_state=42)
    model.fit(data_train, severity_train, eval_set=(data_test, severity_test), plot = True)
    model_name = 'weighed_catboost_model'
    model.save_model(model_name + '.cbm')
    
elif user_input == "2":
    print("You have selected Catboost oversampled balancing")
    # Create a RandomOverSampler object
    oversampler = RandomOverSampler(random_state=42)
    # Resample the training data and labels
    data_train_resampled, severity_train_resampled = oversampler.fit_resample(data_train, severity_train)
    model = CatBoostClassifier(iterations=300, early_stopping_rounds=10, random_state=42)
    model.fit(data_train_resampled, severity_train_resampled, eval_set=(data_test, severity_test), plot = True)
    model_name = 'oversampled_catboost_model'
    model.save_model(model_name + '.cbm')

elif user_input == "3":
    print("You have selected Catboost undersampled balancing")
    # Create a RandomUnderSampler object
    undersampler = RandomUnderSampler(random_state=42)
    # Resample the training data and labels
    data_train_resampled, severity_train_resampled = undersampler.fit_resample(data_train, severity_train)    
    model = CatBoostClassifier(iterations=300, early_stopping_rounds=10, random_state=42)
    model.fit(data_train_resampled, severity_train_resampled, eval_set=(data_test, severity_test), plot = True)
    model_name = 'undersampled_catboost_model'
    model.save_model(model_name + '.cbm')
    
elif user_input == "4":
    print("You have selected Catboost SMOTE balancing")
    # Create a SMOTE object
    smote = SMOTE(random_state=42)
    # Resample the training data and labels
    data_train_resampled, severity_train_resampled = smote.fit_resample(data_train, severity_train)    
    model = CatBoostClassifier(iterations=300, early_stopping_rounds=10, random_state=42)
    model.fit(data_train_resampled, severity_train_resampled, eval_set=(data_test, severity_test), plot = True)
    model_name = 'smote_catboost_model'
    model.save_model(model_name + '.cbm')

elif user_input == "5":
    print("You have selected XGBoost with no balancing")
    unique_classes = np.unique(severity_train)
    if unique_classes[0] != 0:
        class_map = {cls: idx for idx, cls in enumerate(unique_classes)}
        severity_train = [class_map[cls] for cls in severity_train]
        severity_test = [class_map[cls] for cls in severity_test]
    else:
        severity_train = severity_train
        severity_test = severity_test
    unique_classes = np.unique(severity_train)
    model = XGBClassifier(n_estimators=300, early_stopping_rounds=10, random_state=42)
    model.fit(data_train, severity_train, eval_set=[(data_test, severity_test)])
    model_name = 'no_balance_xgboost_model'
    model.save_model(model_name + '.json')
    
elif user_input == "6":
    print("You have selected XGBoost weighted balancing")
    # Training our classifier with weighted severity classes
    unique_classes = np.unique(severity_train)
    if unique_classes[0] != 0:
        class_map = {cls: idx for idx, cls in enumerate(unique_classes)}
        severity_train = [class_map[cls] for cls in severity_train]
        severity_test = [class_map[cls] for cls in severity_test]
    else:
        severity_train = severity_train
        severity_test = severity_test
    unique_classes = np.unique(severity_train)
    sample_weights = compute_sample_weight(class_weight='balanced', y=severity_train)
    # Train the XGBoost classifier with the computed weights
    model = XGBClassifier(n_estimators=300, early_stopping_rounds=10, random_state=42, objective='multi:softmax', eval_metric='mlogloss')
    model.fit(data_train, severity_train, sample_weight=sample_weights, eval_set=[(data_test, severity_test)])
    model_name = 'weighed_xgboost_model'
    model.save_model(model_name + '.json')
    
elif user_input == "7":
    print("You have selected XGBoost oversampled balancing")
    # Create a RandomOverSampler object
    oversampler = RandomOverSampler(random_state=42)
    unique_classes = np.unique(severity_train)
    if unique_classes[0] != 0:
        class_map = {cls: idx for idx, cls in enumerate(unique_classes)}
        severity_train = [class_map[cls] for cls in severity_train]
        severity_test = [class_map[cls] for cls in severity_test]
    else:
        severity_train = severity_train
        severity_test = severity_test
    # Resample the training data and labels
    data_train_resampled, severity_train_resampled = oversampler.fit_resample(data_train, severity_train)
    model = XGBClassifier(n_estimators=300, early_stopping_rounds=10, random_state=42)
    model.fit(data_train_resampled, severity_train_resampled, eval_set=[(data_test, severity_test)])
    model_name = 'oversampled_xgboost_model'
    model.save_model(model_name + '.json')

elif user_input == "8":
    print("You have selected XGBoost undersampled balancing")
    # Create a RandomUnderSampler object
    undersampler = RandomUnderSampler(random_state=42)
    unique_classes = np.unique(severity_train)
    if unique_classes[0] != 0:
        class_map = {cls: idx for idx, cls in enumerate(unique_classes)}
        severity_train = [class_map[cls] for cls in severity_train]
        severity_test = [class_map[cls] for cls in severity_test]
    else:
        severity_train = severity_train
        severity_test = severity_test
    # Resample the training data and labels
    data_train_resampled, severity_train_resampled = undersampler.fit_resample(data_train, severity_train)    
    model = XGBClassifier(n_estimators=300, early_stopping_rounds=10, random_state=42)
    model.fit(data_train_resampled, severity_train_resampled, eval_set=[(data_test, severity_test)])
    model_name = 'undersampled_xgboost_model'
    model.save_model(model_name + '.json')
    
elif user_input == "9":
    print("You have selected XGBoost SMOTE balancing")
    # Create a SMOTE object
    smote = SMOTE(random_state=42)
    unique_classes = np.unique(severity_train)
    if unique_classes[0] != 0:
        class_map = {cls: idx for idx, cls in enumerate(unique_classes)}
        severity_train = [class_map[cls] for cls in severity_train]
        severity_test = [class_map[cls] for cls in severity_test]
    else:
        severity_train = severity_train
        severity_test = severity_test
    # Resample the training data and labels
    data_train_resampled, severity_train_resampled = smote.fit_resample(data_train, severity_train)    
    model = XGBClassifier(n_estimators=300, early_stopping_rounds=10, random_state=42)
    model.fit(data_train_resampled, severity_train_resampled, eval_set=[(data_test, severity_test)])
    model_name = 'smote_xgboost_model'
    model.save_model(model_name + '.json')
    
elif user_input == "10":
    print("You have selected LGBM with no balancing")
    model = LGBMClassifier(objective='multiclass',
                       num_class=len(unique_classes),
                       metric='multi_logloss',
                       boosting_type='gbdt',
                       learning_rate=0.1,
                       feature_fraction=0.9,
                       bagging_fraction=0.8,
                       bagging_freq=5,
                       n_estimators=1000,
                       random_state=42)
    model.fit(X=data_train, 
          y=severity_train, 
          eval_set=[(data_test, severity_test)],
          callbacks=[early_stopping(10), log_evaluation(period=100)])  # Print every 100 iterations
    model_name = 'no_balance_lgbm_model'
    model.booster_.save_model(model_name + '.txt')

elif user_input == "11":
    print("You have selected LGBM weighted balancing")
    # Assuming you already have data_train, data_test, severity_train, and severity_test
    unique_classes = np.unique(severity_train)
    class_weights = [1 / pd.Series(severity_train).value_counts()[cls] for cls in unique_classes]
    class_weight_dict = dict(zip(unique_classes, class_weights))
    # Create weights for training data
    weights = severity_train.map(class_weight_dict)
    model = LGBMClassifier(objective='multiclass',
                       num_class=len(unique_classes),
                       metric='multi_logloss',
                       boosting_type='gbdt',
                       learning_rate=0.1,
                       feature_fraction=0.9,
                       bagging_fraction=0.8,
                       bagging_freq=5,
                       n_estimators=1000,
                       random_state=42)
    model.fit(X=data_train, 
          y=severity_train,
          sample_weight=weights,
          eval_set=[(data_test, severity_test)],
          callbacks=[early_stopping(10), log_evaluation(period=100)])  # Print every 100 iterations
    model_name = 'weighed_lgbm_model'
    model.booster_.save_model(model_name + '.txt')
    
elif user_input == "12":
    print("You have selected LGBM oversampled balancing")
    # Create a RandomOverSampler object
    oversampler = RandomOverSampler(random_state=42)
    unique_classes = np.unique(severity_train)
    if unique_classes[0] != 0:
        class_map = {cls: idx for idx, cls in enumerate(unique_classes)}
        severity_train = [class_map[cls] for cls in severity_train]
        severity_test = [class_map[cls] for cls in severity_test]
    else:
        severity_train = severity_train
        severity_test = severity_test
    # Resample the training data and labels
    data_train_resampled, severity_train_resampled = oversampler.fit_resample(data_train, severity_train)
    # Create weights for training data
    model = LGBMClassifier(objective='multiclass',
                       num_class=len(unique_classes),
                       metric='multi_logloss',
                       boosting_type='gbdt',
                       learning_rate=0.1,
                       feature_fraction=0.9,
                       bagging_fraction=0.8,
                       bagging_freq=5,
                       n_estimators=1000,
                       random_state=42)
    model.fit(X=data_train_resampled, 
          y=severity_train_resampled,
          eval_set=[(data_test, severity_test)],
          callbacks=[early_stopping(10), log_evaluation(period=100)])  # Print every 100 iterations
    model_name = 'oversampled_lgbm_model'
    model.booster_.save_model(model_name + '.txt')    
    
elif user_input == "13":
    print("You have selected LGBM undersampled balancing")
    # Create a RandomUnderSampler object
    undersampler = RandomUnderSampler(random_state=42)
    unique_classes = np.unique(severity_train)
    if unique_classes[0] != 0:
        class_map = {cls: idx for idx, cls in enumerate(unique_classes)}
        severity_train = [class_map[cls] for cls in severity_train]
        severity_test = [class_map[cls] for cls in severity_test]
    else:
        severity_train = severity_train
        severity_test = severity_test
    # Resample the training data and labels
    data_train_resampled, severity_train_resampled = oversampler.fit_resample(data_train, severity_train)
    # Create weights for training data
    model = LGBMClassifier(objective='multiclass',
                       num_class=len(unique_classes),
                       metric='multi_logloss',
                       boosting_type='gbdt',
                       learning_rate=0.1,
                       feature_fraction=0.9,
                       bagging_fraction=0.8,
                       bagging_freq=5,
                       n_estimators=1000,
                       random_state=42)
    model.fit(X=data_train_resampled, 
          y=severity_train_resampled,
          eval_set=[(data_test, severity_test)],
          callbacks=[early_stopping(10), log_evaluation(period=100)])  # Print every 100 iterations
    model_name = 'undersampled_lgbm_model'
    model.booster_.save_model(model_name + '.txt')    

else:
    print("Invalid choice. Please select a number between 1 and 3.")
    
print("Training and Fitting is Done")

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

# Calculate correlation matrix
correlation_matrix = data.corr()

plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap="YlGnBu", annot_kws={"fontsize": 6})
plt.title("Correlation Matrix")
plt.show()

# Assuming you have feature names in a list called 'feature_names'
feature_names = header

# Get feature importances from the trained model
feature_importances = model.feature_importances_
feature_importances = np.round((feature_importances/np.sum(feature_importances)),3)
print(feature_importances)

# Sort feature importances and their corresponding feature names in descending order
sorted_indices = feature_importances.argsort()[::-1]
sorted_feature_importances = feature_importances[sorted_indices]
sorted_feature_names = [feature_names[i] for i in sorted_indices]

df_feat_imp = pd.DataFrame(sorted_feature_importances, index = sorted_feature_names)
df_feat_imp.to_csv("fi_" + model_name + ".csv", index = True, header = False)

# Create a bar chart with feature importances
plt.figure(figsize=(22, 6))
plt.bar(sorted_feature_names, sorted_feature_importances)
plt.xticks(fontsize=7)  # Change the font size of the x-axis labels
plt.xlabel('Features', fontsize=12)
plt.ylabel('Importance', fontsize=12)
plt.title('Feature Importances')

# Display the chart
plt.show()

In [None]:
from sklearn.metrics import classification_report, accuracy_score, roc_auc_score, confusion_matrix
import numpy as np

#Printing features of the classifier with weighted severity classes
y_pred = model.predict(data_test)
y_proba = model.predict_proba(data_test) # For ROC AUC
accuracy = accuracy_score(severity_test, y_pred)

"""
A confusion matrix is a performance evaluation tool used in CatBoost, which is a gradient boosting machine learning algorithm.
The confusion matrix is a table that shows the number of true positives, true negatives, false positives, and false negatives for a given binary classification problem.
In the context of CatBoost, the confusion matrix is generated by comparing the predicted labels of the model to the actual labels in the test dataset.
The true positives (TP) represent the number of instances where the model predicted a positive label and the actual label was also positive.
The true negatives (TN) represent the number of instances where the model predicted a negative label and the actual label was also negative.
The false positives (FP) represent the number of instances where the model predicted a positive label, but the actual label was negative.
The false negatives (FN) represent the number of instances where the model predicted a negative label, but the actual label was positive.
"""

test_result = model.predict(data_test)
#calculating confusion matrix
cm = confusion_matrix(severity_test, y_pred)
proba_test_result = model.predict_proba(data_test)
n_classes = len(model.classes_)

acc_cons_dwn = np.sum(np.tril(cm)/np.sum(cm))
acc_cons_up = np.sum(np.triu(cm)/np.sum(cm))
print(f'confusion_matrix_lower_triangle: {acc_cons_dwn}')
print(f'confusion_matrix_upper_triangle: {acc_cons_up}')

"""
The confusion_matrix function output

                Predicted
             1     2     3
    Actual  -----------------
      1   | TP1 | FP1 | FP2 |
      2   | FP3 | TP2 | FP4 |
      3   | FP5 | FP6 | TP3 |

TP represents the number of instances that are correctly classified,
FP represents the number of instances that are incorrectly classified as class 2 when they actually belong to class 1,
FP2 represents the number of instances that are incorrectly classified as class 3 when they actually belong to class 1,
and so on.
"""

#calcularing roc_auc_score
# Multi-class case with OneVsRest strategy
roc_auc = roc_auc_score(severity_test, y_proba, multi_class='ovr', average='macro')
print(f'AUC_ROC: {roc_auc}')

"""
roc_auc_score is a performance metric used in CatBoost, which is a gradient boosting machine learning algorithm.
The roc_auc_score is a measure of how well the algorithm is able to distinguish between two classes in a binary classification problem.
The term "ROC" stands for Receiver Operating Characteristic, which is a curve that plots the true positive rate against the false positive rate for a given classification model.
The area under this curve is referred to as the "ROC AUC" score, and is used to evaluate the performance of a binary classification model.
In CatBoost, the roc_auc_score is used to evaluate the performance of the model on the test dataset.
A higher roc_auc_score indicates that the model is better able to distinguish between the positive and negative classes in the test dataset, and therefore has a higher predictive accuracy.
This metric is particularly useful when dealing with imbalanced datasets, where one class is much more prevalent than the other, as it provides a more robust measure of performance than simply looking at accuracy or precision.
"""

report = classification_report(severity_test, y_pred, output_dict=True)
report['accuracy'] = {" ": accuracy}
report['ROC AUC'] = {" ": roc_auc}
report['acc_cons_up'] = {" ": acc_cons_up}
report['acc_cons_dwn'] = {" ": acc_cons_dwn}

print(report)

# Flatten the reports
flat_report = {}
for class_label, class_report in report.items():
    if isinstance(class_report, dict):
        for metric, score in class_report.items():
            if metric != 'support':
                flat_report[(model, class_label, metric)] = score

# Convert the flat reports to a DataFrame
df_report = pd.DataFrame(list(flat_report.items()), columns=['Model_Class_Metric', 'Score'])

# Split the Model_Class_Metric tuple into separate columns
df_report[['Model', 'Class', 'Metric']] = pd.DataFrame(df_report['Model_Class_Metric'].tolist(), index=df_report.index)

# Drop the Model_Class_Metric column
df_report = df_report.drop(columns='Model_Class_Metric')

# Reorder the columns
df_report = df_report[['Model', 'Class', 'Metric', 'Score']]

# Save the DataFrame to a CSV file
df_report.to_csv('cr_' + model_name + '.csv', index=False)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import roc_curve, auc
import numpy as np

# Function to plot confusion matrix heatmap
def plot_confusion_matrix(cm, class_labels):
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Reds', xticklabels=class_labels, yticklabels=class_labels)
    plt.xlabel('Predicted Label')
    plt.ylabel('True Label')
    plt.title('Confusion Matrix')

# Plot confusion matrix heatmap
plot_confusion_matrix(cm, model.classes_)

In [None]:
# Function to plot ROC AUC curves for each class
import numpy as np
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt
from sklearn.preprocessing import label_binarize

# Convert target labels to binary format using one-hot encoding
y_test_bin = label_binarize(severity_test, classes=model.classes_)

# Obtain predicted probabilities
y_probs = model.predict_proba(data_test)

# Compute ROC curve and ROC AUC for each class
fpr = dict()
tpr = dict()
roc_auc = dict()
n_classes = len(model.classes_)
for i in range(n_classes):
    fpr[i], tpr[i], _ = roc_curve(y_test_bin[:, i], y_probs[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])

# Plot ROC AUC curves
plt.figure(figsize=(8, 6))
for i in range(n_classes):
    plt.plot(fpr[i], tpr[i], label=f'Class {model.classes_[i]} (AUC = {roc_auc[i]:.2f})')

plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC AUC')
plt.legend(loc="lower right")
plt.show()
