# Importing necessary libraries

In [1]:
import joblib
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold, train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import warnings
warnings.filterwarnings('ignore')
from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTEENN


In [2]:
# Loading the encoded train and test data
train_data = joblib.load('encoded_train_data.joblib', mmap_mode='r')
test_data = joblib.load('encoded_test_data.joblib', mmap_mode='r')

# Separateing features (X) and target variable (y)
X = train_data.drop(columns=['IncidentGrade'])
y = train_data['IncidentGrade']

# Spliting the data (80:20)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Comparing Machine Learning Models

In [11]:

X_train_subsample = X_train.sample(frac=0.1, random_state=42)
y_train_subsample = y_train.loc[X_train_subsample.index]

models = {
    'Logistic Regression': LogisticRegression(max_iter=1000, random_state=42),
    'Random Forest': RandomForestClassifier(n_jobs=-1, random_state=42),
    'Decision Tree': DecisionTreeClassifier(random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(random_state=42),
    'XGBoost': XGBClassifier(n_jobs=-1, random_state=42),
    'LightGBM': LGBMClassifier(n_jobs=-1, random_state=42),
}

for model_name, model in models.items():
    print(f'Model: {model_name}')
    
    model.fit(X_train_subsample, y_train_subsample)
    
    y_pred = model.predict(X_val)
    
    # Evaluateing the models
    accuracy = accuracy_score(y_val, y_pred)
    report = classification_report(y_val, y_pred)
    cm = confusion_matrix(y_val, y_pred)
    
    # Displaying the results of the modles
    print(f'Accuracy: {accuracy}')
    print('Classification Report:')
    print(report)
    print('Confusion Matrix:')
    print(cm)
    print('-' * 50)

Model: Logistic Regression
Accuracy: 0.6334398207738485
Classification Report:
              precision    recall  f1-score   support

           0       0.60      0.87      0.71    765560
           1       0.64      0.16      0.26    390976
           2       0.70      0.64      0.67    628025

    accuracy                           0.63   1784561
   macro avg       0.65      0.56      0.55   1784561
weighted avg       0.64      0.63      0.60   1784561

Confusion Matrix:
[[667388  21971  76201]
 [234097  63266  93613]
 [214156  14111 399758]]
--------------------------------------------------
Model: Random Forest
Accuracy: 0.701326544735652
Classification Report:
              precision    recall  f1-score   support

           0       0.68      0.84      0.75    765560
           1       0.67      0.44      0.53    390976
           2       0.75      0.70      0.72    628025

    accuracy                           0.70   1784561
   macro avg       0.70      0.66      0.67   1784561


In [12]:

# Createing a report data
report = {
    'Model': ['Logistic Regression', 'Decision Tree', 'Random Forest', 'XGBoost', 'LightGBM', 'Gradient Boosting'],
    'Accuracy': [0.6318, 0.7005, 0.7011, 0.6777, 0.6756, 0.6414],
    'Macro-F1 Score': [0.54, 0.67, 0.67, 0.62, 0.61, 0.55],
    'Precision': [0.64, 0.70, 0.70, 0.71, 0.72, 0.69],
    'Recall': [0.55, 0.66, 0.66, 0.61, 0.61, 0.56]
}

df = pd.DataFrame(report)

print("Comparison Table:")
print(df.to_string(index=False))

best_models_with_max_f1 = df[df['Macro-F1 Score'] == df['Macro-F1 Score'].max()]

if len(best_models_with_max_f1) > 1:
    best_model = best_models_with_max_f1.loc[best_models_with_max_f1['Accuracy'].idxmax()]
else:
    best_model = df.loc[df['Macro-F1 Score'].idxmax()]

print("\nBest Model Based on Macro-F1 Score (and Accuracy in case of a tie):")
print(best_model)

Comparison Table:
              Model  Accuracy  Macro-F1 Score  Precision  Recall
Logistic Regression    0.6318            0.54       0.64    0.55
      Decision Tree    0.7005            0.67       0.70    0.66
      Random Forest    0.7011            0.67       0.70    0.66
            XGBoost    0.6777            0.62       0.71    0.61
           LightGBM    0.6756            0.61       0.72    0.61
  Gradient Boosting    0.6414            0.55       0.69    0.56

Best Model Based on Macro-F1 Score (and Accuracy in case of a tie):
Model             Random Forest
Accuracy                 0.7011
Macro-F1 Score             0.67
Precision                   0.7
Recall                     0.66
Name: 2, dtype: object


## Applying SMOTE to the training data for class imbalance and doing hyperparameter tuning for best result

In [13]:

# Loading the encoded train data
train_data = joblib.load('encoded_train_data.joblib', mmap_mode='r')

# Separating the features (X) and target variable (y)
X = train_data.drop('IncidentGrade', axis=1)
y = train_data['IncidentGrade']

X = X.apply(pd.to_numeric, errors='coerce')
X = X.dropna(axis=1)

# Splitting the data (80:20)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Downsampling the training data to 2% for quicker processing
X_train_sampled, _, y_train_sampled, _ = train_test_split(X_train, y_train, train_size=0.02, stratify=y_train, random_state=42)

if X_train_sampled.select_dtypes(include=['bool']).shape[1] > 0:
    X_train_sampled = X_train_sampled.astype(int)

# Applying SMOTE for multi-class classification (default strategy balances all classes equally)
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_sampled, y_train_sampled)

# Hyperparameters for RandomizedSearchCV
param_dist = {
    'n_estimators': [50, 75],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
    'bootstrap': [True, False]
}

# Random Forest Classifier
rf = RandomForestClassifier(random_state=42, n_jobs=-1)

random_search = RandomizedSearchCV(estimator=rf, param_distributions=param_dist, n_iter=5,
                                   cv=3, verbose=1, random_state=42, n_jobs=-1)

# Fitting the Randomized Search with resampled training data
random_search.fit(X_train_resampled, y_train_resampled)

# Best parameters and model
best_rf = random_search.best_estimator_

# Evaluating on validation data
y_pred = best_rf.predict(X_val)

# Printing the results
print("Best Hyperparameters:", random_search.best_params_)
print("Classification Report:")
print(classification_report(y_val, y_pred))

print("Confusion Matrix:")
print(confusion_matrix(y_val, y_pred))

# Saving the tuned model
joblib.dump(best_rf, "rf_smote_tuned_model.joblib")
print("Model saved as rf_smote_tuned_model.joblib")

Fitting 3 folds for each of 5 candidates, totalling 15 fits
Best Hyperparameters: {'n_estimators': 75, 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_depth': None, 'bootstrap': False}
Classification Report:
              precision    recall  f1-score   support

           0       0.69      0.71      0.70    765560
           1       0.50      0.54      0.52    390976
           2       0.73      0.66      0.69    628025

    accuracy                           0.66   1784561
   macro avg       0.64      0.64      0.64   1784561
weighted avg       0.66      0.66      0.66   1784561

Confusion Matrix:
[[546173 130013  89374]
 [115108 211526  64342]
 [129133  83129 415763]]
Model saved as rf_smote_tuned_model.joblib


In [14]:
# Loading the encoded train data
train_data = joblib.load('encoded_train_data.joblib', mmap_mode='r')

# Separating the features (X) and target variable (y)
X = train_data.drop('IncidentGrade', axis=1)
y = train_data['IncidentGrade']

# Splitting the data (80:20)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Downsampling the training data to 2% for quicker processing
X_train_sampled, _, y_train_sampled, _ = train_test_split(X_train, y_train, train_size=0.02, stratify=y_train, random_state=42)

# Random Forest without SMOTE
rf_no_smote = RandomForestClassifier(random_state=42, n_jobs=-1)

# Training the model
rf_no_smote.fit(X_train_sampled, y_train_sampled)
y_pred_no_smote = rf_no_smote.predict(X_val)

print("Classification Report Without SMOTE:")
print(classification_report(y_val, y_pred_no_smote))

print("Confusion Matrix Without SMOTE:")
print(confusion_matrix(y_val, y_pred_no_smote))

import joblib
joblib.dump(rf_no_smote, "rf_no_smote_model.joblib")

Classification Report Without SMOTE:
              precision    recall  f1-score   support

           0       0.67      0.79      0.73    765560
           1       0.61      0.43      0.50    390976
           2       0.72      0.70      0.71    628025

    accuracy                           0.68   1784561
   macro avg       0.67      0.64      0.65   1784561
weighted avg       0.67      0.68      0.67   1784561

Confusion Matrix Without SMOTE:
[[605585  65963  94012]
 [145512 167813  77651]
 [147899  43079 437047]]


['rf_no_smote_model.joblib']

# Evaluation of Best Random Forest Model on Test Data

In [15]:
# Loading the saved Random Forest model
best_rf = joblib.load("rf_smote_tuned_model.joblib")

# Loading the test dataset
test_data = joblib.load('encoded_test_data.joblib', mmap_mode='r')

# Separateing the features and target from test data
X_test = test_data.drop('IncidentGrade', axis=1)  
y_test = test_data['IncidentGrade']

# Makeing predictions on the test data
y_test_pred = best_rf.predict(X_test)

# Evaluateing the saved model on the test data
print("\nClassification Report on Test Data:")
report = classification_report(y_test, y_test_pred, output_dict=True)
print(classification_report(y_test, y_test_pred))

macro_f1 = report['macro avg']['f1-score']
macro_precision = report['macro avg']['precision']
macro_recall = report['macro avg']['recall']

print("\nMacro-F1 Score: {:.2f}".format(macro_f1))
print("Macro Precision: {:.2f}".format(macro_precision))
print("Macro Recall: {:.2f}".format(macro_recall))
print("\nConfusion Matrix on Test Data:")
print(confusion_matrix(y_test, y_test_pred))


Classification Report on Test Data:
              precision    recall  f1-score   support

           0       0.64      0.67      0.66   1630942
           1       0.43      0.47      0.45    868897
           2       0.71      0.63      0.67   1422856

    accuracy                           0.61   3922695
   macro avg       0.59      0.59      0.59   3922695
weighted avg       0.62      0.61      0.61   3922695


Macro-F1 Score: 0.59
Macro Precision: 0.59
Macro Recall: 0.59

Confusion Matrix on Test Data:
[[1098474  316803  215665]
 [ 309293  405018  154586]
 [ 305012  221387  896457]]


In [16]:
# Loading the saved Random Forest model
best_rf = joblib.load("rf_no_smote_model.joblib")

# Loading the test dataset
test_data = joblib.load('encoded_test_data.joblib', mmap_mode='r')

# Separateing the features and target from test data
X_test = test_data.drop('IncidentGrade', axis=1)  
y_test = test_data['IncidentGrade']

# Makeing predictions on the test data
y_test_pred = best_rf.predict(X_test)

# Evaluateing the saved model on the test data
print("\nClassification Report on Test Data:")
report = classification_report(y_test, y_test_pred, output_dict=True)
print(classification_report(y_test, y_test_pred))

macro_f1 = report['macro avg']['f1-score']
macro_precision = report['macro avg']['precision']
macro_recall = report['macro avg']['recall']

print("\nMacro-F1 Score: {:.2f}".format(macro_f1))
print("Macro Precision: {:.2f}".format(macro_precision))
print("Macro Recall: {:.2f}".format(macro_recall))
print("\nConfusion Matrix on Test Data:")
print(confusion_matrix(y_test, y_test_pred))


Classification Report on Test Data:
              precision    recall  f1-score   support

           0       0.63      0.75      0.69   1630942
           1       0.51      0.36      0.42    868897
           2       0.70      0.66      0.68   1422856

    accuracy                           0.63   3922695
   macro avg       0.61      0.59      0.60   3922695
weighted avg       0.63      0.63      0.62   3922695


Macro-F1 Score: 0.60
Macro Precision: 0.61
Macro Recall: 0.59

Confusion Matrix on Test Data:
[[1225931  173642  231369]
 [ 372911  314848  181138]
 [ 348671  131592  942593]]


## Applying SMOTE-ENN to the training data for class imbalance and doing hyperparameter tuning for best result
## (SMOTE + Edited Nearest Neighbors)

* SMOTE: Adds synthetic samples to balance the classes.
* SMOTE-ENN: Adds synthetic samples and then removes noisy or ambiguous samples for better data quality.

In [17]:
# Loading the encoded train data
train_data = joblib.load('encoded_train_data.joblib', mmap_mode='r')

# Separating the features (X) and target variable (y)
X = train_data.drop('IncidentGrade', axis=1)
y = train_data['IncidentGrade']

X = X.apply(pd.to_numeric, errors='coerce')
X = X.dropna(axis=1)

# Splitting the data (80:20)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Downsampling the training data to 2% for quicker processing
X_train_sampled, _, y_train_sampled, _ = train_test_split(X_train, y_train, train_size=0.02, stratify=y_train, random_state=42)

if X_train_sampled.select_dtypes(include=['bool']).shape[1] > 0:
    X_train_sampled = X_train_sampled.astype(int)

# Applying SMOTE for multi-class classification (default strategy balances all classes equally)
smote_enn = SMOTEENN(random_state=42)
X_train_resampled, y_train_resampled = smote_enn.fit_resample(X_train_sampled, y_train_sampled)

# Hyperparameters for RandomizedSearchCV
param_dist = {
    'n_estimators': [50, 75],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
    'bootstrap': [True, False]
}

# Random Forest Classifier
rf = RandomForestClassifier(random_state=42, n_jobs=-1)

random_search = RandomizedSearchCV(estimator=rf, param_distributions=param_dist, n_iter=5,
                                   cv=3, verbose=1, random_state=42, n_jobs=-1)

# Fitting the Randomized Search with resampled training data
random_search.fit(X_train_resampled, y_train_resampled)

# Best parameters and model
best_rf = random_search.best_estimator_

# Evaluating on validation data
y_pred = best_rf.predict(X_val)

# Printing the results
print("Best Hyperparameters:", random_search.best_params_)
print("Classification Report:")
print(classification_report(y_val, y_pred))

print("Confusion Matrix:")
print(confusion_matrix(y_val, y_pred))

# Saving the tuned model
joblib.dump(best_rf, "rf_smote_enn_tuned_model.joblib")
print("Model saved as rf_smote_enn_tuned_model.joblib")

Fitting 3 folds for each of 5 candidates, totalling 15 fits
Best Hyperparameters: {'n_estimators': 75, 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_depth': None, 'bootstrap': False}
Classification Report:
              precision    recall  f1-score   support

           0       0.69      0.58      0.63    765560
           1       0.40      0.58      0.47    390976
           2       0.71      0.65      0.68    628025

    accuracy                           0.60   1784561
   macro avg       0.60      0.60      0.59   1784561
weighted avg       0.63      0.60      0.61   1784561

Confusion Matrix:
[[442274 222094 101192]
 [ 98760 224981  67235]
 [ 98375 120051 409599]]
Model saved as rf_smote_enn_tuned_model.joblib


In [18]:
# Loading the saved Random Forest model
best_rf = joblib.load("rf_smote_tuned_model.joblib")

# Loading the test dataset
test_data = joblib.load('encoded_test_data.joblib', mmap_mode='r')

# Separateing the features and target from test data
X_test = test_data.drop('IncidentGrade', axis=1)  
y_test = test_data['IncidentGrade']

# Makeing predictions on the test data
y_test_pred = best_rf.predict(X_test)

# Evaluateing the saved model on the test data
print("\nClassification Report on Test Data:")
report = classification_report(y_test, y_test_pred, output_dict=True)
print(classification_report(y_test, y_test_pred))

macro_f1 = report['macro avg']['f1-score']
macro_precision = report['macro avg']['precision']
macro_recall = report['macro avg']['recall']

print("\nMacro-F1 Score: {:.2f}".format(macro_f1))
print("Macro Precision: {:.2f}".format(macro_precision))
print("Macro Recall: {:.2f}".format(macro_recall))
print("\nConfusion Matrix on Test Data:")
print(confusion_matrix(y_test, y_test_pred))


Classification Report on Test Data:
              precision    recall  f1-score   support

           0       0.64      0.67      0.66   1630942
           1       0.43      0.47      0.45    868897
           2       0.71      0.63      0.67   1422856

    accuracy                           0.61   3922695
   macro avg       0.59      0.59      0.59   3922695
weighted avg       0.62      0.61      0.61   3922695


Macro-F1 Score: 0.59
Macro Precision: 0.59
Macro Recall: 0.59

Confusion Matrix on Test Data:
[[1098477  316800  215665]
 [ 309293  405018  154586]
 [ 305012  221387  896457]]
