In [18]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from xgboost import XGBClassifier
from sklearn.ensemble import (
    RandomForestClassifier, 
    AdaBoostClassifier,
    GradientBoostingClassifier
)
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.model_selection import cross_val_score, StratifiedKFold, KFold
from collections import Counter
from imblearn.over_sampling import SMOTE
from sklearn.metrics import make_scorer, precision_score, recall_score, f1_score
from sklearn.metrics import classification_report, confusion_matrix, auc, roc_curve
from sklearn.metrics import ConfusionMatrixDisplay, RocCurveDisplay

# 5. Model Training

In [19]:
# Load individual train and test dataset arrays
train_array = np.load('train_array.npy')
test_array = np.load('test_array.npy')

In [20]:
X_train, y_train, X_test, y_test = (
    train_array[:, :-1],
    train_array[:, -1],
    test_array[:, :-1],
    test_array[:, -1]
)

In [21]:
print(X_train[:5])
print(y_train[:5])

[[-1.13072593e-01  2.50000000e-05  2.50000000e-05  0.00000000e+00
   1.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  1.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00]
 [ 7.32170486e+00  2.50000000e-05  2.50000000e-05  1.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  1.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00]
 [ 6.10589071e-02  2.50000000e-05  2.50000000e-05  0.00000000e+00
   0.00000000e+00  0.00000000e+00  1.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   1.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00]
 [ 8.68951820e+00  2.50000000e-05  1.00000000e-04  0.00000000e+00
   0.00000000e+00  0.00000000e+00  1.00000000e+00  0.00000000e+00
   0.00000000e+00  

### Base Model

In [22]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix

param_grid = {
    'C': [0.1, 1, 10],  # Regularization strength
    'solver': ['liblinear', 'lbfgs']  # Solvers compatible with smaller datasets
}

log_reg = LogisticRegression(random_state=42, max_iter=1000)

grid_search = GridSearchCV(
    estimator=log_reg,
    param_grid=param_grid,
    cv=2,
    verbose=2
)

grid_search.fit(X_train, y_train)

print("Best Parameters: ", grid_search.best_params_)

best_model = grid_search.best_estimator_

# Predict on Train data
y_train_pred = best_model.predict(X_train)
# Predict on Test data
y_test_pred = best_model.predict(X_test)

print(f"Precision: {precision_score(y_test_pred, y_test, average='weighted')}")
print(f"Recall: {recall_score(y_test_pred, y_test, average='weighted')}")
print(f"F1 score: {f1_score(y_test_pred, y_test, average='weighted')}")
# Compute confusion matrix
print(f"Confusion Matrix: \n{confusion_matrix(y_test_pred, y_test)}")

   

Fitting 2 folds for each of 6 candidates, totalling 12 fits
[CV] END ............................C=0.1, solver=liblinear; total time=   0.0s
[CV] END ............................C=0.1, solver=liblinear; total time=   0.0s
[CV] END ................................C=0.1, solver=lbfgs; total time=   0.0s
[CV] END ................................C=0.1, solver=lbfgs; total time=   0.0s
[CV] END ..............................C=1, solver=liblinear; total time=   0.0s
[CV] END ..............................C=1, solver=liblinear; total time=   0.0s
[CV] END ..................................C=1, solver=lbfgs; total time=   0.0s
[CV] END ..................................C=1, solver=lbfgs; total time=   0.0s
[CV] END .............................C=10, solver=liblinear; total time=   0.0s
[CV] END .............................C=10, solver=liblinear; total time=   0.0s
[CV] END .................................C=10, solver=lbfgs; total time=   0.0s
[CV] END .................................C=10, s

#### Oversamplig the data and cross validating by using different models


In [23]:
"""  Imbalance dataset - upsampling """
def upsampling_data(X, y):
    sm = SMOTE(sampling_strategy='auto', random_state=42)
    X_sm, y_sm = sm.fit_resample(X, y)   
    print(f"\n====== Upsampled the minority class data ======") 
    print("Before SMOTE: ", Counter(y))
    print("After SMOTE: ", Counter(y_sm))
    return X_sm, y_sm


""" Evaluating Models """
def model_evaluation(model, X, y, num_procs):
    # parallel cross-validate models
    print("\n===================== Beginning cross validation ========================== ")
    # Define cross-validation method
    cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
    # Define metrics
    metrics = {
        'Precision': make_scorer(precision_score, average='weighted'),
        'Recall': make_scorer(recall_score, average='weighted'),
        'F1 score': make_scorer(f1_score, average='weighted')
    }
    # Perform cross-validation and evaluate using different metrics
    metrics_results = {}
    for name, metric in metrics.items():
        scores = cross_val_score(model, X, y, cv=cv, scoring=metric).mean()
        metrics_results[name] = scores
    return metrics_results


""" Calculate Metrics """
def model_metrics(y_pred, y_test):    
    precision = precision_score(y_pred, y_test, average='weighted')
    recall = recall_score(y_pred, y_test, average='weighted')
    f1 = f1_score(y_pred, y_test, average='weighted')
    # Compute confusion matrix
    cm = confusion_matrix(y_pred, y_test) 
    return precision, recall, f1, cm

##### Models evaluation 

In [24]:
"""
    Evaluating Models
    - Random_Forest
    - AdaBoost
    - XGBoost
"""

# Initialize the classifiers
models = {
    "Random_Forest": RandomForestClassifier(),
    "AdaBoost": AdaBoostClassifier(),
    # "Gradient Boosting": GradientBoostingClassifier(),
    "XGBoost": XGBClassifier()
}


# Define the hyperparameters and their possible values
params = {
    "Random_Forest": {
        'n_estimators': [50, 100, 200],
        'criterion': ['gini', 'entropy'],
        # 'max_depth': [None, 10, 20, 30],
        # 'min_samples_split': [2, 5, 10],
        # 'min_samples_leaf': [1, 2, 4],
        # 'max_features': ['sqrt', 'log2', None]
    },
    "AdaBoost": {
        'n_estimators': [50, 100, 200],
        'learning_rate': [0.01, 0.1, 0.5, 1.0],
        # 'algorithm': ['SAMME', 'SAMME.R']
    },
    # "Gradient Boosting": {
    #     # 'n_estimators': [50, 100, 200],
    #     # 'learning_rate': [0.01, 0.1, 0.05, 0.001],
    #     # 'subsample': [0.6, 0.7, 0.8, 0.9],
    #     # 'max_depth': [3, 5, 7, 9],
    #     # 'min_samples_split': [2, 5, 10],
    #     # 'min_samples_leaf': [1, 2, 4]
    # },
    "XGBoost": {
        'n_estimators': [50, 100, 200],
        'learning_rate': [0.01, 0.1, 0.05, 0.001],
        # 'max_depth': [3, 5, 7, 9],
        # 'min_child_weight': [1, 3, 5],
        # 'gamma': [0, 0.1, 0.2],
        # 'subsample': [0.6, 0.7, 0.8, 0.9],
        # 'colsample_bytree': [0.6, 0.7, 0.8, 0.9]
    }
}


# Initialize StratifiedKFold with 5 folds
# Stratified K-Fold ensures that each fold has the same proportion of classes as the entire dataset. 
skf = StratifiedKFold(n_splits=5)

# Upsampling train data
X_train_sm, y_train_sm = upsampling_data(X_train, y_train)

train_report = {}
test_report = {}
for i in range(len(models)):
    model = list(models.values())[i]
    param = params[list(models.keys())[i]]
    print(f"\n================ {model} ==================================")
    # Initalize GridSearchCV with 5-fold stratified cross validation
    # n_jobs is set to -1, to use all available CPU cores on the machine
    # verbose=2 to track the progress grid search or model training.
    grid_search = GridSearchCV(estimator=model, 
                               param_grid=param, 
                               cv=skf,
                               n_jobs=-1,
                               verbose=2)

    # Fit the grid search model
    grid_search.fit(X_train_sm, y_train_sm)
    print(f"Best parameters: {grid_search.best_params_} for {model}")

    # Set the model with best hyperparameters
    model.set_params(**grid_search.best_params_)
    # Fit the model
    model.fit(X_train_sm, y_train_sm)

    # Predict on Train data
    y_train_pred = model.predict(X_train_sm)
    # Predict on Test data
    y_test_pred = model.predict(X_test)

    # Get evaluation metrics for train and test data using best hyperparametrs
    print(f"Obtaining evaluation metrics for {model} by using best hyperparameters")
    precision_train, recall_train, f1_train, cm_train = model_metrics(y_train_pred, y_train_sm)
    train_model_score = []
    train_model_score.append({
        "Precision" : precision_train,
        "Recall" : recall_train,
        "F1 score": f1_train,
        "Confusion Matrix": cm_train
    })
    train_report[list(models.keys())[i]] = train_model_score

    precision_test, recall_test, f1_test, cm_test = model_metrics(y_test_pred, y_test)
    test_model_score = []
    test_model_score.append({
        "Precision" : precision_test,
        "Recall" : recall_test,
        "F1 score": f1_test,
        "Confusion Matrix": cm_test
    })
    test_report[list(models.keys())[i]] = test_model_score

    print("=============================================================\n\n")

print(f"Model performance on Training Data: \n{train_report}")
print(f"-----------------------------------------------------")
print(f"Model performance on Test Data: \n{test_report}")



Before SMOTE:  Counter({np.float64(0.0): 39959, np.float64(1.0): 41})
After SMOTE:  Counter({np.float64(0.0): 39959, np.float64(1.0): 39959})

Fitting 5 folds for each of 6 candidates, totalling 30 fits
[CV] END ....................criterion=gini, n_estimators=50; total time=   2.0s
[CV] END ....................criterion=gini, n_estimators=50; total time=   2.1s
[CV] END ....................criterion=gini, n_estimators=50; total time=   2.2s
[CV] END ....................criterion=gini, n_estimators=50; total time=   2.2s
[CV] END ....................criterion=gini, n_estimators=50; total time=   2.2s
[CV] END ...................criterion=gini, n_estimators=100; total time=   4.1s
[CV] END ...................criterion=gini, n_estimators=100; total time=   4.2s
[CV] END ...................criterion=gini, n_estimators=100; total time=   4.2s
[CV] END ...................criterion=gini, n_estimators=100; total time=   4.2s
[CV] END ...................criterion=gini, n_estimators=100; total

#### Models evaluation results on training data and test data

In [25]:
train_report_df = pd.DataFrame({
    'Model': list(train_report.keys()),
    'Precision': [metrics[0]['Precision'] for metrics in train_report.values()],
    'Recall': [metrics[0]['Recall'] for metrics in train_report.values()],
    'F1 Score': [metrics[0]['F1 score'] for metrics in train_report.values()],
    'Confusion Matrix': [metrics[0]['Confusion Matrix'] for metrics in train_report.values()]
})
print(f"Model performance on Train data: \n{train_report_df}\n")

test_report_df = pd.DataFrame({
    'Model': list(test_report.keys()),
    'Precision': [metrics[0]['Precision'] for metrics in test_report.values()],
    'Recall': [metrics[0]['Recall'] for metrics in test_report.values()],
    'F1 Score': [metrics[0]['F1 score'] for metrics in test_report.values()],
    'Confusion Matrix': [metrics[0]['Confusion Matrix'] for metrics in test_report.values()]
})
print(f"Model performance on Test data: \n{test_report_df}")


Model performance on Train data: 
           Model  Precision    Recall  F1 Score               Confusion Matrix
0  Random_Forest   0.999900  0.999900  0.999900       [[39952, 1], [7, 39958]]
1       AdaBoost   0.968477  0.967404  0.967422  [[37731, 377], [2228, 39582]]
2        XGBoost   0.990638  0.990503  0.990503    [[39251, 51], [708, 39908]]

Model performance on Test data: 
           Model  Precision  Recall  F1 Score        Confusion Matrix
0  Random_Forest   0.961823  0.9788  0.969589  [[9787, 12], [200, 1]]
1       AdaBoost   0.921879  0.9371  0.907923   [[9362, 4], [625, 9]]
2        XGBoost   0.957494  0.9765  0.966164  [[9764, 12], [223, 1]]


In [26]:
test_report

{'Random_Forest': [{'Precision': 0.9618226432824211,
   'Recall': 0.9788,
   'F1 score': 0.9695885681464568,
   'Confusion Matrix': array([[9787,   12],
          [ 200,    1]])}],
 'AdaBoost': [{'Precision': 0.9218786098851584,
   'Recall': 0.9371,
   'F1 score': 0.9079230021968008,
   'Confusion Matrix': array([[9362,    4],
          [ 625,    9]])}],
 'XGBoost': [{'Precision': 0.9574942194083078,
   'Recall': 0.9765,
   'F1 score': 0.9661644786073622,
   'Confusion Matrix': array([[9764,   12],
          [ 223,    1]])}]}

#### Models and its corresponding Recall score from test training report

In [27]:
models_recall_score = {model: recall_result[0]["Recall"] for model, recall_result in test_report.items()}
print(f"The models and their corresponding Recall score: \n{models_recall_score}")

The models and their corresponding Recall score: 
{'Random_Forest': 0.9788, 'AdaBoost': 0.9371, 'XGBoost': 0.9765}


#### Finding the best model and its score

In [28]:
best_model_name, best_score = max(models_recall_score.items(), key=lambda item: item[1])
print(f"Best Model name is {best_model_name} with Recall score: {best_score}")

# Get best model
best_model = models[best_model_name]
print(f"Best Model: {best_model}")

Best Model name is Random_Forest with Recall score: 0.9788
Best Model: RandomForestClassifier(n_estimators=50)


In [29]:
# Initialize the the best model 
best_model = RandomForestClassifier(n_estimators=200)
# Train the model on the training data
rfc_best = best_model.fit(X_train_sm, y_train_sm)

In [30]:
rfc_best.get_params

<bound method BaseEstimator.get_params of RandomForestClassifier(n_estimators=200)>

In [31]:
# Predict the labels for the test set
y_pred = rfc_best.predict(X_test)
print(y_pred)

[0. 0. 0. ... 0. 0. 0.]


In [32]:
precision_train, recall_train, f1_train, cm_train = model_metrics(y_pred, y_test)
result_dict = {"Final Model": best_model_name,
               "Precision" : precision_test,
               "Recall" : recall_test,
               "F1 score": f1_test,
               "Confusion Matrix": cm_test}
final_results = pd.DataFrame([result_dict])
final_results

Unnamed: 0,Final Model,Precision,Recall,F1 score,Confusion Matrix
0,Random_Forest,0.957494,0.9765,0.966164,"[[9764, 12], [223, 1]]"


In [33]:
# # Predict the probabilities for the test set
# test_probabilities = rfc_best.predict_proba(X_test)[:, 1]
# print(f"Probabilities prediction of 'Is Laundering':\n {test_probabilities}")

# # Compute the ROC curve and AUC
# fpr, tpr, thresholds = roc_curve(test_probabilities, y_test)
# roc_auc = auc(fpr, tpr)

# # Plot the ROC curve
# plt.figure(figsize=(4,4))
# RocCurveDisplay(fpr=fpr, tpr=tpr, roc_auc=roc_auc, estimator_name=rfc_best).plot()
# plt.plot([0, 1], [0, 1], color='red', lw=2, linestyle='--')
# plt.title('ROC Curve')
# plt.show()

In [34]:
# Compute the True Positive Rate (TPR) and False Positive Rate (FPR) for a specific threshold value
# desired_tpr = 0.88
# closest_threshold = thresholds[np.argmin(np.abs(tpr - desired_tpr))]
# print(f"Closet threshold to get Desired TPR of around {desired_tpr*100}%: {closest_threshold}")

# y_pred = (test_probabilities >= closest_threshold).astype(int)
# tn, fp, fn, tp = confusion_matrix(y_pred, y_test).ravel()

# fpr_cm = fp / (fp + tn)
# tpr_cm = tp / (tp + fn)
# print(f"False Positive Rate (FPR): {fpr_cm:.3f}")
# print(f"True Positive Rate (TPR): {tpr_cm:.3f}")

# disp = ConfusionMatrixDisplay.from_predictions(y_pred, y_test, cmap="Blues")
# plt.show()

# print(f"Classification report: \n {classification_report(y_pred, y_test)}")

#### Best model is Random Forest with Recall (True Postive Rate) of ~99%