In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    classification_report, confusion_matrix,
    roc_auc_score, average_precision_score
)

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline

sns.set(style="whitegrid")


In [3]:
df = pd.read_csv("../data/creditcard_preprocessed.csv")
df.head()


Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V23,V24,V25,V26,V27,V28,Amount,Class,Amount_log,Hour
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0,5.01476,0.0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0,1.305626,0.0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0,5.939276,0.0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0,4.824306,0.0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0,4.262539,0.0


In [4]:
X = df.drop(columns=["Class"])
y = df["Class"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)


In [5]:
#Logistic Regression + SMOTE Pipeline
pipeline_lr = ImbPipeline(steps=[
    ("scaler", StandardScaler()),
    ("smote", SMOTE(random_state=42)),
    ("clf", LogisticRegression(max_iter=1000, n_jobs=-1))
])

pipeline_lr.fit(X_train, y_train)

y_pred_lr = pipeline_lr.predict(X_test)
y_proba_lr = pipeline_lr.predict_proba(X_test)[:, 1]


In [7]:
print("Logistic Regression Metrics")
print(confusion_matrix(y_test, y_pred_lr))
print(classification_report(y_test, y_pred_lr, digits=4))

print("ROC-AUC:", roc_auc_score(y_test, y_proba_lr))
print("PR-AUC:", average_precision_score(y_test, y_proba_lr))

Logistic Regression Metrics
[[55311  1553]
 [    8    90]]
              precision    recall  f1-score   support

           0     0.9999    0.9727    0.9861     56864
           1     0.0548    0.9184    0.1034        98

    accuracy                         0.9726     56962
   macro avg     0.5273    0.9455    0.5447     56962
weighted avg     0.9982    0.9726    0.9846     56962

ROC-AUC: 0.9735408794919206
PR-AUC: 0.7315787895461129


In [8]:
#Random Forest + SMOTE Pipeline
rf = RandomForestClassifier(
    n_estimators=200,
    random_state=42,
    n_jobs=-1
)

pipeline_rf = ImbPipeline(steps=[
    ("scaler", StandardScaler()),
    ("smote", SMOTE(random_state=42)),
    ("clf", rf)
])

pipeline_rf.fit(X_train, y_train)

y_pred_rf = pipeline_rf.predict(X_test)
y_proba_rf = pipeline_rf.predict_proba(X_test)[:, 1]

In [9]:
print("Random Forest Metrics")
print(confusion_matrix(y_test, y_pred_rf))
print(classification_report(y_test, y_pred_rf, digits=4))

print("ROC-AUC:", roc_auc_score(y_test, y_proba_rf))
print("PR-AUC:", average_precision_score(y_test, y_proba_rf))


Random Forest Metrics
[[56852    12]
 [   18    80]]
              precision    recall  f1-score   support

           0     0.9997    0.9998    0.9997     56864
           1     0.8696    0.8163    0.8421        98

    accuracy                         0.9995     56962
   macro avg     0.9346    0.9081    0.9209     56962
weighted avg     0.9995    0.9995    0.9995     56962

ROC-AUC: 0.9823955187026977
PR-AUC: 0.8739995740241422


In [10]:
#Compare logistic vs random forest
print("\nModel Comparison")
print(f"LR ROC-AUC:  {roc_auc_score(y_test, y_proba_lr):.4f}")
print(f"RF ROC-AUC:  {roc_auc_score(y_test, y_proba_rf):.4f}")

print(f"LR PR-AUC:   {average_precision_score(y_test, y_proba_lr):.4f}")
print(f"RF PR-AUC:   {average_precision_score(y_test, y_proba_rf):.4f}")



Model Comparison
LR ROC-AUC:  0.9735
RF ROC-AUC:  0.9824
LR PR-AUC:   0.7316
RF PR-AUC:   0.8740
