<a href="https://colab.research.google.com/github/roboy88/Machine-Learning-for-Imbalanced-Fraud-Data/blob/main/Machine_Learning_for_Imbalanced_Fraud_Data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Credit Card Fraud Detection with Resampling Techniques

# === SETUP: LIBRARIES ===
from google.colab import files
uploaded = files.upload()

import pandas as pd
df = pd.read_csv("creditcard.csv")
print(f"Dataset shape: {df.shape}")
print(f"Fraudulent transactions: {df['Class'].value_counts()[1]}")

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, balanced_accuracy_score

from imblearn.over_sampling import RandomOverSampler, SMOTE
from imblearn.under_sampling import ClusterCentroids
from imblearn.combine import SMOTEENN
from imblearn.ensemble import BalancedRandomForestClassifier, EasyEnsembleClassifier
from imblearn.metrics import classification_report_imbalanced

import warnings
warnings.filterwarnings('ignore')

# === STEP 1: LOAD DATA ===
df = pd.read_csv("/content/creditcard.csv")
print(f"Dataset shape: {df.shape}")
print(f"Fraudulent transactions: {df['Class'].value_counts()[1]}")
print(f"Percentage of fraud: {round(df['Class'].value_counts()[1] / len(df) * 100, 4)}%\n")

# === STEP 2: PREPROCESS ===
X = df.drop("Class", axis=1)
y = df["Class"]

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, random_state=1, stratify=y)
print(f"Train class distribution: {Counter(y_train)}")
print(f"Test class distribution: {Counter(y_test)}\n")

# === HELPER FUNCTION ===
def evaluate_model(model, X_test, y_test, label):
    y_pred = model.predict(X_test)
    bal_acc = balanced_accuracy_score(y_test, y_pred)
    print(f"===== {label} =====")
    print("Balanced Accuracy Score:", round(bal_acc, 4))
    print(confusion_matrix(y_test, y_pred))
    print(classification_report_imbalanced(y_test, y_pred))
    return bal_acc

# === BASELINE LOGISTIC REGRESSION ===
print("\n=== BASELINE Logistic Regression ===")
model_base = LogisticRegression()
model_base.fit(X_train, y_train)
evaluate_model(model_base, X_test, y_test, "Baseline Logistic Regression")

# === RESAMPLING: RandomOverSampler ===
ros = RandomOverSampler(random_state=1)
X_ros, y_ros = ros.fit_resample(X_train, y_train)
model_ros = LogisticRegression()
model_ros.fit(X_ros, y_ros)
evaluate_model(model_ros, X_test, y_test, "RandomOverSampler")

# === RESAMPLING: SMOTE ===
sm = SMOTE(random_state=1)
X_sm, y_sm = sm.fit_resample(X_train, y_train)
model_sm = LogisticRegression()
model_sm.fit(X_sm, y_sm)
evaluate_model(model_sm, X_test, y_test, "SMOTE")

# === UNDERSAMPLING: ClusterCentroids ===
cc = ClusterCentroids(random_state=1)
X_cc, y_cc = cc.fit_resample(X_train, y_train)
model_cc = LogisticRegression()
model_cc.fit(X_cc, y_cc)
evaluate_model(model_cc, X_test, y_test, "ClusterCentroids")

# === COMBINATION: SMOTEENN ===
smote_enn = SMOTEENN(random_state=1)
X_se, y_se = smote_enn.fit_resample(X_train, y_train)
model_se = LogisticRegression()
model_se.fit(X_se, y_se)
evaluate_model(model_se, X_test, y_test, "SMOTEENN")

# === ENSEMBLE: BalancedRandomForest ===
brf = BalancedRandomForestClassifier(n_estimators=100, random_state=1)
brf.fit(X_train, y_train)
evaluate_model(brf, X_test, y_test, "BalancedRandomForestClassifier")

# === ENSEMBLE: EasyEnsembleClassifier ===
eec = EasyEnsembleClassifier(n_estimators=100, random_state=1)
eec.fit(X_train, y_train)
evaluate_model(eec, X_test, y_test, "EasyEnsembleClassifier")


Saving creditcard.csv to creditcard.csv
Dataset shape: (284807, 31)
Fraudulent transactions: 492
Dataset shape: (284807, 31)
Fraudulent transactions: 492
Percentage of fraud: 0.1727%

Train class distribution: Counter({0: 213236, 1: 369})
Test class distribution: Counter({0: 71079, 1: 123})


=== BASELINE Logistic Regression ===
===== Baseline Logistic Regression =====
Balanced Accuracy Score: 0.8495
[[71070     9]
 [   37    86]]
                   pre       rec       spe        f1       geo       iba       sup

          0       1.00      1.00      0.70      1.00      0.84      0.72     71079
          1       0.91      0.70      1.00      0.79      0.84      0.68       123

avg / total       1.00      1.00      0.70      1.00      0.84      0.72     71202

===== RandomOverSampler =====
Balanced Accuracy Score: 0.9515
[[69386  1693]
 [    9   114]]
                   pre       rec       spe        f1       geo       iba       sup

          0       1.00      0.98      0.93      0.99 