In [None]:
# ============================================================
# NOTEBOOK 4: MODEL TRAINING & COMPARISON
# ============================================================
# Goal: Train and evaluate multiple models to detect ATO fraud.
# Models:
#   1. Logistic Regression (Baseline)
#   2. Random Forest (Robust Ensemble)
#   3. XGBoost (Gradient Boosting - Industry Standard)
#   4. Isolation Forest (Unsupervised Anomaly Detection)
# ============================================================

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
import time
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.metrics import (accuracy_score, precision_score, recall_score, f1_score, 
                             roc_auc_score, confusion_matrix, roc_curve, precision_recall_curve)
from imblearn.over_sampling import SMOTE

# Models
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, IsolationForest
from xgboost import XGBClassifier

# Configuration
sns.set_theme(style="whitegrid")
import warnings
warnings.filterwarnings('ignore')

# Load engineered features
df = pd.read_csv('../data/processed/features_engineered.csv')

print("Data Loaded")
print(f"Shape: {df.shape}")
print(f"Fraud Rate: {df['is_fraud'].mean():.4f}")

In [None]:
# ============================================================
# DATA PREPARATION
# ============================================================

# Define features and target
X = df.drop(['transaction_id', 'user_id', 'timestamp', 'fraud_type', 'is_fraud'], axis=1)
y = df['is_fraud']

# Split data (80% Train, 20% Test)
# Stratify is crucial for imbalanced datasets to keep fraud ratio consistent
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Scale features (RobustScaler handles outliers better than StandardScaler)
scaler = RobustScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("Data Split & Scaled")
print(f"Train Set: {X_train.shape[0]} samples ({y_train.sum()} frauds)")
print(f"Test Set:  {X_test.shape[0]} samples ({y_test.sum()} frauds)")

In [None]:
# ============================================================
# EVALUATION HELPER FUNCTION
# ============================================================

def evaluate_model(name, model, X_test, y_test, y_pred, y_prob=None):
    """
    Calculates and prints key metrics for fraud detection.
    """
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred) # Critical for fraud (catch rate)
    f1 = f1_score(y_test, y_pred)
    
    print(f"----- {name} Results -----")
    print(f"Accuracy:  {accuracy:.4f}")
    print(f"Precision: {precision:.4f} (Low false positives)")
    print(f"Recall:    {recall:.4f} (High detection rate)")
    print(f"F1-Score:  {f1:.4f}")
    
    auc = None
    if y_prob is not None:
        auc = roc_auc_score(y_test, y_prob)
        print(f"ROC-AUC:   {auc:.4f}")
        
    return {
        'Model': name,
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1': f1,
        'AUC': auc
    }

print("Helper function ready")

In [None]:
# ============================================================
# SUPERVISED LEARNING MODELS
# ============================================================
results_list = []
models = {}

# 1. Logistic Regression (Baseline)
# class_weight='balanced' helps with the 3% fraud rate
lr = LogisticRegression(class_weight='balanced', random_state=42, max_iter=1000)
lr.fit(X_train_scaled, y_train)
y_pred_lr = lr.predict(X_test_scaled)
y_prob_lr = lr.predict_proba(X_test_scaled)[:, 1]

results_list.append(evaluate_model("Logistic Regression", lr, X_test_scaled, y_test, y_pred_lr, y_prob_lr))
models['Logistic Regression'] = lr
print("\n")

# 2. Random Forest
rf = RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=42, n_jobs=-1)
rf.fit(X_train, y_train) # Tree models don't strictly need scaling
y_pred_rf = rf.predict(X_test)
y_prob_rf = rf.predict_proba(X_test)[:, 1]

results_list.append(evaluate_model("Random Forest", rf, X_test, y_test, y_pred_rf, y_prob_rf))
models['Random Forest'] = rf
print("\n")

# 3. XGBoost
# scale_pos_weight = ratio of negative/positive classes (~32 for 3% fraud)
ratio = float(np.sum(y_train == 0)) / np.sum(y_train == 1)
xgb = XGBClassifier(n_estimators=100, learning_rate=0.1, scale_pos_weight=ratio, random_state=42, n_jobs=-1)
xgb.fit(X_train, y_train)
y_pred_xgb = xgb.predict(X_test)
y_prob_xgb = xgb.predict_proba(X_test)[:, 1]

results_list.append(evaluate_model("XGBoost", xgb, X_test, y_test, y_pred_xgb, y_prob_xgb))
models['XGBoost'] = xgb

In [None]:
# ============================================================
# UNSUPERVISED ANOMALY DETECTION
# ============================================================
# Isolation Forest doesn't use labels (y_train) for training!
# It tries to find "outliers" based on feature distribution.

iso_forest = IsolationForest(contamination=0.03, random_state=42, n_jobs=-1)
iso_forest.fit(X_train_scaled) # Unsupervised fit

# Predict (-1 is anomaly, 1 is normal) -> Convert to (1 is fraud, 0 is normal)
y_pred_iso_raw = iso_forest.predict(X_test_scaled)
y_pred_iso = [1 if x == -1 else 0 for x in y_pred_iso_raw]

# Isolation Forest doesn't output probability like supervised models
# But decision_function gives anomaly score (lower is more anomalous)
y_scores_iso = -iso_forest.decision_function(X_test_scaled) 

results_list.append(evaluate_model("Isolation Forest", iso_forest, X_test_scaled, y_test, y_pred_iso, y_scores_iso))
models['Isolation Forest'] = iso_forest

In [None]:
# ============================================================
# MODEL COMPARISON DASHBOARD
# ============================================================

results_df = pd.DataFrame(results_list)
print("Summary Table:")
display(results_df.sort_values('AUC', ascending=False))

# Visualization
plt.figure(figsize=(12, 5))

# Prepare data for plotting (melt)
results_melt = results_df.melt(id_vars='Model', value_vars=['Precision', 'Recall', 'AUC'], 
                               var_name='Metric', value_name='Score')

sns.barplot(x='Metric', y='Score', hue='Model', data=results_melt, palette='viridis')
plt.title('Model Performance Comparison', fontsize=14)
plt.ylim(0.5, 1.0) # Zoom in on the top half
plt.legend(bbox_to_anchor=(1.05, 1), loc=2)
plt.show()

In [None]:
# ============================================================
# ROC CURVES (Visualizing Trade-offs)
# ============================================================

plt.figure(figsize=(10, 8))

# Plot ROC for each model
# LR
fpr_lr, tpr_lr, _ = roc_curve(y_test, y_prob_lr)
plt.plot(fpr_lr, tpr_lr, label=f'Logistic Regression (AUC = {results_df.iloc[0]["AUC"]:.2f})')

# RF
fpr_rf, tpr_rf, _ = roc_curve(y_test, y_prob_rf)
plt.plot(fpr_rf, tpr_rf, label=f'Random Forest (AUC = {results_df.iloc[1]["AUC"]:.2f})')

# XGB
fpr_xgb, tpr_xgb, _ = roc_curve(y_test, y_prob_xgb)
plt.plot(fpr_xgb, tpr_xgb, label=f'XGBoost (AUC = {results_df.iloc[2]["AUC"]:.2f})')

# IsoForest
fpr_iso, tpr_iso, _ = roc_curve(y_test, y_scores_iso)
plt.plot(fpr_iso, tpr_iso, label=f'Isolation Forest (AUC = {results_df.iloc[3]["AUC"]:.2f})', linestyle='--')

# Random Guess line
plt.plot([0, 1], [0, 1], 'k--', alpha=0.5)

plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate (Recall)')
plt.title('ROC Curves: Supervised vs Unsupervised')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()

In [None]:
# ============================================================
# CONFUSION MATRIX (Best Model: XGBoost)
# ============================================================

best_model_name = "XGBoost" # Usually the winner
cm = confusion_matrix(y_test, y_pred_xgb)

plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False,
            xticklabels=['Predicted Legit', 'Predicted Fraud'],
            yticklabels=['Actual Legit', 'Actual Fraud'])
plt.title(f'Confusion Matrix - {best_model_name}')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.show()

In [None]:
# ============================================================
# FEATURE IMPORTANCE (What drives fraud?)
# ============================================================

# Get feature importance from XGBoost
importances = xgb.feature_importances_
feature_names = X.columns
feat_imp = pd.DataFrame({'Feature': feature_names, 'Importance': importances})
feat_imp = feat_imp.sort_values('Importance', ascending=False).head(10)

plt.figure(figsize=(10, 6))
sns.barplot(x='Importance', y='Feature', data=feat_imp, palette='magma')
plt.title('Top 10 Features for Detecting ATO Fraud (XGBoost)')
plt.show()

# Save best model
import os
os.makedirs('../models', exist_ok=True)
joblib.dump(xgb, '../models/best_model_xgboost.pkl')
joblib.dump(scaler, '../models/scaler.pkl')
print("Best model and scaler saved to 'models/' folder")