<a href="https://colab.research.google.com/github/sr6awi/ieee_fraud_detection/blob/main/notebooks/04_modeling_dual_path_benchmark_stratified_fold.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score, confusion_matrix, classification_report
import matplotlib.pyplot as plt
import seaborn as sns
import json
import pickle
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

print("=" * 70)
print("🚀 DUAL-PATH FRAUD DETECTION - COMPLETE TRAINING PIPELINE")
print("Node Classification with Full Dataset Support")
print("=" * 70)

# GPU Configuration
gpus = tf.config.list_physical_devices('GPU')
if gpus:
    try:
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
        print(f"✅ GPU: {len(gpus)} device(s) available")
    except RuntimeError as e:
        print(e)
else:
    print("⚠️ No GPU - using CPU (slower)")

SEED = 42
tf.random.set_seed(SEED)
np.random.seed(SEED)

🚀 DUAL-PATH FRAUD DETECTION - COMPLETE TRAINING PIPELINE
Node Classification with Full Dataset Support
✅ GPU: 1 device(s) available


In [2]:
BASE_DIR = Path("/content/drive/MyDrive/ML_Projects/ieee-fraud-detection")
PROCESSED_DIR = BASE_DIR / "processed"
MODEL_DIR = BASE_DIR / "models"
RESULTS_DIR = BASE_DIR / "results"

MODEL_DIR.mkdir(parents=True, exist_ok=True)
RESULTS_DIR.mkdir(parents=True, exist_ok=True)

# Training configuration
CONFIG = {
    'batch_size': 1024,
    'epochs': 50,
    'learning_rate': 0.001,
    'patience': 10,
    'n_folds': 5,
    'use_class_weights': True,
    'save_best_only': True
}

print(f"\n📁 Directories:")
print(f"   Data: {PROCESSED_DIR}")
print(f"   Models: {MODEL_DIR}")
print(f"   Results: {RESULTS_DIR}")


📁 Directories:
   Data: /content/drive/MyDrive/ML_Projects/ieee-fraud-detection/processed
   Models: /content/drive/MyDrive/ML_Projects/ieee-fraud-detection/models
   Results: /content/drive/MyDrive/ML_Projects/ieee-fraud-detection/results


In [3]:
print("\n" + "="*70)
print("📥 LOADING FEATURE-ENGINEERED DATA")
print("="*70)

print("\nLoading training data...")
train = pd.read_csv(PROCESSED_DIR / "train_feature_engineered.csv")
print(f"✅ Train loaded: {train.shape}")

print("Loading test data...")
test = pd.read_csv(PROCESSED_DIR / "test_feature_engineered.csv")
print(f"✅ Test loaded: {test.shape}")

# Check for target
if 'isFraud' not in train.columns:
    raise ValueError("Target 'isFraud' not found in training data!")

print(f"\n📊 Data Statistics:")
print(f"   Training samples: {len(train):,}")
print(f"   Test samples: {len(test):,}")
print(f"   Features: {train.shape[1] - 2}")  # Exclude isFraud and TransactionID
print(f"   Fraud rate: {train['isFraud'].mean()*100:.2f}%")
print(f"   Class imbalance: {(1-train['isFraud'].mean())/train['isFraud'].mean():.1f}:1")


📥 LOADING FEATURE-ENGINEERED DATA

Loading training data...
✅ Train loaded: (590540, 262)
Loading test data...
✅ Test loaded: (506691, 261)

📊 Data Statistics:
   Training samples: 590,540
   Test samples: 506,691
   Features: 260
   Fraud rate: 3.50%
   Class imbalance: 27.6:1


In [4]:
print("\n" + "="*70)
print("🎯 PREPARING FEATURES FOR DUAL PATHS")
print("="*70)

# Smart features for GNN (features that benefit from graph structure)
GNN_FEATURES = [
    'TransactionAmt', 'TransactionDT',
    'card1', 'card2', 'card1_count',
    'card1_fraud_rate', 'card2_fraud_rate',
    'P_emaildomain_fraud_rate', 'R_emaildomain_fraud_rate',
    'addr1', 'dist1', 'ProductCD', 'ProductCD_fraud_rate',
    'transaction_hour', 'is_weekend', 'is_night',
    'TransactionAmt_log', 'email_domains_match', 'addr_match'
]

# Filter available GNN features
GNN_FEATURES = [f for f in GNN_FEATURES if f in train.columns]

print(f"✅ GNN Path: {len(GNN_FEATURES)} smart features selected")
print(f"   Focus: Graph-beneficial features")

# All features for TabNet (exclude ID and target)
exclude_cols = ['isFraud', 'TransactionID']
TABNET_FEATURES = [c for c in train.columns if c not in exclude_cols]

print(f"✅ TabNet Path: {len(TABNET_FEATURES)} features (all available)")
print(f"   Focus: Rich feature learning")


🎯 PREPARING FEATURES FOR DUAL PATHS
✅ GNN Path: 19 smart features selected
   Focus: Graph-beneficial features
✅ TabNet Path: 260 features (all available)
   Focus: Rich feature learning


In [5]:
print("\n" + "="*70)
print("📊 PREPARING TRAINING DATA (SIMPLIFIED FOR MEMORY)")
print("="*70)

"""
SIMPLIFIED APPROACH:
- Skip graph construction for now (memory intensive)
- Use GNN features as regular neural network
- Focus on TabNet path primarily
- Treat as multi-modal learning (two feature sets)
"""

# Prepare features
X_gnn = train[GNN_FEATURES].fillna(0).values
X_tabnet = train[TABNET_FEATURES].fillna(0).values
y = train['isFraud'].values

X_gnn_test = test[GNN_FEATURES].fillna(0).values
X_tabnet_test = test[TABNET_FEATURES].fillna(0).values

print(f"✅ Training data prepared:")
print(f"   GNN input shape: {X_gnn.shape}")
print(f"   TabNet input shape: {X_tabnet.shape}")
print(f"   Labels shape: {y.shape}")

# Normalize features
print("\n📏 Normalizing features...")
scaler_gnn = StandardScaler()
scaler_tabnet = StandardScaler()

X_gnn_scaled = scaler_gnn.fit_transform(X_gnn)
X_tabnet_scaled = scaler_tabnet.fit_transform(X_tabnet)

X_gnn_test_scaled = scaler_gnn.transform(X_gnn_test)
X_tabnet_test_scaled = scaler_tabnet.transform(X_tabnet_test)

# Save scalers
import joblib
joblib.dump(scaler_gnn, MODEL_DIR / 'scaler_gnn.pkl')
joblib.dump(scaler_tabnet, MODEL_DIR / 'scaler_tabnet.pkl')

print("✅ Features normalized and scalers saved!")


📊 PREPARING TRAINING DATA (SIMPLIFIED FOR MEMORY)
✅ Training data prepared:
   GNN input shape: (590540, 19)
   TabNet input shape: (590540, 260)
   Labels shape: (590540,)

📏 Normalizing features...
✅ Features normalized and scalers saved!


In [6]:
print("\n" + "="*70)
print("🏗️ BUILDING SIMPLIFIED DUAL-PATH MODEL")
print("="*70)

def build_dual_path_model(gnn_features, tabnet_features):
    """
    Simplified dual-path architecture for production training.
    Treats GNN features as a separate neural network path.
    """

    # INPUT LAYERS
    input_gnn = layers.Input(shape=(gnn_features,), name='gnn_input')
    input_tabnet = layers.Input(shape=(tabnet_features,), name='tabnet_input')

    # PATH 1: GNN-style Network (processes graph-beneficial features)
    x1 = layers.Dense(128, activation='relu', name='gnn_dense1')(input_gnn)
    x1 = layers.BatchNormalization(name='gnn_bn1')(x1)
    x1 = layers.Dropout(0.3, name='gnn_dropout1')(x1)

    x1 = layers.Dense(64, activation='relu', name='gnn_dense2')(x1)
    x1 = layers.BatchNormalization(name='gnn_bn2')(x1)
    x1 = layers.Dropout(0.2, name='gnn_dropout2')(x1)

    gnn_output = layers.Dense(32, activation='relu', name='gnn_output')(x1)

    # PATH 2: TabNet-style Network (attention-based feature selection)
    # Step 1
    x2 = layers.Dense(256, activation='gelu', name='tabnet_step1')(input_tabnet)
    x2 = layers.BatchNormalization(name='tabnet_bn1')(x2)
    x2 = layers.Dropout(0.3, name='tabnet_dropout1')(x2)

    # Step 2
    x2 = layers.Dense(128, activation='gelu', name='tabnet_step2')(x2)
    x2 = layers.BatchNormalization(name='tabnet_bn2')(x2)
    x2 = layers.Dropout(0.2, name='tabnet_dropout2')(x2)

    # Step 3
    tabnet_output = layers.Dense(32, activation='relu', name='tabnet_output')(x2)

    # FUSION: Concatenate both paths
    merged = layers.Concatenate(name='fusion_concat')([gnn_output, tabnet_output])

    # Attention-based fusion
    attention_weights = layers.Dense(64, activation='softmax', name='attention_weights')(merged)
    attended = layers.Multiply(name='attended_features')([merged, attention_weights])

    # Final classification layers
    x = layers.Dense(64, activation='relu', name='classifier_dense1')(attended)
    x = layers.BatchNormalization(name='classifier_bn')(x)
    x = layers.Dropout(0.2, name='classifier_dropout')(x)

    x = layers.Dense(32, activation='relu', name='classifier_dense2')(x)

    # Output
    output = layers.Dense(1, activation='sigmoid', name='output')(x)

    # Create model
    model = keras.Model(
        inputs=[input_gnn, input_tabnet],
        outputs=output,
        name='DualPathFraudDetector'
    )

    return model

# Build model
model = build_dual_path_model(len(GNN_FEATURES), len(TABNET_FEATURES))

print("✅ Model architecture built!")
print(f"\n📋 Model Summary:")
model.summary()


🏗️ BUILDING SIMPLIFIED DUAL-PATH MODEL
✅ Model architecture built!

📋 Model Summary:


In [7]:
print("\n" + "="*70)
print("⚙️ COMPILING MODEL")
print("="*70)

# Calculate class weights for imbalanced data
fraud_rate = y.mean()
class_weight = {
    0: 1.0,
    1: (1 - fraud_rate) / fraud_rate
}

print(f"📊 Class weights (for imbalance):")
print(f"   Class 0 (Legit): {class_weight[0]:.2f}")
print(f"   Class 1 (Fraud): {class_weight[1]:.2f}")

# Compile model
model.compile(
    optimizer=keras.optimizers.Adam(learning_rate=CONFIG['learning_rate']),
    loss='binary_crossentropy',
    metrics=[
        'accuracy',
        keras.metrics.AUC(name='auc'),
        keras.metrics.Precision(name='precision'),
        keras.metrics.Recall(name='recall')
    ]
)

print("\n✅ Model compiled successfully!")


⚙️ COMPILING MODEL
📊 Class weights (for imbalance):
   Class 0 (Legit): 1.00
   Class 1 (Fraud): 27.58

✅ Model compiled successfully!


In [9]:
!pip install keras-tqdm



In [8]:


print("\n" + "="*70)
print("🎓 TRAINING WITH STRATIFIED K-FOLD CROSS-VALIDATION")
print("="*70)

from tensorflow.keras.callbacks import LambdaCallback

# Base callbacks
callbacks = [
    keras.callbacks.EarlyStopping(
        monitor='val_auc',
        patience=CONFIG['patience'],
        restore_best_weights=True,
        mode='max',
        verbose=1
    ),
    keras.callbacks.ReduceLROnPlateau(
        monitor='val_auc',
        factor=0.5,
        patience=5,
        min_lr=1e-6,
        mode='max',
        verbose=1
    )
]

# Stratified K-Fold setup
skf = StratifiedKFold(n_splits=CONFIG['n_folds'], shuffle=True, random_state=SEED)

cv_scores = []
fold_histories = []

print(f"\n🔄 Starting {CONFIG['n_folds']}-Fold Cross-Validation...")
print("="*70)

for fold, (train_idx, val_idx) in enumerate(skf.split(X_gnn_scaled, y), 1):
    print(f"\n{'='*70}")
    print(f"📊 FOLD {fold}/{CONFIG['n_folds']}")
    print(f"{'='*70}")

    # Split data
    X_gnn_train, X_gnn_val = X_gnn_scaled[train_idx], X_gnn_scaled[val_idx]
    X_tabnet_train, X_tabnet_val = X_tabnet_scaled[train_idx], X_tabnet_scaled[val_idx]
    y_train, y_val = y[train_idx], y[val_idx]

    print(f"   Train: {len(y_train):,} samples ({y_train.mean()*100:.2f}% fraud)")
    print(f"   Val:   {len(y_val):,} samples ({y_val.mean()*100:.2f}% fraud)")

    # Rebuild model (fresh weights for each fold)
    model = build_dual_path_model(len(GNN_FEATURES), len(TABNET_FEATURES))
    model.compile(
        optimizer=keras.optimizers.Adam(learning_rate=CONFIG['learning_rate']),
        loss='binary_crossentropy',
        metrics=[
            'accuracy',
            keras.metrics.AUC(name='auc'),
            keras.metrics.Precision(name='precision'),
            keras.metrics.Recall(name='recall')
        ]
    )

    # Custom epoch logger for short summaries
    epoch_logger = LambdaCallback(
        on_epoch_end=lambda e, logs:
            print(f"Epoch {e+1:02d} | "
                  f"loss={logs['loss']:.4f} | val_loss={logs['val_loss']:.4f} | "
                  f"auc={logs['auc']:.4f} | val_auc={logs['val_auc']:.4f}")
    )

    # Create callbacks for this fold (unique checkpoint path)
    fold_callbacks = callbacks.copy() + [
        keras.callbacks.ModelCheckpoint(
            filepath=str(MODEL_DIR / f'best_model_fold_{fold}.keras'),
            monitor='val_auc',
            save_best_only=True,
            mode='max',
            verbose=1
        ),
        epoch_logger
    ]

    # Train
    print(f"\n   🚀 Training...")
    history = model.fit(
        [X_gnn_train, X_tabnet_train],
        y_train,
        validation_data=([X_gnn_val, X_tabnet_val], y_val),
        epochs=CONFIG['epochs'],
        batch_size=CONFIG['batch_size'],
        class_weight=class_weight if CONFIG['use_class_weights'] else None,
        callbacks=fold_callbacks,
        verbose=1      # ✅ live progress bar
    )

    # Evaluate
    val_preds = model.predict([X_gnn_val, X_tabnet_val], verbose=0)
    val_auc = roc_auc_score(y_val, val_preds)
    cv_scores.append(val_auc)
    fold_histories.append(history.history)

    print(f"\n   ✅ Fold {fold} Complete!")
    print(f"   Val AUC: {val_auc:.6f}")
    print(f"   Best epoch: {np.argmax(history.history['val_auc']) + 1}")

# ============================================================================
# RESULTS SUMMARY
# ============================================================================

print("\n" + "="*70)
print("📊 CROSS-VALIDATION RESULTS")
print("="*70)

print(f"\nFold AUC Scores:")
for i, score in enumerate(cv_scores, 1):
    print(f"   Fold {i}: {score:.6f}")

print(f"\n🏆 Summary:")
print(f"   Mean AUC: {np.mean(cv_scores):.6f}")
print(f"   Std AUC:  {np.std(cv_scores):.6f}")
print(f"   Min AUC:  {np.min(cv_scores):.6f}")
print(f"   Max AUC:  {np.max(cv_scores):.6f}")

# Save cross-validation results
cv_results = {
    'fold_scores': cv_scores,
    'mean_auc': float(np.mean(cv_scores)),
    'std_auc': float(np.std(cv_scores)),
    'config': CONFIG
}

with open(RESULTS_DIR / 'cv_results.json', 'w') as f:
    json.dump(cv_results, f, indent=2)

print(f"\n✅ CV results saved to: {RESULTS_DIR / 'cv_results.json'}")


🎓 TRAINING WITH STRATIFIED K-FOLD CROSS-VALIDATION

🔄 Starting 5-Fold Cross-Validation...

📊 FOLD 1/5
   Train: 472,432 samples (3.50% fraud)
   Val:   118,108 samples (3.50% fraud)

   🚀 Training...
Epoch 1/50
[1m462/462[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step - accuracy: 0.6845 - auc: 0.8222 - loss: 0.9941 - precision: 0.0934 - recall: 0.7882
Epoch 1: val_auc improved from -inf to 0.91795, saving model to /content/drive/MyDrive/ML_Projects/ieee-fraud-detection/models/best_model_fold_1.keras
Epoch 01 | loss=0.8427 | val_loss=0.2499 | auc=0.8777 | val_auc=0.9180
[1m462/462[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 31ms/step - accuracy: 0.6848 - auc: 0.8223 - loss: 0.9938 - precision: 0.0935 - recall: 0.7883 - val_accuracy: 0.9209 - val_auc: 0.9180 - val_loss: 0.2499 - val_precision: 0.2670 - val_recall: 0.7227 - learning_rate: 0.0010
Epoch 2/50
[1m456/462[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 6ms/step - accuracy: 0.8446 - auc: 0.