# SynFinance Fraud Detection ML Tutorial

This notebook demonstrates the complete ML workflow for fraud detection using SynFinance:

1. Generate synthetic transaction data with fraud patterns
2. Engineer ML features from transaction history
3. Create balanced training dataset
4. Train fraud detection models (Random Forest & XGBoost)
5. Evaluate model performance
6. Analyze feature importance

**Author**: SynFinance Development Team  
**Version**: 0.5.0  
**Date**: October 26, 2025

## Setup and Imports

Import required libraries and configure SynFinance

In [None]:
# Standard library imports
import sys
import os
from datetime import datetime, timedelta
import json

# Add SynFinance to path
sys.path.insert(0, os.path.abspath('..'))

# SynFinance imports
from src.data_generator import DataGenerator
from src.generators.fraud_patterns import FraudPatternGenerator
from src.generators.ml_features import MLFeatureEngineer
from src.generators.ml_dataset_generator import MLDatasetGenerator

# ML library imports
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, f1_score
from sklearn.metrics import precision_recall_curve, roc_curve

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# XGBoost (install if needed: pip install xgboost)
try:
    import xgboost as xgb
    XGBOOST_AVAILABLE = True
except ImportError:
    print("XGBoost not available. Install with: pip install xgboost")
    XGBOOST_AVAILABLE = False

# Set random seeds for reproducibility
np.random.seed(42)

# Configure matplotlib
plt.style.use('seaborn-v0_8-darkgrid')
%matplotlib inline

print("✓ Imports complete")
print(f"✓ XGBoost available: {XGBOOST_AVAILABLE}")

## Step 1: Generate Synthetic Transaction Data

Create 1000 transactions with 10% fraud rate using various fraud patterns

In [None]:
# Initialize data generator
generator = DataGenerator(
    num_customers=100,
    start_date=datetime(2025, 1, 1),
    num_days=30
)

# Generate customer profiles
print("Generating customer profiles...")
customers = generator.generate_customers()
print(f"✓ Generated {len(customers)} customers")

# Generate transactions
print("\nGenerating transactions...")
transactions = generator.generate_transactions(num_transactions=1000)
print(f"✓ Generated {len(transactions)} transactions")

# Inject fraud patterns
print("\nInjecting fraud patterns...")
fraud_gen = FraudPatternGenerator(seed=42)
transactions = fraud_gen.inject_fraud_patterns(
    transactions,
    customers,
    fraud_rate=0.10,
    patterns=['card_cloning', 'velocity_abuse', 'geographic_impossible']
)

# Calculate fraud statistics
fraud_count = sum(1 for t in transactions if t.get('is_fraud', 0) == 1)
fraud_rate = fraud_count / len(transactions)

print(f"✓ Fraud injection complete")
print(f"  - Total transactions: {len(transactions)}")
print(f"  - Fraud transactions: {fraud_count}")
print(f"  - Fraud rate: {fraud_rate:.1%}")

## Step 2: Engineer ML Features

Extract 32 features from transaction data across 6 categories

In [None]:
# Initialize feature engineer
feature_engineer = MLFeatureEngineer()

# Build transaction history lookup
print("Building transaction history...")
history_lookup = {}
for txn in transactions:
    customer_id = txn['customer_id']
    if customer_id not in history_lookup:
        history_lookup[customer_id] = []
    history_lookup[customer_id].append(txn)

# Sort history by timestamp
for customer_id in history_lookup:
    history_lookup[customer_id].sort(key=lambda x: x['timestamp'])

print(f"✓ Built history for {len(history_lookup)} customers")

# Engineer features for each transaction
print("\nEngineering features...")
features_list = []

for i, txn in enumerate(transactions):
    # Get customer info
    customer = next(c for c in customers if c['customer_id'] == txn['customer_id'])
    
    # Get transaction history (all transactions before this one)
    customer_history = history_lookup[txn['customer_id']]
    txn_index = customer_history.index(txn)
    history = customer_history[:txn_index]
    
    # Engineer features
    ml_features = feature_engineer.engineer_features(
        transaction=txn,
        customer=customer,
        transaction_history=history
    )
    
    features_list.append(ml_features.to_dict())
    
    if (i + 1) % 200 == 0:
        print(f"  Processed {i + 1}/{len(transactions)} transactions")

print(f"\n✓ Engineered features for {len(features_list)} transactions")
print(f"  - Features per transaction: {len(features_list[0]) - 3}")  # Exclude id, is_fraud, fraud_type

# Display feature metadata
metadata = feature_engineer.get_feature_metadata()
print(f"\nFeature Categories:")
for category, features in metadata['features'].items():
    print(f"  - {category}: {len(features)} features")

## Step 3: Create ML-Ready Dataset

Balance dataset, split into train/val/test, normalize features

In [None]:
# Initialize dataset generator
dataset_gen = MLDatasetGenerator(seed=42)

# Create ML-ready dataset
print("Creating ML-ready dataset...\n")
split, metadata = dataset_gen.create_ml_ready_dataset(
    features_list,
    balance_strategy='undersample',
    target_fraud_rate=0.5,
    normalize=True,
    encode_categorical=True
)

print("\n✓ Dataset creation complete")
print(f"\nDataset Statistics:")
stats = split.get_stats()
print(f"  Train set: {stats['train_size']} samples (fraud rate: {stats['train_fraud_rate']:.1%})")
print(f"  Validation set: {stats['validation_size']} samples (fraud rate: {stats['validation_fraud_rate']:.1%})")
print(f"  Test set: {stats['test_size']} samples (fraud rate: {stats['test_fraud_rate']:.1%})")

## Step 4: Prepare Data for ML Models

Convert to numpy arrays for scikit-learn and XGBoost

In [None]:
def prepare_ml_data(dataset):
    """Convert dataset to X, y numpy arrays."""
    # Define feature columns (exclude ID and labels)
    exclude_cols = {'transaction_id', 'is_fraud', 'fraud_type', 'fraud_type_encoded'}
    
    if len(dataset) == 0:
        return np.array([]), np.array([])
    
    # Get feature columns
    all_cols = set(dataset[0].keys())
    feature_cols = sorted(all_cols - exclude_cols)
    
    # Extract features and labels
    X = np.array([[sample.get(col, 0) for col in feature_cols] for sample in dataset])
    y = np.array([sample.get('is_fraud', 0) for sample in dataset])
    
    return X, y, feature_cols

# Prepare datasets
print("Preparing data for ML models...")
X_train, y_train, feature_names = prepare_ml_data(split.train)
X_val, y_val, _ = prepare_ml_data(split.validation)
X_test, y_test, _ = prepare_ml_data(split.test)

print(f"✓ Data preparation complete")
print(f"  - Training features shape: {X_train.shape}")
print(f"  - Validation features shape: {X_val.shape}")
print(f"  - Test features shape: {X_test.shape}")
print(f"  - Number of features: {len(feature_names)}")

## Step 5: Train Random Forest Classifier

Baseline model using Random Forest

In [None]:
print("Training Random Forest Classifier...\n")

# Initialize and train model
rf_model = RandomForestClassifier(
    n_estimators=100,
    max_depth=10,
    min_samples_split=5,
    min_samples_leaf=2,
    random_state=42,
    n_jobs=-1
)

rf_model.fit(X_train, y_train)

# Predictions
rf_train_pred = rf_model.predict(X_train)
rf_val_pred = rf_model.predict(X_val)
rf_test_pred = rf_model.predict(X_test)

rf_test_proba = rf_model.predict_proba(X_test)[:, 1]

print("✓ Random Forest training complete\n")

# Evaluate on train set
print("Training Set Performance:")
print(f"  F1-Score: {f1_score(y_train, rf_train_pred):.3f}")
print(f"  ROC-AUC: {roc_auc_score(y_train, rf_model.predict_proba(X_train)[:, 1]):.3f}")

# Evaluate on validation set
print("\nValidation Set Performance:")
print(f"  F1-Score: {f1_score(y_val, rf_val_pred):.3f}")
print(f"  ROC-AUC: {roc_auc_score(y_val, rf_model.predict_proba(X_val)[:, 1]):.3f}")

# Evaluate on test set
print("\nTest Set Performance:")
print(f"  F1-Score: {f1_score(y_test, rf_test_pred):.3f}")
print(f"  ROC-AUC: {roc_auc_score(y_test, rf_test_proba):.3f}")

print("\nDetailed Classification Report (Test Set):")
print(classification_report(y_test, rf_test_pred, target_names=['Normal', 'Fraud']))

## Step 6: Train XGBoost Classifier

Advanced gradient boosting model

In [None]:
if XGBOOST_AVAILABLE:
    print("Training XGBoost Classifier...\n")
    
    # Initialize and train model
    xgb_model = xgb.XGBClassifier(
        n_estimators=100,
        max_depth=6,
        learning_rate=0.1,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42,
        eval_metric='logloss'
    )
    
    xgb_model.fit(
        X_train, y_train,
        eval_set=[(X_val, y_val)],
        verbose=False
    )
    
    # Predictions
    xgb_train_pred = xgb_model.predict(X_train)
    xgb_val_pred = xgb_model.predict(X_val)
    xgb_test_pred = xgb_model.predict(X_test)
    
    xgb_test_proba = xgb_model.predict_proba(X_test)[:, 1]
    
    print("✓ XGBoost training complete\n")
    
    # Evaluate on train set
    print("Training Set Performance:")
    print(f"  F1-Score: {f1_score(y_train, xgb_train_pred):.3f}")
    print(f"  ROC-AUC: {roc_auc_score(y_train, xgb_model.predict_proba(X_train)[:, 1]):.3f}")
    
    # Evaluate on validation set
    print("\nValidation Set Performance:")
    print(f"  F1-Score: {f1_score(y_val, xgb_val_pred):.3f}")
    print(f"  ROC-AUC: {roc_auc_score(y_val, xgb_model.predict_proba(X_val)[:, 1]):.3f}")
    
    # Evaluate on test set
    print("\nTest Set Performance:")
    print(f"  F1-Score: {f1_score(y_test, xgb_test_pred):.3f}")
    print(f"  ROC-AUC: {roc_auc_score(y_test, xgb_test_proba):.3f}")
    
    print("\nDetailed Classification Report (Test Set):")
    print(classification_report(y_test, xgb_test_pred, target_names=['Normal', 'Fraud']))
else:
    print("XGBoost not available. Skipping XGBoost training.")

## Step 7: Confusion Matrix Visualization

In [None]:
# Create confusion matrices
fig, axes = plt.subplots(1, 2 if XGBOOST_AVAILABLE else 1, figsize=(12 if XGBOOST_AVAILABLE else 6, 5))

if not XGBOOST_AVAILABLE:
    axes = [axes]

# Random Forest confusion matrix
cm_rf = confusion_matrix(y_test, rf_test_pred)
sns.heatmap(cm_rf, annot=True, fmt='d', cmap='Blues', ax=axes[0])
axes[0].set_title('Random Forest Confusion Matrix')
axes[0].set_ylabel('True Label')
axes[0].set_xlabel('Predicted Label')
axes[0].set_xticklabels(['Normal', 'Fraud'])
axes[0].set_yticklabels(['Normal', 'Fraud'])

# XGBoost confusion matrix
if XGBOOST_AVAILABLE:
    cm_xgb = confusion_matrix(y_test, xgb_test_pred)
    sns.heatmap(cm_xgb, annot=True, fmt='d', cmap='Greens', ax=axes[1])
    axes[1].set_title('XGBoost Confusion Matrix')
    axes[1].set_ylabel('True Label')
    axes[1].set_xlabel('Predicted Label')
    axes[1].set_xticklabels(['Normal', 'Fraud'])
    axes[1].set_yticklabels(['Normal', 'Fraud'])

plt.tight_layout()
plt.savefig('output/confusion_matrices.png', dpi=150, bbox_inches='tight')
print("✓ Saved confusion matrices to output/confusion_matrices.png")
plt.show()

## Step 8: ROC Curves

In [None]:
# Calculate ROC curves
fpr_rf, tpr_rf, _ = roc_curve(y_test, rf_test_proba)
roc_auc_rf = roc_auc_score(y_test, rf_test_proba)

plt.figure(figsize=(10, 6))
plt.plot(fpr_rf, tpr_rf, label=f'Random Forest (AUC = {roc_auc_rf:.3f})', linewidth=2)

if XGBOOST_AVAILABLE:
    fpr_xgb, tpr_xgb, _ = roc_curve(y_test, xgb_test_proba)
    roc_auc_xgb = roc_auc_score(y_test, xgb_test_proba)
    plt.plot(fpr_xgb, tpr_xgb, label=f'XGBoost (AUC = {roc_auc_xgb:.3f})', linewidth=2)

plt.plot([0, 1], [0, 1], 'k--', label='Random Classifier', linewidth=1)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curves - Fraud Detection Models')
plt.legend(loc='lower right')
plt.grid(True, alpha=0.3)
plt.savefig('output/roc_curves.png', dpi=150, bbox_inches='tight')
print("✓ Saved ROC curves to output/roc_curves.png")
plt.show()

## Step 9: Feature Importance Analysis

In [None]:
# Get feature importances
rf_importances = rf_model.feature_importances_
feature_importance_df = pd.DataFrame({
    'feature': feature_names,
    'importance': rf_importances
}).sort_values('importance', ascending=False)

# Plot top 15 features
plt.figure(figsize=(12, 8))
top_features = feature_importance_df.head(15)
plt.barh(range(len(top_features)), top_features['importance'])
plt.yticks(range(len(top_features)), top_features['feature'])
plt.xlabel('Feature Importance')
plt.title('Top 15 Most Important Features (Random Forest)')
plt.gca().invert_yaxis()
plt.grid(True, alpha=0.3, axis='x')
plt.tight_layout()
plt.savefig('output/feature_importance.png', dpi=150, bbox_inches='tight')
print("✓ Saved feature importance plot to output/feature_importance.png")
plt.show()

# Display top 10
print("\nTop 10 Most Important Features:")
for idx, row in feature_importance_df.head(10).iterrows():
    print(f"  {row['feature']}: {row['importance']:.4f}")

## Step 10: Export Dataset to Multiple Formats

In [None]:
import os

# Create output directory
os.makedirs('output/ml_exports', exist_ok=True)

# 1. Export to CSV
dataset_gen.export_to_csv(split.train, 'output/ml_exports/train.csv')
dataset_gen.export_to_csv(split.test, 'output/ml_exports/test.csv')
print("✓ Exported to CSV format")

# 2. Export to JSON
dataset_gen.export_to_json(split.train, 'output/ml_exports/train.json')
dataset_gen.export_to_json(split.test, 'output/ml_exports/test.json')
print("✓ Exported to JSON format")

# 3. Export metadata
dataset_gen.export_metadata('output/ml_exports/dataset_metadata.json', metadata)
print("✓ Exported metadata")

# 4. Export to NumPy arrays
np.save('output/ml_exports/X_train.npy', X_train)
np.save('output/ml_exports/y_train.npy', y_train)
np.save('output/ml_exports/X_test.npy', X_test)
np.save('output/ml_exports/y_test.npy', y_test)
print("✓ Exported to NumPy format")

# 5. Save feature names
with open('output/ml_exports/feature_names.json', 'w') as f:
    json.dump(feature_names, f, indent=2)
print("✓ Exported feature names")

print("\n✓ All exports complete in output/ml_exports/")

## Summary

This notebook demonstrated the complete ML workflow for fraud detection:

**Key Results**:
- Generated 1000 transactions with realistic fraud patterns
- Engineered 32 features across 6 categories
- Trained Random Forest and XGBoost models
- Achieved strong fraud detection performance
- Identified most important features for fraud prediction

**Next Steps**:
- Experiment with different fraud patterns
- Tune model hyperparameters
- Try ensemble methods
- Deploy model for real-time fraud detection

For more information, see the SynFinance documentation.