# Fraud Detection Model Analysis

This notebook demonstrates the fraud detection ensemble and analyzes model performance.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Set style
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette('husl')

print('Libraries loaded successfully!')

## 1. Generate Synthetic Data

For demonstration, we generate synthetic transaction data with realistic fraud patterns.

In [None]:
def generate_synthetic_data(n_samples=10000, fraud_rate=0.02, random_state=42):
    """Generate synthetic fraud detection data."""
    np.random.seed(random_state)
    
    n_fraud = int(n_samples * fraud_rate)
    n_legit = n_samples - n_fraud
    
    # Feature names
    features = [
        'txn_count_1h', 'txn_count_6h', 'txn_count_24h', 'txn_count_7d',
        'amount_sum_1h', 'amount_sum_24h', 'amount_avg_30d', 'amount_std_30d',
        'time_since_last_txn', 'unique_merchants_24h', 'unique_channels_24h',
        'is_first_transaction', 'is_new_merchant', 'is_new_device',
        'deviation_from_avg', 'merchant_risk_score', 'device_risk_score'
    ]
    
    # Legitimate transactions
    legit_data = {
        'txn_count_1h': np.random.poisson(2, n_legit),
        'txn_count_6h': np.random.poisson(5, n_legit),
        'txn_count_24h': np.random.poisson(10, n_legit),
        'txn_count_7d': np.random.poisson(30, n_legit),
        'amount_sum_1h': np.random.exponential(100, n_legit),
        'amount_sum_24h': np.random.exponential(500, n_legit),
        'amount_avg_30d': np.random.normal(150, 50, n_legit),
        'amount_std_30d': np.random.exponential(50, n_legit),
        'time_since_last_txn': np.random.exponential(7200, n_legit),
        'unique_merchants_24h': np.random.poisson(3, n_legit),
        'unique_channels_24h': np.random.choice([1, 2], n_legit, p=[0.8, 0.2]),
        'is_first_transaction': np.random.choice([0, 1], n_legit, p=[0.99, 0.01]),
        'is_new_merchant': np.random.choice([0, 1], n_legit, p=[0.9, 0.1]),
        'is_new_device': np.random.choice([0, 1], n_legit, p=[0.95, 0.05]),
        'deviation_from_avg': np.random.normal(0, 1, n_legit),
        'merchant_risk_score': np.random.beta(2, 10, n_legit),
        'device_risk_score': np.random.beta(2, 10, n_legit),
    }
    
    # Fraudulent transactions (different patterns)
    fraud_data = {
        'txn_count_1h': np.random.poisson(8, n_fraud),
        'txn_count_6h': np.random.poisson(15, n_fraud),
        'txn_count_24h': np.random.poisson(25, n_fraud),
        'txn_count_7d': np.random.poisson(40, n_fraud),
        'amount_sum_1h': np.random.exponential(500, n_fraud),
        'amount_sum_24h': np.random.exponential(2000, n_fraud),
        'amount_avg_30d': np.random.normal(150, 50, n_fraud),
        'amount_std_30d': np.random.exponential(50, n_fraud),
        'time_since_last_txn': np.random.exponential(600, n_fraud),
        'unique_merchants_24h': np.random.poisson(8, n_fraud),
        'unique_channels_24h': np.random.choice([1, 2, 3], n_fraud, p=[0.3, 0.4, 0.3]),
        'is_first_transaction': np.random.choice([0, 1], n_fraud, p=[0.7, 0.3]),
        'is_new_merchant': np.random.choice([0, 1], n_fraud, p=[0.4, 0.6]),
        'is_new_device': np.random.choice([0, 1], n_fraud, p=[0.5, 0.5]),
        'deviation_from_avg': np.random.normal(3, 2, n_fraud),
        'merchant_risk_score': np.random.beta(5, 5, n_fraud),
        'device_risk_score': np.random.beta(5, 5, n_fraud),
    }
    
    # Combine
    df_legit = pd.DataFrame(legit_data)
    df_legit['is_fraud'] = 0
    
    df_fraud = pd.DataFrame(fraud_data)
    df_fraud['is_fraud'] = 1
    
    df = pd.concat([df_legit, df_fraud], ignore_index=True)
    df = df.sample(frac=1, random_state=random_state).reset_index(drop=True)
    
    return df

# Generate data
df = generate_synthetic_data(n_samples=10000, fraud_rate=0.02)
print(f"Dataset shape: {df.shape}")
print(f"Fraud rate: {df['is_fraud'].mean():.2%}")
df.head()

## 2. Exploratory Data Analysis

In [None]:
# Feature distributions by fraud label
fig, axes = plt.subplots(3, 3, figsize=(15, 12))

features_to_plot = [
    'txn_count_1h', 'amount_sum_1h', 'time_since_last_txn',
    'unique_merchants_24h', 'deviation_from_avg', 'merchant_risk_score',
    'is_new_merchant', 'is_new_device', 'device_risk_score'
]

for idx, feature in enumerate(features_to_plot):
    ax = axes[idx // 3, idx % 3]
    
    for label, color in [(0, 'green'), (1, 'red')]:
        data = df[df['is_fraud'] == label][feature]
        ax.hist(data, bins=30, alpha=0.5, label=f'Fraud={label}', color=color, density=True)
    
    ax.set_title(feature)
    ax.legend()

plt.tight_layout()
plt.suptitle('Feature Distributions by Fraud Label', y=1.02, fontsize=14)
plt.show()

## 3. Train Ensemble Model

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, IsolationForest
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, confusion_matrix, classification_report,
    precision_recall_curve, roc_curve
)

# Split data
feature_cols = [c for c in df.columns if c != 'is_fraud']
X = df[feature_cols].values
y = df['is_fraud'].values

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Train: {len(X_train)}, Test: {len(X_test)}")
print(f"Train fraud rate: {y_train.mean():.2%}")
print(f"Test fraud rate: {y_test.mean():.2%}")

In [None]:
# Train Random Forest (simulating XGBoost)
rf_model = RandomForestClassifier(
    n_estimators=100,
    max_depth=8,
    class_weight='balanced',
    random_state=42
)
rf_model.fit(X_train, y_train)

# Train Isolation Forest
iso_model = IsolationForest(
    n_estimators=100,
    contamination=0.02,
    random_state=42
)
iso_model.fit(X_train)

print('Models trained successfully!')

In [None]:
# Get predictions
rf_proba = rf_model.predict_proba(X_test)[:, 1]
iso_scores = -iso_model.decision_function(X_test)  # Negate so higher = more anomalous
iso_proba = 1 / (1 + np.exp(-5 * (iso_scores - 0.5)))  # Convert to probability

# Ensemble (weighted average)
ensemble_proba = 0.7 * rf_proba + 0.3 * iso_proba
y_pred = (ensemble_proba >= 0.5).astype(int)

# Evaluate
print("Classification Report:")
print(classification_report(y_test, y_pred, target_names=['Legitimate', 'Fraud']))

print(f"\nAUC-ROC: {roc_auc_score(y_test, ensemble_proba):.4f}")

## 4. Model Performance Visualization

In [None]:
fig, axes = plt.subplots(1, 3, figsize=(15, 5))

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=axes[0],
            xticklabels=['Legit', 'Fraud'], yticklabels=['Legit', 'Fraud'])
axes[0].set_title('Confusion Matrix')
axes[0].set_xlabel('Predicted')
axes[0].set_ylabel('Actual')

# ROC Curve
fpr, tpr, _ = roc_curve(y_test, ensemble_proba)
auc = roc_auc_score(y_test, ensemble_proba)
axes[1].plot(fpr, tpr, 'b-', linewidth=2, label=f'Ensemble (AUC={auc:.3f})')
axes[1].plot([0, 1], [0, 1], 'k--', alpha=0.5)
axes[1].set_xlabel('False Positive Rate')
axes[1].set_ylabel('True Positive Rate')
axes[1].set_title('ROC Curve')
axes[1].legend()

# Precision-Recall Curve
precision, recall, _ = precision_recall_curve(y_test, ensemble_proba)
axes[2].plot(recall, precision, 'g-', linewidth=2)
axes[2].set_xlabel('Recall')
axes[2].set_ylabel('Precision')
axes[2].set_title('Precision-Recall Curve')

plt.tight_layout()
plt.show()

## 5. Feature Importance

In [None]:
# Get feature importance
importance = pd.DataFrame({
    'feature': feature_cols,
    'importance': rf_model.feature_importances_
}).sort_values('importance', ascending=True)

plt.figure(figsize=(10, 8))
plt.barh(importance['feature'], importance['importance'], color='steelblue')
plt.xlabel('Importance')
plt.title('Feature Importance (Random Forest)')
plt.tight_layout()
plt.show()

print("\nTop 5 Features:")
print(importance.tail(5).to_string(index=False))

## 6. Score Distribution Analysis

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Score distribution by label
for label, color, name in [(0, 'green', 'Legitimate'), (1, 'red', 'Fraud')]:
    scores = ensemble_proba[y_test == label]
    axes[0].hist(scores, bins=50, alpha=0.6, label=name, color=color, density=True)
    
axes[0].axvline(x=0.3, color='orange', linestyle='--', label='Approve Threshold')
axes[0].axvline(x=0.7, color='yellow', linestyle='--', label='Review Threshold')
axes[0].axvline(x=0.9, color='red', linestyle='--', label='Decline Threshold')
axes[0].set_xlabel('Risk Score')
axes[0].set_ylabel('Density')
axes[0].set_title('Risk Score Distribution')
axes[0].legend()

# Decision breakdown
decisions = []
for score in ensemble_proba:
    if score < 0.3:
        decisions.append('Approve')
    elif score < 0.7:
        decisions.append('Step-up')
    elif score < 0.9:
        decisions.append('Review')
    else:
        decisions.append('Decline')

decision_counts = pd.Series(decisions).value_counts()
colors = ['green', 'orange', 'yellow', 'red']
axes[1].pie(decision_counts, labels=decision_counts.index, autopct='%1.1f%%', colors=colors[:len(decision_counts)])
axes[1].set_title('Decision Distribution')

plt.tight_layout()
plt.show()

## 7. Summary

This notebook demonstrated:
- Synthetic fraud data generation with realistic patterns
- Ensemble model training (RF + Isolation Forest)
- Performance evaluation (AUC, Precision, Recall)
- Feature importance analysis
- Score distribution and decision breakdown

The production system uses XGBoost + Neural Network + Isolation Forest for even better performance.