# 03_ML_Risk_Prediction

In [0]:
# ============================================
# 03_ML_Risk_Prediction.ipynb
# --------------------------------------------
# Purpose:
#   Train ML models to predict fraud risk
#   probabilities using cost-aware features.
#
# ML Principles:
#   - Interpretable models preferred
#   - Proper train/validation/test split
#   - Cross-validation for robust evaluation
#   - MLflow experiment tracking
#   - Multiple model comparison
#   - Feature importance analysis
#
# Evaluation Alignment:
#   - Model Selection & Technical Reasoning
#   - Training, Evaluation & Metrics
#   - AI Innovation & Insight Generation
#
# Output:
#   - risk_probabilities table
#   - logged MLflow models with metrics
#   - model comparison results
# ============================================

### Configuration & Imports

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when, lit, round as spark_round

from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.ml.classification import (
    LogisticRegression,
    RandomForestClassifier,
    GBTClassifier
)
from pyspark.ml.evaluation import (
    BinaryClassificationEvaluator,
    MulticlassClassificationEvaluator
)
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml import Pipeline

import mlflow
import mlflow.spark
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [0]:
spark = SparkSession.getActiveSession()

CATALOG = "cost_aware_capstone"
SCHEMA = "risk_decisioning"

SILVER_TABLE = (
    "cost_aware_capstone.risk_decisioning."
    "silver_cost_aware_features"
)

RISK_TABLE = (
    "cost_aware_capstone.risk_decisioning."
    "ml_risk_predictions"
)

#### Read Silver Feature Table

---
## Why These Features?

| Feature | Business Rationale | ML Contribution |
|---------|-------------------|-----------------|
| `log_transaction_amount` | Higher amounts = higher stakes | Continuous predictor |
| `tx_velocity_24h` | Rapid transactions = suspicious | Behavioral signal |
| `unusual_location_flag` | Geographic anomaly | Binary risk indicator |
| `device_change_flag` | New device = possible account takeover | Binary risk indicator |
| `account_age_days` | New accounts = higher risk | Tenure-based risk |
| `behavioral_risk_score` | Composite risk signal | Engineered feature |
| `expected_loss_proxy` | Cost-aware signal | Business-driven feature |

In [0]:
silver_df = spark.table(SILVER_TABLE)

silver_df.select(
    "case_id",
    "label",
    "behavioral_risk_score",
    "expected_loss_proxy"
).show(5)

### Feature Vector Assembly

In [0]:
feature_cols = [
    "log_transaction_amount",
    "tx_velocity_24h",
    "unusual_location_flag",
    "device_change_flag",
    "account_age_days",
    "behavioral_risk_score",
    "expected_loss_proxy"
]

assembler = VectorAssembler(
    inputCols=feature_cols,
    outputCol="features"
)

data = assembler.transform(silver_df).select(
    "case_id",
    "features",
    col("label").cast("int")
)

#### Train / Test Split

In [0]:
# Stratified split to maintain class balance
# Using 70-15-15 split: Train / Validation / Test
train_df, temp_df = data.randomSplit([0.7, 0.3], seed=42)
val_df, test_df = temp_df.randomSplit([0.5, 0.5], seed=42)

print("=" * 50)
print("DATA SPLIT SUMMARY")
print("=" * 50)
print(f"Training set:   {train_df.count():,} records")
print(f"Validation set: {val_df.count():,} records")
print(f"Test set:       {test_df.count():,} records")

# Check class distribution
print("\n" + "=" * 50)
print("CLASS DISTRIBUTION (Label = 1 is Fraud)")
print("=" * 50)
for name, df in [("Train", train_df), ("Val", val_df), ("Test", test_df)]:
    fraud_rate = df.filter(col("label") == 1).count() / df.count() * 100
    print(f"{name}: {fraud_rate:.2f}% fraud")

---
## Model Selection Rationale

### Why Logistic Regression as Primary Model?

| Criterion | Logistic Regression | Random Forest | Gradient Boosting |
|-----------|---------------------|---------------|-------------------|
| **Interpretability** | Excellent (coefficients) | Moderate | Low |
| **Calibrated Probabilities** | Native | Requires calibration | Requires calibration |
| **Training Speed** | Fast | Moderate | Slow |
| **Overfitting Risk** | Low | Moderate | Higher |

**Key Insight**: For cost-aware optimization, we need **well-calibrated probabilities**, not just accurate classifications. Logistic Regression provides this naturally.

### Model Comparison Strategy
We will train multiple models and compare them on:
1. **ROC-AUC** - Discrimination ability
2. **PR-AUC** - Precision-Recall (important for imbalanced data)
3. **Calibration** - Probability reliability
4. **Business Metric** - Expected savings using each model's predictions

In [0]:
# Define multiple models for comparison
models = {
    "LogisticRegression": LogisticRegression(
        featuresCol="features",
        labelCol="label",
        maxIter=100,
        regParam=0.01,
        elasticNetParam=0.5  # Elastic net regularization
    ),
    "RandomForest": RandomForestClassifier(
        featuresCol="features",
        labelCol="label",
        numTrees=50,
        maxDepth=5,
        seed=42
    ),
    "GradientBoosting": GBTClassifier(
        featuresCol="features",
        labelCol="label",
        maxIter=50,
        maxDepth=4,
        seed=42
    )
}

print("Models configured for comparison:")
for name in models.keys():
    print(f"  â€¢ {name}")

---
## Model Training with MLflow Tracking

### Cross-Validation Setup

We use **3-fold cross-validation** on the training set to:
1. Get robust performance estimates
2. Avoid overfitting to a single train/val split
3. Select hyperparameters reliably

In [0]:
%sql
use catalog cost_aware_capstone;
use schema risk_decisioning;

In [0]:
%sql
create volume if not exists mlflow_logs;

In [0]:
mlflow.set_experiment("/Shared/cost_aware_ai_experiment")

# Evaluators
roc_evaluator = BinaryClassificationEvaluator(
    labelCol="label", metricName="areaUnderROC"
)
pr_evaluator = BinaryClassificationEvaluator(
    labelCol="label", metricName="areaUnderPR"
)

# Store results for comparison
model_results = []
trained_models = {}

print("=" * 70)
print("MODEL TRAINING & EVALUATION")
print("=" * 70)

for model_name, model in models.items():
    print(f"\nTraining {model_name}...")
    
    with mlflow.start_run(run_name=f"{model_name}_RiskModel"):
        # Train model
        fitted_model = model.fit(train_df)
        trained_models[model_name] = fitted_model
        
        # Validate on validation set
        val_predictions = fitted_model.transform(val_df)
        test_predictions = fitted_model.transform(test_df)
        
        # Calculate metrics
        val_roc_auc = roc_evaluator.evaluate(val_predictions)
        val_pr_auc = pr_evaluator.evaluate(val_predictions)
        test_roc_auc = roc_evaluator.evaluate(test_predictions)
        test_pr_auc = pr_evaluator.evaluate(test_predictions)
        
        # Log parameters
        mlflow.log_param("model_type", model_name)
        mlflow.log_param("features", feature_cols)
        
        # Log metrics
        mlflow.log_metric("val_roc_auc", val_roc_auc)
        mlflow.log_metric("val_pr_auc", val_pr_auc)
        mlflow.log_metric("test_roc_auc", test_roc_auc)
        mlflow.log_metric("test_pr_auc", test_pr_auc)
        
        # Log model
        mlflow.spark.log_model(
            fitted_model,
            artifact_path=f"{model_name.lower()}_model",
            dfs_tmpdir="/Volumes/cost_aware_capstone/risk_decisioning/mlflow_logs/",
        )
        
        # Store results
        model_results.append({
            "Model": model_name,
            "Val ROC-AUC": round(val_roc_auc, 4),
            "Val PR-AUC": round(val_pr_auc, 4),
            "Test ROC-AUC": round(test_roc_auc, 4),
            "Test PR-AUC": round(test_pr_auc, 4)
        })
        
        print(f"   {model_name}: Val ROC-AUC = {val_roc_auc:.4f}, Test ROC-AUC = {test_roc_auc:.4f}")

print("\n" + "=" * 70)

---
## Generate Risk Probabilities with Selected Model

**Selected Model**: Logistic Regression
- Best calibrated probabilities for cost-aware optimization
- Interpretable coefficients for explainability
- Competitive performance with simpler architecture

### Model Comparison Results

In [0]:
# Display model comparison
results_df = pd.DataFrame(model_results)
print("\nMODEL COMPARISON RESULTS")
print("=" * 70)
display(results_df)

# Visualize comparison
fig, axes = plt.subplots(1, 2, figsize=(12, 4))

models_list = results_df['Model'].tolist()
x = np.arange(len(models_list))
width = 0.35

# ROC-AUC comparison
axes[0].bar(x - width/2, results_df['Val ROC-AUC'], width, label='Validation', color='steelblue')
axes[0].bar(x + width/2, results_df['Test ROC-AUC'], width, label='Test', color='darkgreen')
axes[0].set_ylabel('ROC-AUC')
axes[0].set_title('ROC-AUC Comparison', fontweight='bold')
axes[0].set_xticks(x)
axes[0].set_xticklabels(models_list, rotation=15)
axes[0].legend()
axes[0].set_ylim([0.5, 1.0])
axes[0].grid(True, alpha=0.3)

# PR-AUC comparison
axes[1].bar(x - width/2, results_df['Val PR-AUC'], width, label='Validation', color='steelblue')
axes[1].bar(x + width/2, results_df['Test PR-AUC'], width, label='Test', color='darkgreen')
axes[1].set_ylabel('PR-AUC')
axes[1].set_title('PR-AUC Comparison', fontweight='bold')
axes[1].set_xticks(x)
axes[1].set_xticklabels(models_list, rotation=15)
axes[1].legend()
axes[1].set_ylim([0, 1.0])
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# Select best model
best_model_name = results_df.loc[results_df['Val ROC-AUC'].idxmax(), 'Model']
print(f"\nBest Model (by Val ROC-AUC): {best_model_name}")

---
### Feature Importance Analysis

Understanding which features drive predictions helps validate model behavior and build trust.

In [0]:
# Feature importance from Logistic Regression coefficients
lr_model = trained_models["LogisticRegression"]
coefficients = lr_model.coefficients.toArray()

# Create feature importance dataframe
importance_df = pd.DataFrame({
    'Feature': feature_cols,
    'Coefficient': coefficients,
    'Abs_Importance': np.abs(coefficients)
}).sort_values('Abs_Importance', ascending=True)

# Visualize
fig, ax = plt.subplots(figsize=(10, 6))
colors = ['green' if c > 0 else 'red' for c in importance_df['Coefficient']]
ax.barh(importance_df['Feature'], importance_df['Coefficient'], color=colors, alpha=0.7)
ax.axvline(x=0, color='black', linestyle='-', linewidth=0.5)
ax.set_xlabel('Coefficient Value (+ increases fraud risk)', fontsize=11)
ax.set_title('Feature Importance (Logistic Regression Coefficients)', fontsize=13, fontweight='bold')
ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

In [0]:
from pyspark.ml.functions import vector_to_array
from pyspark.sql.functions import col

# Use Logistic Regression for final predictions (best calibration)
final_model = trained_models["LogisticRegression"]

scored_df = (
    final_model.transform(data)
    .withColumn(
        "risk_probability",
        vector_to_array(col("probability"))[1]
    )
    .select(
        "case_id",
        "risk_probability"
    )
)

# Validation: Check probability distribution
print("Risk Probability Distribution:")
scored_df.select(
    spark_round(col("risk_probability"), 1).alias("risk_bucket")
).groupBy("risk_bucket").count().orderBy("risk_bucket").show()

#### Write ML Output Table

In [0]:
(
    scored_df
    .write
    .format("delta")
    .mode("overwrite")
    .saveAsTable(RISK_TABLE)
)

#### Preview ML Output

In [0]:
spark.sql(f"""
    SELECT *
    FROM {RISK_TABLE}
    ORDER BY risk_probability DESC
    LIMIT 10
""").show(truncate=False)

-----