# 03_ML_Risk_Prediction

In [0]:
# ============================================
# 03_ML_Risk_Prediction.ipynb
# --------------------------------------------
# Purpose:
#   Train an ML model to predict fraud risk
#   probabilities using cost-aware features.
#
# ML Principles:
#   - Interpretable model
#   - Proper validation
#   - MLflow tracking
#
# Output:
#   - risk_probabilities table
#   - logged MLflow model
# ============================================

### Configuration & Imports

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator

import mlflow
import mlflow.spark

In [0]:
spark = SparkSession.getActiveSession()

CATALOG = "cost_aware_capstone"
SCHEMA = "risk_decisioning"

SILVER_TABLE = (
    "cost_aware_capstone.risk_decisioning."
    "silver_cost_aware_features"
)

RISK_TABLE = (
    "cost_aware_capstone.risk_decisioning."
    "ml_risk_predictions"
)

#### Read Silver Feature Table

In [0]:
silver_df = spark.table(SILVER_TABLE)

silver_df.select(
    "case_id",
    "label",
    "behavioral_risk_score",
    "expected_loss_proxy"
).show(5)

### Feature Vector Assembly

In [0]:
feature_cols = [
    "log_transaction_amount",
    "tx_velocity_24h",
    "unusual_location_flag",
    "device_change_flag",
    "account_age_days",
    "behavioral_risk_score",
    "expected_loss_proxy"
]

assembler = VectorAssembler(
    inputCols=feature_cols,
    outputCol="features"
)

data = assembler.transform(silver_df).select(
    "case_id",
    "features",
    col("label").cast("int")
)

#### Train / Test Split

In [0]:
train_df, test_df = data.randomSplit([0.7, 0.3], seed=42)

print("Train count:", train_df.count())
print("Test count:", test_df.count())

### Train Interpretable ML Model

In [0]:
lr = LogisticRegression(
    featuresCol="features",
    labelCol="label",
    maxIter=20
)

### MLflow Tracking

In [0]:
%sql
use catalog cost_aware_capstone;
use schema risk_decisioning;

In [0]:
%sql
create volume if not exists mlflow_logs;

In [0]:
mlflow.set_experiment("/Shared/cost_aware_ai_experiment")

with mlflow.start_run(run_name="LogisticRegression_RiskModel"):

    model = lr.fit(train_df)

    predictions = model.transform(test_df)

    evaluator = BinaryClassificationEvaluator(
        labelCol="label",
        metricName="areaUnderROC"
    )

    auc = evaluator.evaluate(predictions)

    mlflow.log_metric("roc_auc", auc)

    mlflow.spark.log_model(
        model,
        artifact_path="risk_model",
        dfs_tmpdir="/Volumes/cost_aware_capstone/risk_decisioning/mlflow_logs/",
    )

    print("ROC-AUC:", auc)

#### Generate Risk Probabilities

In [0]:
from pyspark.ml.functions import vector_to_array
from pyspark.sql.functions import col

scored_df = (
    model.transform(data)
    .withColumn(
        "risk_probability",
        vector_to_array(col("probability"))[1]
    )
    .select(
        "case_id",
        "risk_probability"
    )
)

#### Write ML Output Table

In [0]:
(
    scored_df
    .write
    .format("delta")
    .mode("overwrite")
    .saveAsTable(RISK_TABLE)
)

#### Preview ML Output

In [0]:
spark.sql(f"""
    SELECT *
    FROM {RISK_TABLE}
    ORDER BY risk_probability DESC
    LIMIT 10
""").show(truncate=False)

-----