### 📦 Load Customer Feature Table with Fraud Labels

In [None]:
from pyspark.sql.functions import col

data = spark.read.format("delta").load("dbfs:/tmp/customer_features_table")

# Select features and label
features = [c for c in data.columns if c not in ("customer_id", "fraud", "amount")]
df = data.select("customer_id", "fraud", "amount", *features)
df = df.dropna()

df.display()

### 📊 Visual: Fraud Distribution

In [None]:
import matplotlib.pyplot as plt

df_sample = df.sample(False, 0.5, seed=42).toPandas()

X = df_sample[features]
y = df_sample["fraud"]
amounts = df_sample["amount"]

y.value_counts().plot(kind='bar', title='Fraud Distribution (0=Genuine, 1=Fraud)', figsize=(5,3))
plt.xlabel("Label")
plt.ylabel("Count")
plt.show()

### 🔀 Train-Test Split

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test, amt_train, amt_test = train_test_split(
    X, y, amounts, test_size=0.3, random_state=42, stratify=y
)

### ⚙️ Define Evaluation Metric at 5% Genuine Decline Rate

In [None]:
import numpy as np

def evaluate_at_decline(y_true, y_pred_proba, amounts, decline_rate=0.05):
    sorted_idx = np.argsort(y_pred_proba)[::-1]
    y_true_sorted = y_true.iloc[sorted_idx].reset_index(drop=True)
    y_pred_sorted = y_pred_proba[sorted_idx]
    amounts_sorted = amounts.iloc[sorted_idx].reset_index(drop=True)

    num_to_decline = int(len(y_true) * decline_rate)
    y_declined = y_true_sorted.iloc[:num_to_decline]
    amounts_declined = amounts_sorted.iloc[:num_to_decline]

    fraud_detected = y_declined.sum()
    value_detected = amounts_declined[y_declined == 1].sum()
    total_fraud = y_true.sum()
    total_fraud_value = amounts[y_true == 1].sum()

    return {
        "fraud_detection_rate": fraud_detected / total_fraud if total_fraud else 0,
        "fraud_value_detection_rate": value_detected / total_fraud_value if total_fraud_value else 0
    }

### 🎯 Hyperparameter Tuning with LightGBM + Hyperopt + MLflow

In [None]:
import mlflow
import mlflow.sklearn
from hyperopt import fmin, tpe, hp, STATUS_OK
from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score

mlflow.set_experiment("/Shared/fraud_model_lgbm")

def objective(params):
    with mlflow.start_run(nested=True):
        model = LGBMClassifier(**params)
        model.fit(X_train, y_train)

        preds = model.predict_proba(X_test)[:, 1]
        auc = roc_auc_score(y_test, preds)

        metrics = evaluate_at_decline(y_test, preds, amt_test)

        mlflow.log_params(params)
        mlflow.log_metric("roc_auc", auc)
        mlflow.log_metric("fraud_detection_rate", metrics["fraud_detection_rate"])
        mlflow.log_metric("fraud_value_detection_rate", metrics["fraud_value_detection_rate"])

        return {"loss": -metrics["fraud_value_detection_rate"], "status": STATUS_OK}

search_space = {
    "learning_rate": hp.uniform("learning_rate", 0.01, 0.2),
    "num_leaves": hp.choice("num_leaves", [15, 31, 63]),
    "max_depth": hp.choice("max_depth", [4, 6, 8, 10])
}

best_result = fmin(fn=objective, space=search_space, algo=tpe.suggest, max_evals=10)

### ✅ Train Final Model with Best Hyperparameters and Register

In [None]:
best_params = {
    "learning_rate": best_result["learning_rate"],
    "num_leaves": [15, 31, 63][best_result["num_leaves"]],
    "max_depth": [4, 6, 8, 10][best_result["max_depth"]],
}

with mlflow.start_run(run_name="final_model") as run:
    final_model = LGBMClassifier(**best_params)
    final_model.fit(X_train, y_train)

    preds = final_model.predict_proba(X_test)[:, 1]
    metrics = evaluate_at_decline(y_test, preds, amt_test)

    mlflow.log_params(best_params)
    mlflow.log_metrics(metrics)
    mlflow.sklearn.log_model(final_model, artifact_path="model")

    model_uri = f"runs:/{run.info.run_id}/model"
    mlflow.register_model(model_uri, "models:/main.fraud_demo.fraud_detector")

### 📈 Visual: Feature Importances

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

importances = final_model.feature_importances_
feat_imp = pd.Series(importances, index=X.columns).sort_values(ascending=False)

feat_imp.head(15).plot(kind='barh', title='Top 15 Feature Importances', figsize=(8,6))
plt.gca().invert_yaxis()
plt.show()

### 📘 Why MLflow Is a Game Changer

In [None]:
"""
✅ MLflow Experiment Tracking:
- Every model trial is recorded with parameters and performance
- Reproducible and comparable across runs

✅ Model Registry:
- Versioned, staged, production-ready models
- Easy promotion (e.g., Staging ➡️ Production)

✅ Governance & Reproducibility:
- You can always trace back to the exact data, code, and model used
- Essential for regulated domains like fraud detection

✅ Summary:
MLflow eliminates the guesswork, version confusion, and manual tracking that slow down trustworthy ML development.
"""