### Imports

In [0]:
import mlflow
import mlflow.sklearn

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from mlflow.models.signature import infer_signature

### Read fact table

In [0]:
fact_orders_df = spark.table("olist_ecommerce.gold.fact_orders").toPandas()

### Select features & target

In [0]:
FEATURES = [
    "order_value",
    "total_freight",
    "total_items",
    "approval_delay_hours",
    "estimated_delivery_days",
    "installments_count"
]

TARGET = "is_late_delivery"

df = fact_orders_df.dropna(subset=FEATURES + [TARGET])

X = df[FEATURES]
y = df[TARGET]

### Train-test split

In [0]:
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.2,random_state=42,stratify=y)

### Define models

In [0]:
models = {
    "logistic_regression": LogisticRegression(max_iter=500, class_weight="balanced"),
    "decision_tree": DecisionTreeClassifier(max_depth=8,class_weight="balanced"),
    "random_forest": RandomForestClassifier(n_estimators=200,max_depth=10,random_state=42,class_weight="balanced")
    }

THRESHOLD = 0.30

### Train, Evaluate, Log to MLflow

In [0]:
from mlflow.models.signature import infer_signature

mlflow.set_experiment("/delivery_prediction_model_comparison")

results = []

for name, model in models.items():

    with mlflow.start_run(run_name=f"delivery_{name}"):

        mlflow.log_param("model_type", name)
        mlflow.log_param("threshold", float(THRESHOLD))

        model.fit(X_train, y_train)

        probs = model.predict_proba(X_test)[:, 1]
        preds = (probs >= THRESHOLD).astype(int)

        auc = roc_auc_score(y_test, probs)
        f1 = f1_score(y_test, preds)

        mlflow.log_metric("auc", auc)
        mlflow.log_metric("f1_score", f1)

        input_example = X_train.iloc[:5]
        output_example = model.predict_proba(input_example)
        signature = infer_signature(input_example, output_example)

        mlflow.sklearn.log_model(
            sk_model=model,
            artifact_path="model",
            signature=signature,
            input_example=input_example
        )

        results.append({
            "model": name,
            "auc": auc,
            "f1": f1,
            "run_id": mlflow.active_run().info.run_id
        })

        print(f"{name} → AUC: {auc:.3f}, F1: {f1:.3f}")



logistic_regression → AUC: 0.581, F1: 0.150




decision_tree → AUC: 0.629, F1: 0.163




random_forest → AUC: 0.640, F1: 0.166


### Decide Best Model

In [0]:
results_df = pd.DataFrame(results)
results_df.sort_values(by="f1", ascending=False)

Unnamed: 0,model,auc,f1,run_id
2,random_forest,0.640072,0.165695,0325bc1675d0486f89fb1a0547691f1f
1,decision_tree,0.629311,0.163424,49b8ad65cae74e0c95f50368ed465134
0,logistic_regression,0.58074,0.14961,fb37fd6bcb0f4a7a99594539aa5edd49
