### Imports & Constants

In [0]:
import mlflow
import mlflow.pyfunc
import pandas as pd
from pyspark.sql.functions import col

CATALOG = "olist_ecommerce"
GOLD_SCHEMA = "gold"
BEST_RUN_ID = "0325bc1675d0486f89fb1a0547691f1f"

MODEL_NAME = f"{CATALOG}.{GOLD_SCHEMA}.delivery_prediction_model"
MODEL_ALIAS = "production"

FEATURE_TABLE = f"{CATALOG}.{GOLD_SCHEMA}.fact_orders"
TARGET_TABLE  = f"{CATALOG}.{GOLD_SCHEMA}.delivery_predictions"

THRESHOLD = 0.30

INFO:py4j.clientserver:Python Server ready to receive messages
INFO:py4j.clientserver:Received command c on object id p0


### Register Best Model

In [0]:
model_uri = f"runs:/{BEST_RUN_ID}/model"

registered_model = mlflow.register_model(
    model_uri=model_uri,
    name=MODEL_NAME
)

Registered model 'olist_ecommerce.gold.delivery_prediction_model' already exists. Creating a new version of this model...
Created version '4' of model 'olist_ecommerce.gold.delivery_prediction_model'.


### Load Production Model

In [0]:
model = mlflow.pyfunc.load_model(
    model_uri=f"models:/{MODEL_NAME}/2"
)

### Load Features from Gold

In [0]:
features_df = spark.table(FEATURE_TABLE)

pdf = features_df.toPandas()

### Generate Predictions

In [0]:
# Predict probabilities
probs = model.predict(pdf)

# Add prediction columns
pdf["late_delivery_probability"] = probs
pdf["predicted_late_delivery"] = (probs >= THRESHOLD).astype(int)



### Convert Back to Spark

In [0]:
predictions_spark = spark.createDataFrame(pdf)

### Write Predictions to Gold

In [0]:
# Drop extra columns and rename for schema match
predictions_spark_fixed = predictions_spark.drop('late_delivery_probability')\
    .withColumnRenamed('predicted_late_delivery', 'late_delivery_prediction')\
    .withColumn('late_delivery_prediction', col('late_delivery_prediction').cast('int'))

predictions_spark_fixed\
    .write\
    .mode("overwrite")\
    .format("delta")\
    .saveAsTable(TARGET_TABLE)