#### Booking Cancellation Prediction Model Inference

#### 01. Generate Sythetic Data for Inference

In [0]:
# please help generate sythetic data for testing inference based on the following table
# workspace.booking.mlops_booking_training

from pyspark.sql.functions import col, rand
from pyspark.sql import DataFrame

def generate_synthetic_data(df: DataFrame, num_rows: int) -> DataFrame:
    unique_values = df.distinct()
    synthetic_data = unique_values.orderBy(rand()).limit(num_rows)
    return synthetic_data.withColumn("Booking_ID", lit(None))

# Load the original data
original_data = spark.table("workspace.booking.mlops_booking_training")

# Generate synthetic data
synthetic_data = generate_synthetic_data(original_data, 100)

# Display the synthetic data
display(synthetic_data)

In [0]:
# Save the synthetic data to a table
synthetic_data.write.mode("overwrite").saveAsTable("workspace.booking.mlops_booking_inference")

#### 02. Batch inference on the Champion model

In [0]:
import mlflow

# Catalog and Schema
catalog = "workspace"
db = "booking"

# Load customer features to be scored
inference_df = spark.read.table(f"{catalog}.{db}.mlops_booking_inference")

# Load champion model as a Spark UDF
champion_model = mlflow.pyfunc.spark_udf(
    spark, 
    model_uri=f"models:/{catalog}.{db}.mlops_booking@Challenger"
)

# Ensure the input columns match the model's input schema
input_columns = champion_model.metadata.get_input_schema().input_names()
preds_df = inference_df.select(*input_columns).withColumn(
    'predictions', 
    champion_model(*input_columns)
)

display(preds_df)