#PROBLEM STATEMENT
##Predict daily product purchases using product engagement metrics.

#ML Use Case & Dataset Understanding

In [0]:
spark.table("gold.products").printSchema()


In [0]:
spark.table("gold.products").show(5)


###Confirm Features / Target
Features: views, conversion_rate

Target: purchases

Note: product_id and event_date excluded for baseline model

#Create MLflow Experiment
Purpose:

Prepare an MLflow experiment to track all model runs

Central point to log parameters, metrics, and models

Aligns with MLflow components and experiment tracking

In [0]:
import mlflow
import mlflow.sklearn


In [0]:
# Define experiment name
experiment_name = "/Users/ram.katneni@gmail.com/day11_purchases_regression"

# Set or create experiment
mlflow.set_experiment(experiment_name)


In [0]:
exp = mlflow.get_experiment_by_name(experiment_name)
print(exp)


#Data Preparation for Model Training

Purpose:

Prepare ML-ready features (views, conversion_rate) and target (purchases)

Split data into train and test sets

Ensure everything is ready for baseline regression training

Learn Alignment:

MLflow Models (conceptual prep â€” ready for logging)

Task Alignment:

Prepares for Task 1: Train simple regression model

In [0]:
from sklearn.model_selection import train_test_split


In [0]:
df = spark.table("gold.products").select("views", "conversion_rate", "purchases").toPandas()


In [0]:
print(df.head())
print(df.describe())


In [0]:
X = df[["views", "conversion_rate"]]
y = df["purchases"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


In [0]:
from sklearn.linear_model import LinearRegression


In [0]:
# Initialize Linear Regression
lr_model = LinearRegression()

# Train on training data
lr_model.fit(X_train, y_train)


In [0]:
# Evaluate on test set
r2_score = lr_model.score(X_test, y_test)

print(f"Baseline Linear Regression RÂ² score: {r2_score:.4f}")


#Interpretation

The model explains ~59% of the variance in product purchases using engagement features. This establishes a strong baseline suitable for experiment tracking and comparison in MLflow.

#MLflow Run & Logging (Parameters, Metrics, Model)
Purpose:

Start an MLflow run

Log parameters, metrics, and the trained model

Make the run visible and auditable in MLflow UI

In [0]:
with mlflow.start_run(run_name="baseline_linear_regression"):
    print("MLflow run started")


In [0]:
with mlflow.start_run(run_name="baseline_linear_regression"):
    mlflow.log_param("model_type", "LinearRegression")
    mlflow.log_param("features", "views,conversion_rate")
    mlflow.log_param("target", "purchases")
    mlflow.log_param("test_size", 0.2)


Parameters explain how the model was trained

Enables reproducibility and comparison

In [0]:
with mlflow.start_run(run_name="baseline_linear_regression"):
    mlflow.log_metric("r2_score", r2_score)


Model performance

Time-stamped evaluation

In [0]:
with mlflow.start_run(run_name="baseline_linear_regression"):
    mlflow.sklearn.log_model(
        sk_model=lr_model,
        artifact_path="model"
    )


This stores:

Serialized model

Conda / environment metadata

Makes the model deployable

In [0]:
with mlflow.start_run(run_name="baseline_linear_regression"):
    mlflow.log_param("model_type", "LinearRegression")
    mlflow.log_param("features", "views,conversion_rate")
    mlflow.log_param("target", "purchases")
    mlflow.log_param("test_size", 0.2)

    mlflow.log_metric("r2_score", r2_score)

    mlflow.sklearn.log_model(lr_model, "model")


# First Run

Existing Run (Already Done)

Item       	Value

Run name	-> baseline_linear_regression

Features	-> views, conversion_rate

Target	  -> purchases

RÂ²	      -> 0.5909

#New Run (We Will Create)

Item	    Value

Run name	-> iews_only_linear_regression

Features	-> views

Target	  -> purchases

Purpose	Compare feature impact

This demonstrates: Compare runs in a real, production-style way.

#Prepare Data for Second Run (Views Only)

In [0]:
# Views-only feature set
X_views_only = df[["views"]]
y = df["purchases"]


In [0]:
from sklearn.model_selection import train_test_split

X_train_v, X_test_v, y_train_v, y_test_v = train_test_split(
    X_views_only, y, test_size=0.2, random_state=42
)


#Train & Log Second MLflow Run (Views Only)

Goal:
Create a second MLflow run inside the same experiment, using:

Same target (purchases)

Same train/test split

Different feature set (views only)

This unlocks true MLflow run comparison.

In [0]:
from sklearn.linear_model import LinearRegression

# Initialize model
lr_views_only = LinearRegression()

# Train model
lr_views_only.fit(X_train_v, y_train_v)


In [0]:
# Evaluate on test data
r2_views_only = lr_views_only.score(X_test_v, y_test_v)

print(f"Views-only Linear Regression RÂ² score: {r2_views_only:.4f}")


In [0]:
import mlflow
import mlflow.sklearn

with mlflow.start_run(run_name="views_only_linear_regression"):
    mlflow.log_param("model_type", "LinearRegression")
    mlflow.log_param("features", "views")
    mlflow.log_param("target", "purchases")
    mlflow.log_param("test_size", 0.2)

    mlflow.log_metric("r2_score", r2_views_only)

    mlflow.sklearn.log_model(lr_views_only, "model")


#MLflow Run Comparison 

Go to Experiments

Open:

/Users/ram.katneni@gmail.com/day11_purchases_regression


Select both runs:

âœ… baseline_linear_regression

âœ… views_only_linear_regression

Click Compare

#Which model should we register?

From comparison:

Run	RÂ²
baseline_linear_regression	0.5909 âœ…
views_only_linear_regression	0.5894

ðŸ‘‰ We will register:
baseline_linear_regression

#Register MLflow Model to Unity Catalog

Goal

Take the model artifact from your existing MLflow run

Register it as a Unity Catalogâ€“managed model

Create Version 1 of the model

In [0]:
%sql
CREATE SCHEMA IF NOT EXISTS workspace.ml_models;


In [0]:
%sql
SHOW CATALOGS;


In [0]:
%sql
SHOW SCHEMAS IN workspace;


In [0]:
# MLFlow Run id : 07b8a7d31f774975aa7135227ad49679

In [0]:

mlflow.set_registry_uri("databricks-uc")


In [0]:
model = lr_model
from mlflow.models import infer_signature

# Take a small slice of training data as input example
input_example = X_train.iloc[:5]

# Predict on input_example to infer output type
predictions = model.predict(input_example)

# Infer signature
signature = infer_signature(input_example, predictions)

# Optional: display the signature
signature


In [0]:
import mlflow
import mlflow.sklearn

with mlflow.start_run(run_name="baseline_lr_with_signature_uc"):
    # Log metric (RÂ² score)
    mlflow.log_metric("r2_score", r2_score)
    
    # Log the model with signature and input example
    mlflow.sklearn.log_model(
        sk_model=model,
        artifact_path="model",
        signature=signature,
        input_example=input_example
    )
    
    print("Model re-logged with signature âœ…")


In [0]:
# Example: replace with your actual run ID
new_run_id = "5c448cc1f96d494b9dab21e3314c5d8c"

model_uri = f"runs:/{new_run_id}/model"

registered_model = mlflow.register_model(
    model_uri=model_uri,
    name="workspace.ml_models.purchases_lr_baseline"
)

print(f"Registered model version: {registered_model.version} âœ…")
