## Imports

In [31]:
# python imports
import os

# third-party imports
import mlflow

from sklearn import datasets
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

import numpy as np
import pandas as pd

## Config Variables

In [32]:
TRACKING_URI = 'http://127.0.0.1:8080'
EXPERIMENT_NAME = 'Housing_Models'
RUN_NAME = 'housing_gb_test'
ARTIFACT_PATH = 'gb_housing'
REGISTERED_MODEL_NAME = 'housing_prod'
MODEL_ALIAS = 'prod'

## Data Preparation

In [33]:
# Load California housing dataset
data = datasets.fetch_california_housing()

# Prepare target variable by converting the continuous target into classes (classification task)
# We'll divide the target into 3 classes (low, medium, high housing prices)
y = np.digitize(data.target, bins=[1.5, 3.0])

# Standardize features (helps in gradient boosting)
scaler = StandardScaler()
X = scaler.fit_transform(data.data)

# Split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create Pandas DataFrames for logging
train_df = pd.DataFrame(X_train, columns=data.feature_names)
train_df['label'] = y_train

eval_df = pd.DataFrame(X_test, columns=data.feature_names)
eval_df['label'] = y_test

## Connecting to MLFlow

In [35]:
# Sets the URI of the MLflow Tracking Server
mlflow.set_tracking_uri(TRACKING_URI)

# Sets the current active experiment and returns the Experiment metadata
experiment = mlflow.set_experiment(EXPERIMENT_NAME)

2024/09/24 16:21:45 INFO mlflow.tracking.fluent: Experiment with name 'Housing_Models' does not exist. Creating a new experiment.


## Running Experiment

In [36]:
# Initiate the MLflow run context
with mlflow.start_run(run_name=RUN_NAME, log_system_metrics=True) as run:

    # Log the training dataset
    mlflow.log_input(mlflow.data.from_pandas(train_df), context='training')

    params = {
        'n_estimators': 500,
        'learning_rate': 0.1,
        'max_depth': 5
    }

    # Initialize the Gradient Boosting Classifier
    clf = GradientBoostingClassifier(**params)
    
    # Learn the digits on the train subset
    clf.fit(X_train, y_train)

    # Predict the value of the digit on the test subset
    y_pred = clf.predict(X_test)

    # Add predictions to the evaluation DataFrame
    eval_df['predictions'] = y_pred

    # Create the PandasDataset for use in mlflow evaluate
    pd_dataset = mlflow.data.from_pandas(eval_df, predictions='predictions', targets='label')

    # Log the parameters used for the model fit
    mlflow.log_params(params)

    # Log an instance of the trained model for later use
    mlflow.sklearn.log_model(
        sk_model=clf, input_example=X_test, artifact_path=ARTIFACT_PATH
    )

    # Execute evaluation
    mlflow.evaluate(data=pd_dataset, model_type='classifier')

2024/09/24 16:21:45 INFO mlflow.system_metrics.system_metrics_monitor: Started monitoring system metrics.
2024/09/24 16:22:55 INFO mlflow.models.evaluation.default_evaluator: The evaluation dataset is inferred as multiclass dataset, number of classes is inferred as 3. If this is incorrect, please specify the `label_list` parameter in `evaluator_config`.
2024/09/24 16:22:55 INFO mlflow.models.evaluation.default_evaluator: Testing metrics on first row...
2024/09/24 16:22:55 INFO mlflow.tracking._tracking_service.client: 🏃 View run housing_gb_test at: http://127.0.0.1:8080/#/experiments/942542597364770131/runs/4f9bbef2dd7a414cb3f6bb790e913b59.
2024/09/24 16:22:55 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:8080/#/experiments/942542597364770131.
2024/09/24 16:22:55 INFO mlflow.system_metrics.system_metrics_monitor: Stopping system metrics monitoring...
2024/09/24 16:22:55 INFO mlflow.system_metrics.system_metrics_monitor: Successfully terminated sy

## Creating a New Model and Registering the Run

In [37]:
# Connecting to MLFlow Tracking System
client = mlflow.MlflowClient(TRACKING_URI)

In [38]:
# Get the experiment ID
experiment = client.get_experiment_by_name(EXPERIMENT_NAME)
experiment_id = experiment.experiment_id

# Filter runs by run name
filtered_runs = client.search_runs(
    experiment_ids=[experiment_id], 
    filter_string=f"tags.mlflow.runName='{RUN_NAME}'"
)

In [49]:
# Obtain model_uri
run = filtered_runs[0]
run_id = run.info.run_id
model_uri = f'runs:/{run_id}/{ARTIFACT_PATH}'

In [50]:
# Creating a New Model
client.create_registered_model(REGISTERED_MODEL_NAME)

# Register the model
mlflow.register_model(model_uri, REGISTERED_MODEL_NAME)

Registered model 'housing_prod' already exists. Creating a new version of this model...
2024/09/24 16:28:12 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: housing_prod, version 1
Created version '1' of model 'housing_prod'.


<ModelVersion: aliases=[], creation_timestamp=1727206092405, current_stage='None', description='', last_updated_timestamp=1727206092405, name='housing_prod', run_id='4f9bbef2dd7a414cb3f6bb790e913b59', run_link='', source='mlflow-artifacts:/942542597364770131/4f9bbef2dd7a414cb3f6bb790e913b59/artifacts/gb_housing', status='READY', status_message='', tags={}, user_id='', version='1'>

## Set an Alias to the Registered Model

In [60]:
# Get the registered model details
registered_model = client.get_registered_model(REGISTERED_MODEL_NAME)

# Get the latest model version
latest_version = max(registered_model.latest_versions, key=lambda x: x.version)

# Set the alias for the latest model version
client.set_registered_model_alias(
        alias=MODEL_ALIAS,
        name=REGISTERED_MODEL_NAME,
        version=latest_version.version,
    )