### Q1. Install MLflow
What's the version that you have?

In [1]:
!mlflow --version

mlflow, version 2.13.0


### Q2. Download and preprocess the data
How many files were saved to OUTPUT_FOLDER?

In [2]:
!python preprocess_data.py --raw_data_path Data_Folder --dest_path ./output

In [3]:
import os
len(os.listdir('output'))

4

### Q3. Train a model with autolog
What is the value of the min_samples_split parameter:

In [4]:
%%writefile train_new.py
import os
import pickle
import click
import mlflow
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

mlflow.set_tracking_uri("http://127.0.0.1:5000")
mlflow.set_experiment("random-forest-train")

def load_pickle(filename: str):
    with open(filename, "rb") as f_in:
        return pickle.load(f_in)


@click.command()
@click.option(
    "--data_path",
    default="./output",
    help="Location where the processed NYC taxi trip data was saved"
)
def run_train(data_path: str):
    mlflow.sklearn.autolog()
    X_train, y_train = load_pickle(os.path.join(data_path, "train.pkl"))
    X_val, y_val = load_pickle(os.path.join(data_path, "val.pkl"))
    with mlflow.start_run():
        rf = RandomForestRegressor(max_depth=10, random_state=0)
        rf.fit(X_train, y_train)
        y_pred = rf.predict(X_val)
    
        rmse = mean_squared_error(y_val, y_pred, squared=False)


if __name__ == '__main__':
    run_train()

Overwriting train_new.py


In [5]:
!python train_new.py



In [14]:
from mlflow.entities import ViewType
from mlflow.tracking import MlflowClient
client = MlflowClient(tracking_uri="http://127.0.0.1:5000")
# Retrieve the top_n model runs and log the models
experiment = client.get_experiment_by_name('random-forest-train')
run = client.search_runs(
    experiment_ids=experiment.experiment_id,
    run_view_type=ViewType.ACTIVE_ONLY
    )[0]
run.data.params['min_samples_split']

'2'

### Q4. Launch the tracking server locally
In addition to backend-store-uri, what else do you need to pass to properly configure the server?

default-artifact-root

### Q5. Tune model hyperparameters
What's the best validation RMSE that you got?

In [7]:
%%writefile hpo_train.py
import os
import pickle
import click
import mlflow
import numpy as np
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe
from hyperopt.pyll import scope
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

mlflow.set_tracking_uri("http://127.0.0.1:5000")
mlflow.set_experiment("random-forest-hyperopt")


def load_pickle(filename: str):
    with open(filename, "rb") as f_in:
        return pickle.load(f_in)


@click.command()
@click.option(
    "--data_path",
    default="./output",
    help="Location where the processed NYC taxi trip data was saved"
)
@click.option(
    "--num_trials",
    default=15,
    help="The number of parameter evaluations for the optimizer to explore"
)
def run_optimization(data_path: str, num_trials: int):

    X_train, y_train = load_pickle(os.path.join(data_path, "train.pkl"))
    X_val, y_val = load_pickle(os.path.join(data_path, "val.pkl"))

    def objective(params):
        with mlflow.start_run():
            mlflow.log_params(params)
            rf = RandomForestRegressor(**params)
            rf.fit(X_train, y_train)
            y_pred = rf.predict(X_val)
            rmse = mean_squared_error(y_val, y_pred, squared=False)
            mlflow.log_metric("rmse", rmse)
            return {'loss': rmse, 'status': STATUS_OK}

    search_space = {
        'max_depth': scope.int(hp.quniform('max_depth', 1, 20, 1)),
        'n_estimators': scope.int(hp.quniform('n_estimators', 10, 50, 1)),
        'min_samples_split': scope.int(hp.quniform('min_samples_split', 2, 10, 1)),
        'min_samples_leaf': scope.int(hp.quniform('min_samples_leaf', 1, 4, 1)),
        'random_state': 42
    }

    rstate = np.random.default_rng(42)  # for reproducible results
    fmin(
        fn=objective,
        space=search_space,
        algo=tpe.suggest,
        max_evals=num_trials,
        trials=Trials(),
        rstate=rstate
    )

if __name__ == '__main__':
    run_optimization()

Overwriting hpo_train.py


In [8]:
!python hpo_train.py


  0%|          | 0/15 [00:00<?, ?trial/s, best loss=?]
  7%|6         | 1/15 [00:44<10:26, 44.76s/trial, best loss: 5.370086069268862]
 13%|#3        | 2/15 [00:47<04:18, 19.86s/trial, best loss: 5.370086069268862]
 20%|##        | 3/15 [00:50<02:28, 12.38s/trial, best loss: 5.370086069268862]
 27%|##6       | 4/15 [01:21<03:37, 19.74s/trial, best loss: 5.357490752366866]
 33%|###3      | 5/15 [01:37<03:04, 18.43s/trial, best loss: 5.357490752366866]
 40%|####      | 6/15 [02:31<04:34, 30.46s/trial, best loss: 5.354695072530291]
 47%|####6     | 7/15 [03:25<05:05, 38.23s/trial, best loss: 5.354695072530291]
 53%|#####3    | 8/15 [03:31<03:15, 27.90s/trial, best loss: 5.354695072530291]
 60%|######    | 9/15 [04:03<02:55, 29.22s/trial, best loss: 5.354695072530291]
 67%|######6   | 10/15 [04:30<02:22, 28.44s/trial, best loss: 5.354695072530291]
 73%|#######3  | 11/15 [04:50<01:43, 25.84s/trial, best loss: 5.335419588556921]
 80%|########  | 12/15 [05:07<01:09, 23.21s/trial, best loss: 

In [15]:
client = MlflowClient(tracking_uri="http://127.0.0.1:5000")
experiment = client.get_experiment_by_name('random-forest-hyperopt')
val_rmse_run = client.search_runs(
    experiment_ids=experiment.experiment_id,
    run_view_type=ViewType.ACTIVE_ONLY,
    max_results=15,
    order_by=["metrics.rmse ASC"]
)[0]

In [16]:
val_rmse_run.data.metrics['rmse']

5.335419588556921

### Q6. Promote the best model to the model registry
What is the test RMSE of the best model?

In [10]:
%%writefile register_model_updated.py
import os
import pickle
import click
import mlflow

from mlflow.entities import ViewType
from mlflow.tracking import MlflowClient
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

HPO_EXPERIMENT_NAME = "random-forest-hyperopt"
EXPERIMENT_NAME = "random-forest-best-models-2"
RF_PARAMS = ['max_depth', 'n_estimators', 'min_samples_split', 'min_samples_leaf', 'random_state']

mlflow.set_tracking_uri("http://127.0.0.1:5000")
mlflow.set_experiment(EXPERIMENT_NAME)
mlflow.sklearn.autolog()


def load_pickle(filename):
    with open(filename, "rb") as f_in:
        return pickle.load(f_in)


def train_and_log_model(data_path, params):
    X_train, y_train = load_pickle(os.path.join(data_path, "train.pkl"))
    X_val, y_val = load_pickle(os.path.join(data_path, "val.pkl"))
    X_test, y_test = load_pickle(os.path.join(data_path, "test.pkl"))

    with mlflow.start_run():
        for param in RF_PARAMS:
            params[param] = int(params[param])

        rf = RandomForestRegressor(**params)
        rf.fit(X_train, y_train)

        # Evaluate model on the validation and test sets
        val_rmse = mean_squared_error(y_val, rf.predict(X_val), squared=False)
        mlflow.log_metric("val_rmse", val_rmse)
        test_rmse = mean_squared_error(y_test, rf.predict(X_test), squared=False)
        mlflow.log_metric("test_rmse", test_rmse)


@click.command()
@click.option(
    "--data_path",
    default="./output",
    help="Location where the processed NYC taxi trip data was saved"
)
@click.option(
    "--top_n",
    default=5,
    type=int,
    help="Number of top models that need to be evaluated to decide which one to promote"
)
def run_register_model(data_path: str, top_n: int):

    client = MlflowClient()

    # Retrieve the top_n model runs and log the models
    experiment = client.get_experiment_by_name(HPO_EXPERIMENT_NAME)
    runs = client.search_runs(
        experiment_ids=experiment.experiment_id,
        run_view_type=ViewType.ACTIVE_ONLY,
        max_results=top_n,
        order_by=["metrics.rmse ASC"]
    )
    for run in runs:
        train_and_log_model(data_path=data_path, params=run.data.params)

    # Select the model with the lowest test RMSE
    experiment = client.get_experiment_by_name(EXPERIMENT_NAME)
    # best_run = client.search_runs( ...  )[0]
    best_run = client.search_runs(
        experiment_ids=experiment.experiment_id,
        run_view_type=ViewType.ACTIVE_ONLY,
        max_results=top_n,
        order_by=["metrics.test_rmse ASC"]
    )[0]

    # Register the best model
    # mlflow.register_model( ... )
    mlflow.register_model(model_uri=f"runs:/{best_run.info.run_id}/model", name="random-forest-model-1")

if __name__ == '__main__':
    run_register_model()

Overwriting register_model_updated.py


In [11]:
!python register_model_updated.py

Registered model 'random-forest-model-1' already exists. Creating a new version of this model...
2024/05/27 23:42:11 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: random-forest-model-1, version 3
Created version '3' of model 'random-forest-model-1'.


In [17]:
client = MlflowClient(tracking_uri="http://127.0.0.1:5000")
experiment = client.get_experiment_by_name("random-forest-best-models-2")
best_run = client.search_runs(
        experiment_ids=experiment.experiment_id,
        run_view_type=ViewType.ACTIVE_ONLY,
        max_results=15,
        order_by=["metrics.test_rmse ASC"]
    )[0]
best_run.data.metrics['test_rmse']

5.567408012462019