# Homework 2

In [1]:
!mlflow --version

mlflow, version 1.26.1


In [2]:
import pickle
import os

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

## Part 1 

In [7]:
import mlflow
 

mlflow.set_tracking_uri("http://127.0.0.1:5000")
mlflow.set_experiment("green-taxi-experiment")

2022/06/02 13:51:45 INFO mlflow.tracking.fluent: Experiment with name 'green-taxi-experiment' does not exist. Creating a new experiment.


<Experiment: artifact_location='./mlruns/2', experiment_id='2', lifecycle_stage='active', name='green-taxi-experiment', tags={}>

In [3]:
def load_pickle(filename: str):
    with open(filename, "rb") as f_in:
        return pickle.load(f_in)

In [9]:
mlflow.sklearn.autolog()

data_path = "output/"
with mlflow.start_run():

    X_train, y_train = load_pickle(os.path.join(data_path, "train.pkl"))
    X_valid, y_valid = load_pickle(os.path.join(data_path, "valid.pkl"))

    rf = RandomForestRegressor(max_depth=10, random_state=0)
    rf.fit(X_train, y_train)
    y_pred = rf.predict(X_valid)

    rmse = mean_squared_error(y_valid, y_pred, squared=False)
    print(rmse)


  return pickle.load(f_in)


6.729070933590364


## Tune the hyperparameters of the model

In [4]:
import mlflow
 
mlflow.set_tracking_uri("http://127.0.0.1:5000")
mlflow.set_experiment("hpo-green-taxi-experiment")

2022/06/02 14:17:29 INFO mlflow.tracking.fluent: Experiment with name 'hpo-green-taxi-experiment' does not exist. Creating a new experiment.


<Experiment: artifact_location='./artifacts_local/1', experiment_id='1', lifecycle_stage='active', name='hpo-green-taxi-experiment', tags={}>

In [5]:
import numpy as np 

from hyperopt import STATUS_OK, Trials, fmin, hp, tpe
from hyperopt.pyll import scope
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

In [8]:
def run(data_path, num_trials=50):

    X_train, y_train = load_pickle(os.path.join(data_path, "train.pkl"))
    X_valid, y_valid = load_pickle(os.path.join(data_path, "valid.pkl"))

    def objective(params):
        with mlflow.start_run():
            mlflow.set_tag("model", "random_forest_reg")
            mlflow.log_params(params)
            rf = RandomForestRegressor(**params)
            rf.fit(X_train, y_train)
            y_pred = rf.predict(X_valid)
            rmse = mean_squared_error(y_valid, y_pred, squared=False)
            mlflow.log_metric("rmse", rmse)

        return {'loss': rmse, 'status': STATUS_OK}

    search_space = {
        'max_depth': scope.int(hp.quniform('max_depth', 1, 20, 1)),
        'n_estimators': scope.int(hp.quniform('n_estimators', 10, 50, 1)),
        'min_samples_split': scope.int(hp.quniform('min_samples_split', 2, 10, 1)),
        'min_samples_leaf': scope.int(hp.quniform('min_samples_leaf', 1, 4, 1)),
        'random_state': 42
    }

    rstate = np.random.default_rng(42)  # for reproducible results
    fmin(
        fn=objective,
        space=search_space,
        algo=tpe.suggest,
        max_evals=num_trials,
        trials=Trials(),
        rstate=rstate
    )

In [9]:
data_path = "output/"
run(data_path)

  0%|          | 0/50 [00:00<?, ?trial/s, best loss=?]

  return pickle.load(f_in)


100%|██████████| 50/50 [08:52<00:00, 10.65s/trial, best loss: 6.6284257482044735]


## Promote the best model to the model registry

In [10]:
import os
import pickle

import mlflow
from hyperopt import hp, space_eval
from hyperopt.pyll import scope
from mlflow.entities import ViewType
from mlflow.tracking import MlflowClient
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

HPO_EXPERIMENT_NAME = "hpo-green-taxi-experiment"
EXPERIMENT_NAME = "random-forest-best-models"

mlflow.set_tracking_uri("http://127.0.0.1:5000")
mlflow.set_experiment(EXPERIMENT_NAME)
mlflow.sklearn.autolog()

SPACE = {
    'max_depth': scope.int(hp.quniform('max_depth', 1, 20, 1)),
    'n_estimators': scope.int(hp.quniform('n_estimators', 10, 50, 1)),
    'min_samples_split': scope.int(hp.quniform('min_samples_split', 2, 10, 1)),
    'min_samples_leaf': scope.int(hp.quniform('min_samples_leaf', 1, 4, 1)),
    'random_state': 42
}

2022/06/02 14:52:05 INFO mlflow.tracking.fluent: Experiment with name 'random-forest-best-models' does not exist. Creating a new experiment.


In [11]:
def train_and_log_model(data_path, params):
    X_train, y_train = load_pickle(os.path.join(data_path, "train.pkl"))
    X_valid, y_valid = load_pickle(os.path.join(data_path, "valid.pkl"))
    X_test, y_test = load_pickle(os.path.join(data_path, "test.pkl"))

    with mlflow.start_run():
        params = space_eval(SPACE, params)
        rf = RandomForestRegressor(**params)
        rf.fit(X_train, y_train)

        # evaluate model on the validation and test sets
        valid_rmse = mean_squared_error(y_valid, rf.predict(X_valid), squared=False)
        mlflow.log_metric("valid_rmse", valid_rmse)
        test_rmse = mean_squared_error(y_test, rf.predict(X_test), squared=False)
        mlflow.log_metric("test_rmse", test_rmse)

In [14]:
client = MlflowClient()

# retrieve the top_n model runs and log the models to MLflow
experiment = client.get_experiment_by_name(HPO_EXPERIMENT_NAME)
runs = client.search_runs(
    experiment_ids=experiment.experiment_id,
    run_view_type=ViewType.ACTIVE_ONLY,
    max_results=1,
    order_by=["metrics.rmse ASC"]
)

for run in runs:
        train_and_log_model(data_path=data_path, params=run.data.params)

  return pickle.load(f_in)


In [16]:
run_id = "efda5354f9c34539ac5a8bd4a41a8b75"
model_uri = f"runs:/{run_id}/model"
mlflow.register_model(model_uri=model_uri, name="nyc-taxi-regressor")

Successfully registered model 'nyc-taxi-regressor'.
2022/06/02 15:05:01 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation.                     Model name: nyc-taxi-regressor, version 1
Created version '1' of model 'nyc-taxi-regressor'.


<ModelVersion: creation_timestamp=1654182301918, current_stage='None', description='', last_updated_timestamp=1654182301918, name='nyc-taxi-regressor', run_id='efda5354f9c34539ac5a8bd4a41a8b75', run_link='', source='./artifacts_local/2/efda5354f9c34539ac5a8bd4a41a8b75/artifacts/model', status='READY', status_message='', tags={}, user_id='', version='1'>