In [4]:
import os
import pickle
import click
import mlflow

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

from mlflow.entities import ViewType
from mlflow.tracking import MlflowClient

HPO_EXPERIMENT_NAME = "random-forest-hyperopt"
EXPERIMENT_NAME = "random-forest-best-models"
RF_PARAMS = ['max_depth', 'n_estimators', 'min_samples_split', 'min_samples_leaf', 'random_state', 'n_jobs']


MLFLOW_TRACKING_URI = "sqlite:///mlflow.db"
mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)
mlflow.set_experiment("random-forest-hyperopt")
mlflow.sklearn.autolog()

def load_pickle(filename):
    with open(filename, "rb") as f_in:
        return pickle.load(f_in)
    

def train_and_log_model(data_path, params):
    xtr, ytr = load_pickle(os.path.join(data_path, "train.pkl"))
    xvl, yvl = load_pickle(os.path.join(data_path, "val.pkl"))
    xts, yts = load_pickle(os.path.join(data_path, "test.pkl"))
    
    with mlflow.start_run():
        for param in RF_PARAMS:
            params[param] = int(params[param])

        rf = RandomForestRegressor(**params)
        rf.fit(xtr, ytr)
        
        val_rmse = mean_squared_error(yvl, rf.predict(xvl), squared=False)
        mlflow.log_metric("val_rmse", val_rmse)
        test_rmse = mean_squared_error(yts, rf.predict(xts), squared=False) 
        mlflow.log_metric("test_rmse", test_rmse)

In [34]:
top_n = 5
data_path = "output/"

client = MlflowClient()
experiment = client.get_experiment_by_name(HPO_EXPERIMENT_NAME)
runs = client.search_runs(
    experiment_ids=experiment.experiment_id,
    run_view_type=ViewType.ACTIVE_ONLY,
    max_results=top_n,
    order_by=["metrics.rmse ASC"]
)
for run in runs:
    train_and_log_model(data_path, run.data.params)

experiment = client.get_experiment_by_name(EXPERIMENT_NAME)

best_run = runs[0]
run_id = best_run.info.run_id
model_uri = f"runs:/{run_id}/model"
registered_model = mlflow.register_model(model_uri, "BestModel")
print(f"Registered model: {registered_model.name} (version {registered_model.version})")


Successfully registered model 'BestModel'.
2023/05/19 15:42:09 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation. Model name: BestModel, version 1


Registered model: BestModel (version 1)


Created version '1' of model 'BestModel'.


{'rmse': 2.449827329704216}

In [None]:
best_run = client.search_runs( ...  )[0]

In [29]:
client.search_runs(experiment_ids="ff69fbe331b545da8fabe6e6a01e3d52")

[]

In [28]:
runs

[<Run: data=<RunData: metrics={'rmse': 2.449827329704216}, params={'max_depth': 15,
  'min_samples_leaf': 4,
  'min_samples_split': 2,
  'n_estimators': 34,
  'n_jobs': -1,
  'random_state': 42}, tags={'mlflow.parentRunId': 'ff69fbe331b545da8fabe6e6a01e3d52',
  'mlflow.runName': 'enchanting-goat-162',
  'mlflow.source.git.commit': 'b9c7dce564cafb5a24e39eb21f5b9c048a6e0834',
  'mlflow.source.name': 'hpo.py',
  'mlflow.source.type': 'LOCAL',
  'mlflow.user': 'hilbert'}>, info=<RunInfo: artifact_uri='/home/hilbert/zoomcamps/experiment-tracking/mlruns/2/112d51d2f5904aac9642105ca7d7b1b1/artifacts', end_time=1684438918860, experiment_id='2', lifecycle_stage='active', run_id='112d51d2f5904aac9642105ca7d7b1b1', run_name='enchanting-goat-162', run_uuid='112d51d2f5904aac9642105ca7d7b1b1', start_time=1684438918114, status='FINISHED', user_id='hilbert'>>,
 <Run: data=<RunData: metrics={'rmse': 2.451379690825458}, params={'max_depth': 20,
  'min_samples_leaf': 3,
  'min_samples_split': 8,
  'n_esti

In [27]:
runs[2].to_dictionary()["info"]["run_id"]

'7f96f97ce25a42efa16393bb70a1d5dd'