In [1]:
# This block of code is used to add the root folder of the project to the path so that src can be imported.
import os
import sys

root_folder = os.path.dirname(os.path.abspath(""))
if not root_folder in sys.path:
    sys.path.append(root_folder)

In [2]:
from pathlib import Path

import mlflow
from mlflow.tracking import MlflowClient
from mlflow.entities import ViewType
import optuna
from optuna.integration.mlflow import MLflowCallback
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import Lasso
from sklearn.metrics import mean_squared_error
from sklearn.pipeline import Pipeline
from xgboost import XGBRegressor

from src import read_trips, process_trips

  from .autonotebook import tqdm as notebook_tqdm


The notebooks shows how to track experiements and register model with MLflow. The noteboook consists of the following parts:

1. Tracking the experiment from a Lasso model,
2. Tuning hyperparameters with optuna MLflowCallback,
3. Extending the model to XGBoost,
4. Registering the best model in model registry.

# Load data

In [4]:
DATA_DIR = Path("../data")
MODEL_DIR = Path("../models")

trips_train = read_trips(DATA_DIR, color="green", year="2021", month="1")
trips_val = read_trips(DATA_DIR, color="green", year="2021", month="2")

trips_train = process_trips(trips_train)
trips_val = process_trips(trips_val)

target = "duration"
categorical_cols = ["PU_DO"]
numerical_cols = ["trip_distance"]
used_cols = categorical_cols + numerical_cols

X_train = trips_train[used_cols].to_dict(orient="records")
y_train = trips_train[target]

X_val = trips_val[used_cols].to_dict(orient="records")
y_val = trips_val[target]

Standard deviation of duration: 59.34
Fraction of the records left after dropping the outliers: 0.9658903787344154
Standard deviation of duration: 53.17
Fraction of the records left after dropping the outliers: 0.9589450535835966


# Using MLflow to track experiments

A typical way to track experiments with MLflow is to wrap training code inside a MLflow context manager. 

Psuedo code:
```python
with mlflow.start_run():
    # your training code
    # parameters, metrics, artifacts you want to log by mlflow.log_<...>()
```

In the code below, we use MLflow to track the experiment from a Lasso model.

In [37]:
MLFLOW_TRACKING_URI = "sqlite:///mlflow.db"
EXPERIMENT_NAME = "nyc-taxi-experiment"

mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)
mlflow.set_experiment(EXPERIMENT_NAME)

with mlflow.start_run():
    mlflow.set_tag("model", "lasso")
    mlflow.sklearn.autolog()

    alpha = 0.01
    pipe = Pipeline(
        [("vectorizer", DictVectorizer()), ("predictor", Lasso(alpha=alpha))]
    )
    pipe.fit(X_train, y_train)

    y_pred = pipe.predict(X_val)
    rmse = mean_squared_error(y_val, y_pred, squared=False)

    mlflow.log_metric("rmse_val", rmse)



# Tuning hyperparameters with optuna MLflowCallback

Optuna offers a MLflowCallback that can be used to track the hyperparameter tuning process with MLflow. 

Psuedo code:
```python
mlflc = MLflowCallback(
    tracking_uri=MLFLOW_TRACKING_URI,
    metric_name= ...,
)

@mlflc.track_in_mlflow()
def objective(trial):
    # parameters to tune
    params = {param_name: trial.suggest_...}
    # your training code
    # objective function to minimize or maximize

study = optuna.create_study(study_name=EXPERIMENT_NAME, direction= ...)
study.optimize(objective, n_trials= ..., callbacks=[mlflc])
```

In the example, we fine-tune the hyperparameters alpha of the Lasso model.

In [7]:
mlflc = MLflowCallback(
    tracking_uri=MLFLOW_TRACKING_URI,
    metric_name="rmse_val",
)

  mlflc = MLflowCallback(


In [8]:
@mlflc.track_in_mlflow()
def objective(trial: optuna.Trial) -> float:
    params = {"alpha": trial.suggest_float("alpha", 0.001, 1.0, log=True)}
    pipe = Pipeline(
        [("vectorizer", DictVectorizer()), ("predictor", Lasso(params["alpha"]))]
    )
    pipe.fit(X_train, y_train)

    rmse = mean_squared_error(y_val, pipe.predict(X_val), squared=False)

    mlflow.log_params(params)
    mlflow.log_metric("rmse_val", rmse)
    return rmse


study = optuna.create_study(study_name=EXPERIMENT_NAME, direction="minimize")
study.optimize(objective, n_trials=10, gc_after_trial=True, callbacks=[mlflc])

  @mlflc.track_in_mlflow()
[32m[I 2023-05-31 10:35:24,896][0m A new study created in memory with name: nyc-taxi-experiment[0m
[32m[I 2023-05-31 10:35:31,792][0m Trial 0 finished with value: 11.14187904852327 and parameters: {'alpha': 0.009607348023831131}. Best is trial 0 with value: 11.14187904852327.[0m
[32m[I 2023-05-31 10:35:38,595][0m Trial 1 finished with value: 11.220389039674325 and parameters: {'alpha': 0.010914376419154865}. Best is trial 0 with value: 11.14187904852327.[0m
[32m[I 2023-05-31 10:35:47,125][0m Trial 2 finished with value: 9.96275277305971 and parameters: {'alpha': 0.002110970838651835}. Best is trial 2 with value: 9.96275277305971.[0m
[32m[I 2023-05-31 10:35:54,078][0m Trial 3 finished with value: 11.294400030590431 and parameters: {'alpha': 0.012431185972786082}. Best is trial 2 with value: 9.96275277305971.[0m
[32m[I 2023-05-31 10:36:01,689][0m Trial 4 finished with value: 10.51171541101469 and parameters: {'alpha': 0.003969955200542646}. Bes

After the training, we can use `study.best_params` to reproduce the best model.

In [20]:
# Get the best model by study.trial
best_model = Pipeline(
    [("vectorizer", DictVectorizer()), ("predictor", Lasso(**study.best_params))]
)
best_model.fit(X_train, y_train)
mean_squared_error(y_val, best_model.predict(X_val), squared=False)

2023/05/31 10:28:17 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '1bc9f293ad1b40d8bac8321930690c5a', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow


9.721854747360974

# Use xgboost as predictor

We use XGBoost as the predictor to show how to define a complex hyperparameter search space `params`.

In [11]:
@mlflc.track_in_mlflow()
def objective(trial):
    params = {
        "max_depth": trial.suggest_int("max_depth", 3, 10),
        "eta": trial.suggest_float("eta", 0.01, 0.4),
        "alpha": trial.suggest_float("alpha", 0, 5),
        "lambda": trial.suggest_float("lambda", 0, 5),
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 10),
    }
    pipe = Pipeline(
        [
            ("vectorizer", DictVectorizer()),
            ("predictor", XGBRegressor(**params, random_state=42)),
        ]
    )
    pipe.fit(X_train, y_train)

    rmse = mean_squared_error(y_val, pipe.predict(X_val), squared=False)

    mlflow.log_params(params)
    mlflow.log_metric("rmse_val", rmse)
    return rmse


study = optuna.create_study(study_name=EXPERIMENT_NAME, direction="minimize")
study.optimize(objective, n_trials=10, gc_after_trial=True, callbacks=[mlflc])

  @mlflc.track_in_mlflow()
[32m[I 2023-05-31 10:40:45,890][0m A new study created in memory with name: nyc-taxi-experiment[0m
             callbacks=None, colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=None, early_stopping_rounds=None,
             enable_categorical=False, eta=0.1194864592193448, eval_metric=None,
             feature_types=None, gamma=None, gpu_id=None, grow_policy=None,
             importance_type=None, interaction_constraints=None,
             lamb...`
             callbacks=None, colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=None, early_stopping_rounds=None,
             enable_categorical=False, eta=0.1194864592193448, eval_metric=None,
             feature_types=None, gamma=None, gpu_id=None, grow_policy=None,
             importance_type=None, interaction_constraints=None,
             lambda=0.27635683234561115, learning_rate=None, max_b...`
[32m[I 2023-05-31 10:40:48,529][0m Trial 0 finished 

In [12]:
best_model = Pipeline(
    [
        ("vectorizer", DictVectorizer()),
        ("predictor", XGBRegressor(**study.best_params, random_state=42)),
    ]
)
best_model.fit(X_train, y_train)
mean_squared_error(y_val, best_model.predict(X_val), squared=False)

2023/05/31 10:41:19 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID 'f47bcd3bb74b46e89dc4f52466a69858', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow
             callbacks=None, colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=None, early_stopping_rounds=None,
             enable_categorical=False, eta=0.36541733187362263,
             eval_metric=None, feature_types=None, gamma=None, gpu_id=None,
             grow_policy=None, importance_type=None,
             interaction_constraints=None, lam...`
             callbacks=None, colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=None, early_stopping_rounds=None,
             enable_categorical=False, eta=0.36541733187362263,
             eval_metric=None, feature_types=None, gamma=None, gpu_id=None,
             grow_policy=None, importance_type=None,
             interacti

6.51201813135006

#  Register the best model to the model registry

1. We can use `search_runs()` from `MlflowClient` to get runs from an experiment by conditionally filtering on the metrics. 
2. When having the run with desired performance, we can use `create_model_version()` to register the model to the model registry.
3. We can transit the model into different stages (Staging or Production) by `transition_model_version_stage()`.
4. The model can be loaded by `mlflow.pyfunc.load_model()` from the model registry.

In [13]:
client = MlflowClient(tracking_uri=MLFLOW_TRACKING_URI)
experiment_id = client.get_experiment_by_name(EXPERIMENT_NAME).experiment_id
runs = client.search_runs(
    experiment_ids=experiment_id,
    filter_string="metrics.rmse_val < 7",
    run_view_type=ViewType.ACTIVE_ONLY,
    max_results=5,
    order_by=["metrics.rmse_val ASC"],
)

In [14]:
print(
    f"Best run id: {runs[0].info.run_id}\nValid RMSE: {runs[0].data.metrics['rmse_val']}"
)

Best run id: 654b2a6ecc1446628c120c7764f23945
Valid RMSE: 6.51201813135006


In [15]:
best_run_id = runs[0].info.run_id
model_uri = f"runs:/{best_run_id}/models"
model_name = "nyc-taxi-regressor"

In [16]:
client.create_registered_model(model_name)
client.create_model_version(
    name=model_name,
    source=f"mlruns/{experiment_id}/{best_run_id}/artifacts/model",
    run_id=best_run_id,
)

2023/05/31 10:41:34 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation. Model name: nyc-taxi-regressor, version 1


<ModelVersion: aliases=[], creation_timestamp=1685522494986, current_stage='None', description=None, last_updated_timestamp=1685522494986, name='nyc-taxi-regressor', run_id='654b2a6ecc1446628c120c7764f23945', run_link=None, source='mlruns/1/654b2a6ecc1446628c120c7764f23945/artifacts/model', status='READY', status_message=None, tags={}, user_id=None, version=1>

In [17]:
version = 1
client.transition_model_version_stage(
    name=model_name, version=version, stage="Staging", archive_existing_versions=True
)

<ModelVersion: aliases=[], creation_timestamp=1685522494986, current_stage='Staging', description=None, last_updated_timestamp=1685522496130, name='nyc-taxi-regressor', run_id='654b2a6ecc1446628c120c7764f23945', run_link=None, source='mlruns/1/654b2a6ecc1446628c120c7764f23945/artifacts/model', status='READY', status_message=None, tags={}, user_id=None, version=1>

In [18]:
model = mlflow.pyfunc.load_model(model_uri=f"models:/{model_name}/Staging")
model.predict(X_val[:5])

 - mlflow (current: 2.3.2, required: mlflow==2.3)
To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.


array([16.529718 ,  7.2243066, 19.936237 , 24.505241 ,  9.864914 ],
      dtype=float32)

In [17]:
version = 1
client.transition_model_version_stage(
    name=model_name, version=version, stage="Production", archive_existing_versions=True
)

<ModelVersion: aliases=[], creation_timestamp=1685519853910, current_stage='Production', description=None, last_updated_timestamp=1685519853982, name='nyc-taxi-regressor', run_id='fae3ee0586ac44b59ee7cd27fc84865b', run_link=None, source='mlruns/1/fae3ee0586ac44b59ee7cd27fc84865b/artifacts/model', status='READY', status_message=None, tags={}, user_id=None, version=1>

# Homework

In [19]:
import os
from pathlib import Path

import mlflow
import mlflow.pyfunc
from mlflow.tracking import MlflowClient
from mlflow.entities import ViewType
import optuna
from optuna.samplers import TPESampler
from optuna.integration.mlflow import MLflowCallback
from sklearn.feature_extraction import DictVectorizer
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error

from src import read_trips, process_trips, save_model

In [20]:
DATA_DIR = Path("../data")
MODEL_DIR = Path("../models")

MLFLOW_TRACKING_URI = "sqlite:///mlflow.db"
EXPERIMENT_NAME = "nyc-taxi-tips-experiment"

In [21]:
mlflow.__version__

'2.3.2'

In [22]:
trips_train = read_trips(DATA_DIR, color="green", year="2022", month="1")
trips_val = read_trips(DATA_DIR, color="green", year="2022", month="2")
trips_test = read_trips(DATA_DIR, color="green", year="2022", month="3")

trips_train = process_trips(trips_train)
trips_val = process_trips(trips_val)
trips_test = process_trips(trips_test)

Standard deviation of duration: 78.22
Fraction of the records left after dropping the outliers: 0.9537242979438355
Standard deviation of duration: 78.88
Fraction of the records left after dropping the outliers: 0.9524200636896786
Standard deviation of duration: 78.87
Fraction of the records left after dropping the outliers: 0.948686606312948


In [23]:
# In the homework, we need to predict the tips amount.
target = "tip_amount"
categorical_cols = ["PU_DO"]
numerical_cols = ["trip_distance"]
used_cols = categorical_cols + numerical_cols

dv = DictVectorizer()
X_hw = dv.fit_transform(trips_train[used_cols].to_dict(orient="records"))

save_model(MODEL_DIR, "dv.pkl", dv)
os.path.getsize(MODEL_DIR / "dv.pkl")

153660

In [24]:
X_train = trips_train[used_cols].to_dict(orient="records")
y_train = trips_train[target]

X_val = trips_val[used_cols].to_dict(orient="records")
y_val = trips_val[target]

X_test = trips_test[used_cols].to_dict(orient="records")
y_test = trips_test[target]

In [25]:
mlflc = MLflowCallback(
    tracking_uri=MLFLOW_TRACKING_URI,
    metric_name="rmse_val",
)


@mlflc.track_in_mlflow()
def objective(trial: optuna.Trial) -> float:
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 10, 50, 1),
        "max_depth": trial.suggest_int("max_depth", 1, 20, 1),
        "min_samples_split": trial.suggest_int("min_samples_split", 2, 10, 1),
        "min_samples_leaf": trial.suggest_int("min_samples_leaf", 1, 4, 1),
        "random_state": 42,
        "n_jobs": -1,
    }
    pipe = Pipeline(
        [
            ("vectorizer", DictVectorizer()),
            ("predictor", RandomForestRegressor(**params)),
        ]
    )
    pipe.fit(X_train, y_train)
    rmse = mean_squared_error(y_val, pipe.predict(X_val), squared=False)

    mlflow.log_params(params)
    mlflow.log_metric("rmse_val", rmse)
    return rmse


study = optuna.create_study(
    study_name=EXPERIMENT_NAME, direction="minimize", sampler=TPESampler(seed=42)
)
study.optimize(objective, n_trials=10, gc_after_trial=True, callbacks=[mlflc])

  mlflc = MLflowCallback(
  @mlflc.track_in_mlflow()
[32m[I 2023-05-31 10:44:46,294][0m A new study created in memory with name: nyc-taxi-tips-experiment[0m
2023/05/31 10:44:46 INFO mlflow.tracking.fluent: Experiment with name 'nyc-taxi-tips-experiment' does not exist. Creating a new experiment.
[32m[I 2023-05-31 10:44:48,297][0m Trial 0 finished with value: 2.451379690825458 and parameters: {'n_estimators': 25, 'max_depth': 20, 'min_samples_split': 8, 'min_samples_leaf': 3}. Best is trial 0 with value: 2.451379690825458.[0m
[32m[I 2023-05-31 10:44:49,879][0m Trial 1 finished with value: 2.4667366020368333 and parameters: {'n_estimators': 16, 'max_depth': 4, 'min_samples_split': 2, 'min_samples_leaf': 4}. Best is trial 0 with value: 2.451379690825458.[0m
[32m[I 2023-05-31 10:44:51,897][0m Trial 2 finished with value: 2.449827329704216 and parameters: {'n_estimators': 34, 'max_depth': 15, 'min_samples_split': 2, 'min_samples_leaf': 4}. Best is trial 2 with value: 2.4498273297

In [26]:
client = MlflowClient(tracking_uri=MLFLOW_TRACKING_URI)
experiment_id = client.get_experiment_by_name(EXPERIMENT_NAME).experiment_id
runs = client.search_runs(
    experiment_ids=experiment_id,
    run_view_type=ViewType.ACTIVE_ONLY,
    max_results=5,
    order_by=["metrics.rmse_val ASC"],
)

In [35]:
for run in runs:
    rf_params = [
        "n_estimators",
        "max_depth",
        "min_samples_split",
        "min_samples_leaf",
        "random_state",
        "n_jobs",
    ]
    params = {k: int(v) for k, v in run.data.params.items() if k in rf_params}
    pipe = Pipeline(
        [
            ("vectorizer", DictVectorizer()),
            ("predictor", RandomForestRegressor(**params)),
        ]
    )
    pipe.fit(X_train, y_train)
    rmse_test = mean_squared_error(y_test, pipe.predict(X_test), squared=False)

    print(f"Run id: {run.info.run_id}. Test RMSE: {rmse_test:.3f}")

2023/05/31 10:52:43 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID 'dc8b32cd83cf4e1ea41c6b58f2af9a2f', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow
2023/05/31 10:52:45 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '561b059dc91548dda1f09cc2598f1c53', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow


Run id: 83d021c801a64742aa49d68719d239c6. Test RMSE: 2.285


2023/05/31 10:52:47 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID 'fd537c0b93f84fcba5f3aa6161e9547d', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow


Run id: 3373d0cf4d72403ab8308af5c242612b. Test RMSE: 2.295


2023/05/31 10:52:49 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '2c0061dcfdc243889a11d36c89c3afe7', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow


Run id: bd41940d56a64118bc2d12b73bdf409e. Test RMSE: 2.292


2023/05/31 10:52:51 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID 'feb886f44686446390db77b6e3588c80', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow


Run id: 25b4e5b8fd1442b3b9ca3a6a7c03b632. Test RMSE: 2.299




Run id: b99be5b02ec94530b38add40d4849ca6. Test RMSE: 2.291
