In [1]:
# This block of code is used to add the root folder of the project to the path so that src can be imported.
import os
import sys

root_folder = os.path.dirname(os.path.abspath(""))
if not root_folder in sys.path:
    sys.path.append(root_folder)

In [2]:
from pathlib import Path

import mlflow
from mlflow.tracking import MlflowClient
from mlflow.entities import ViewType
import optuna
from optuna.integration.mlflow import MLflowCallback
from sklearn.linear_model import Lasso
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor

from src import create_pipeline, read_trips, process_trips

The notebooks shows how to track experiements and register model with MLflow. The noteboook consists of the following parts:

1. Tracking the experiment from a Lasso model,
2. Tuning hyperparameters with optuna MLflowCallback,
3. Extending the model to XGBoost,
4. Registering the best model in model registry.

# Load data

In [3]:
DATA_DIR = Path("../data")
MODEL_DIR = Path("../models")

trips_train = read_trips(DATA_DIR, color="green", year="2021", month="1")
trips_val = read_trips(DATA_DIR, color="green", year="2021", month="2")

trips_train = process_trips(trips_train)
trips_val = process_trips(trips_val)

target = "duration"
categorical_cols = ["PU_DO"]
numerical_cols = ["trip_distance"]
used_cols = categorical_cols + numerical_cols

X_train = trips_train[used_cols]
y_train = trips_train[target]

X_val = trips_val[used_cols]
y_val = trips_val[target]

Standard deviation of duration: 59.34
Fraction of the records left after dropping the outliers: 0.9658903787344154
Standard deviation of duration: 53.17
Fraction of the records left after dropping the outliers: 0.9589450535835966


# Using MLflow to track experiments

A typical way to track experiments with MLflow is to wrap training code inside a MLflow context manager. 

Psuedo code:
```python
with mlflow.start_run():
    # your training code
    # parameters, metrics, artifacts you want to log by mlflow.log_<...>()
```

In the code below, we use MLflow to track the experiment from a Lasso model.

In [5]:
MLFLOW_TRACKING_URI = "sqlite:///mlflow.db"
EXPERIMENT_NAME = "nyc-taxi-experiment"

mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)
mlflow.set_experiment(EXPERIMENT_NAME)

with mlflow.start_run():
    mlflow.set_tag("model", "lasso")
    mlflow.sklearn.autolog()

    alpha = 0.01
    pipe = create_pipeline(Lasso(alpha=alpha))
    pipe.fit(X_train, y_train)

    rmse = mean_squared_error(y_val, pipe.predict(X_val), squared=False)

    mlflow.log_metric("rmse_val", rmse)

# Tuning hyperparameters with optuna MLflowCallback

Optuna offers a MLflowCallback that can be used to track the hyperparameter tuning process with MLflow. 

Psuedo code:
```python
mlflc = MLflowCallback(
    tracking_uri=MLFLOW_TRACKING_URI,
    metric_name= ...,
)

@mlflc.track_in_mlflow()
def objective(trial):
    # parameters to tune
    params = {param_name: trial.suggest_...}
    # your training code
    # objective function to minimize or maximize

study = optuna.create_study(study_name=EXPERIMENT_NAME, direction= ...)
study.optimize(objective, n_trials= ..., callbacks=[mlflc])
```

In the example, we fine-tune the hyperparameters alpha of the Lasso model.

In [7]:
mlflc = MLflowCallback(
    tracking_uri=MLFLOW_TRACKING_URI,
    metric_name="rmse_val",
)

  mlflc = MLflowCallback(


In [9]:
@mlflc.track_in_mlflow()
def objective(trial: optuna.Trial) -> float:
    params = {"alpha": trial.suggest_float("alpha", 0.001, 1.0, log=True)}
    pipe = create_pipeline(Lasso(**params))
    pipe.fit(X_train, y_train)

    rmse = mean_squared_error(y_val, pipe.predict(X_val), squared=False)

    mlflow.log_params(params)
    mlflow.log_metric("rmse_val", rmse)
    return rmse


study = optuna.create_study(study_name=EXPERIMENT_NAME, direction="minimize")
study.optimize(objective, n_trials=10, gc_after_trial=True, callbacks=[mlflc])

  @mlflc.track_in_mlflow()
[32m[I 2023-06-03 12:44:42,373][0m A new study created in memory with name: nyc-taxi-experiment[0m
[32m[I 2023-06-03 12:44:45,714][0m Trial 0 finished with value: 12.212582895862711 and parameters: {'alpha': 0.7284515113077665}. Best is trial 0 with value: 12.212582895862711.[0m
[32m[I 2023-06-03 12:44:49,223][0m Trial 1 finished with value: 12.212582630408178 and parameters: {'alpha': 0.5088262461021089}. Best is trial 1 with value: 12.212582630408178.[0m
[32m[I 2023-06-03 12:44:58,127][0m Trial 2 finished with value: 10.1899072091622 and parameters: {'alpha': 0.0027013408039320433}. Best is trial 2 with value: 10.1899072091622.[0m
[32m[I 2023-06-03 12:45:04,227][0m Trial 3 finished with value: 11.43042026181242 and parameters: {'alpha': 0.016178469825871802}. Best is trial 2 with value: 10.1899072091622.[0m
[32m[I 2023-06-03 12:45:10,246][0m Trial 4 finished with value: 11.638993968649391 and parameters: {'alpha': 0.026365426607397546}. Bes

After the training, we can use `study.best_params` to reproduce the best model. The mean squared error of the best model is the smallest value in the log above.

In [10]:
# Get the best model by study.trial
best_model = create_pipeline(Lasso(**study.best_params))
best_model.fit(X_train, y_train)
mean_squared_error(y_val, best_model.predict(X_val), squared=False)

2023/06/03 12:46:26 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID 'ae9592374f154178963e6aee05fc0cb6', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow


10.1899072091622

# Use xgboost as predictor

We use XGBoost as the predictor to show how to define a complex hyperparameter search space `params`.

In [22]:
@mlflc.track_in_mlflow()
def objective(trial):
    params = {
        "max_depth": trial.suggest_int("max_depth", 3, 10),
        "eta": trial.suggest_float("eta", 0.01, 0.4, log=True),
        "alpha": trial.suggest_float("alpha", 0.01, 5, log=True),
        "lambda": trial.suggest_float("lambda", 0.01, 5, log=True),
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 10),
    }
    pipe = create_pipeline(XGBRegressor(**params, random_state=42)) 
    pipe.fit(X_train, y_train)

    rmse = mean_squared_error(y_val, pipe.predict(X_val), squared=False)

    mlflow.log_params(params)
    mlflow.log_metric("rmse_val", rmse)
    return rmse


study = optuna.create_study(study_name=EXPERIMENT_NAME, direction="minimize")
study.optimize(objective, n_trials=10, gc_after_trial=True, callbacks=[mlflc])

  @mlflc.track_in_mlflow()
[32m[I 2023-06-03 13:00:28,430][0m A new study created in memory with name: nyc-taxi-experiment[0m
             callbacks=None, colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=None, early_stopping_rounds=None,
             enable_categorical=False, eta=0.1723126708454834, eval_metric=None,
             feature_types=None, gamma=None, gpu_id=None, grow_policy=None,...`
             callbacks=None, colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=None, early_stopping_rounds=None,
             enable_categorical=False, eta=0.1723126708454834, eval_metric=None,
             feature_types=None, gamma=None, gpu_id=None, grow_policy=None,
             importance_type=None, interaction_constraints=None,
             lambda=4.586198333655434, learning_rate=None, max_...`
[32m[I 2023-06-03 13:00:31,221][0m Trial 0 finished with value: 6.55527410586997 and parameters: {'max_depth': 10, 'eta': 0.17231267084548

In [23]:
best_model = create_pipeline(XGBRegressor(**study.best_params, random_state=42)) 
best_model.fit(X_train, y_train)
mean_squared_error(y_val, best_model.predict(X_val), squared=False)

2023/06/03 13:01:01 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '4f4e8b085d014fa5b7ac136e0f3d667a', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow
             callbacks=None, colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=None, early_stopping_rounds=None,
             enable_categorical=False, eta=0.1723126708454834, eval_metric=None,
             feature_types=None, gamma=None, gpu_id=None, grow_policy=None,...`
             callbacks=None, colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=None, early_stopping_rounds=None,
             enable_categorical=False, eta=0.1723126708454834, eval_metric=None,
             feature_types=None, gamma=None, gpu_id=None, grow_policy=None,
             importance_type=None, interaction_constraints=None,
             lambda=4.586198333655434, learning_rate=None, max_...`


6.55527410586997

#  Register the best model to the model registry

1. We can use `search_runs()` from `MlflowClient` to get runs from an experiment by conditionally filtering on the metrics. 
2. When having the run with desired performance, we can use `create_model_version()` to register the model to the model registry.
3. We can transit the model into different stages (Staging or Production) by `transition_model_version_stage()`.
4. The model can be loaded by `mlflow.pyfunc.load_model()` from the model registry.

In [24]:
client = MlflowClient(tracking_uri=MLFLOW_TRACKING_URI)
experiment_id = client.get_experiment_by_name(EXPERIMENT_NAME).experiment_id
runs = client.search_runs(
    experiment_ids=experiment_id,
    filter_string="metrics.rmse_val < 7",
    run_view_type=ViewType.ACTIVE_ONLY,
    max_results=5,
    order_by=["metrics.rmse_val ASC"],
)

In [25]:
print(
    f"Best run id: {runs[0].info.run_id}\nValid RMSE: {runs[0].data.metrics['rmse_val']}"
)

Best run id: e677ee00a99d40dea94eaee4a2ec4d6a
Valid RMSE: 6.503431247273923


In [26]:
best_run_id = runs[0].info.run_id
model_uri = f"runs:/{best_run_id}/models"
model_name = "nyc-taxi-regressor"

In [27]:
client.create_registered_model(model_name)
client.create_model_version(
    name=model_name,
    source=f"mlruns/{experiment_id}/{best_run_id}/artifacts/model",
    run_id=best_run_id,
)

2023/06/03 13:01:25 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation. Model name: nyc-taxi-regressor, version 1


<ModelVersion: aliases=[], creation_timestamp=1685790085313, current_stage='None', description=None, last_updated_timestamp=1685790085313, name='nyc-taxi-regressor', run_id='e677ee00a99d40dea94eaee4a2ec4d6a', run_link=None, source='mlruns/1/e677ee00a99d40dea94eaee4a2ec4d6a/artifacts/model', status='READY', status_message=None, tags={}, user_id=None, version=1>

In [28]:
version = 1
client.transition_model_version_stage(
    name=model_name, version=version, stage="Staging", archive_existing_versions=True
)

<ModelVersion: aliases=[], creation_timestamp=1685790085313, current_stage='Staging', description=None, last_updated_timestamp=1685790089209, name='nyc-taxi-regressor', run_id='e677ee00a99d40dea94eaee4a2ec4d6a', run_link=None, source='mlruns/1/e677ee00a99d40dea94eaee4a2ec4d6a/artifacts/model', status='READY', status_message=None, tags={}, user_id=None, version=1>

In [29]:
model = mlflow.pyfunc.load_model(model_uri=f"models:/{model_name}/Staging")
model.predict(X_val[:5])

 - mlflow (current: 2.3.2, required: mlflow==2.3)
To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.


array([17.151312 ,  7.1795797, 18.21909  , 24.356016 ,  9.369879 ],
      dtype=float32)

In [30]:
version = 1
client.transition_model_version_stage(
    name=model_name, version=version, stage="Production", archive_existing_versions=True
)

<ModelVersion: aliases=[], creation_timestamp=1685790085313, current_stage='Production', description=None, last_updated_timestamp=1685790118728, name='nyc-taxi-regressor', run_id='e677ee00a99d40dea94eaee4a2ec4d6a', run_link=None, source='mlruns/1/e677ee00a99d40dea94eaee4a2ec4d6a/artifacts/model', status='READY', status_message=None, tags={}, user_id=None, version=1>

# Homework

In [31]:
import os
from pathlib import Path

import mlflow
import mlflow.pyfunc
from mlflow.tracking import MlflowClient
from mlflow.entities import ViewType
import optuna
from optuna.samplers import TPESampler
from optuna.integration.mlflow import MLflowCallback
from sklearn.feature_extraction import DictVectorizer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

from src import read_trips, process_trips, save_model

In [32]:
DATA_DIR = Path("../data")
MODEL_DIR = Path("../models")

MLFLOW_TRACKING_URI = "sqlite:///mlflow.db"
EXPERIMENT_NAME = "nyc-taxi-tips-experiment"

In [33]:
mlflow.__version__

'2.3.2'

In [34]:
trips_train = read_trips(DATA_DIR, color="green", year="2022", month="1")
trips_val = read_trips(DATA_DIR, color="green", year="2022", month="2")
trips_test = read_trips(DATA_DIR, color="green", year="2022", month="3")

trips_train = process_trips(trips_train)
trips_val = process_trips(trips_val)
trips_test = process_trips(trips_test)

Standard deviation of duration: 78.22
Fraction of the records left after dropping the outliers: 0.9537242979438355
Standard deviation of duration: 78.88
Fraction of the records left after dropping the outliers: 0.9524200636896786
Standard deviation of duration: 78.87
Fraction of the records left after dropping the outliers: 0.948686606312948


In [35]:
# In the homework, we need to predict the tips amount.
target = "tip_amount"
categorical_cols = ["PU_DO"]
numerical_cols = ["trip_distance"]
used_cols = categorical_cols + numerical_cols

dv = DictVectorizer()
X_hw = dv.fit_transform(trips_train[used_cols].to_dict(orient="records"))

save_model(MODEL_DIR, "dv.pkl", dv)
os.path.getsize(MODEL_DIR / "dv.pkl")

153660

In [39]:
X_train, y_train = trips_train[used_cols], trips_train[target]
X_val, y_val = trips_val[used_cols], trips_val[target]
X_test, y_test = trips_test[used_cols], trips_test[target]

In [40]:
mlflc = MLflowCallback(
    tracking_uri=MLFLOW_TRACKING_URI,
    metric_name="rmse_val",
)


@mlflc.track_in_mlflow()
def objective(trial: optuna.Trial) -> float:
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 10, 50, 1),
        "max_depth": trial.suggest_int("max_depth", 1, 20, 1),
        "min_samples_split": trial.suggest_int("min_samples_split", 2, 10, 1),
        "min_samples_leaf": trial.suggest_int("min_samples_leaf", 1, 4, 1),
        "random_state": 42,
        "n_jobs": -1,
    }
    pipe = create_pipeline(RandomForestRegressor(**params))
    pipe.fit(X_train, y_train)
    rmse = mean_squared_error(y_val, pipe.predict(X_val), squared=False)

    mlflow.log_params(params)
    mlflow.log_metric("rmse_val", rmse)
    return rmse


study = optuna.create_study(
    study_name=EXPERIMENT_NAME, direction="minimize", sampler=TPESampler(seed=42)
)
study.optimize(objective, n_trials=10, gc_after_trial=True, callbacks=[mlflc])

  mlflc = MLflowCallback(
  @mlflc.track_in_mlflow()
[32m[I 2023-06-03 13:04:45,760][0m A new study created in memory with name: nyc-taxi-tips-experiment[0m
[32m[I 2023-06-03 13:04:47,934][0m Trial 0 finished with value: 2.451379690825458 and parameters: {'n_estimators': 25, 'max_depth': 20, 'min_samples_split': 8, 'min_samples_leaf': 3}. Best is trial 0 with value: 2.451379690825458.[0m
[32m[I 2023-06-03 13:04:49,642][0m Trial 1 finished with value: 2.4667366020368333 and parameters: {'n_estimators': 16, 'max_depth': 4, 'min_samples_split': 2, 'min_samples_leaf': 4}. Best is trial 0 with value: 2.451379690825458.[0m
[32m[I 2023-06-03 13:04:51,863][0m Trial 2 finished with value: 2.449827329704216 and parameters: {'n_estimators': 34, 'max_depth': 15, 'min_samples_split': 2, 'min_samples_leaf': 4}. Best is trial 2 with value: 2.449827329704216.[0m
[32m[I 2023-06-03 13:04:53,727][0m Trial 3 finished with value: 2.460983516558473 and parameters: {'n_estimators': 44, 'max_dep

In [41]:
client = MlflowClient(tracking_uri=MLFLOW_TRACKING_URI)
experiment_id = client.get_experiment_by_name(EXPERIMENT_NAME).experiment_id
runs = client.search_runs(
    experiment_ids=experiment_id,
    run_view_type=ViewType.ACTIVE_ONLY,
    max_results=5,
    order_by=["metrics.rmse_val ASC"],
)

In [42]:
for run in runs:
    rf_params = [
        "n_estimators",
        "max_depth",
        "min_samples_split",
        "min_samples_leaf",
        "random_state",
        "n_jobs",
    ]
    params = {k: int(v) for k, v in run.data.params.items() if k in rf_params}
    pipe = create_pipeline(RandomForestRegressor(**params))
    pipe.fit(X_train, y_train)
    rmse_test = mean_squared_error(y_test, pipe.predict(X_test), squared=False)

    print(f"Run id: {run.info.run_id}. Test RMSE: {rmse_test:.3f}")

2023/06/03 13:10:34 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '6db9398c5bd445e381f54e02858cff62', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow
2023/06/03 13:10:36 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID 'd3bd7eb7523442088f666388651d650d', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow


Run id: 04376d6f198d46b387ba3e488696ede4. Test RMSE: 2.285


2023/06/03 13:10:38 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '168c5210f50f4188b31ba305c603d437', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow


Run id: 8919149a24f946fdb27758e108f7a35a. Test RMSE: 2.295


2023/06/03 13:10:40 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '4e79d1c853d041c8bae17bccf03a16f4', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow


Run id: cfecc5956d3a4ed396f01484a80123f4. Test RMSE: 2.292


2023/06/03 13:10:42 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID 'a58a82cc66da4f36b3232f0ea078669f', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow


Run id: b6a1eb99cdf3429f89d03c29011bb58f. Test RMSE: 2.299
Run id: 1f9550f9bcb94d7e8654f321b4f39ae9. Test RMSE: 2.291
