In [None]:
import os
import sys

root_folder = os.path.dirname(os.path.abspath(""))
if not root_folder in sys.path:
    sys.path.append(root_folder)

In [None]:
from pathlib import Path

import mlflow
import optuna
from optuna.integration.mlflow import MLflowCallback
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import Lasso
from sklearn.metrics import mean_squared_error
from sklearn.pipeline import Pipeline
from xgboost import XGBRegressor

from src import read_trips, process_trips

In [None]:
DATA_DIR = Path("../data")
MODEL_DIR = Path("../models")

MLFLOW_TRACKING_URI = "sqlite:///mlflow.db"
EXPERIMENT_NAME = "nyc-taxi-experiment"

In [None]:
trips_train = read_trips(DATA_DIR, color="green", year="2021", month="1")
trips_val = read_trips(DATA_DIR, color="green", year="2021", month="2")

trips_train = process_trips(trips_train)
trips_val = process_trips(trips_val)

target = "duration"
categorical_cols = ["PU_DO"]
numerical_cols = ["trip_distance"]
used_cols = categorical_cols + numerical_cols

X_train = trips_train[used_cols].to_dict(orient="records")
y_train = trips_train[target]

X_val = trips_val[used_cols].to_dict(orient="records")
y_val = trips_val[target]

# Use MLflow to track the experiment

In [None]:
mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)
mlflow.set_experiment(EXPERIMENT_NAME)

with mlflow.start_run():
    mlflow.set_tag("model", "lasso")
    mlflow.sklearn.autolog()

    pipe = Pipeline([("vectorizer", DictVectorizer()), ("predictor", Lasso(0.01))])
    pipe.fit(X_train, y_train)

    y_pred = pipe.predict(X_val)
    rmse = mean_squared_error(y_val, y_pred, squared=False)
    mlflow.log_metric("rmse_val", rmse)

# Hyperparamer Tuning

In [None]:
mlflc = MLflowCallback(
    tracking_uri=MLFLOW_TRACKING_URI,
    metric_name="rmse_val",
)

In [None]:
@mlflc.track_in_mlflow()
def objective(trial: optuna.Trial) -> float:
    params = {"alpha": trial.suggest_float("alpha", 0.001, 1.0, log=True)}
    pipe = Pipeline(
        [("vectorizer", DictVectorizer()), ("predictor", Lasso(params["alpha"]))]
    )
    pipe.fit(X_train, y_train)

    rmse = mean_squared_error(y_val, pipe.predict(X_val), squared=False)

    mlflow.sklearn.autolog()
    return rmse


study = optuna.create_study(study_name=EXPERIMENT_NAME, direction="minimize")
study.optimize(objective, n_trials=10, gc_after_trial=True, callbacks=[mlflc])

In [None]:
# Get the best model by study.trial
best_model = Pipeline(
    [("vectorizer", DictVectorizer()), ("predictor", Lasso(**study.best_params))]
)
best_model.fit(X_train, y_train)
mean_squared_error(y_val, best_model.predict(X_val), squared=False)

# Use xgboost as predictor

In [32]:
@mlflc.track_in_mlflow()
def objective(trial):
    params = {
        "max_depth": trial.suggest_int("max_depth", 3, 10),
        "eta": trial.suggest_float("eta", 0.01, 0.4),
        "alpha": trial.suggest_float("alpha", 0, 5),
        "lambda": trial.suggest_float("lambda", 0, 5),
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 10),
    }
    pipe = Pipeline(
        [
            ("vectorizer", DictVectorizer()),
            ("predictor", XGBRegressor(**params, random_state=42)),
        ]
    )
    pipe.fit(X_train, y_train)

    rmse = mean_squared_error(y_val, pipe.predict(X_val), squared=False)
    mlflow.sklearn.autolog()
    return rmse


study = optuna.create_study(study_name=EXPERIMENT_NAME, direction="minimize")
study.optimize(objective, n_trials=10, gc_after_trial=True, callbacks=[mlflc])

  @mlflc.track_in_mlflow()
[32m[I 2023-05-29 13:54:41,352][0m A new study created in memory with name: nyc-taxi-experiment[0m
             callbacks=None, colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=None, early_stopping_rounds=None,
             enable_categorical=False, eta=0.08915661915738007,
             eval_metric=None, feature_types=None, gamma=None, gpu_id=None,
             grow_policy=None, importance_type=None,
             interaction_constraints=None, la...`
             callbacks=None, colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=None, early_stopping_rounds=None,
             enable_categorical=False, eta=0.08915661915738007,
             eval_metric=None, feature_types=None, gamma=None, gpu_id=None,
             grow_policy=None, importance_type=None,
             interaction_constraints=None, lambda=2.515744074947076,
             learning_rat...`
[32m[I 2023-05-29 13:54:43,868][0m Trial 0 finished wi

In [33]:
best_model = Pipeline(
    [
        ("vectorizer", DictVectorizer()),
        ("predictor", XGBRegressor(**study.best_params, random_state=42)),
    ]
)
best_model.fit(X_train, y_train)
mean_squared_error(y_val, best_model.predict(X_val), squared=False)

2023/05/29 13:55:18 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID 'b6a578a5603b4a3b9c79345e33fbfb44', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow
             callbacks=None, colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=None, early_stopping_rounds=None,
             enable_categorical=False, eta=0.39471646237738456,
             eval_metric=None, feature_types=None, gamma=None, gpu_id=None,
             grow_policy=None, importance_type=None,
             interaction_constraints=None, la...`
             callbacks=None, colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=None, early_stopping_rounds=None,
             enable_categorical=False, eta=0.39471646237738456,
             eval_metric=None, feature_types=None, gamma=None, gpu_id=None,
             grow_policy=None, importance_type=None,
             interactio

6.523958692127635

#  Link the best model to the model registry

In [34]:
from mlflow.tracking import MlflowClient
from mlflow.entities import ViewType

In [35]:
client = MlflowClient(tracking_uri=MLFLOW_TRACKING_URI)

In [36]:
runs = client.search_runs(
    experiment_ids="1",  # Experiment ID we want
    filter_string="metrics.rmse_val < 7",
    run_view_type=ViewType.ACTIVE_ONLY,
    max_results=5,
    order_by=["metrics.rmse_val ASC"],
)

In [37]:
print(
    f"Best run id: {runs[0].info.run_id}\nValid RMSE: {runs[0].data.metrics['rmse_val']}"
)

Best run id: 6be2d1a5de274006804b2c8b6ee0d92d
Valid RMSE: 6.523958692127635


In [38]:
best_run_id = runs[0].info.run_id
model_uri = f"runs:/{best_run_id}/models"
model_name = "nyc-taxi-regressor"

In [39]:
client.create_registered_model(model_name)
client.create_model_version(
    name=model_name,
    source=f"mlruns/1/{best_run_id}/artifacts/model",
    run_id=best_run_id,
)

2023/05/29 13:55:45 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation. Model name: nyc-taxi-regressor, version 1


<ModelVersion: aliases=[], creation_timestamp=1685361345259, current_stage='None', description=None, last_updated_timestamp=1685361345259, name='nyc-taxi-regressor', run_id='6be2d1a5de274006804b2c8b6ee0d92d', run_link=None, source='mlruns/1/6be2d1a5de274006804b2c8b6ee0d92d/artifacts/model', status='READY', status_message=None, tags={}, user_id=None, version=1>

In [40]:
version = 1
client.transition_model_version_stage(
    name=model_name, version=version, stage="Staging", archive_existing_versions=True
)

<ModelVersion: aliases=[], creation_timestamp=1685361345259, current_stage='Staging', description=None, last_updated_timestamp=1685361358270, name='nyc-taxi-regressor', run_id='6be2d1a5de274006804b2c8b6ee0d92d', run_link=None, source='mlruns/1/6be2d1a5de274006804b2c8b6ee0d92d/artifacts/model', status='READY', status_message=None, tags={}, user_id=None, version=1>

In [44]:
import mlflow.pyfunc

model = mlflow.pyfunc.load_model(model_uri=f"models:/{model_name}/Staging")
model.predict(X_val[:5])

 - mlflow (current: 2.3.2, required: mlflow==2.3)
To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.


array([16.63945  ,  7.2041225, 21.48723  , 24.47869  ,  8.892417 ],
      dtype=float32)

In [None]:
version = 1
client.transition_model_version_stage(
    name=model_name, version=version, stage="Production", archive_existing_versions=True
)