In [7]:
from mlflow.client import MlflowClient

TRACKING_URI = "sqlite:///mlflow.db"
client = MlflowClient(TRACKING_URI)

In [10]:
# Check Experiments
client.search_experiments()

[<Experiment: artifact_location='/home/ubuntu/mlops_zoomcamp_sam/02-mlflow/mlruns/1', creation_time=1684758245252, experiment_id='1', last_update_time=1684758245252, lifecycle_stage='active', name='nyc-taxi-experiment', tags={}>,
 <Experiment: artifact_location='/home/ubuntu/mlops_zoomcamp_sam/02-mlflow/mlruns/0', creation_time=1684758245241, experiment_id='0', last_update_time=1684758245241, lifecycle_stage='active', name='Default', tags={}>]

In [11]:
# Can create experiments
client.create_experiment(name = 'api_created_experiment')

'2'

In [21]:
# Check the runs for the ebst runs in a given experiment

from mlflow.entities import ViewType

runs = client.search_runs(
     experiment_ids = '1',
     filter_string = 'metrics.rmse < 10',
     run_view_type=ViewType.ACTIVE_ONLY,
     max_results=5,
     order_by=['metrics.rmse ASC'])

In [84]:
for run in runs:
    print(f"run id: {run.info.run_id}, rmse: {run.data.metrics['rmse']}")

run id: 752fd132d7544919987e96ce765d95c9, rmse: 7.216312938895791
run id: dc40f8f62a1b49b2b6f657fea8757b5a, rmse: 7.289268481569861
run id: 8f27c2299ef1470c8045db1b6bf6d837, rmse: 8.116069832221214
run id: 240ee977bb9a4ee0b05b4c98b35589ae, rmse: 8.116069832221214
run id: aa4d4650f4794677937f468dffa129ed, rmse: 8.116069832221214


In [23]:
# Next Step, promote to registry
import mlflow
mlflow.set_tracking_uri(TRACKING_URI)

In [85]:
run_id = 'dc40f8f62a1b49b2b6f657fea8757b5a'
model_uri = f'runs:/{run_id}/model'
mlflow.register_model(model_uri = model_uri, name = 'nyc_taxi_regressor')

Registered model 'nyc_taxi_regressor' already exists. Creating a new version of this model...
2023/05/24 14:43:11 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation. Model name: nyc_taxi_regressor, version 3
Created version '3' of model 'nyc_taxi_regressor'.


<ModelVersion: aliases=[], creation_timestamp=1684939391630, current_stage='None', description=None, last_updated_timestamp=1684939391630, name='nyc_taxi_regressor', run_id='dc40f8f62a1b49b2b6f657fea8757b5a', run_link=None, source='/home/ubuntu/mlops_zoomcamp_sam/02-mlflow/mlruns/1/dc40f8f62a1b49b2b6f657fea8757b5a/artifacts/model', status='READY', status_message=None, tags={}, user_id=None, version=3>

In [86]:
model_uri

'runs:/dc40f8f62a1b49b2b6f657fea8757b5a/model'

In [87]:
# How to transition a model from one stage to another

model_name = 'nyc_taxi_regressor'
#client.get_registered_model('nyc_taxi_regressor')
latest_versions = client.get_latest_versions(name=model_name)

for version in latest_versions:
    print(f'Version: {version.version} Stage: {version.current_stage}')

Version: 1 Stage: Staging
Version: 2 Stage: Production
Version: 3 Stage: None


In [88]:
version = 2
stage = 'Production'
client.transition_model_version_stage(
    name = model_name,
    version = version,
    stage = stage,
    archive_existing_versions=False)

<ModelVersion: aliases=[], creation_timestamp=1684936393160, current_stage='Production', description='THe model version 2 was transitioned to Production on 2023-05-24', last_updated_timestamp=1684939400751, name='nyc_taxi_regressor', run_id='dc40f8f62a1b49b2b6f657fea8757b5a', run_link=None, source='/home/ubuntu/mlops_zoomcamp_sam/02-mlflow/mlruns/1/dc40f8f62a1b49b2b6f657fea8757b5a/artifacts/model', status='READY', status_message=None, tags={}, user_id=None, version=2>

In [99]:
latest_versions = client.get_latest_versions(name=model_name)
for version in latest_versions:
    print(f'Version: {version.version} Stage: {version.current_stage}')

Version: 1 Stage: Staging
Version: 2 Stage: Production
Version: 3 Stage: None


In [90]:
from datetime import datetime

date = datetime.today().date()
version = 2
# We can annotate as well
client.update_model_version(
    name = model_name,
    version = version,
    description = f"THe model version {version} was transitioned to {stage} on {date}"
)

<ModelVersion: aliases=[], creation_timestamp=1684936393160, current_stage='Production', description='THe model version 2 was transitioned to Production on 2023-05-24', last_updated_timestamp=1684939404850, name='nyc_taxi_regressor', run_id='dc40f8f62a1b49b2b6f657fea8757b5a', run_link=None, source='/home/ubuntu/mlops_zoomcamp_sam/02-mlflow/mlruns/1/dc40f8f62a1b49b2b6f657fea8757b5a/artifacts/model', status='READY', status_message=None, tags={}, user_id=None, version=2>

In [48]:
# Which do we want to promote into production and archive one in prod, updated code

from sklearn.metrics import mean_squared_error
import pandas as pd


def read_dataframe(filename):
    df = pd.read_parquet(filename)
    
    df['duration'] = df.tpep_dropoff_datetime - df.tpep_pickup_datetime
    df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)

    df = df[(df.duration >= 1) & (df.duration <= 60)]

    categorical = ['PULocationID', 'DOLocationID']
    df[categorical] = df[categorical].astype(str)
    
    return df


def preprocess(df, dv):
    df['PU_DO'] = df['PULocationID'] + '_' + df['DOLocationID']
    categorical = ['PU_DO']
    numerical = ['trip_distance']
    train_dicts = df[categorical + numerical].to_dict(orient='records')
    return dv.transform(train_dicts)


def test_model(name, stage, X_test, y_test):
    model = mlflow.pyfunc.load_model(f"models:/{name}/{stage}")
    y_pred = model.predict(X_test)
    return {"rmse": mean_squared_error(y_test, y_pred, squared=False)}

In [91]:
# Added in March as validation
import pathlib
data_path = pathlib.Path('../data')
df = read_dataframe(pathlib.Path(data_path, 'yellow_tripdata_2022-03.parquet'))

In [92]:
import pickle
run_id = 'dc40f8f62a1b49b2b6f657fea8757b5a'
client.download_artifacts(run_id = run_id, path = 'preprocessor', dst_path='.')
with open('preprocessor/preprocessor.b', 'rb') as f_in:
    dv = pickle.load(f_in)
X_test = preprocess(df, dv)


  client.download_artifacts(run_id = run_id, path = 'preprocessor', dst_path='.')


In [93]:
target = 'duration'
y_test = df[target].values

In [94]:
%time test_model(name = model_name, stage = 'Production', X_test=X_test, y_test = y_test)

 - mlflow (current: 2.3.2, required: mlflow==2.3)
To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.


CPU times: user 10.4 s, sys: 1.82 ms, total: 10.4 s
Wall time: 3.51 s


{'rmse': 10.564157543930355}

In [101]:
%time test_model(name = model_name, stage = 'None', X_test=X_test, y_test = y_test)

 - mlflow (current: 2.3.2, required: mlflow==2.3)
To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.


CPU times: user 10.3 s, sys: 3.15 ms, total: 10.3 s
Wall time: 3.5 s


{'rmse': 10.564157543930355}