<h1>MLFlow Tutorial</h1>

In [17]:
import numpy as np
import pandas as pd

#For Modeling
from sklearn.model_selection import cross_val_score, TimeSeriesSplit
from sklearn.metrics import make_scorer
from sklearn.ensemble import RandomForestRegressor

#MLflow imports
import mlflow
from mlflow.tracking import MlflowClient

<h3>MLFLow CheatSheet</h3>

In [3]:
mlflow.__version__

'1.11.0'

**Let's dive in**

In [9]:
mlflow_client = MlflowClient()

In [10]:
mlflow.get_tracking_uri()

'file:///Users/shahwaiz/Desktop/mlruns'

In [11]:
mlflow_client.list_experiments()

[<Experiment: artifact_location='./mlruns/0', experiment_id='0', lifecycle_stage='active', name='Default', tags={}>]

In [None]:
#Returns important information such as creation_timestamp, current_stage, run_id, and more.
version = 1
mlflow_client.get_model_version(name = model_name, version = version)

In [None]:
run_id = '9842hweure2bjhdiwh'
mlflow_client.get_run(run_id).data.metrics["MAPE"]

In [None]:
transition_model_version_stage(name = model_name, version = , stage = , archive_existing_versions = True)

**Logging Model to MLflow**

In [None]:
mlflow_client.create_experiment(name = exp_name)

mlflow_client.create_run(experiment_id = run_id_uuid)

<h2> Let's use what we learned </h2>

<h3>Read dummy dataset</h3>

In [26]:
orders = pd.read_csv("dummy_data.csv")
orders["date"] = pd.to_datetime(orders["date"]) #convert date column to datetime data type

In [37]:
orders.head()

Unnamed: 0,date,count,day_of_week,day_of_year,month,year,quarter,lag_1
0,2019-01-01,5723,1,1,1,2019,1,0
1,2019-01-02,599,2,2,1,2019,1,5723
2,2019-01-03,7410,3,3,1,2019,1,599
3,2019-01-04,8435,4,4,1,2019,1,7410
4,2019-01-05,5053,5,5,1,2019,1,8435


<h3>Feature Engineering</h3>

In [35]:
orders["day_of_week"] = orders["date"].dt.dayofweek
orders["day_of_year"] = orders["date"].dt.dayofyear
orders["month"] = orders["date"].dt.month
orders["year"] = orders["date"].dt.year
orders["quarter"] = orders["date"].dt.quarter
orders["lag_1"] = orders["count"].shift(periods = 1, fill_value = 0)

In [36]:
orders.head(10)

Unnamed: 0,date,count,day_of_week,day_of_year,month,year,quarter,lag_1
0,2019-01-01,5723,1,1,1,2019,1,0
1,2019-01-02,599,2,2,1,2019,1,5723
2,2019-01-03,7410,3,3,1,2019,1,599
3,2019-01-04,8435,4,4,1,2019,1,7410
4,2019-01-05,5053,5,5,1,2019,1,8435
5,2019-01-06,2779,6,6,1,2019,1,5053
6,2019-01-07,8323,0,7,1,2019,1,2779
7,2019-01-08,5501,1,8,1,2019,1,8323
8,2019-01-09,4850,2,9,1,2019,1,5501
9,2019-01-10,1862,3,10,1,2019,1,4850


<h3>Modeling</h3>

In [48]:
mape = lambda y_true, y_pred: np.mean(np.abs((y_true - y_pred) / y_true)) * 100
mape_scorer = make_scorer(mape)

hyperparameters = {'n_estimators':10, 'max_depth':3}

np.mean(
    cross_val_score(
        estimator = RandomForestRegressor(**hyperparameters, random_state=0),
        X = orders[["day_of_week", "day_of_year", "month", "year", "quarter", "lag_1"]],
        y = orders["count"],
        scoring = mape_scorer,
        cv = TimeSeriesSplit(n_splits=4))
)

148.39531555194822

<h3>Log to MLflow</h3>

Run `mlflow ui` to launch the MLflow UI.

One thing that might be very useful could be automating what model to promote to production based on some criteria.
For instance, we can train a model daily and then compare it to the model that is in Production and if the error of the newly trained model is lower than some threshold, we can promote that model to production and archive the previous model, and vice versa.

Suppose we have a model that is tagged `Production` in MLflow and we just trained a new model. Using MLflow, automating model management is extremely easy.

In [None]:
def get_production_metric_minus_model_metric(
    mlflow_client: MLflowClient,
    model_name: str,
    model_version: int,
    metric: str) -> float:
    """
    Calculates the difference between current production metric and given model version metric.
    :param mlflow_client MLflow Client object to interact with API.
    :param model_name: Name of the model.
    :paramm metric: Model scoring metric.
    :return: Difference between the error of given model version and production version.
    """
    
    production_model_metadata = mlflow_client.get_latest_versions(f"{model_name}", stages=["Production"])
    production_model_metric = mlflow_client.get_run(production_model_data[0].run_id).data.metrics[metric]
    
    latest_model_metadata = mlflow_client.get_model_version(name = model_name, version = model_version)
    current_model_metric = mlflow_client.get_run(latest_model_data.run_id).data.metrics[metric]
    
    return production_model_metric - current_model_metric

In [None]:
newly_trained_model_version = int(
    mlflow_client.get_latest_versions(
        name=registered_model_name, stages=["None"]
    )[0].version
)

metric_difference = pipeline_tasks.get_production_metric_minus_model_metric(
    mlflow_client=mlflow_client,
    model_name=registered_model_name,
    model_version=newly_trained_model_version,
    metric="MAPE",
)

THRESHOLD = 1
if metric_difference > THRESHOLD:

    mlflow_client.transition_model_version_stage(
        name=registered_model_name,
        version=newly_trained_model_version,
        stage="Production",
        archive_existing_versions=True,
    )

    logger.info(
        "Model %s has been set to Production for %s in MLflow",
        str(newly_trained_model_version),
        registered_model_name,
    )
