## Experiment Tracking 

In development of machine learning, we need to track all of experiments. Terefore, a proper process is required. Here, this notebook showcases how we use one of tools called MLFlow. 



1. Train a linear regression model

2. Package the code that trains the model in a reusable and reproducible model format


This tutorial uses a dataset to predict the quality of wine based on quantitative features like the wine’s “fixed acidity”, “pH”, “residual sugar”, and so on. The dataset is from UCI’s machine learning repository. 

In [2]:
!pip install mlflow -q
!pip install pyngrok -q

In [4]:
import os
import warnings
import sys
from argparse import Namespace
import mlflow
from pathlib import Path
import json
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import ElasticNet
from urllib.parse import urlparse
import mlflow
import mlflow.sklearn

def eval_metrics(actual, pred):
    rmse = np.sqrt(mean_squared_error(actual, pred))
    mae = mean_absolute_error(actual, pred)
    r2 = r2_score(actual, pred)
    return rmse, mae, r2

def load_data():

    # Read the wine-quality csv file from the URL
    csv_url = (
        "http://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv"
    )
    try:
        data = pd.read_csv(csv_url, sep=";")
    except Exception as e:
        print("Unable to download training & test CSV, check your internet connection. Error: %s")
    return data   


In [5]:
!pwd

/content


In [6]:
!ls

sample_data


In [7]:
# Set tracking URI
MODEL_REGISTRY = Path("experiments")
Path(MODEL_REGISTRY).mkdir(exist_ok=True) # create experiments dir
mlflow.set_tracking_uri("file://" + str(MODEL_REGISTRY.absolute()))

In [8]:
!ls

experiments  sample_data


In [9]:
!ls /content/experiments

In [10]:
def train_lr(args, df):
    """Train a LR using specific arguments."""

    # Set seeds
    np.random.seed(40)

    # Get data splits
    # Split the data into training and test sets. (0.75, 0.25) split.
    train, test = train_test_split(df)

    # The predicted column is "quality" which is a scalar from [3, 9]
    train_x = train.drop(["quality"], axis=1)
    test_x = test.drop(["quality"], axis=1)
    train_y = train[["quality"]]
    test_y = test[["quality"]]

    # Initialize model
    alpha    = args.alpha
    l1_ratio = args.l1_ratio
    lr = ElasticNet(alpha=alpha, l1_ratio=l1_ratio, random_state=42)

    # Train the model
    lr.fit(train_x, train_y)
    predicted_qualities = lr.predict(test_x)
    # Test the model
    (rmse, mae, r2) = eval_metrics(test_y, predicted_qualities)
    print("Elasticnet model (alpha=%f, l1_ratio=%f):" % (alpha, l1_ratio))
    print("  RMSE: %s" % rmse)
    print("  MAE: %s" % mae)
    print("  R2: %s" % r2)
    # Evaluate (simple)
    performance = {"RMSE": rmse, "MAE": mae, "R2": r2}
    return {
        "args": args,
        "model": lr,
        "performance": performance
    }

In [11]:
def mlexp_track(args, df):
  # Tracking
  with mlflow.start_run() as run:
    df = load_data()
    # Train & evaluate
    artifacts = train_lr(args=args, df=df)
    # Log key metrics
    mlflow.log_metrics({"RMSE": artifacts["performance"]["RMSE"]})
    mlflow.log_metrics({"MAE": artifacts["performance"]["MAE"]})
    mlflow.log_metrics({"R2": artifacts["performance"]["R2"]})
    tracking_url_type_store = urlparse(mlflow.get_tracking_uri()).scheme

    # Model registry does not work with file store
    if tracking_url_type_store != "file":
          # Register the model
          # There are other ways to use the Model Registry, which depends on the use case,
          # please refer to the doc for more information:
          # https://mlflow.org/docs/latest/model-registry.html#api-workflow
          mlflow.sklearn.log_model(artifacts['model'], "model", registered_model_name="ElasticnetWineModel")
    else:
          mlflow.sklearn.log_model(artifacts['model'], "model")
    # Log parameters
    mlflow.log_params(vars(artifacts["args"]))
    #mlflow.log_param("alpha", artifacts["args"].alpha)
    #mlflow.log_param("l1_ratio", artifacts["args"].l1_ratio)

In [12]:
# Specify a list of arguments
args_list = [Namespace(alpha=1.5, l1_ratio=0.9,),
             Namespace(alpha=0.5, l1_ratio=0.02,),
             Namespace(alpha=0.01, l1_ratio=0.5,)]



In [13]:
# Set experiment
mlflow.set_experiment(experiment_name="baselines")

2022/03/18 09:37:56 INFO mlflow.tracking.fluent: Experiment with name 'baselines' does not exist. Creating a new experiment.


<Experiment: artifact_location='file:///content/experiments/0', experiment_id='0', lifecycle_stage='active', name='baselines', tags={}>

In [14]:
df = load_data()
for args in args_list:
  mlexp_track(args, df)

Elasticnet model (alpha=1.500000, l1_ratio=0.900000):
  RMSE: 0.8327481314145982
  MAE: 0.6751289812215555
  R2: 0.017435513620481347
Elasticnet model (alpha=0.500000, l1_ratio=0.020000):
  RMSE: 0.7364106074415193
  MAE: 0.5673052761841408
  R2: 0.23162398391500494
Elasticnet model (alpha=0.010000, l1_ratio=0.500000):
  RMSE: 0.6778557583356976
  MAE: 0.5190564939146215
  R2: 0.3489590462840657


In [15]:
from pyngrok import ngrok

### Check experiments' records from UI

In [17]:
# https://stackoverflow.com/questions/61615818/setting-up-mlflow-on-google-colab
get_ipython().system_raw("mlflow server -h 0.0.0.0 -p 5000 --backend-store-uri $PWD/experiments/ &")
ngrok.kill()
# Get your authtoken from https://dashboard.ngrok.com/auth
ngrok.set_auth_token("")
ngrok_tunnel = ngrok.connect(addr="5000", proto="http", bind_tls=True)
print("MLflow Tracking UI:", ngrok_tunnel.public_url)

MLflow Tracking UI: https://2a66-34-66-129-152.ngrok.io


### Retrieve the best model

In [None]:
# Load all runs from experiment
experiment_id = mlflow.get_experiment_by_name("baselines").experiment_id
all_runs = mlflow.search_runs(experiment_ids=experiment_id, order_by=["metric.MAE", "metric.R2"])
print (all_runs)

                             run_id experiment_id    status  \
0  152f5ffbaad841ef98a015028ac4eae8             0  FINISHED   
1  7e706301eee14f37b6d766fe82e3c6fe             0  FINISHED   
2  8446d297418c4224ac1d5e0d0efaa96b             0  FINISHED   

                                        artifact_uri  \
0  file:///content/experiments/0/152f5ffbaad841ef...   
1  file:///content/experiments/0/7e706301eee14f37...   
2  file:///content/experiments/0/8446d297418c4224...   

                        start_time                         end_time  \
0 2022-03-06 00:00:20.494000+00:00 2022-03-06 00:00:23.240000+00:00   
1 2022-03-06 00:00:17.418000+00:00 2022-03-06 00:00:20.491000+00:00   
2 2022-03-06 00:00:11.552000+00:00 2022-03-06 00:00:17.414000+00:00   

   metrics.MAE  metrics.RMSE  metrics.R2 params.alpha params.l1_ratio  \
0     0.519056      0.677856    0.348959         0.01             0.5   
1     0.567305      0.736411    0.231624          0.5            0.02   
2     0.675129    

In [None]:
# Load the Best run
best_run_id = all_runs.iloc[0].run_id
best_run = mlflow.get_run(run_id=best_run_id)
model_uri = "runs:/" + best_run_id + "/model"
hh = mlflow.sklearn.load_model(model_uri)

In [None]:
hh.predict(df.drop(["quality"], axis=1).iloc[20:23])

array([5.55755636, 5.43387419, 5.50192246])