## Hyper Parameter Tuning with MLFlow

Adapted from https://github.com/mlflow/mlflow/tree/master/examples/hyperparam

### Set up runtime environment

In [None]:
import logging
import os
from anaconda.enterprise.server.common.sdk import load_ae5_user_secrets

logging.getLogger().setLevel(logging.DEBUG)

load_ae5_user_secrets()

### Define our parameter search space

In [None]:
import numpy as np
from hyperopt import hp

# Define the search space
# https://hyperopt.github.io/hyperopt/
space = [
    hp.uniform("lr", 1e-5, 1e-1),
    hp.uniform("momentum", 0.0, 1.0),
]

_inf = np.finfo(np.float64).max
seed: int = 97531

### Create an MLFlow client

In [None]:
from mlflow.tracking import MlflowClient

# Generate a client
tracking_client = MlflowClient()

### Define our best run search (report)

In [None]:
def get_best_run(experiment_id, run):
    # find the best run, log its metrics as the final metrics of this run.
    runs = tracking_client.search_runs(
        [experiment_id], "tags.mlflow.parentRunId = '{run_id}' ".format(run_id=run.info.run_id)
    )
    best_val_train = _inf
    best_val_valid = _inf
    best_val_test = _inf
    best_run = None
    for r in runs:
        if r.data.metrics["val_rmse"] < best_val_valid:
            best_run = r
            best_val_train = r.data.metrics["train_rmse"]
            best_val_valid = r.data.metrics["val_rmse"]
            best_val_test = r.data.metrics["test_rmse"]
    return best_run, best_val_train, best_val_valid, best_val_test

### Define Training (with paramertization)

In [None]:
import uuid


def parameterized_training(nepochs, lr, momentum, experiment_id, training_data):
    with mlflow.start_run(run_name=f"parameterized-training-{str(uuid.uuid4())}", nested=True) as child_run:
        #
        # Wrapped and Tracked Workflow Step Runs
        # https://mlflow.org/docs/latest/python_api/mlflow.projects.html#mlflow.projects.run
        #
        p = mlflow.projects.run(
            uri=".",
            entry_point="train",
            run_id=child_run.info.run_id,
            env_manager="local",
            backend="adsp",
            parameters={
                "training_data": training_data,
                "epochs": str(nepochs),
                "learning_rate": str(lr),
                "momentum": str(momentum),
                "seed": seed,
            },
            experiment_id=experiment_id,
            synchronous=False,  # Allow the run to fail if a model is not properly created
        )
        succeeded = p.wait()
        mlflow.log_params({"lr": lr, "momentum": momentum})

    return succeeded, p

In [None]:
import mlflow.tracking


def new_eval(
    nepochs,
    experiment_id,
    null_train_loss,
    null_valid_loss,
    null_test_loss,
    training_data,
    return_all=False,
    metric="rmse",
):
    """
    Create a new eval function

    :param nepochs: Number of epochs to train the model.
    :experiment_id: Experiment id for the training run
    :valid_null_loss: Loss of a null model on the validation dataset
    :test_null_loss: Loss of a null model on the test dataset.
    :return_test_loss: Return both validation and test loss if set.

    :return: new eval function.
    """

    def eval(params):
        """
        Train Keras model with given parameters by invoking MLflow run.

        Notice we store runUuid and resulting metric in a file. We will later use these to pick
        the best run and to log the runUuids of the child runs as an artifact. This is a
        temporary workaround until MLflow offers better mechanism of linking runs together.

        :param params: Parameters to the train_keras script we optimize over:
                      learning_rate, drop_out_1
        :return: The metric value evaluated on the validation data.
        """

        lr, momentum = params
        succeeded, p = parameterized_training(
            nepochs=nepochs, experiment_id=experiment_id, lr=lr, momentum=momentum, training_data=training_data
        )

        if succeeded:
            training_run = tracking_client.get_run(p.run_id)
            metrics = training_run.data.metrics
            # cap the loss at the loss of the null model
            train_loss = min(null_train_loss, metrics["train_{}".format(metric)])
            valid_loss = min(null_valid_loss, metrics["val_{}".format(metric)])
            test_loss = min(null_test_loss, metrics["test_{}".format(metric)])
        else:
            # run failed => return null loss
            tracking_client.set_terminated(p.run_id, "FAILED")
            train_loss = null_train_loss
            valid_loss = null_valid_loss
            test_loss = null_test_loss

        # Log this tuning runs metrics
        mlflow.log_metrics(
            {
                "train_{}".format(metric): train_loss,
                "val_{}".format(metric): valid_loss,
                "test_{}".format(metric): test_loss,
            }
        )

        if return_all:
            return train_loss, valid_loss, test_loss
        else:
            return valid_loss

    return eval

In [None]:
from hyperopt import fmin, tpe, rand
import mlflow


def tune(max_runs, epochs, training_data, metric="rmse", algo="tpe.suggest"):
    """Run hyperparameter optimization."""

    with mlflow.start_run(run_name=f"hyperparamter-optimization-jburt-{str(uuid.uuid4())}") as run:
        experiment_id = run.info.experiment_id

        # Evaluate null model first.
        train_null_loss, valid_null_loss, test_null_loss = new_eval(
            0, experiment_id, _inf, _inf, _inf, training_data, True, metric
        )(params=[0, 0])

        # perform parameter search
        best = fmin(
            fn=new_eval(
                epochs,
                experiment_id,
                train_null_loss,
                valid_null_loss,
                test_null_loss,
                training_data=training_data,
                metric=metric,
            ),
            space=space,
            algo=tpe.suggest if algo == "tpe.suggest" else rand.suggest,
            max_evals=max_runs,
        )
        # log the best parameters
        mlflow.set_tag("best params", str(best))

        # find the best run, log its metrics as the final metrics of this run.
        best_run, best_val_train, best_val_valid, best_val_test = get_best_run(experiment_id, run)

        # Tag final metrics
        mlflow.set_tag("best_run", best_run.info.run_id)
        mlflow.log_metrics(
            {
                "train_{}".format(metric): best_val_train,
                "val_{}".format(metric): best_val_valid,
                "test_{}".format(metric): best_val_test,
            }
        )

        return {
            "best": {"params": str(best), "run": best_run.info.run_id},
            "metrics": {
                "train_{}".format(metric): best_val_train,
                "val_{}".format(metric): best_val_valid,
                "test_{}".format(metric): best_val_test,
            },
            "details": {"experiment_id": experiment_id, "max_runs": max_runs, "epochs": epochs},
        }

### Perform the Parameter Search

In [None]:
# Data Source
# http://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-white.csv

summary = tune(max_runs=3, epochs=16, training_data="data/winequality-white.csv")

### Report

In [None]:
import json

print(json.dumps(summary, indent=4))