In [None]:
import time
import subprocess
import sys
import threading
from queue import Queue, Empty

from functools import partial

import mlflow
import mlflow.sklearn

from cuml.metrics.accuracy import accuracy_score
from cuml.preprocessing.model_selection import train_test_split
from cuml.ensemble import RandomForestClassifier

from hyperopt import fmin, tpe, hp, Trials, STATUS_OK

### Define environment helpers

In [None]:
import os

USER_NAME = ""
ACCOUNT_ID = ""
ACCOUNT_TOKEN = ""

experiment = "rapids_mlflow"
dbvars = {
    "MLFLOW_EXPERIMENT_NAME": f"/Users/{USER_NAME}/{experiment}",
    "MLFLOW_TRACKING_URI": f"databricks",
    "DATABRICKS_HOST": f"https://{ACCOUNT_ID}.cloud.databricks.com",
    "DATABRICKS_TOKEN": f"{ACCOUNT_TOKEN}"
}

def set_databricks_env():
    for k, v in dbvars.items():
        os.environ[k] = v
        
    mlflow.set_experiment(f"/Users/{USER_NAME}/{experiment}")
     
set_databricks_env()

### Define a data loading helper

In [None]:
def load_data(fpath):
    """
    Simple helper function for loading data to be used by CPU/GPU models.

    :param fpath: Path to the data to be ingested
    :return: DataFrame wrapping the data at [fpath]. Data will be in either a Pandas or RAPIDS (cuDF) DataFrame
    """
    import cudf

    df = cudf.read_parquet(fpath)

    return df

### Define our training routine, and Hyperopt entry points.

In [None]:
def _train(params, fpath, hyperopt=False):
    """
    :param params: hyperparameters. Its structure is consistent with how search space is defined. See below.
    :param fpath: Path or URL for the training data used with the model.
    :param mode: Hardware backend to use for training [CPU|GPU]
    :param hyperopt: Use hyperopt for hyperparameter search during training.
    :return: dict with fields 'loss' (scalar loss) and 'status' (success/failure status of run)
    """
    max_depth, max_features, n_estimators = params
    max_depth, max_features, n_estimators = int(max_depth), float(max_features), int(n_estimators)

    df = load_data(fpath)

    X = df.drop(["ArrDelayBinary"], axis=1)
    y = df["ArrDelayBinary"].astype('int32')

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
    mod = RandomForestClassifier(max_depth=max_depth, max_features=max_features, n_estimators=n_estimators)
    acc_scorer = accuracy_score

    mod.fit(X_train, y_train)
    preds = mod.predict(X_test)
    acc = acc_scorer(y_test, preds)

    mlparams = {"max_depth": str(max_depth),
                "max_features": str(max_features),
                "n_estimators": str(n_estimators),
                }
    mlflow.log_params(mlparams)

    mlmetrics = {"accuracy": acc}
    mlflow.log_metrics(mlmetrics)

    if (not hyperopt):
        return mod

    return {'loss': acc, 'status': STATUS_OK}


def train(params, fpath, hyperopt=False):
    """
    Proxy function used to call _train
    :param params: hyperparameters. Its structure is consistent with how search space is defined. See below.
    :param fpath: Path or URL for the training data used with the model.
    :param hyperopt: Use hyperopt for hyperparameter search during training.
    :return: dict with fields 'loss' (scalar loss) and 'status' (success/failure status of run)
    """
    with mlflow.start_run(nested=True):
        return _train(params, fpath, hyperopt)

### Implement our MLFlow training loop, and save our best model to the tracking server.

In [None]:
PATH_TO_CONDA_DATA = ""
PATH_TO_AIRLINE_DATA = "rapidsai-cloud-ml-sample-data.s3-us-west-2.amazonaws.com"

algorithm = 'tpe'
conda_env = f'https://{PATH_TO_CONDA_DATA}/conda.yaml'
fpath     = f'https://{PATH_TO_AIRLINE_DATA}/airline_small.parquet'

search_space = [
    hp.uniform('max_depth', 5, 20),
    hp.uniform('max_features', 0., 1.0),
    hp.uniform('n_estimators', 150, 1000)
]

trials = Trials()
algorithm = tpe.suggest if algorithm == 'tpe' else None
fn = partial(train, fpath=fpath, hyperopt=True)
experid = 0

with mlflow.start_run():
    mlflow.set_tag("mlflow.runName", "RAPIDS-Hyperopt-Databricks")
    argmin = fmin(fn=fn,
                  space=search_space,
                  algo=algorithm,
                  max_evals=2,
                  trials=trials)

    print("===========")
    fn = partial(train, fpath=fpath, hyperopt=False)
    final_model = fn(tuple(argmin.values()))

    conda_data = ""
    if (conda_env.startswith("http")):
        import requests

        resp = requests.get(conda_env)
        conda_data = str(resp.text)
    else:
        with open(conda_env, 'r') as reader:
            conda_data = reader.read()

    with open("conda.yaml", 'w') as writer:
        writer.write(conda_data)
    
    mlflow.sklearn.log_model(final_model,
                             artifact_path="rapids_mlflow_test",
                             registered_model_name="rapids_mlflow_test",
                             conda_env='conda.yaml')

    client = mlflow.tracking.MlflowClient()
    latest_model = dict(client.search_model_versions("name='rapids_mlflow_test'")[0])
    latest_model_source = latest_model['source']
    
    retries = 0
    while(True):
        if (retries > 1):
            raise RuntimeError("Failed to update registered model status.")
        try:
            # We need to wait for the model to be registered
            time.sleep(10)
            client.transition_model_version_stage(
                name="rapids_mlflow_test",
                version=latest_model['version'],
                stage="Production")
            print(f"Successfully registered model version {latest_model['version']}, as production.")
            break
        except Exception as e:
            print(e, flush=True)
            retries += 1

### Helper to track our server output.

In [None]:
def queue_descriptor_output(out, queue):
    for line in iter(out.readline, b''):
        queue.put(line)
    out.close()

def follow_subprocess(cmd, timeout=1000, line_timeout=60.00):
    p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
    q = Queue()
    t = threading.Thread(target=queue_descriptor_output, args=(p.stdout, q))
    t.daemon = True
    t.start()

    elapsed = 0
    line_elapsed = 0
    last_line_time = time.perf_counter()
    while (p.poll() is None and elapsed < timeout and line_elapsed < line_timeout):
        try:
            time.sleep(2)
            elapsed += 2
            while (True):
                line = q.get(timeout=0.1)
                line_elapsed = 0
                last_line_time = time.perf_counter()
                sys.stdout.write(line.decode())

        except Empty:
            line_elapsed = (time.perf_counter() - last_line_time)
        except KeyboardInterrupt:
            sys.stderr.write("\nCaught ctrl+c, killing subprocess ({})\n".format(' '.join(cmd)))
            p.kill()
            raise

    try:
        p.kill()
    except:
        pass

    t.join(2)

    ## Drain any remaining text
    try:
        while (True):
            line = q.get(timeout=0.1)
            sys.stdout.write(line)

    except Empty:
        pass

### Begin serving our trained model using MLFlow
### Note: the serving thread will continue to run in this cell. Select the cell and click 'interrupt the kernel' to stop it.

In [None]:
port = 55755
host = 'localhost'
command = f"mlflow models serve -m {latest_model_source} -p {port} -h {host}".split()
kwargs = { "cmd": command, "timeout":float('Inf'), "line_timeout": float('Inf') }

threading.Thread(target=follow_subprocess, kwargs=kwargs).start()
time.sleep(30)

### Make requests against the deployed model

In [None]:
import json
import requests

headers = {
    "Content-Type": "application/json",
    "format": "pandas-split"
}

data = { 
    "columns": ["Year", "Month", "DayofMonth", "DayofWeek", "CRSDepTime", "CRSArrTime", "UniqueCarrier", "FlightNum", "ActualElapsedTime", "Origin", "Dest", "Distance", "Diverted"],
    "data": [[1987, 10, 1, 4, 1, 556, 0, 190, 247, 202, 162, 1846, 0]]
}

while (True):
    try:
        resp = requests.post(url=f"http://{host}:{port}/invocations", data=json.dumps(data), headers=headers)
        print(f'Classification: {"ON-Time" if resp.text == "[0.0]" else "LATE"}')
        break
    except Exception as e:
        errmsg = f"Caught exception attempting to call model endpoint: {e}"
        print(f"{errmsg}", end='')
        print(f"Sleeping")
        time.sleep(20)