# Imports

In [None]:
import os, errno

import numpy as np
import pandas as pd
from sklearn import datasets
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier

import mlflow
import mlflow.sklearn

# Function Definitions

In [None]:
def makedirs(path):
    try:
        os.makedirs(path)
    except OSError as e:
        if e.errno != errno.EEXIST:
            raise

In [None]:
def download_wine(basename, dirname):
    print("downloading wine dataset")
    data = datasets.load_wine(as_frame=True)
    df = data.frame
    targets = df.columns[~np.in1d(df.columns,data.feature_names)]
    df[targets] = data.target_names[df[targets]]
    makedirs(dirname)
    with open(os.path.join(dirname, basename), "w") as fp:
        df.to_csv(fp)

In [None]:
def load_wine(basename="data.csv", dirname="./datasets/wine"):
    if not os.path.exists( os.path.join(dirname, basename) ):
        download_wine(basename, dirname)
    with open(os.path.join(dirname, basename)) as fp:
        dataset_df = pd.read_csv(fp, index_col=0)
    return dataset_df

# K Nearest Neighbor

In [None]:
def train_knn(n_neighbors=5, weights="uniform"):    
    # set the name of this run within the experiment
    run_name = "KNN({}, {}) - Simple MLFlow".format(n_neighbors, weights)
    mlflow.start_run(run_name=run_name)
    
    # load dataset
    dataset_df = load_wine()
    X = dataset_df[dataset_df.columns[:-1]]
    y = dataset_df[dataset_df.columns[-1]]
    
    # create train/test splits
    splits = train_test_split(X, y, test_size=0.25, random_state=0, shuffle=True, stratify=y)
    X_train, X_test, y_train, y_test = splits
    
    # build classifier
    knn = KNeighborsClassifier(n_neighbors=n_neighbors, weights=weights)
    knn.fit(X_train, y_train)
    
    # save model parameters
    mlflow.log_param("n_neighbors", n_neighbors)
    mlflow.log_param("weights",     weights)
    
    # evaluate performance
    y_pred = knn.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    metrics = precision_recall_fscore_support(y_test, y_pred, average="macro")
    precision, recall, fscore, support = metrics
    print("{1:10.4f} {0:}".format("Accuracy",  accuracy))
    print("{1:10.4f} {0:}".format("Precision", precision))
    print("{1:10.4f} {0:}".format("Recall",    recall))
    print("{1:10.4f} {0:}".format("Fscore",    fscore))

    # save performance metrics
    mlflow.log_metric("accuracy",  accuracy)
    mlflow.log_metric("precision", precision)
    mlflow.log_metric("recall",    recall)
    mlflow.log_metric("fscore",    fscore)

    # save the trained model
    mlflow.sklearn.log_model(knn, "model")
    
    # always make sure to end the run
    mlflow.end_run()

# Run Experiments

In [None]:
# set the mlflow experiment
mlflow.set_experiment("Wine (Simple-live)")
    
print("KNN, n_neighbors=1")
train_knn(n_neighbors=1)
print()

print("KNN, n_neighbors=5, weights=uniform")
train_knn(n_neighbors=5, weights="uniform")
print()

print("KNN, n_neighbors=5, weights=distance")
train_knn(n_neighbors=5, weights="distance")
print()

By default, the MLflow Python API logs runs locally to files in a `mlruns` directory wherever you ran your program. You can see the logged runs by running the `mlflow ui` command in the same directory as your code and then viewing the following webpage: http://localhost:5000