In [2]:
# Wine Quality Sample
def train(in_n=5, in_weights="uniform", in_algorithm="auto"):
    import os
    import warnings
    import sys
    import dvc.api

    import pandas as pd
    import numpy as np
    from sklearn.metrics import accuracy_score
    from sklearn.model_selection import train_test_split
    from sklearn.neighbors import KNeighborsClassifier
    from sklearn.preprocessing import LabelEncoder

    import mlflow
    import mlflow.sklearn
    
    from IPython.display import display
    import logging
    logging.basicConfig(level=logging.WARN)
    logger = logging.getLogger(__name__)

    def eval_metrics(actual, pred):
        acc = accuracy_score(actual, pred)
        return acc


    warnings.filterwarnings("ignore")
    np.random.seed(40)
    
    # DVC
    # path = "data/prepared/beer_profile_and_ratings.csv"
    # repo = "https://github.com/stijnhering/PreTraineeship"
    
    # LOCAL
    path="../data/prepared/beer_profile_and_ratings.csv"
    
    try:
        # DVC
        # with dvc.api.open(path, repo=repo) as fd:
        #     data = pd.read_csv(fd, sep=",", index_col="Name")
        #     display(data.head())

        # LOCAL
        with open(path,'rb') as fd:
            data = pd.read_csv(fd)
            display(data.head())
            
    except Exception as e:
        logger.exception("Unable to open training & test CSV, check your path. Error: %s", e)

    # Encode target
    le = LabelEncoder()
    encoded = le.fit_transform(data[["Style"]].values.ravel())
    data[["Style"]] = encoded.reshape(-1, 1)

    # Split the data into training and test sets. (0.75, 0.25) split.
    train, test = train_test_split(data)

    #  SET Y VALUE, THIS CAN ALSO BE DONE WITH AN "sys.argv" function if necesary
    # The predicted column is "quality" which is a scalar from [3, 9]
    train_x = train.drop(["Style"], axis=1)
    test_x = test.drop(["Style"], axis=1)
    train_y = train[["Style"]]
    test_y = test[["Style"]]

    # Useful for multiple runs (only doing one run in this sample notebook)    
    with mlflow.start_run():
        # Execute ElasticNet
        kNeigh = KNeighborsClassifier(n_neighbors=in_n, weights=in_weights, algorithm=in_algorithm)
        kNeigh.fit(train_x, train_y)

        # Evaluate Metrics
        predicted_styles = kNeigh.predict(test_x)
        predicted_styles_proba = kNeigh.predict_proba(test_x)
        (acc) = eval_metrics(test_y, predicted_styles)

        # Print out metrics
        print(f"KNeighborsClassifier (n_neighbors={in_n}, l1_ratio={in_weights}):")
        print(f"     accuracy: {acc}")


        # Log data params
        # mlflow.log_param("data_url", data_url)
        mlflow.log_param("input_rows", data.shape[0])
        mlflow.log_param("input_cols", data.shape[1])
        mlflow.log_params(kNeigh.get_params())

        # Log artifacts: columns usded for modeling
        cols_x = pd.DataFrame(list(train_x.columns))
        cols_x.to_csv("KneighborsClassifier/data/features.csv", header=False, index=False)
        mlflow.log_artifact("KneighborsClassifier/data/features.csv")

        cols_y = pd.DataFrame(list(train_y.columns))
        cols_y.to_csv("KneighborsClassifier/data/targets.csv", header=False, index=False)
        mlflow.log_artifact("KneighborsClassifier/data/targets.csv")

        mlflow.log_metric("predicted_probabilities", predicted_styles_proba)
        mlflow.log_metric("accuracy_score", acc)

        mlflow.sklearn.log_model(kNeigh, "model")