In [1]:
# Wine Quality Sample
def train(in_n=5, in_weights="uniform", in_algorithm="auto"):
    import warnings
    import dvc.api

    import pandas as pd
    import numpy as np
    from sklearn.metrics import accuracy_score
    from sklearn.model_selection import train_test_split
    from sklearn.neighbors import KNeighborsClassifier
    from sklearn.preprocessing import LabelEncoder

    import mlflow
    import mlflow.sklearn
    
    from IPython.display import display
    import logging
    logging.basicConfig(level=logging.WARN)
    logger = logging.getLogger(__name__)

    def eval_metrics(actual, pred):
        acc = accuracy_score(actual, pred)
        return acc


    warnings.filterwarnings("ignore")
    np.random.seed(40)
    
    # LOCAL
    path="../data/prepared/beer_profile_and_ratings.csv"
    index_col = "Beer Name (Full)"
    
    try:
        with open(path,'rb') as fd:
            data = pd.read_csv(fd, index_col=index_col)
            
    except Exception as e:
        logger.exception("Unable to open training & test CSV, check your path. Error: %s", e)



    # Split the data into training and test sets. (0.75, 0.25) split.
    train, test = train_test_split(data)

    #  SET Y VALUE, THIS CAN ALSO BE DONE WITH AN "sys.argv" function if necesary
    # The predicted column is "quality" which is a scalar from [3, 9]
    train_x = train.drop(["Style"], axis=1)
    test_x = test.drop(["Style"], axis=1)
    train_y = train[["Style"]]
    test_y = test[["Style"]]

    # Useful for multiple runs (only doing one run in this sample notebook)    
    with mlflow.start_run():
        # Execute ElasticNet
        kNeigh = KNeighborsClassifier(n_neighbors=in_n, weights=in_weights, algorithm=in_algorithm)
        kNeigh.fit(train_x, train_y)

        # Evaluate Metrics
        predicted_styles = kNeigh.predict(test_x)
        predicted_styles_proba = kNeigh.predict_proba(test_x)
        display(predicted_styles_proba)
        
        
        
        (acc) = eval_metrics(test_y, predicted_styles)

        # Print out metrics
        print(f"KNeighborsClassifier (n_neighbors={in_n}, l1_ratio={in_weights}):")
        print(f"     accuracy: {acc}")


        # Log data params
        # mlflow.log_param("data_url", data_url)
        mlflow.log_param("input_rows", data.shape[0])
        mlflow.log_param("input_cols", data.shape[1])
        mlflow.log_params(kNeigh.get_params())

        # Log artifacts: columns usded for modeling
        cols_x = pd.DataFrame(list(train_x.columns))
        cols_x.to_csv("./data/features.csv", header=False, index=False)
        mlflow.log_artifact("./data/features.csv")

        cols_y = pd.DataFrame(list(train_y.columns))
        cols_y.to_csv("./data/targets.csv", header=False, index=False)
        mlflow.log_artifact("./data/targets.csv")
        
        target_values = pd.DataFrame(list(data.Style.unique()))
        target_values.to_csv("./data/targets_values.csv", header=False, index=False)
        mlflow.log_artifact("./data/targets_values.csv")
        

    
        # mlflow.log_artifact("predicted_probabilities", predicted_styles_proba)
        mlflow.log_metric("accuracy_score", acc)

        mlflow.sklearn.log_model(kNeigh, "model")

In [42]:
class mlflowProject_KNeighbour():
    def __init__(self, path, index_col, target_col):
        import pandas as pd
        from IPython.display import display
        
        self.target_col = target_col
        self.data = pd.read_csv(path, index_col=index_col)
        display(self.data.head(2))
        
        
    def train(self, in_n=5, in_weights="uniform", in_algorithm="auto"):
        import warnings
        import numpy as np
        import pandas as pd
        
        from sklearn.metrics import accuracy_score
        from sklearn.model_selection import train_test_split
        from sklearn.neighbors import KNeighborsClassifier
        
        import mlflow
        import mlflow.sklearn
        
        from IPython.display import display
        import logging
        logging.basicConfig(level=logging.WARN)

        def eval_metrics(actual, pred):
            acc = accuracy_score(actual, pred)
            return acc

        warnings.filterwarnings("ignore")
        np.random.seed(40)
        

        # Split the data into training and test sets. (0.75, 0.25) split.
        train, test = train_test_split(self.data)

        #  SET Y VALUE, THIS CAN ALSO BE DONE WITH AN "sys.argv" function if necesary
        # The predicted column is "quality" which is a scalar from [3, 9]
        train_x = train.drop([self.target_col], axis=1)
        test_x = test.drop([self.target_col], axis=1)
        train_y = train[[self.target_col]]
        test_y = test[[self.target_col]]

        # Useful for multiple runs (only doing one run in this sample notebook)    
        with mlflow.start_run():
            # Execute ElasticNet
            kNeigh = KNeighborsClassifier(n_neighbors=in_n, weights=in_weights, algorithm=in_algorithm)
            kNeigh.fit(train_x, train_y)
            predicted_styles = kNeigh.predict(test_x)
            
            # Evaluate Metrics
            (acc) = eval_metrics(test_y, predicted_styles)
            
            # Print out metrics
            print(f"KNeighborsClassifier with paramaters:")
            parameters = kNeigh.get_params()
            for par in parameters:
                print(f"     {par} : {parameters[par]}" )
            print(f"KNeighborsClassifier Metrics:")
            print(f"     accuracy: {acc}")

            # Log data params
            # mlflow.log_param("data_url", data_url)
            mlflow.log_param("input_rows", self.data.shape[0])
            mlflow.log_param("input_cols", self.data.shape[1])
            mlflow.log_params(kNeigh.get_params())

            # Log artifacts: columns usded for modeling
            cols_x = pd.DataFrame(list(train_x.columns))
            cols_x.to_csv("./data/features.csv", header=False, index=False)
            mlflow.log_artifact("./data/features.csv")

            cols_y = pd.DataFrame(list(train_y.columns))
            cols_y.to_csv("./data/target.csv", header=False, index=False)
            mlflow.log_artifact("./data/target.csv")
            
            target_values = pd.DataFrame(list(self.data[self.target_col].unique()))
            target_values.to_csv("./data/target_values.csv", header=False, index=False)
            mlflow.log_artifact("./data/target_values.csv")
        
            # mlflow.log_artifact("predicted_probabilities", predicted_styles_proba)
            mlflow.log_metric("accuracy_score", acc)

            mlflow.sklearn.log_model(kNeigh, "model")
    
        

In [43]:
path="../data/prepared/beer_profile_and_ratings.csv"
index_col = "Beer Name (Full)"
target_col = "Style"

project1 = mlflowProject_KNeighbour(path, index_col, target_col)
project1.train()



Unnamed: 0_level_0,Style,ABV,Min IBU,Max IBU,Astringency,Body,Alcohol,Bitter,Sweet,Sour,Salty,Fruits,Hoppy,Spices,Malty,review_aroma,review_appearance,review_palate,review_taste,review_overall
Beer Name (Full),Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
Alaskan Brewing Co. Alaskan Amber,Altbier,5.3,25,50,13,32,9,47,74,33,0,33,57,8,111,3.498994,3.636821,3.556338,3.643863,3.847082
Long Trail Brewing Co. Double Bag,Altbier,7.2,25,50,12,57,18,33,55,16,0,24,35,12,84,3.798337,3.846154,3.904366,4.024948,4.034304


KNeighborsClassifier with paramaters:
     algorithm : auto
     leaf_size : 30
     metric : minkowski
     metric_params : None
     n_jobs : None
     n_neighbors : 5
     p : 2
     weights : uniform
KNeighborsClassifier Metrics:
     accuracy: 0.55
