In [42]:
class mlflowProject_KNeighbour():
    def __init__(self, path, index_col, target_col):
        import pandas as pd
        from IPython.display import display
        
        self.target_col = target_col
        self.data = pd.read_csv(path, index_col=index_col)
        display(self.data.head(2))
        
        
    def train(self, n_neighbors=5, weights="uniform", algorithm="auto"):
        import warnings
        import numpy as np
        import pandas as pd
        
        from sklearn.metrics import accuracy_score, confusion_matrix
        from sklearn.model_selection import train_test_split
        from sklearn.neighbors import KNeighborsClassifier
        
        from sklearn.preprocessing import StandardScaler
        
        
        import mlflow
        import mlflow.sklearn
        import plotly.express as px
        
        from IPython.display import display
        import logging
        logging.basicConfig(level=logging.WARN)

        def eval_metrics(actual, pred):
            acc = accuracy_score(actual, pred)
            return acc

        warnings.filterwarnings("ignore")
        
        model_data = self.data
        
        # Scale Features
        scaler = StandardScaler()
        model_data[model_data.columns.drop(self.target_col)] = scaler.fit_transform(
            model_data[model_data.columns.drop(self.target_col)])
        
        display(model_data.head(2))
        
       
        # Split the data into training and test sets. (0.75, 0.25) split.
        train, test = train_test_split(model_data)

        #  SET Y VALUE, THIS CAN ALSO BE DONE WITH AN "sys.argv" function if necesary
        # The predicted column is "quality" which is a scalar from [3, 9]
        train_x = train.drop([self.target_col], axis=1)
        test_x = test.drop([self.target_col], axis=1)
        train_y = train[[self.target_col]]
        test_y = test[[self.target_col]]

        # Useful for multiple runs (only doing one run in this sample notebook)    
        with mlflow.start_run():
            # Execute ElasticNet
            kNeigh = KNeighborsClassifier(n_neighbors=n_neighbors, weights=weights, algorithm=algorithm)
            kNeigh.fit(train_x, train_y)
            predicted_styles = kNeigh.predict(test_x)
            
            # Evaluate Metrics
            (acc) = eval_metrics(test_y, predicted_styles)
            
            conf_matrix = pd.DataFrame(confusion_matrix(test_y, predicted_styles))
            display(conf_matrix)
            
            # Print out metrics
            print(f"KNeighborsClassifier with paramaters:")
            parameters = kNeigh.get_params()
            for par in parameters:
                print(f"     {par} : {parameters[par]}" )
            print(f"KNeighborsClassifier Metrics:")
            print(f"     accuracy: {acc}")
            
            
            # fig = px.scatter(
            #     X_test, x=0, y=1,
            #     color=y_score, color_continuous_scale='RdBu',
            #     symbol=y_test, symbol_map={'0': 'square-dot', '1': 'circle-dot'},
            #     labels={'symbol': 'label', 'color': 'score of <br>first class'}
            # )
            # fig.update_traces(marker_size=12, marker_line_width=1.5)
            # fig.update_layout(legend_orientation='h')
            # fig.show()

            # Log data params
            # mlflow.log_param("data_url", data_url)
            mlflow.log_param("input_rows", model_data.shape[0])
            mlflow.log_param("input_cols", model_data.shape[1])
            mlflow.log_params(kNeigh.get_params())

            # Log artifacts: columns usded for modeling
            cols_x = pd.DataFrame(list(train_x.columns))
            cols_x.to_csv("./data/features.csv", header=False, index=False)
            mlflow.log_artifact("./data/features.csv")

            cols_y = pd.DataFrame(list(train_y.columns))
            cols_y.to_csv("./data/target.csv", header=False, index=False)
            mlflow.log_artifact("./data/target.csv")
            
            target_values = pd.DataFrame(
                list(model_data[self.target_col].unique()))
            target_values.to_csv("./data/target_values.csv", header=False, index=False)
            mlflow.log_artifact("./data/target_values.csv")
        
            # mlflow.log_artifact("predicted_probabilities", predicted_styles_proba)
            mlflow.log_metric("accuracy_score", acc)

            mlflow.sklearn.log_model(kNeigh, "model")
    
        

In [43]:
path="../data/prepared/beer_profile_and_ratings.csv"
index_col = "Beer Name (Full)"
target_col = "Style"

project1 = mlflowProject_KNeighbour(path, index_col, target_col)
project1.train(n_neighbors=9)



Unnamed: 0_level_0,Style,ABV,Min IBU,Max IBU,Astringency,Body,Alcohol,Bitter,Sweet,Sour,Salty,Fruits,Hoppy,Spices,Malty,review_aroma,review_appearance,review_palate,review_taste,review_overall
Beer Name (Full),Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
Alaskan Brewing Co. Alaskan Amber,Altbier,5.3,25,50,13,32,9,47,74,33,0,33,57,8,111,3.498994,3.636821,3.556338,3.643863,3.847082
Long Trail Brewing Co. Double Bag,Altbier,7.2,25,50,12,57,18,33,55,16,0,24,35,12,84,3.798337,3.846154,3.904366,4.024948,4.034304


Unnamed: 0_level_0,Style,ABV,Min IBU,Max IBU,Astringency,Body,Alcohol,Bitter,Sweet,Sour,Salty,Fruits,Hoppy,Spices,Malty,review_aroma,review_appearance,review_palate,review_taste,review_overall
Beer Name (Full),Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
Alaskan Brewing Co. Alaskan Amber,Altbier,-0.481696,0.28848,0.515791,-0.337764,-0.54462,-0.464895,0.412438,0.458897,-0.004066,-0.477041,-0.171238,0.528815,-0.435553,0.893892,-0.27785,-0.291487,-0.231379,-0.114903,0.224124
Long Trail Brewing Co. Double Bag,Altbier,0.264397,0.28848,0.515791,-0.433834,0.419002,0.054477,-0.130468,-0.095428,-0.479264,-0.477041,-0.449949,-0.194896,-0.267152,0.217253,0.317111,0.227495,0.542245,0.631911,0.645588


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,32,33,34,35,36,37,38,39,40,41
0,3,0,1,0,0,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,14,0,0,0,0,0,0,0,0,...,0,0,0,0,1,4,0,0,0,0
2,0,0,10,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,1,1,0,0,0
4,0,0,0,0,6,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
5,0,0,1,0,3,16,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,1,0,0,0,7,0,6,0,0,...,0,0,0,0,1,0,0,0,0,0
8,1,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,1,0,3,...,0,0,0,0,0,0,0,1,0,0


KNeighborsClassifier with paramaters:
     algorithm : auto
     leaf_size : 30
     metric : minkowski
     metric_params : None
     n_jobs : None
     n_neighbors : 9
     p : 2
     weights : uniform
KNeighborsClassifier Metrics:
     accuracy: 0.5875
