# Demo project - Wine quality prediction

## Contents:
* [Import packages](#first-bullet)
* [Load Data](#second-bullet)
* [Exploratory data analysis](#third-bullet)
* [Prepare dataset for training model](#forth-bullet)
* [Build a baseline model](#fifth-bullet)
* [Experiment with a new model](#sixth-bullet)
* [Predict](#seventh-bullet)

## Import packages <a class="anchor" id="first-bullet"></a>

Before import packages, install packages as required <br>
Any pypi packages can be installed <br>

In [None]:
!pip install s3fs hyperopt cloudpickle mlflow xgboost

In [None]:
import boto3
import pandas as pd
import matplotlib.pyplot as plt
import tempfile
import os

import mlflow

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

import mlflow.pyfunc
import mlflow.sklearn
import numpy as np

import sklearn
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score

from mlflow.models.signature import infer_signature
from mlflow.utils.environment import _mlflow_conda_env
import cloudpickle
import time

from hyperopt import fmin, tpe, hp, SparkTrials, Trials, STATUS_OK
from hyperopt.pyll import scope
from math import exp
import mlflow.xgboost
import numpy as np
import xgboost as xgb

In [None]:
import boto3
import pandas as pd
import matplotlib.pyplot as plt
import tempfile
import os

import mlflow

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

import mlflow.pyfunc
import mlflow.sklearn
import numpy as np

import sklearn
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score

from mlflow.models.signature import infer_signature
from mlflow.utils.environment import _mlflow_conda_env
import cloudpickle
import time

from hyperopt import fmin, tpe, hp, SparkTrials, Trials, STATUS_OK
from hyperopt.pyll import scope
from math import exp
import mlflow.xgboost
import numpy as np
import xgboost as xgb

## Load Data <a class="anchor" id="second-bullet"></a>

Assumption: the bucket is already created and "winequality-red.csv" & "winequality-white.csv" are uploaded into the bucket <br>
Read data from object store <br>
Connect to object store and instantiate a client object using boto3 session:

In [None]:
AWS_ACCESS_KEY_ID = os.environ['AWS_ACCESS_KEY_ID']
AWS_SECRET_ACCESS_KEY = os.environ['AWS_SECRET_ACCESS_KEY']
AWS_S3_ENDPOINT = os.environ['AWS_S3_ENDPOINT']
AWS_S3_BUCKET = os.environ['AWS_S3_BUCKET']

In [None]:
s3conn = boto3.Session(aws_access_key_id=S3ACCESS_KEY,
                           aws_secret_access_key=S3SECRET_KEY)
s3_client = s3conn.client('s3',endpoint_url = S3ENDPOINT, verify=False)

In [None]:
s3_client.list_objects(Bucket='data')

Using the s3_client, retrieve data from objective store:<br>

In [None]:
objectname = "winequality-red.csv"
file_addr = "data/winequality-red.csv"
response = s3_client.download_file(bucket_name, objectname, file_addr)

In [None]:
objectname = "winequality-white.csv"
file_addr = "data/winequality-white.csv"
response = s3_client.download_file(bucket_name, objectname, file_addr)

## Exploratory data analysis <a class="anchor" id="third-bullet"></a>

In [None]:
import s3fs
def read_data(datasrc):
    data = pd.read_csv(
        "s3://" + AWS_S3_BUCKET + "/" + datasrc, sep=';',
        storage_options={
            "key": AWS_ACCESS_KEY_ID,
            "secret": AWS_SECRET_ACCESS_KEY,
            "endpoint_url": AWS_S3_ENDPOINT,
        }
    )
    return data

In [None]:
## after concatanation, setting the value of is_red for which is a red wine, which is a white wine - feature

In [None]:
def transformdata(red_wine,white_wine):
    red_wine['is_red'] = 1
    white_wine['is_red'] = 0
    data = pd.concat([red_wine, white_wine], axis=0)
    data.rename(columns=lambda x: x.replace(' ', '_'), inplace=True)
    return data

In [None]:
white_wine = read_data('winequality-white.csv')
red_wine = read_data('winequality-red.csv')
data = transformdata(red_wine, white_wine)

In [None]:
data.head(5)

Visualize data

In [None]:
!pip install seaborn

In [None]:
import seaborn as sns
sns.displot(data.quality, kde=False)

In [None]:
## set type boolean

In [None]:
def settarget(data):
    high_quality = (data.quality >= 7).astype(int)
    data.quality = high_quality
    return data

data = settarget(data)

In [None]:
import seaborn as sns
sns.displot(data.quality, kde=False)

In [None]:
## median, upper and lower quartile, IQR
## histogram for distribution

In [None]:
dims = (3, 4)

f, axes = plt.subplots(dims[0], dims[1], figsize=(25, 15))
axis_i, axis_j = 0, 0
for col in data.columns:
  if col == 'is_red' or col == 'quality':
    continue # Box plots cannot be used on indicator variables
  sns.boxplot(x=data['quality'], y=data[col], ax=axes[axis_i, axis_j])
  axis_j += 1
  if axis_j == dims[1]:
    axis_i += 1
    axis_j = 0

Check missing value

In [None]:
## scenarios for missing data - decision for the missing data
## if alcohol is not an indicator, delete that record

## what are we going to do with the outliers? are they real outliers?

In [None]:
data.isna().any()

## Prepare dataset for training model <a class="anchor" id="forth-bullet"></a>
Split the input data into 3 sets:

- Train (60% of the dataset used to train the model)
- Validation (20% of the dataset used to tune the hyperparameters)
- Test (20% of the dataset used to report the true performance of the model on an unseen dataset)

In [None]:
def get_trainingdata(data):
    X = data.drop(["quality"], axis=1)
    y = data.quality

    # Split out the training data
    X_train, X_rem, y_train, y_rem = train_test_split(X, y, train_size=0.6, random_state=123)

    # Split the remaining data equally into validation and test
    X_val, X_test, y_val, y_test = train_test_split(X_rem, y_rem, test_size=0.5, random_state=123)
    return (X_train,X_val,X_test,y_train,y_val,y_test)

In [None]:
(X_train,X_val,X_test,y_train,y_val,y_test) = get_trainingdata(data)

## Build a baseline model (random forest classifier) <a class="anchor" id="fifth-bullet"></a>
Build a simple classifier using scikit-learn. Use MLflow to keep track of the model accuracy. You can read about Classification - ROC and AUC here if you want 
https://developers.google.com/machine-learning/crash-course/classification/roc-and-auc

Enable MLflow autologging

In [None]:
experiment_name = "WineQuality"

In [None]:
# check if experiment name already exists
mlflow.set_tracking_uri("http://mlflow:5500")
mlflow.set_experiment(experiment_name)

# enable autologging
mlflow.sklearn.autolog(log_input_examples=True)

In [None]:
def log_featureimportance(model):
    tmpdir = tempfile.mkdtemp()
    filepath = os.path.join(tmpdir, 'feature_importance.json')
    feature_importances = pd.DataFrame(model.feature_importances_, index=X_train.columns.tolist(), columns=['importance'])
    feature_importances.sort_values('importance', ascending=False).to_json(filepath)
    mlflow.log_artifact(filepath)
    return

Train random forest

In [None]:
class SklearnModelWrapper(mlflow.pyfunc.PythonModel):
    def __init__(self, model):
        self.model = model

    def predict(self, context, model_input):
        return self.model.predict_proba(model_input)[:,1]

def train_randomforest(X_train,y_train,X_test,y_test):

    with mlflow.start_run(run_name='untuned_random_forest'):
        n_estimators = 10
        model = RandomForestClassifier(n_estimators=n_estimators, random_state=np.random.RandomState(123))
        model.fit(X_train, y_train)

        predictions_test = model.predict_proba(X_test)[:,1]
        auc_score = roc_auc_score(y_test, predictions_test)
        mlflow.log_param('n_estimators', n_estimators) #specify the interested parameter/metric
        mlflow.log_metric('auc', auc_score)
        wrappedModel = SklearnModelWrapper(model)

        signature = infer_signature(X_train, wrappedModel.predict(None, X_train))

        conda_env = _mlflow_conda_env(
            additional_conda_deps=None,
            additional_pip_deps=["cloudpickle=={}".format(cloudpickle.__version__), "scikit-learn=={}".format(sklearn.__version__)],
            additional_conda_channels=None,
            )
        mlflow.pyfunc.log_model("random_forest_model", python_model=wrappedModel, conda_env=conda_env, signature=signature)
        log_featureimportance(model)
        return model

In [None]:
model = train_randomforest(X_train,y_train,X_test,y_test)

In [None]:
# Sanity-check: This should match the AUC logged by MLflow
print(f'AUC: {roc_auc_score(y_test, model.predict_proba(X_test)[:,1])}')

In [None]:
# Sanity-check: This should match the feature importance logged by MLflow
feature_importances = pd.DataFrame(model.feature_importances_, index=X_train.columns.tolist(), columns=['importance'])
feature_importances.sort_values('importance', ascending=False)

## Experiment with a new model (xgboost) <a class="anchor" id="sixth-bullet"></a>
Use the xgboost library to train a more accurate model. Run hyperparameter tuning to train multiple models. As before, the code tracks the performance of each parameter configuration with MLflow.

In [None]:
search_space = {
  'max_depth': scope.int(hp.quniform('max_depth', 50, 100, 10)),
  'learning_rate': hp.loguniform('learning_rate', -3, 0),
  'reg_alpha': hp.loguniform('reg_alpha', -5, -1),
  'reg_lambda': hp.loguniform('reg_lambda', -6, -1),
  'min_child_weight': hp.loguniform('min_child_weight', -1, 3),
  'objective': 'binary:logistic',
  'seed': 123, # Set a seed for deterministic training
}

def train_model(params):

  mlflow.xgboost.autolog()
  with mlflow.start_run(nested=True):
    train = xgb.DMatrix(data=X_train, label=y_train)
    validation = xgb.DMatrix(data=X_val, label=y_val)

    booster = xgb.train(params=params, dtrain=train, num_boost_round=100,\
                        evals=[(validation, "validation")], early_stopping_rounds=50)
    validation_predictions = booster.predict(validation)
    auc_score = roc_auc_score(y_val, validation_predictions)
    mlflow.log_metric('auc', auc_score) #specify the interested parameter/metric

    signature = infer_signature(X_train, booster.predict(train))
    mlflow.xgboost.log_model(booster, "model", signature=signature)

    return {'status': STATUS_OK, 'loss': -1*auc_score, 'booster': booster.attributes()}

with mlflow.start_run(run_name='xgboost_models'):
  best_params = fmin(
    fn=train_model,
    space=search_space,
    algo=tpe.suggest,
    max_evals=10,
  )

In [None]:
best_run = mlflow.search_runs(order_by=['metrics.auc DESC']).iloc[0]
best_run_id = best_run["run_id"]
print(f'AUC of Best Run: {best_run["metrics.auc"]}')

In [None]:
best_run_id

## Predict <a class="anchor" id="seventh-bullet"></a>

In [None]:
# model = mlflow.pyfunc.load_model(f"models:/TestModelD/production")
model = mlflow.pyfunc.load_model("runs:/" + best_run_id + "/model")

test_predictions = model.predict(X_test)
print(f'AUC: {roc_auc_score(y_test, test_predictions)}')

In [None]:
from sklearn.metrics import classification_report

class_labels = ['white wine', 'red wine']
test_predictions = np.where(test_predictions>0.5, 1, 0)
print(classification_report(y_test, test_predictions, target_names=class_labels))

In [None]:
cm = confusion_matrix(y_test, test_predictions)
disp = ConfusionMatrixDisplay(confusion_matrix=cm)

disp.plot()
plt.show()

In [None]:
# register the best model
new_model_version = mlflow.register_model(f"runs:/{best_run_id}/model", "WineQuality")

In [None]:
# # Promote the new model version to Production
# client.transition_model_version_stage(
#   name="TestModelD",
#   version=new_model_version.version,
#   stage="Production"
# )

In [None]:
# # clean up models
# from mlflow.tracking import MlflowClient
# client = MlflowClient()
# client.delete_registered_model(name="winequality")