# Tutorial 3: Build and register ML models for imbalanced data

In this tutorial, we will demonstrate how to build and register multiple ML models using mlflow. More info on mlflow [here](https://mlflow.org/docs/latest/index.html)

## Steps

- Create an experiment using mlflow client
- Create a pandas dataframe using _credit_card_train.csv_ file
- Use imbalanced-learn library for random undersampling
- Build and register multiple models, parameters, and metrics using mlflow Python APIs

## Import MLFlow libraries

In [9]:
import os
import mlflow
import mlflow.sklearn
from  mlflow.tracking import MlflowClient

## Import Numpy, Matplotlib, Sklearn, Imbalanced-learn libraries

In [10]:
from numpy import mean
from numpy import std
import pandas as pd
from matplotlib import pyplot
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score, RepeatedStratifiedKFold
from sklearn.metrics import precision_recall_curve, accuracy_score, auc, make_scorer
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, BaggingClassifier
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE, ADASYN

## Define scoring precision-recall area under curve, evaluate model, log model

In [11]:
# calculate precision-recall area under curve
def pr_auc(y_true, probas_pred):
    # calculate precision-recall curve
    p, r, _ = precision_recall_curve(y_true, probas_pred)
    # calculate area under curve
    return auc(r, p)

# evaluate a model
def evaluate_model(X, y, model,name):
    # define evaluation procedure
    cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
    # define the model evaluation the metric
    metric = make_scorer(pr_auc, needs_proba=True)
    # evaluate model
    scores = cross_val_score(model, X, y, scoring=metric, cv=cv, n_jobs=-1)
    model.fit(X,y)
    with mlflow.start_run():
        mlflow.log_param("Model",name)
        mlflow.log_metric("Mean Score",mean(scores))
        mlflow.log_metric("Std Score",std(scores))
        mlflow.sklearn.log_model(model,"model")
        
    return scores

## Create a list of models to build

In [12]:
# define models to test
def get_models():
    models, names = list(), list()
    # CART
    models.append(DecisionTreeClassifier())
    names.append('CART')
    # KNN
    steps = [('s',StandardScaler()),('m',KNeighborsClassifier())]
    models.append(Pipeline(steps=steps))
    names.append('KNN')
    # Bagging
    steps = [('s',StandardScaler()),('m',BaggingClassifier(n_estimators=100))]
    models.append(Pipeline(steps=steps))
    names.append('BAG')
    # RF
    steps = [('s',StandardScaler()),('m',RandomForestClassifier(n_estimators=100))]
    models.append(Pipeline(steps=steps))
    names.append('RF')
    # ET
    steps = [('s',StandardScaler()),('m',ExtraTreesClassifier(n_estimators=100))]
    models.append(Pipeline(steps=steps))
    names.append('ET')
    return models, names

## Set MLFlow Experiment in which you want to track your runs

In [13]:
## Setup MLFLOW
tracking_uri = os.environ.get("TRACKING_URL")
client = MlflowClient(tracking_uri=tracking_uri)
mlflow.set_tracking_uri(tracking_uri)
experiments = client.list_experiments()
experiment_names = []
for exp in experiments:
    experiment_names.append(exp.name)
experiment_name = "demo"
if experiment_name not in experiment_names:
    mlflow.create_experiment(experiment_name)
mlflow.set_experiment(experiment_name)

## Training samples and targets

In [14]:
full_path = 'credit_card_train.csv'
df = pd.read_csv(full_path)
y = df["Target"]
df = df.drop(["Target"],axis=1)
X = df

## Undersampler to handle imbalanced data

In [15]:
# define the location of the dataset
# load the dataset
rus = RandomUnderSampler(random_state=0)
X_resampled, y_resampled = rus.fit_resample(X, y)

## Train models and get scores

In [16]:
# X_resampled, y_resampled = SMOTE().fit_resample(X, y)
# summarize the loaded dataset
models, names = get_models()
results = list()
# evaluate each model
for i in range(len(models)):
    # evaluate the model and store results
    scores = evaluate_model(X_resampled, y_resampled, models[i],names[i])
    results.append(scores)
    # summarize performance
    print('>%s %.3f (%.3f)' % (names[i], mean(scores), std(scores)))

>CART 0.925 (0.031)
>KNN 0.967 (0.014)
>BAG 0.982 (0.012)
>RF 0.983 (0.012)
>ET 0.984 (0.013)
