# Tutorial 10: Build and register ML Models with Reason Codes and Filtered Columns

In this tutorial, we will demonstrate how to build and register ML: 1) with reason codes and 2) remove any additional columns that could be in the input dataframe.
More info on mlflow [here](https://mlflow.org/docs/latest/index.html)

## Steps

- Create an experiment using mlflow client
- Create a pandas dataframe using _credit_card_train.csv_ file
- Use imbalanced-learn library for random undersampling
- Create a pipeline using sklearn pipeline module to include StandardScaler as a pre-processing step
- Build and register multiple models, parameters, and metrics using mlflow Python APIs

## Import MLFlow libraries

In [1]:
import os
import mlflow
import mlflow.sklearn
from  mlflow.tracking import MlflowClient
from IPython.display import Markdown, display
import joblib

## Import Numpy, Matplotlib, Sklearn libraries

In [2]:
import numpy as np
from numpy import mean
from numpy import std
from tqdm import tqdm

import pandas as pd
from matplotlib import pyplot
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.metrics import precision_recall_curve, accuracy_score
from sklearn.metrics import auc
from sklearn.metrics import make_scorer
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.base import BaseEstimator, TransformerMixin

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE, ADASYN
import shap

## Define a Column Filter Transformer using Sklearn BaseEstimator and TransformerMixin

In [45]:
class ColumnFilterTransformer(BaseEstimator, TransformerMixin):
    def __init__(self,columns="V0,V1,V2"):
        self.columns = columns.split(",")
    
    def fit(self,X,y=None):
        self.n_samples_fit_ = X.shape[0]
        return self
    
    def transform(self,X):
        return X[self.columns]

## Define a Time Difference Transformer using Sklearn BaseEstimator and TransformerMixin

In [4]:
class TimeDifferenceTransformer(BaseEstimator, TransformerMixin):
    def __init__(self,unit='second',time_pre_column='time_pre',time_post_column='time_post',missing_value=-999999,output_column='time_diff'):
        self.unit = unit
        self.time_pre_column = time_pre_column
        self.time_post_column = time_post_column
        self.missing_value = missing_value
        self.output_column = output_column
    
    def fit(self,X,y=None):
        self.n_samples_fit_ = X.shape[0]
        return self
    
    def transform(self,X):
        from dateutil import parser
        time_diff_list = list()
        is_NaN = X[[self.time_pre_column,self.time_post_column]].isnull()
        row_has_NaN = is_NaN.any(axis=1)
        rows_with_NaN = list(X[row_has_NaN].index)
        for index, row in tqdm(X.iterrows()):
            time_pre = row[self.time_pre_column]
            time_post = row[self.time_post_column]
            if index in rows_with_NaN:
                time_diff = self.missing_value
            else:
                time_pre  = parser.parse(time_pre)
                time_post = parser.parse(time_post)
                time_delta = time_post - time_pre
                time_diff = time_delta.seconds
                if self.unit == 'day':
                    time_diff = time_delta.days
                if self.unit == 'minute':
                    time_diff = (time_delta.seconds//60)%60
                if self.unit == 'hour':
                    time_diff = (time_delta.seconds//3600)                
            time_diff_list.append(time_diff)
        X[self.output_column] = time_diff_list
        return X

## Define IP2Block Transformer using Sklearn BaseEstimator and TransformerMixin

In [5]:
class IP2BlockTransformer(BaseEstimator, TransformerMixin):
    def __init__(self,ip_address_column='ip_address',missing_value=-999999,output_column='ip2block'):
        self.missing_value = missing_value
        self.ip_address_column = ip_address_column
        self.output_column = output_column
    
    def fit(self,X,y=None):
        self.n_samples_fit_ = X.shape[0]
        return self
    
    def transform(self,X):
        ip2block_list = list()
        is_NaN = X[[self.ip_address_column]].isnull()
        row_has_NaN = is_NaN.any(axis=1)
        rows_with_NaN = list(X[row_has_NaN].index)
        for index, row in tqdm(X.iterrows()):
            ip_address = row[self.ip_address_column]
            if index in rows_with_NaN:
                ip2block = self.missing_value
            else:
                ip2block = '.'.join(ip_address.strip().split('.')[:2])
            ip2block_list.append(ip2block)
        X[self.output_column] = ip2block_list
        return X

In [30]:
full_path = 'credit_card_train_new_columns.csv'
X = pd.read_csv(full_path)
y = X["Target"]

In [18]:
column_names = ["V"+str(i) for i in range(30)]
column_names = column_names + ['time_pre','time_post','cc_exp', 'lat1', 'long1', 'lat2', 'long2', 'lat3', 'long3', 'ip_address']
feature_column_names = ["V"+str(i) for i in range(30)] + ['time_diff']

In [46]:
steps = [('cf1',ColumnFilterTransformer(",".join(column_names))),
         ('tdiff',TimeDifferenceTransformer()),('ip2block',IP2BlockTransformer()), 
         ('cf2',ColumnFilterTransformer(",".join(feature_column_names))),
         ('m',LGBMClassifier(n_estimators=100))]

In [47]:
pipeline = Pipeline(steps=steps)

In [48]:
# calculate precision-recall area under curve
def pr_auc(y_true, probas_pred):
    # calculate precision-recall curve
    p, r, _ = precision_recall_curve(y_true, probas_pred)
    # calculate area under curve
    return auc(r, p)

rus = RandomUnderSampler(random_state=0)
X_resampled, y_resampled = rus.fit_resample(X, y)
# define evaluation procedure
pipeline.fit(X_resampled,y_resampled)

788it [00:00, 3624.52it/s]
788it [00:00, 15617.55it/s]


Pipeline(steps=[('cf1',
                 ColumnFilterTransformer(columns=['V0', 'V1', 'V2', 'V3', 'V4',
                                                  'V5', 'V6', 'V7', 'V8', 'V9',
                                                  'V10', 'V11', 'V12', 'V13',
                                                  'V14', 'V15', 'V16', 'V17',
                                                  'V18', 'V19', 'V20', 'V21',
                                                  'V22', 'V23', 'V24', 'V25',
                                                  'V26', 'V27', 'V28', 'V29', ...])),
                ('tdiff', TimeDifferenceTransformer()),
                ('ip2block', IP2BlockTransformer()),
                ('cf2',
                 ColumnFilterTransformer(columns=['V0', 'V1', 'V2', 'V3', 'V4',
                                                  'V5', 'V6', 'V7', 'V8', 'V9',
                                                  'V10', 'V11', 'V12', 'V13',
                                               

In [49]:
def add_libraries_to_conda_env(_conda_env,libraries=[],conda_dependencies=[]):
    dependencies = _conda_env["dependencies"]
    dependencies = dependencies + conda_dependencies
    pip_index = None
    for _index,_element in enumerate(dependencies):
        if type(_element) == dict:
            if "pip" in _element.keys():
                pip_index = _index
                break
    dependencies[pip_index]["pip"] =  dependencies[pip_index]["pip"] + libraries
    _conda_env["dependencies"] = dependencies
    return _conda_env

In [58]:
## Setup MLFLOW
tracking_uri = os.environ.get("TRACKING_URL")
client = MlflowClient(tracking_uri=tracking_uri)
mlflow.set_tracking_uri(tracking_uri)
experiments = client.list_experiments()
experiment_names = []
for exp in experiments:
    experiment_names.append(exp.name)
experiment_name = "demo"
if experiment_name not in experiment_names:
    mlflow.create_experiment(experiment_name)
mlflow.set_experiment(experiment_name)

In [59]:
with mlflow.start_run():
    mlflow.log_param("Model","LightGBM")
    mlflow.log_param("ReasonCodes","No")
    mlflow.log_param("Filters","Yes")
    mlflow.log_param("Transforms","Yes")
    conda_env = mlflow.sklearn.get_default_conda_env()
    conda_env = add_libraries_to_conda_env(conda_env,libraries=["lightgbm==3.1.1","python-dateutil==2.8.1"],conda_dependencies=["shap==0.39.0"])
    mlflow.sklearn.log_model(pipeline,"model",conda_env=conda_env)