## README:

App : **Sample Application**

Stage : **Training**

This is the sample notebook for loading data from s3 location 

The notebook expects the required inputs in the adjacent `data` folder 

Loading configuration from `config/train.yaml` file

In [None]:
import os
import sys
import time
import joblib
import json
import logging
import datetime

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from tqdm import tqdm
from pathlib import Path
from sklearn.model_selection import train_test_split
from IPython.display import display
from pprint import pprint
from typing import Dict, List, Tuple, Union
from matplotlib.ticker import MaxNLocator

#Rudderlab data utilities imports
from rudderlabs.data.apps.log import setup_file_logger
from rudderlabs.data.apps.config import read_yaml
from rudderlabs.data.apps.utils import get_latest_folder
from rudderlabs.data.apps.aws.s3 import upload_file_to_s3

from sklearn.metrics import average_precision_score, precision_recall_fscore_support, roc_auc_score, f1_score
from sklearn.metrics import precision_score, recall_score, precision_recall_curve, roc_curve
from sklearn.metrics import get_scorer
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe

pd.options.display.max_columns=None
tqdm.pandas()

In [None]:
# Parameters cell for papermill. These values can get overridden by parameters passed by papermill
job_id = None
train_id = None
local_input_path = None
local_output_path = None
code_path = "../";

In [None]:
if not job_id:
    job_id = get_latest_folder("../data").split("/")[-1]
    print(f"Data prep run id is not given. Taking the latest run id: {job_id}")

job_id = str(job_id)

In [None]:
#Initialize input and output paths if they are not passed by papermill
if local_input_path is None:
    local_input_path = f"../data/{job_id}/data-prep"

In [None]:
def generate_train_id(job_id_folder):
    train_id = 0
    while True:
        if not os.path.exists(os.path.join(job_id_folder, f"train_{train_id}")):
            # The train id doesn't exist. This will be used as incremental
            return train_id
        else:
            train_id+=1

if train_id is None:
    train_id = generate_train_id(local_input_path)
print(f"Train job with id {train_id} is starting")


if local_output_path is None:
    local_output_path = f"../data/{job_id}/train_{train_id}"

In [None]:
print(job_id)
print(f"local_input_path {local_input_path}")
print(f"local_output_path {local_output_path}")

In [None]:
#Local imports
sys.path.append(code_path)
from data_loader import DataIO

In [None]:
# Constants
# All the required constants are defined here
IMAGE_FORMAT = 'png'

In [None]:
#Logging setup
try:
    log_file_path = os.path.join(local_output_path, "logs", "train.log")
    logging = setup_file_logger(log_file_path)
except:
    pass

logging.info("\n\n\t\tSTARTING TRAINING JOB\n\n")

In [None]:
#Configurations
notebook_config = read_yaml(os.path.join(code_path, "config/train.yaml"))
print("Notebook config:")
pprint(notebook_config)

In [None]:
creds_config = read_yaml(os.path.join(code_path, "credentials.yaml"))
print("Credentials config:")
pprint(creds_config)

In [None]:
# All the output files get stored in the output_directory. Each run of the feature_processing generates a new sub directory based on the timestamp.
# output directory structure
# - data
#   - <job_id>
#       - train
#           - visuals
#           - model_artifacts
visuals_dir = os.path.join( local_output_path, "visuals" )
model_artifacts_dir = os.path.join(local_output_path, "model_artifacts")

logging.info(f"All the output files will be saved to following location: {local_output_path}")
for output_path in [local_output_path, visuals_dir, model_artifacts_dir]:
    Path(output_path).mkdir(parents=True, exist_ok=True)

In [None]:
TRAIN = "train"
TEST = "test"
VAL = "val"

dataframes = { 
    split : pd.read_csv(os.path.join(local_input_path, f"{split}.csv"))
    for split in [TRAIN, TEST, VAL]
}

print("No:of samples in :")
for split, df in dataframes.items():
    print(f"{split}: {len(df)}")

Sample Training Data

In [None]:
dataframes[TRAIN].head()

In [None]:
fig, axs = plt.subplots(1,3,figsize=(16,6))
fig.suptitle("Label distribution:")
for n, (split, df) in enumerate(dataframes.items()):
    df[notebook_config["data_path"]["label_col_name"]].value_counts().plot.pie(explode=[0,0.1], autopct="%1.1f%%", ax=axs[n])
    axs[n].set_title(split)
    axs[n].set_ylabel("")

In [None]:
models = notebook_config["model_params"]["models"]
pprint(models)

In [None]:
models_to_run = [model["name"] for model in models]
sep = ', '
print(f"Models to explore are: {sep.join(models_to_run)}")
print("**Note: Currently only xgboost is implemented for the POC")

In [None]:
# Separate input data and label data for the given dataset
def prepare_dataset(dataset: pd.DataFrame, label_columns: str) -> Tuple[pd.DataFrame, pd.DataFrame]:
    return dataset.drop(columns=label_columns), dataset[label_columns]

label_columns = notebook_config["data_path"]["label_col_name"]
X_train, y_train = prepare_dataset(dataframes[TRAIN], label_columns)
X_test, y_test = prepare_dataset(dataframes[TEST], label_columns)
X_val, y_val = prepare_dataset(dataframes[VAL], label_columns)

In [None]:
hyperopts_expressions_map = {exp.__name__: exp for exp in [hp.choice, hp.quniform, hp.uniform, hp.loguniform]}
evalution_metrics_map = {metric.__name__: metric for metric in [average_precision_score, precision_recall_fscore_support]}

#Generate hyper parameter space for given options
def generate_hyperparameter_space(hyperopts: List[dict]):
    space = {}
    for expression in hyperopts:
        expression = expression.copy()
        exp_type = expression.pop("type")
        name = expression.pop("name")

        # Handle expression for explicit choices and 
        # implicit choices using "low", "high" and optinal "step" values
        if exp_type == "choice":
            options = expression["options"]
            if not isinstance(options, list):
                expression["options"] = list(range( options["low"], options["high"], options.get("step", 1)))
                
        space[name] = hyperopts_expressions_map[f"hp_{exp_type}"](name, **expression)
    return space


def build_model(
    X_train:pd.DataFrame, y_train:pd.DataFrame,
    X_val:pd.DataFrame, y_val:pd.DataFrame,
    model_class: Union[XGBClassifier, RandomForestClassifier, MLPClassifier], 
    model_config: dict, 
    ):
    
    hyperopt_space = generate_hyperparameter_space(model_config["hyperopts"])

    #We can set evaluation set for xgboost model which we cannot directly configure from configuration file
    fit_params = model_config.get("fitparams", {}).copy()
    if model_class.__name__ == "XGBClassifier":
        fit_params["eval_set"] = [( X_train, y_train), ( X_val, y_val)]

    #Objective method to run for different hyper-parameter space
    def objective(space):
        clf = model_class(**model_config["modelparams"], **space)
        clf.fit(X_train, y_train, **fit_params)
        pred = clf.predict_proba(X_val)
        eval_metric_name = model_config["evaluation_metric"]
        pr_auc = evalution_metrics_map[eval_metric_name](y_val, pred[:, 1])
        
        return {'loss': (0  - pr_auc), 'status': STATUS_OK , "config": space}

    trials = Trials()
    best_hyperparams = fmin(fn = objective,
                            space = hyperopt_space,
                            algo = tpe.suggest,
                            max_evals = model_config["hyperopts_config"]["max_evals"],
                            return_argmin=False,
                            trials = trials)

    clf = model_class(**best_hyperparams, **model_config["modelparams"])
    clf.fit(X_train, y_train, **fit_params)
    return clf, trials, best_hyperparams

In [None]:
models_map = { model.__name__: model for model in [XGBClassifier, RandomForestClassifier, MLPClassifier]}
models_hyper_parameters_search_results = {}

for model_config in models:
    name = model_config["name"]
    print(f"Training {name}")
    clf, trials, best_hyperparams = build_model(X_train, y_train, X_val, y_val, models_map[name], model_config)

    models_hyper_parameters_search_results[name] = {
        "model" : clf,
        "trials" : trials,
        "eval_metric_name" : model_config["evaluation_metric"],
        "eval_metric_values" : [ -1*loss for loss in trials.losses()],
    }

In [None]:
def plot_hyperparameter_search_resulsts(name:str, values:list, model_name:str) -> None:
    plt.figure(figsize=(15,6))
    plt.plot(values)
    plt.xlabel("Trial")
    plt.gca().xaxis.set_major_locator(MaxNLocator(integer=True))
    plt.ylabel(name)
    plt.grid(True)
    plt.title(f"Validation score on various hyperparam combination steps ({model_name})");

for model_name, results in models_hyper_parameters_search_results.items():
    plot_hyperparameter_search_resulsts(results["eval_metric_name"], results["eval_metric_values"], model_name)

**Top runs with their hyperparameters:**

## Evaluation metrics

In [None]:
def get_classification_metrics(y_true, y_pred_proba, th=0.5):
    precision, recall, f1, _ = precision_recall_fscore_support(y_true, np.where(y_pred_proba>th,1,0))
    precision = precision[1]
    recall = recall[1]
    f1 = f1[1]
    roc_auc = roc_auc_score(y_true, y_pred_proba)
    pr_auc = average_precision_score(y_true, y_pred_proba)
    metrics = {"precision": precision, "recall": recall, "f1_score": f1, "roc_auc": roc_auc, 'pr_auc': pr_auc}
    return metrics
    
def get_best_th(y_true, y_pred_proba):
    """
    Returns the threshold that maximizes f1 score based on y_true and y_pred_proba
    Also returns the metrics at the threshold
    y_true: Array of 1s and 0s. True labels
    y_pred_proba: Array of predicted probabilities
    """
    best_f1 = 0.0
    best_th = 0.0

    for th in np.arange(0,1,0.01):
        f1 = f1_score(y_true, np.where(y_pred_proba>th,1,0))
        if f1 >= best_f1:
            best_th = th
            best_f1 = f1
            
    best_metrics = get_classification_metrics(y_true, y_pred_proba, best_th)
    return best_metrics, best_th

In [None]:
def get_metrics(
    clf: Union[XGBClassifier, RandomForestClassifier, MLPClassifier],
    X_train: pd.DataFrame, y_train: pd.DataFrame,
    X_test: pd.DataFrame, y_test: pd.DataFrame,
    X_val: pd.DataFrame, y_val: pd.DataFrame
):
    train_preds = clf.predict_proba(X_train)[:,1]
    train_metrics, prob_threshold = get_best_th(y_train, train_preds)

    test_preds = clf.predict_proba(X_test)[:,1]
    test_metrics = get_classification_metrics(y_test, test_preds, prob_threshold)

    val_preds = clf.predict_proba(X_val)[:,1]
    val_metrics = get_classification_metrics(y_val, val_preds, prob_threshold)

    metrics = {"train": train_metrics, "val": val_metrics, "test": test_metrics}
    predictions = {"train": train_preds, "val": val_preds, "test": test_preds}
     
    return metrics, predictions, prob_threshold

In [None]:
models_evaluations_results = {}

for model_name, results in models_hyper_parameters_search_results.items():
    metrics, predictions, prob_threshold = get_metrics(results["model"], X_train, y_train, X_test, y_test, X_val, y_val)
    models_evaluations_results[model_name] = {
        "metrics" : metrics,
        "predictions" : predictions,
        "prob_threshold" : prob_threshold
    }
    
    print(f"\n{model_name}")
    results_df = pd.DataFrame.from_records(metrics).T.round(3)
    display(results_df)

### Best Model

Selecting best model based on F1 score for validation dataset

In [None]:
# ToDo: evaluation_metric should be used here too, to pick the best model. Currently it is hard coded as f1_score
f1_scores = { name:results["metrics"]["val"]["f1_score"] for name, results in models_evaluations_results.items()}
f1_scores = dict(sorted(f1_scores.items(), key=lambda item: item[1], reverse=True))
best_model_name = list(f1_scores.keys())[0]
best_model = models_hyper_parameters_search_results[best_model_name]["model"]
print(f"Selected best model {best_model_name} with f1 score {f1_scores[best_model_name]}")

In [None]:
from sklearn.metrics import ConfusionMatrixDisplay, precision_recall_curve,confusion_matrix
from sklearn.metrics import roc_curve, auc

In [None]:
y_actuals = {"train": y_train, "test": y_test, "val": y_val}
y_preds = models_evaluations_results[best_model_name]["predictions"]
prob_threshold = models_evaluations_results[best_model_name]["prob_threshold"]

In [None]:

fig = plt.figure(constrained_layout=True, figsize=(16,15))
fig.suptitle("Confusion Matrices:")
ax_dict = fig.subplot_mosaic(
    [
        ["train-cm", "val-cm", "test-cm"],
        ["pr-auc", "pr-auc", "pr-auc"],
        ["roc-auc", "roc-auc", "roc-auc"],
    ],
)
pr_aucs  = []
roc_aucs = []
for n,split in enumerate(["train", "val", "test"]):
    y_actual = y_actuals[split]
    y_pred = y_preds[split]
    dsp = ConfusionMatrixDisplay(confusion_matrix(y_actual, np.where(y_pred>=prob_threshold,1,0)))
    dsp.plot(ax=ax_dict[f"{split}-cm"])
    ax_dict[f"{split}-cm"].set_title(f"{split}")
    pr, re, th =  precision_recall_curve(y_actual, y_pred)
    sns.lineplot(x=pr, y=re, ax=ax_dict["pr-auc"])
    
    pr_aucs.append(average_precision_score(y_actual, y_pred))
    fpr, tpr, _ = roc_curve(y_actual, y_pred)
    roc_aucs.append(auc(fpr, tpr))
    sns.lineplot(x=fpr, y=tpr, ax=ax_dict["roc-auc"])
    
ax_dict["pr-auc"].set_title("Precision Recall AUC")
ax_dict["pr-auc"].legend([f"train ({pr_aucs[0]:.2f})", f"val ({pr_aucs[1]:.2f})", f"test ({pr_aucs[2]:.2f})"])

ax_dict["pr-auc"].set_xlabel("Precision")
ax_dict["pr-auc"].set_ylabel("Recall")
ax_dict["pr-auc"].grid(True)


ax_dict["roc-auc"].set_title("ROC AUC")
ax_dict["roc-auc"].legend([f"train ({roc_aucs[0]:.2f})", f"val ({roc_aucs[1]:.2f})", f"test ({roc_aucs[2]:.2f})"])

ax_dict["roc-auc"].set_xlabel("FPR")
ax_dict["roc-auc"].set_ylabel("TPR")
ax_dict["roc-auc"].grid(True)

In [None]:

#save model
joblib.dump(best_model, f"{model_artifacts_dir}/saved_model.pkl")
print(f"Model is saved as a pickle file in the location:\n\t{local_output_path}\n\nIt can be loaded using joblib.load")

### Uploading Model files

In [None]:
print("Uploading saved model file")
upload_file_to_s3(
    creds = creds_config,
    local_file_path = f"{model_artifacts_dir}/saved_model.pkl",
    s3_bucket_name = creds_config["aws"]["s3Bucket"],
    s3_path = f"{creds_config['aws']['staging_models_s3_prefix']}/{job_id}/saved_model.pkl"
)

s3_location = f"s3://{creds_config['aws']['s3Bucket']}/{creds_config['aws']['staging_models_s3_prefix']}/{job_id}"
print(f" Model file is uploaded to:\n\t{s3_location}")
logging.info(f" Model file is uploaded to:\n\t{s3_location}")

metrics_dict = {}

#Run metrics on golden dataset
for metric in notebook_config["model_params"]["evaluation_metrics"]:
    scorer = get_scorer(metric)
    for split in ["train", "val", "test"]:
        metrics_dict[f"{split}_{metric}"] = scorer._score_func(y_actuals[split], np.where(y_preds[split] >  prob_threshold,1,0))

print("Adding entry to model registry")
data = {
    "job_id": job_id,
    "model_name" : "leadscoring",
    "model_type" : "staging",
    "timestamp" : datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
    "threshold" : prob_threshold,
    "metrics" : json.dumps(metrics_dict),
    "model_files_location" : s3_location,
    "version" : "1.0.0"
}
print(f"Adding entry to model registry:\n{data}")
logging.info(f"Adding entry to model registry:\n{data}")

data = pd.DataFrame(data, index=[0])
data_io = DataIO(notebook_config, creds_config)

data_io.write_to_wh_table(
    df = data,
    table_name = creds_config["data_warehouse"]["model_registry_table"],
    schema = creds_config["data_warehouse"]["schema"],
    if_exists = "append"
)

In [None]:
## Cell to hide code while converting to a html page
from IPython.display import HTML

HTML('''<script>
$('div.input').hide();
</script>''')