In [None]:
import os
import sys
import time
import joblib
import json
import logging
import datetime

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from tqdm import tqdm
from pathlib import Path
from sklearn.model_selection import train_test_split
from IPython.display import display
from pprint import pprint
from typing import Dict, List, Tuple, Union
from matplotlib.ticker import MaxNLocator

#Rudderlab data utilities imports
from rudderlabs.data.apps.log import setup_file_logger
from rudderlabs.data.apps.config import read_yaml
from rudderlabs.data.apps.utils import get_latest_folder
from rudderlabs.data.apps.aws.s3 import upload_file_to_s3

from sklearn.metrics import average_precision_score, precision_recall_fscore_support, roc_auc_score, f1_score
from sklearn.metrics import precision_score, recall_score, precision_recall_curve, roc_curve
from sklearn.metrics import get_scorer
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe

pd.options.display.max_columns=None
tqdm.pandas()

In [None]:
# Parameters cell for papermill. These values can get overridden by parameters passed by papermill
job_id = str(int(time.time()))
local_input_path = None
local_output_path = None
code_path = "../";

In [None]:
#Initialize input and output paths if they are not passed by papermill
if local_input_path is None:
    local_input_path = f"../data/{job_id}/train_automl"
    
if local_output_path is None:
    local_output_path = f"../data/{job_id}/train_automl"

In [None]:
print(job_id)
print(f"local_input_path {local_input_path}")
print(f"local_output_path {local_output_path}")

In [None]:
#Local imports
sys.path.append(code_path)
from data_loader import DataIO

In [None]:
# Constants
# All the required constants are defined here
IMAGE_FORMAT = 'png'

In [None]:
#Logging setup
try:
    log_file_path = os.path.join(local_output_path, "logs", "sample_notebook.log")
    logging = setup_file_logger(log_file_path)
except:
    pass

logging.info("\n\n\t\tSTARTING FEATURE PREPROCESSING")

In [None]:
#Configurations
notebook_config = read_yaml(os.path.join(code_path, "config/train_automl.yaml"))
print("Notebook config:")
pprint(notebook_config)

In [None]:
creds_config = read_yaml(os.path.join(code_path, "credentials.yaml"))

In [None]:
# All the output files get stored in the output_directory. Each run of the feature_processing generates a new sub directory based on the timestamp.
# output directory structure
# - data
#   - <job_id>
#       - data-prep
#           - visuals
#           - model_artifacts
visuals_dir = os.path.join( local_output_path, "visuals" )
model_artifacts_dir = os.path.join(local_output_path, "model_artifacts")

logging.info(f"All the output files will be saved to following location: {local_output_path}")
for output_path in [local_output_path, visuals_dir, model_artifacts_dir]:
    Path(output_path).mkdir(parents=True, exist_ok=True)

In [None]:
#Data splitting
train_split = data_prep_config['data']['train_size']
val_split = notebook_config['data']['val_size']
test_split = notebook_config['data']['test_size']

ignore_features = notebook_config['data']['ignore_features']
label_column = notebook_config['data']['label_column']

In [None]:
print("Getting data from warehouse")
dataIO = DataIO(notebook_config, creds_config)
input_data = dataIO.get_data()

In [None]:
#Ignoring features
#Select valid columns to ignore from the feature table
ignore_features = [ col for col in ignore_features if col in input_data.columns ]
print(f"Ignoring features {ignore_features}")
logging.info(f"Ignoring features {ignore_features}")
input_data = input_data.drop(columns=ignore_features)

In [None]:
print("Basic stats of all numerical features in input:")
pd.options.display.max_columns = None
# Histograms for each numeric features
display(input_data.describe())

In [None]:
X_train, X_test = train_test_split(input_data, test_size=val_split+test_split)
X_val, X_test = train_test_split(X_test, test_size=test_split/(test_split + val_split))

In [None]:
y_train = X_train[label_column]
y_test = X_test[label_column]
y_val = X_val[label_column]

In [None]:
X_train.head()

### Utility function

In [None]:
def get_classification_metrics(y_true, y_pred_proba, th=0.5):
    precision, recall, f1, _ = precision_recall_fscore_support(y_true, np.where(y_pred_proba>th,1,0))
    precision = precision[1]
    recall = recall[1]
    f1 = f1[1]
    roc_auc = roc_auc_score(y_true, y_pred_proba)
    pr_auc = average_precision_score(y_true, y_pred_proba)
    metrics = {"precision": precision, "recall": recall, "f1_score": f1, "roc_auc": roc_auc, 'pr_auc': pr_auc}
    return metrics

def get_best_th(y_true, y_pred_proba):
    """
    Returns the threshold that maximizes f1 score based on y_true and y_pred_proba
    Also returns the metrics at the threshold
    y_true: Array of 1s and 0s. True labels
    y_pred_proba: Array of predicted probabilities
    """
    best_f1 = 0.0
    best_th = 0.0
   
    for th in np.arange(0,1,0.01):
        f1 = f1_score(y_true, np.where(y_pred_proba>th,1,0))
        if f1 >= best_f1:
            best_th = th
            best_f1 = f1
            
    best_metrics = get_classification_metrics(y_true, y_pred_proba, best_th)
    return best_metrics, best_th

### Pycaret

In [None]:
from pycaret.classification import *
import pycaret

In [None]:
def get_metrics_pycaret(
    best_model,
    X_train: pd.DataFrame, y_train: pd.DataFrame,
    X_test: pd.DataFrame, y_test: pd.DataFrame,
    X_val: pd.DataFrame, y_val: pd.DataFrame
):
    train_preds = pd.DataFrame(predict_model(best_model, X_train, raw_score=True))["prediction_score_1"]
    train_metrics, prob_threshold = get_best_th(y_train, train_preds)   

    test_preds = pd.DataFrame(predict_model(best_model, X_test, raw_score=True))["prediction_score_1"]
    test_metrics = get_classification_metrics(y_test, test_preds, prob_threshold)

    val_preds = pd.DataFrame(predict_model(best_model, X_val, raw_score=True))["prediction_score_1"]
    val_metrics = get_classification_metrics(y_val, val_preds, prob_threshold)

    metrics = {"train": train_metrics, "val": val_metrics, "test": test_metrics}
    predictions = {"train": train_preds, "val": val_preds, "test": test_preds}
    
    return metrics, predictions, prob_threshold

In [None]:
%%time

experiment = setup(data = X_train, target = label_column ,session_id=1001)

In [None]:
%%time

best_model = compare_models()

In [None]:
results = pull()
results

In [None]:
evaluate_model(best_model)

In [None]:
plot_model(best_model, plot = 'auc')

## Hyperparameters Tuning

In [None]:
tunned_model = tune_model(best_model)

In [None]:
models_evaluations_results = {}
metrics, predictions, prob_threshold = get_metrics_pycaret(tunned_model, X_train, y_train, X_test, y_test, X_val, y_val)
models_evaluations_results = {
        "metrics" : metrics,
        "predictions" : predictions,
        "prob_threshold" : prob_threshold
    }
model_name = results.Model.tolist()[0]
print(f"\n{model_name}")
results_df = pd.DataFrame.from_records(metrics).T.round(3)
display(results_df)

In [None]:
#save model
save_model(tunned_model, os.path.join(model_artifacts_dir, "saved_model"))

In [None]:
y_actuals = {"train": y_train, "test": y_test, "val": y_val}
y_preds = predictions

### Uploading Model files

In [None]:
print("Uploading saved model file")
upload_file_to_s3(
    creds = creds_config,
    local_file_path = f"{model_artifacts_dir}/saved_model.pkl",
    s3_bucket_name = creds_config["aws"]["s3Bucket"],
    s3_path = f"{creds_config['aws']['staging_models_s3_prefix']}/{job_id}/saved_model.pkl"
)

s3_location = f"s3://{creds_config['aws']['s3Bucket']}/{creds_config['aws']['staging_models_s3_prefix']}/{job_id}"
print(f" Model file is uploaded to:\n\t{s3_location}")
logging.info(f" Model file is uploaded to:\n\t{s3_location}")

metrics_dict = {}

for metric in notebook_config["model_params"]["evaluation_metrics"]:
    scorer = get_scorer(metric)
    for split in ["train", "val", "test"]:
        metrics_dict[f"{split}_{metric}"] = scorer._score_func(y_actuals[split], np.where(y_preds[split] >  prob_threshold,1,0))

print("Adding entry to model registry")
data = {
    "job_id": job_id,
    "model_name" : "leadscoring",
    "model_type" : "staging",
    "timestamp" : datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
    "threshold" : prob_threshold,
    "metrics" : json.dumps(metrics_dict),
    "model_files_location" : s3_location,
    "version" : "1.0.0"
}
print(f"Adding entry to model registry:\n{data}")
logging.info(f"Adding entry to model registry:\n{data}")

data = pd.DataFrame(data, index=[0])
data_io = DataIO(notebook_config, creds_config)

data_io.write_to_wh_table(
    df = data,
    table_name = creds_config["data_warehouse"]["model_registry_table"],
    schema = creds_config["data_warehouse"]["schema"],
    if_exists = "append"
)

In [None]:
## Cell to hide code while converting to a html page
from IPython.display import HTML

HTML('''<script>
$('div.input').hide();
</script>''')