## README:

App : **Sample Application**

Stage : **Validation**

This is the sample notebook for loading golden dataset from s3 location and evaluating staging model and production model

Based on acceptance criteria staging model will be moved to production.

If there is no production model, latest staging model will be moved to production without any acceptance criteria.

Stagging and production models information can be queried from `model_registry_table` mentioned in `credentials.yaml` configuration file.

In [None]:
import os
import sys
import time
import json
import pickle
import joblib
import logging
import tempfile
import datetime

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from pprint import pprint
from tqdm import tqdm


#Rudderlab data utilities imports
from rudderlabs.data.apps.log import setup_file_logger
from rudderlabs.data.apps.config import read_yaml
from rudderlabs.data.apps.utils import get_latest_folder
from rudderlabs.data.apps.aws.s3 import upload_file_to_s3, download_s3_directory, parse_s3_path, get_s3_resource, copy_s3_to_s3

from sklearn.metrics import get_scorer

pd.options.display.max_columns=None
tqdm.pandas()

In [None]:
# Parameters cell for papermill. These values can get overridden by parameters passed by papermill
job_id = None
local_input_path = None
local_output_path = None
validation_output_path = None
code_path = "../"

In [None]:
if not job_id:
    job_id = get_latest_folder("../data").split("/")[-1]
    print(f"Data prep run id is not given. Taking the latest run id: {job_id}")

job_id = str(job_id)

In [None]:
if local_output_path is None:
    local_output_path = f"../data/{job_id}/validation"
    if not os.path.exists(local_output_path):
        os.makedirs(local_output_path)

if validation_output_path is None:
    validation_output_path = local_output_path
else:
    validation_output_path = validation_output_path.replace("<job_id>", job_id)
print(f"Validation output path: {validation_output_path}")

In [None]:
#Local imports
sys.path.append(code_path)
from model_loader import ModelLoader
from data_loader import DataIO

In [None]:
# Constants
# All the required constants are defined here
IMAGE_FORMAT = 'png'

In [None]:
#Logging setup
try:
    log_file_path = os.path.join(local_output_path, "logs", "validation.log")
    logging = setup_file_logger(log_file_path)
except:
    pass

logging.info("\n\n\t\tSTARTING PREDICTION\n\n")

In [None]:
#Configurations
data_prep_config = read_yaml(os.path.join(code_path, "config/data_prep.yaml"))
print("Data Preparation Configurations:")
pprint(data_prep_config)

In [None]:
#Configurations
notebook_config = read_yaml(os.path.join(code_path, "config/validation.yaml"))
print("Validation Configurations:")
pprint(notebook_config)

In [None]:
creds_config = read_yaml(os.path.join(code_path, "credentials.yaml"))
print("Credentials config:")
pprint(creds_config)

In [None]:
ignore_features = data_prep_config["data"]["ignore_features"]
label_column = data_prep_config["data"]["label_column"]
entity_column = data_prep_config["data"]["entity_column"]

### Get staging and production models

In [None]:
model_loader = ModelLoader(creds_config)

#### Stagging model

In [None]:
print(f"Getting latest model for job_id: {job_id}")
staging_model_data = model_loader.get_latest_model(model_type="staging", job_id=job_id)

print("Stagging model data:")
pprint(staging_model_data)

print("Downloading staging model")
stagginmodel_path = model_loader.download_model_files_to_temp(staging_model_data)

#### Production model

In [None]:
print("Getting latest production model")
prod_model_data = model_loader.get_latest_model(model_type="production")

print("Latest production model data:")
pprint(prod_model_data)

print("Downloading production model")
prod_model_path = model_loader.download_model_files_to_temp(prod_model_data)

### Preparing golden dataset

We are using same preprocessing pipeline for evaluating data on both staging and production models. In case the preprocessing pipeline and configuration used for `data preparation step` is different for staging and production model. User needs to do following things
* Upload configuration files to s3 location where model files get stored
* Use those configuration and preprocessing pipeline for respective model evaluation

In [None]:
data_io = DataIO(notebook_config, creds_config)
golden_data_s3_location = creds_config["aws"]["golden_dataset_s3_location"]
golden_data = data_io.get_data_from_s3(golden_data_s3_location)

Samples from gloden dataset

In [None]:
golden_data.head()

In [None]:
stage_preprocessor = None
with open(os.path.join(stagginmodel_path, "data_pipeline.pkl"), "rb") as f:
    stage_preprocessor = pickle.load(f)

stage_model = joblib.load(os.path.join(stagginmodel_path, "saved_model.pkl"))

In [None]:
#Ignoring features
#Select valid columns to ignore from the feature table
ignore_features = [ col for col in ignore_features if col in golden_data.columns ]
print(f"Ignoring features {ignore_features}")
logging.info(f"Ignoring features {ignore_features}")
data = golden_data.drop(columns=ignore_features)

In [None]:
print("Running preprocessing pipeline")
input_data = stage_preprocessor.transform(data.drop(columns=[label_column]))

### Validation

In [None]:
metric_names = notebook_config["model"]["evaluation_metrics"]
print(f"Metrics to calculate: {metric_names}")

acceptance_metric_name = notebook_config["model"]["acceptance_criteria"]["metric_name"]
acceptance_metric_threshold = notebook_config["model"]["acceptance_criteria"]["threshold"]

if acceptance_metric_name not in metric_names:
    metric_names.append(acceptance_metric_name)

print(f"Acceptance metric: {acceptance_metric_name}")
print(f"Acceptance metric threshold: {acceptance_metric_threshold}")

In [None]:
#Calculating metrics
def get_metrics(model, X_data, Y_data, threshold):
    predictions = model.predict_proba(X_data)[:, 1]
    metrics = {}
    for metric in metric_names:
        try:
            scorer = get_scorer(metric)
            metrics[metric] = scorer._score_func(Y_data, np.where(predictions >  threshold, 1, 0))
        except Exception as e:
            print(str(e))
            pass
    return metrics

In [None]:
staging_model_threshold = staging_model_data["threshold"]
print(f"Calculating metrics for staging model with threshold {staging_model_threshold}")
staging_metrics = get_metrics(stage_model, input_data, data[label_column], staging_model_threshold)
print(f"Stagging model metrics: {staging_metrics}")

In [None]:
prod_metrics = {}

if prod_model_path is not None:
    prod_model_threshold = prod_model_data["threshold"]
    print(f"Calculating metrics for production model with threshold {prod_model_threshold}")
    prod_model = joblib.load(os.path.join(prod_model_path, "saved_model.pkl"))
    prod_metrics = get_metrics(prod_model, input_data, data[label_column], prod_model_threshold)
else:
    print("No production model found")
    print("Copying staging model metrics to production metrics, so that the acceptance criteria can be checked")
    prod_metrics = staging_metrics

print(f"Production model metrics: {prod_metrics}")

#### Comparing Stagging and Production models

In [None]:
#Comparing metrics
print(f"Comparing metrics for staging model and production model")
accepted = False

try:
    if staging_metrics[acceptance_metric_name] > acceptance_metric_threshold * prod_metrics[acceptance_metric_name]:
        print(f"Stagging model is accepted")
        logging.info(f"Stagging model is accepted")
        accepted = True
        new_prod_data = staging_model_data.copy()
        #Remove ID
        if "id" in new_prod_data:
            new_prod_data.pop("id")

        new_prod_data["timestamp"] = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        new_prod_data["metrics"] = json.dumps(staging_metrics)
        new_prod_data["model_type"] = "production"

        s3_resource = get_s3_resource(creds_config)
        staging_model_s3_location = staging_model_data["model_files_location"]
        
        production_model_s3_location = staging_model_s3_location.replace(
            creds_config["aws"]["staging_models_s3_prefix"],
            creds_config["aws"]["production_models_s3_prefix"]
        )

        new_prod_data["model_files_location"] = production_model_s3_location

        print(f"Copying staging model files from {staging_model_s3_location} to production model location: {production_model_s3_location}")

        copy_s3_to_s3(
            s3_resource = s3_resource,
            source = staging_model_s3_location,
            destination = production_model_s3_location,
            delete_source = False
        )
        
    else:
        print(f"Stagging model is rejected")
        logging.info(f"Stagging model is rejected")
except Exception as e:
    print(str(e))
    pass

#### Updating Model Registry

In [None]:
model_registry_table = creds_config["data_warehouse"]["model_registry_table"]

def update_model_registry(data_io, table_data):
    payload = pd.DataFrame(table_data, index=[0])

    data_io.write_to_wh_table(
        df = payload,
        table_name = model_registry_table,
        schema = creds_config["data_warehouse"]["schema"],
        if_exists = "append"
    )

In [None]:
staging_model_data["evaluation_files_location"] = validation_output_path
staging_model_data["metrics"] = json.dumps(staging_metrics)

data_io = DataIO(notebook_config, creds_config)
print(f"Updating model registry table {model_registry_table}")

print("Updating staging model data")
pprint(staging_model_data)

id_value = staging_model_data.pop("id")
data_io.update_wh_table(
    data = staging_model_data,
    table_name = model_registry_table,
    schema = creds_config["data_warehouse"]["schema"],
    where = f"id = '{id_value}'"
)

if accepted:
    print("Updating production model data")
    pprint(new_prod_data)
    update_model_registry(data_io, new_prod_data)