## README:

App : **Sample Application**

Stage : **Prediction**

This is the sample notebook for loading data from s3 location 

Getting latest production model information from `model_registry_table` mentioned in `credentials.yaml` configuration file

In [None]:
import os
import sys
import time
import pickle
import joblib
import logging
import tempfile
import datetime

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from pprint import pprint


#Rudderlab data utilities imports
from rudderlabs.data.apps.log import setup_file_logger
from rudderlabs.data.apps.config import read_yaml
from rudderlabs.data.apps.utils import get_latest_folder
from rudderlabs.data.apps.aws.s3 import upload_file_to_s3, download_s3_directory, parse_s3_path, get_s3_resource

In [None]:
# Parameters cell for papermill. These values can get overridden by parameters passed by papermill
job_id = None
local_input_path = None
local_output_path = None
code_path = "../"

In [None]:
if not job_id:
    job_id = get_latest_folder("../data").split("/")[-1]
    print(f"Data prep run id is not given. Taking the latest run id: {job_id}")

job_id = str(job_id)

In [None]:
if local_output_path is None:
    local_output_path = f"../data/{job_id}/prediction"
    if not os.path.exists(local_output_path):
        os.makedirs(local_output_path)

In [None]:
#Local imports
sys.path.append(code_path)
from model_loader import ModelLoader
from data_loader import DataIO

In [None]:
# Constants
# All the required constants are defined here
IMAGE_FORMAT = 'png'

In [None]:
#Logging setup
try:
    log_file_path = os.path.join(local_output_path, "logs", "prediction.log")
    logging = setup_file_logger(log_file_path)
except:
    pass

logging.info("\n\n\t\tSTARTING PREDICTION\n\n")

In [None]:
#Configurations
notebook_config = read_yaml(os.path.join(code_path, "config/data_prep.yaml"))
print("Notebook config:")
pprint(notebook_config)

In [None]:
creds_config = read_yaml(os.path.join(code_path, "credentials.yaml"))
print("Credentials config:")
pprint(creds_config)

In [None]:
ignore_features = notebook_config["data"]["ignore_features"]
label_column = notebook_config["data"]["label_column"]
entity_column = notebook_config["data"]["entity_column"]

### Loading model from model registry

In [None]:
model_loader = ModelLoader(creds_config)
model_data = model_loader.get_latest_model(model_type="staging")

print("Model data:")
pprint(model_data)

In [None]:
temp_folder = tempfile.mkdtemp()
print(f"Downloading model data to temporary location {temp_folder}")

s3_bucket, s3_prefix = parse_s3_path(model_data["model_files_location"])
s3_resource = get_s3_resource(creds_config)

download_s3_directory(s3_resource, s3_bucket, s3_prefix, temp_folder)

In [None]:
preprocessor = None
with open(os.path.join(temp_folder, "data_pipeline.pkl"), "rb") as f:
    preprocessor = pickle.load(f)

model = joblib.load(os.path.join(temp_folder, "saved_model.pkl"))

### Predictions

In [None]:
print("Loading sample data")
data_io = DataIO({}, creds_config)
input_data = data_io.get_data_for_prediction()

In [None]:
#Ignoring features
#Select valid columns to ignore from the feature table
ignore_features = [ col for col in ignore_features if col in input_data.columns ]
print(f"Ignoring features {ignore_features}")
logging.info(f"Ignoring features {ignore_features}")
data = input_data.drop(columns=ignore_features)

In [None]:
print("Running preprocessing pipeline")
data = preprocessor.transform(data.drop(columns=[label_column]))

In [None]:
print("Predicting")
prediction_scores = model.predict_proba(data)

display_output = pd.concat([ input_data.reset_index(), pd.Series(prediction_scores[:,1], name="prediction_score")], axis=1)


cols = [entity_column, label_column, "prediction_score"]
for col in display_output.columns:
    if col not in ["user","prediction_score", label_column]:
        cols.append(col)
        
display_output[cols].head()

Above is the sample datapoints with their actual label, prediction, and their features

In [None]:
plt.figure(figsize=(16,6))
plt.title("Prediction scores distribution of converted and non converted users")
plt.xlim([0,1])
sns.kdeplot("prediction_score", data=display_output, hue=label_column, common_norm=False);
plt.savefig(os.path.join(local_output_path, "prediction_scores_dist.png"))

In [None]:
# create dataframe
actual = input_data[label_column]
data_predictions = pd.concat([actual.reset_index(drop = True), pd.Series(prediction_scores[:,1])], axis = 1)
data_predictions.columns = ['actual', 'prob_c']

data_predictions['deciles'] = pd.qcut(data_predictions["prob_c"].rank(method='first'), 10)

In [None]:
# compare actual conversion in each decile
# pandas groupby is a useful function for 
lift = data_predictions.groupby(data_predictions["deciles"])['actual'].agg(["sum", "count"]).reset_index()

# existing conversion rate in the data
x = sum(lift['sum'])/sum(lift['count'])

# calculate conversion probs
lift['prob_con'] = lift['sum']/lift['count']

# get cumulative counts and probabilities 
lift['sum_c'] = lift['sum'].iloc[::-1].cumsum()
lift['prop_c'] = lift['sum_c']/np.sum(lift['sum'])

# old model - 40% conversion by random selection
lift['old'] = lift['count']*x


## Lift plot

Here we compare the cumulative gains in converted customers we get from our model compared to what we get by calling leads randomly.



In [None]:
# Lift plot
plt.figure(figsize=(16,5))
ax = plt.gca()

d = np.linspace(0.1,1,10).round(1)

ax.plot(lift.index, lift['prop_c'].iloc[::-1]*100, marker='o')
ax.plot(lift.index, d*100, color = 'red', marker='o')
plt.legend(["Using lead score", "Random"])
plt.xticks(lift.index, labels=d*100)
plt.title("Lift Chart")
plt.xlabel("% of Leads")
plt.ylabel("% of Conversions")
plt.grid(True)
try:
    plt.savefig(os.path.join(local_output_path, "lift.png"))
except:
    pass
plt.show()

In [None]:
print(f"Copying the predction outputs as csv to location:\n\t{local_output_path}")

display_output.to_csv(os.path.join(local_output_path,"predictions.csv"), index=False)

In [None]:
display_output.head()

### Writing the predictions back to warehouse

In [None]:
#output_data = display_output[entity_column]
output_data = display_output[[entity_column, "prediction_score"]]
output_data["updated_at"] = datetime.datetime.now()

output_data.columns = [ col.lower().replace(" ", "_") for col in output_data.columns ]

In [None]:
output_data.head()

In [None]:
print(f"Output data length : {len(output_data)}")

In [None]:
data_io = DataIO(notebook_config, creds_config)

data_io.write_to_wh_table(
    df = output_data, 
    table_name = creds_config["data_warehouse"]["prediction_store_table"], 
    schema = creds_config["data_warehouse"]["schema"], 
    if_exists="append",
)

In [None]:
print(f'The output data is stored in the warehouse table: {creds_config["data_warehouse"]["prediction_store_table"]}')

In [None]:
## Cell to hide code while converting to a html page
from IPython.display import HTML

HTML('''<script>
$('div.input').hide();
</script>''')