Producing lists out of prediction files
===
This notebooks aims to produce lists as JSON file with each entry containing fields:
- siret
- periode
- score
- timestamp
- algo
- alert

It takes as inputs a set of CSV prediction files produced by `predictsignauxfaibles`, typically:
- one file corresponding to the "default" model
- one file corresponding to the "small" model

In [None]:
%config Completer.use_jedi = False

In [None]:
# Set logging level to INFO
import logging
logging.getLogger().setLevel(logging.INFO)

# Import required libraries and modules
from datetime import datetime
import pandas as pd
from pathlib import Path
import json

from predictsignauxfaibles.config import OUTPUT_FOLDER

Functions to make the alert flag

In [None]:
def merge_models(model_list: list):
    """
    Builds a single list of predicted probabilities based on several models,
    listed by decreasing order of priority.
    For a given SIRET, if no prediction is found for the first model in the list,
    a prediction for the next model will be considered, etc
    Arguments:
        model_list: list
            A list of pandas DataFrame containing, at least, the following columns: siren, predicted_probability
    """
    merged = model_list.pop()
    for model_id in range(len(model_list)):
        model = model_list.pop()
        merged = pd.merge(model, merged, left_on='siret', right_on='siret', how='outer', suffixes=("_main", "_supp"))          
        merged["predicted_probability"] = merged["predicted_probability_main"].fillna(merged["predicted_probability_supp"])
        merged = merged[["siret", "predicted_probability"]]
    return merged

def assign_flag(pred: float, t_rouge: float, t_orange: float):
    if pred > t_rouge:
        return "Alerte seuil F1"
    elif pred > t_orange:
        return "Alerte seuil F2"
    return "Pas d'alerte"

def split_predictions(preds: pd.DataFrame, t_rouge: float, t_orange: float):
    """
    Generates red/orange/green flags based on two thresholds
    """
    assert("predicted_probability" in preds.columns.tolist())
    preds["alert"] = preds["predicted_probability"].apply(lambda x: assign_flag(x, t_rouge, t_orange))
    
    num_rouge = sum(preds["predicted_probability"] > t_rouge)
    num_orange = sum(preds["predicted_probability"] > t_orange)
    num_orange -= num_rouge
    print(f"{num_rouge} rouge ({round(num_rouge/preds.shape[0] * 100, 2)}%)")
    print(f"{num_orange} orange ({round(num_orange/preds.shape[0] * 100, 2)}%)")
    
    return preds

Let's load CSV data produced by a run with the default model and a run with the small model:

In [None]:
default = pd.read_csv("/home/simon.lebastard/predictsignauxfaibles/predictsignauxfaibles/model_runs/20210507-195755/predictions-20210507-195755.csv")
small = pd.read_csv("/home/simon.lebastard/predictsignauxfaibles/predictsignauxfaibles/model_runs/20210507-195735/predictions-20210507-195735.csv")

In [None]:
merged = merge_models(model_list = [default, small])

In [None]:
merged = split_predictions(merged, t_rouge= 0.75, t_orange = 0.3)

In [None]:
merged

In [None]:
list_id = datetime.now().strftime("%Y%m%d-%H%M%S")
run_path = Path(OUTPUT_FOLDER) / f"{list_id}"
run_path.mkdir(parents=True, exist_ok=True)

with open(run_path / "scores.json", "w") as stats_file:
    stats_file.write(json.dumps(merged.to_json()))

In [None]:
run_path