Producing lists out of prediction files
===
This notebooks aims to produce lists as JSON file with each entry containing fields:
- siret
- periode
- score
- timestamp
- algo
- alert

It takes as inputs a set of CSV prediction files produced by `predictsignauxfaibles`, typically:
- one file corresponding to the "default" model
- one file corresponding to the "small" model

In [None]:
%config Completer.use_jedi = False

In [None]:
# Set logging level to INFO
from collections import namedtuple
import logging
logging.getLogger().setLevel(logging.INFO)

# Import required libraries and modules
from datetime import datetime
import pandas as pd
from pathlib import Path
import json

from predictsignauxfaibles.config import OUTPUT_FOLDER
from predictsignauxfaibles.merge_models import merge_models
from predictsignauxfaibles.utils import assign_flag, log_splits_size

In [None]:
# Imported only for debugging, to be removed
from matplotlib import pyplot as plt
import numpy as np

## Loading CSV predictions from last run of "default" and "small" models

In [None]:
default = pd.read_csv("/home/simon.lebastard/predictsignauxfaibles/predictsignauxfaibles/model_runs/default_20210531-165745/predictions.csv")
small = pd.read_csv("/home/simon.lebastard/predictsignauxfaibles/predictsignauxfaibles/model_runs/small_20210531-180946/predictions.csv")

## Making alert flag
Using thredsholds selected through precision/recall curve.
Note that the default behaviour in our CLI will be to select thredsholds through $f_{\beta}$ score maximisation

In [None]:
default["alert"] = default["predicted_probability"].apply(
    lambda x: assign_flag(x, t_rouge=0.903, t_orange=0.134)
)

In [None]:
small["alert"] = small["predicted_probability"].apply(
    lambda x: assign_flag(x, t_rouge=0.683, t_orange=0.127)
)

(((( Debugging

In [None]:
default

In [None]:
plt.scatter(
    np.arange(0,len(default.expl_selection)),
    default.expl_selection.isnull(),
)

In [None]:
small

In [None]:
plt.scatter(
    np.arange(0,len(small.expl_selection)),
    small.expl_selection.isnull(),
)

))))

## Merging the two model predictions

Model "default" has priority over "small"

In [None]:
ModelPreds = namedtuple("ModelPreds", ["name", "predictions"])

model_list = [
    ModelPreds("default", default),
    ModelPreds("small", small),
]

In [None]:
merged = merge_models(
    model_list,
    cols_to_merge=[
        "alert",
        "predicted_probability",
        "expl_selection",
        "macro_expl",
        "micro_expl",
        "macro_radar",
    ]
)

(((( Quality control

In [None]:
merged

In [None]:
plt.scatter(
    np.arange(0,len(merged.alert)),
    merged.alert.isnull(),
)

In [None]:
merged.groupby(by="which_model").siret.count()

In [None]:
plt.scatter(
    np.arange(0,len(merged.expl_selection)),
    merged.expl_selection.isnull(),
)

In [None]:
100*merged.groupby(by="alert").siret.count()/len(merged)

))))

## Applying Redressement a posteriori

In [None]:
from predictsignauxfaibles.pipelines import REDRESSEMENTS_PIPELINE, run_pipeline

## Fetching datasets necessary to apply redressements

In [None]:
REDR_FEATURES = [
    "siret",
    "siren",
    "periode",
    "code_naf",
    "cotisation_moy12m",
    "ratio_dette",
    "ratio_dette_moy12m",
    "tag_debit",
    "tag_default",
    "cotisation",
    "montant_part_ouvriere",
    "montant_part_patronale",
    "delai_nb_jours_total",
    "delai_montant_echeancier",
    "delai_nb_jours_restants",
]

july_2020 = SFDataset(
    date_min="2020-07-01",
    date_max="2020-07-31",
    fields = REDR_FEATURES,
    sample_size = 1_000_000
)

july_2020 = SFDataset(
    date_min="2021-01-01",
    date_max="2021-01-31",
    fields = REDR_FEATURES,
    sample_size = 1_000_000
)

In [None]:
READ_FROM = "csv"

if READ_FROM == "csv":
    july_2020 = pd.read_csv("/home/common/july_2020.csv")
    jan_2021 = pd.read_csv("/home/common/jan_2021.csv")
if READ_FROM == "mongo":
    july_2020.fetch_data()
    jan_2021.fetch_data()

In [None]:
jan_2021.set_index("siret", inplace = True)
july_2020.set_index("siret", inplace = True)
merged.set_index("siret", inplace = True)

### Joining merged with the datasets for Redressements

In [None]:
merged_redr = merged.join(july_2020, rsuffix="_july2020")
merged_redr = merged_redr.join(jan_2021, lsuffix = "_july2020", rsuffix="_latest")

(((( Quality control: did we loose SIRETS along the way?

))))

In [None]:
merged_redr = run_pipeline(merged_redr, REDRESSEMENTS_PIPELINE)

In [None]:
merged_redr.sample(n=3)

In [None]:
# Filtering fields that we want


Preparing a new dummy list
---
From what was output by the succesful run of `python3 -m predictsignauxfaibles` using the new function explain, let's produce a list that we can communicate to the front-end team.

Collection `Scores` on MongoDB needs to receive that looks like this:
```
{
    "siret": "12345678901234",
    "periode": "2019-01-01",
    "score": 0.996714234,
    "batch": "1904",
    "timestamp": 2019-01-01T14:56:58.418+00:00,
    "algo": "algo_avec_urssaf",
    "alert" :"Alerte seuil F1"
}
```

In [None]:
merged_redr["periode"] = "2021-06-01"
merged_redr["batch"] = "<BATCH_NAME>"
merged_redr["algo"] = "default"

In [None]:
merged_redr

In [None]:
list_id = datetime.now().strftime("%Y%m%d-%H%M%S")
run_path = Path(OUTPUT_FOLDER) / f"{list_id}"
run_path.mkdir(parents=True, exist_ok=True)

with open(run_path / "scores.json", "w") as stats_file:
    stats_file.write(json.dumps(merged.to_json()))

Alternative method:

In [None]:
pred_dict = merged.to_dict('records')

In [None]:
import json

js = json.dumps(pred_dict) #allow_nan=False
with open("/home/simon.lebastard/predictsignauxfaibles/data/explain/scores_export_test.json", "w", encoding="utf-8") as file:
    file.write(js)