# Automated Post-integration Report - Signaux Faibles
This notebook can be run after each new data integration by the [opensignauxfaibles](https://github.com/signaux-faibles/opensignauxfaibles) codebase.

In [None]:
VARIABLES = [
    "financier_court_terme",
    "interets",
    "ca",
    "equilibre_financier",
    "endettement",
    "degre_immo_corporelle",
    "liquidite_reduite",
    "poids_bfr_exploitation",
    "productivite_capital_investi",
    "rentabilite_economique",
    "rentabilite_nette",
    "cotisation",
    "cotisation_moy12m",
    "montant_part_ouvriere",
    "montant_part_ouvriere_past_1",
    "montant_part_ouvriere_past_12",
    "montant_part_ouvriere_past_2",
    "montant_part_ouvriere_past_3",
    "montant_part_ouvriere_past_6",
    "montant_part_patronale",
    "montant_part_patronale_past_1",
    "montant_part_patronale_past_12",
    "montant_part_patronale_past_2",
    "montant_part_patronale_past_3",
    "montant_part_patronale_past_6",
    "ratio_dette",
    "ratio_dette_moy12m",
    "effectif",
    "apart_heures_consommees_cumulees",
    "apart_heures_consommees",
    "paydex_nb_jours",
    "paydex_nb_jours_past_12",
]
# ces variables sont toujours requêtées
VARIABLES += ["outcome", "periode", "siret", "siren", "time_til_outcome", "code_naf"]

# période actuelle
LATEST_PERIODE = "2021-02-01"

## Fetch a random sample of the data

In [None]:
%config Completer.use_jedi = False
import pandas as pd

In [None]:
from predictsignauxfaibles.data import SFDataset

In [None]:
dataset = SFDataset(
    fields = VARIABLES,
    sample_size=100_000
)
dataset.fetch_data();

## Temporal Coverage and NA values

In [None]:
dataset.data.periode = pd.to_datetime(dataset.data.periode)

In [None]:
date_range = dataset.data.periode.min().date(), dataset.data.periode.max().date()
print(f"Data goes from {date_range[0]} to {date_range[1]}")

In [None]:
na_rates_df = (dataset.data.isna().sum() / len(dataset) * 100).sort_values(ascending = False).to_frame()
na_rates_df.columns = ["NA rate"]
na_rates_df

## Coverage over time for selected variables

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
fig, axs = plt.subplots(len(VARIABLES), figsize=(10, 100))
fig.tight_layout()
for i, variable in enumerate(VARIABLES):
    grouped = dataset.data.groupby(pd.Grouper(key="periode", freq="M")).agg({f"{variable}": count_na_prop})
    axs[i].set_title(f"{variable}")
    axs[i].set_ylim([0, 100])
    axs[i].plot_date(grouped.index, grouped[f"{variable}"], "-");
    axs[i].set(adjustable='box')

## Average over time

In [None]:
from pandas.api.types import is_numeric_dtype

In [None]:
VARIABLES_TO_AVERAGE = [var for var in VARIABLES if is_numeric_dtype(dataset.data[var])]
fig, axs = plt.subplots(len(VARIABLES_TO_AVERAGE), figsize=(10, 100))
fig.tight_layout()
for i, variable in enumerate(VARIABLES_TO_AVERAGE):
    grouped = dataset.data.groupby(pd.Grouper(key="periode", freq="M")).agg({f"{variable}": "mean"})
    axs[i].set_title(f"{variable}")
    #axs[i].set_ylim([0, 100])
    axs[i].plot_date(grouped.index, grouped[f"{variable}"], "-");
    axs[i].set(adjustable='box')

## Codes NAF

In [None]:
import seaborn as sns
grouped = dataset.data.groupby("code_naf", as_index=False).agg({"outcome": "count"})
sns.barplot(x = grouped.code_naf, y = grouped.outcome);

## Codes NAF over time

In [None]:
grouped = dataset.data.groupby([pd.Grouper(key = "periode", freq = "2Q"), "code_naf"]).agg({"outcome": "count"}).reset_index()
plt.figure(figsize = (15, 10))
sns.lineplot(x = grouped.periode, y = grouped.outcome, hue = grouped.code_naf);

## Analysis for latest period only

In [None]:
dataset = SFDataset(
    fields = VARIABLES,
    date_min = LATEST_PERIODE,
    sample_size=100_000
)
dataset.fetch_data();

In [None]:
na_rates_df = (dataset.data.isna().sum() / len(dataset) * 100).sort_values(ascending = False).to_frame()
na_rates_df.columns = ["NA rate"]
na_rates_df