In [1]:
import pandas as pd
from pathlib import Path
from utils import define_label
from functools import partial
import joblib

In [2]:
path_data = Path("../data")
path_data_raw = path_data / "raw"
path_peda = path_data_raw/"donnees-peda.csv"
path_sat_sta = path_data_raw/"satisfaction-stagiaires.csv"
path_sat_ens = path_data_raw/"satisfaction-enseignants.csv"

dfp = pd.read_csv(path_peda)
dfs_s = pd.read_csv(path_sat_sta, skiprows=1)
dfs_e = pd.read_csv(path_sat_ens)

In [3]:
dfp["NOM_COMPLET"] = dfp["NOM"] + " " + dfp["PRENOM"]
dfs_s = dfs_s.loc[~dfs_s["NOM"].isna()]
dfs_s["NOM_COMPLET"] = dfs_s["NOM"] + " " + dfs_s["PRENOM"]
dfs_e["NOM_COMPLET"] = dfs_e["NOM"] + " " + dfs_e["PRENOM"]

In [4]:
## Stagiaires en Formation Initiale (FI) ou Formation Continue (FC)
b_fi = dfp["FI/FC"] == "FI"
b_fc = dfp["FI/FC"] == "FC"

noms_sta_satis = dfs_s["NOM_COMPLET"]
noms_sta_FI = dfp["NOM_COMPLET"].loc[b_fi]
noms_sta_FC = dfp["NOM_COMPLET"].loc[b_fc]

# ajout de la colonne FI / FC pour les données de satisfaction
dfs_s["is_FI"] = dfs_s["NOM_COMPLET"].isin(noms_sta_FI)
dfs_s["FI/FC"] = ["FI"*ii + (1-ii)*"FC" for ii in dfs_s["is_FI"]]

# garder uniquement les formations initiales
dfs_s = dfs_s[dfs_s["FI/FC"] == "FI"]
dfp = dfp[dfp["FI/FC"] == "FI"]

In [5]:
# combiner les 2 jeux de données (péda + satisfaction)
df = pd.merge(left=dfs_s, right=dfp, on=["NOM_COMPLET", "NOM", "PRENOM"], how="right")

## feature engineering

In [6]:
labels = dict()

In [7]:
## LIKERT - MOYENNE
cats = ["Satisfaction globale", "motivation", "Pédagogie", "Organisation"]
## calculer les moyennes
for c in cats:
    cols = [f"{c}.EV{i}" for i in range(1, 6)]
    df[f"MOYENNE_{c}"] = df[cols].mean(axis=1)

In [8]:
## TAUX DE PRESENCE
for imod in range(1,6):
    cols = [f"PJ{i}" for i in range(1+(imod-1)*3, 4+(imod-1)*3)]
    df[f"pre_{imod}_tx"] = df[cols].sum(axis=1)/len(cols)
cols = [f"PJ{i}" for i in range(1, 16)]
df["pre_tx"] = df[cols].sum(axis=1)/len(cols)

cuts = [0.5, 0.79]
labels["presence"] = ["absent(e)", "intermittent(e)", "assidu(e)"]

define_pre_label = partial(define_label, cuts = cuts, labs = labels["presence"])
    
df["pre_tx_lab"] = df["pre_tx"].apply(define_pre_label)

In [9]:
## REUSSITE ACADEMIQUE
# moyenne totale pour chaque étudiant
cols = [f"REV{i}" for i in range(1, 7)]
df["MOYENNE_EV"] = df[cols].sum(axis=1)/len(cols)
# reussi
df["SUCCES"] = (df["MOYENNE_EV"] > 0.5)
# échec, moyen, bon, excellent
aca_cuts = [5, 6, 7.9]
labels["perf_academique"] = ["échec", "moyen", "bon", "excellent"]
define_academic_label = partial(define_label, cuts = aca_cuts, labs = labels["perf_academique"])
df["academic_lab"] = df["MOYENNE_EV"].apply(define_academic_label)

In [10]:
## SCORE NPS
cuts = [7, 8]
labels["NPS"] = ["detracteurs", "passifs", "promoteurs"]
define_nps_label = partial(define_label, cuts = cuts, labs = labels["NPS"])
df["NPS_LABEL"] = df["NPS.EV6"].apply(define_nps_label)

In [11]:
## ENGAGEMENT PLATEFORME
df["OC_engag_tx"] = df["OC_JC"]/15

cuts = [0.4,0.75]
labels["OC_engagement"] = ["désengagé(e)", "moy. engagé(e)", "très engagé(e)"]
define_engag_label = partial(define_label, cuts = cuts, labs = labels["OC_engagement"])
df["OC_engag_label"] = df["OC_engag_tx"].apply(define_engag_label)

labels["OC_frequence"] = ["faible", "moyenne", "haute"]
df["OC_F_lab"] = df["OC_F"].apply(lambda x: labels["OC_frequence"][x-1])

In [12]:
## Pour chaque mpodule, quels stagiaires ont assisté à toutes les sessions en présentiel ?
for i_mod in range(1,6):
    cols = [f"PJ{i}" for i in range(1 + (i_mod-1)*3, 4 + (i_mod-1)*3)]
    df[f"present_tout_module_{i_mod}"] = (df[cols].sum(axis = 1) == 3)

In [13]:
## Absence complete pour chaque module
for imod in range (1,6):
    cols = [f"PJ{i}" for i in range(1+(imod-1)*3, 4+(imod-1)*3)]
    df[f"absent_module_{imod}"] = (df[cols].sum(axis=1) == 0)

In [14]:
# le stagiaire a abandonné suite au module
for i in range(1,6):
    b = ~df[f"absent_module_{i}"]
    for j in range(i+1, 6):
        b = b & df[f"absent_module_{j}"]
    b = b & (df["PEF"] == 0)
    df[f"abandon_apres_module_{i}"] = b

In [15]:
path_df_stagiaire = path_data / "df_stagiaire.pkl"
path_df_intervenant = path_data / "df_intervenant.pkl"
path_labels = path_data / "labels.pkl"
df.to_pickle(path_df_stagiaire)
dfs_e.to_pickle(path_df_intervenant)
with open(path_labels, "wb") as f: joblib.dump(labels, f)