In [1]:
import pandas as pd
import warnings
from lifelines import CoxPHFitter
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
warnings.filterwarnings("ignore")

# Import des données

In [2]:
df = pd.read_csv('../data/base_modelis_pass_det.csv',sep = ';')

In [3]:
df['date_entree_defaut'] = pd.to_datetime(df['date_entree_defaut'], format='%d/%m/%y')
df['date_sortie_defaut'] = pd.to_datetime(df['date_sortie_defaut'], format='%d/%m/%y', errors='coerce')
df['arrete'] = pd.to_datetime(df['arrete'], format='%d/%m/%y')
df['DT_MEP_OPE'] = pd.to_datetime(df['DT_MEP_OPE'], format='%d%b%Y:%H:%M:%S.%f')
df['dt_arr_last_enc_ope'] = pd.to_datetime(df['dt_arr_last_enc_ope'], errors='coerce')
df['dt_arr_1st_enc_ope'] = pd.to_datetime(df['dt_arr_1st_enc_ope'], errors='coerce')
df['dtHJD_prov'] = pd.to_datetime(df['dtHJD_prov'], errors='coerce')
df['dtHJD_def'] = pd.to_datetime(df['dtHJD_def'], errors='coerce')

In [4]:
for column in df.columns :
    if df[column].isna().sum()/df.shape[0] > 0.5 : 
        df.drop(columns = [column], inplace = True)

In [5]:
df.drop(columns = ["CD_POST_BIEN_PFI"], inplace = True)

In [6]:
var_to_categorize = [
    "FL_REL_NON_ANNULE",
    "FL_ETR",
    "CD_ETAT_CIVIL",
    "CD_DNE_CHR_LGD_SAIN",
    "fl_prise_de_gar",
    "fl_fam_vam",
    "fam_ETH",
    "fam_ENQ",
    "fam_exp",
    "fam_hyp",
    "fam_sim",
    "fam_AEP",
    "dat_dec_echec",
    "solution",
    "fam_PCD",
    "CD_MTF_ENE_CTX",
    "niv_gravite_mtf_dne",
    "niv_gravite_mtf_dfs",
    "no_pass_def",
    "fl_def_ss_pass_CTX",
    "fl_prt_Conso",
    "fl_fonc",
    "AMI",
    "EXE",
    "PTG"
]
for var in var_to_categorize : 
    df[var] = df[var].astype("object")

In [7]:
cols_numeriques = df.select_dtypes(include=['float64', 'int64']).columns.to_list()
cols_categorielles = df.select_dtypes(include=['object']).columns.to_list()
cols_categorielles.remove("cd_op")

for col in cols_numeriques:
    df[col].fillna(df[col].mean(), inplace=True)

for col in cols_categorielles:
    df[col].fillna(df[col].mode()[0], inplace=True)

In [8]:
def encode_columns(df):
    encoders = {}
    for column in cols_categorielles:
        unique_values = df[column].nunique()
        if unique_values <= 3:
            # Appliquer OneHotEncoder
            encoder = OneHotEncoder(sparse=False)
            encoded = encoder.fit_transform(df[[column]])
            encoders[column] = encoder

            # Remplacer la colonne originale par de nouvelles colonnes pour chaque catégorie
            for i, category in enumerate(encoder.categories_[0]):
                df[str(column) + '_' + str(category)] = encoded[:, i]
            df.drop(column, axis=1, inplace=True)
        else:
            # Appliquer LabelEncoder
            encoder = LabelEncoder()
            df[column] = encoder.fit_transform(df[column])
            encoders[column] = encoder
    return df, encoders

encoded_df, encoders = encode_columns(df)

In [86]:
from scipy.stats import pointbiserialr


def calculate_pointbiserial_corr(df, binary_var):
    corr_dict = {}
    for col in df.columns:
        if col != binary_var:
            corr, p_value = pointbiserialr(df[binary_var], df[col])
            corr_dict[col] = corr
    return corr_dict

# Calcul de la corrélation point-bisériale
corr_dict = calculate_pointbiserial_corr(encoded_df, 'fl_pass_DET')

# Filtrer les variables avec une corrélation > 0.30
selected_variables = {k: v for k, v in corr_dict.items() if abs(v) > 0.05}

selected_variables.keys()

UFuncTypeError: ufunc 'add' did not contain a loop with signature matching types (dtype('float64'), dtype('<U10')) -> None

In [None]:
selected_columns

# Préparation des données

In [9]:
df_validation = df[df['arrete']>"2020-12-30"]
df_train = df[df['arrete']<"2020-12-30"]

In [10]:
df_train

Unnamed: 0,cd_op,date_entree_defaut,arrete,fl_pass_DET,nb_prt,DT_MEP_OPE,CD_NAT_EMP1,CD_NAT_EMP2,ANC_BANC_MAX_PFI,CD_DEST_PFI,...,fl_prt_Conso_0,fl_prt_Conso_1,fl_fonc_0,fl_fonc_1,AMI_0.0,AMI_1.0,EXE_0.0,EXE_1.0,PTG_0.0,PTG_1.0
0,I860708600,2017-01-16,2017-01-31,0,1,1986-09-15,3,2,83.291914,5,...,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0
1,I860708600,2017-01-16,2017-02-28,0,1,1986-09-15,3,2,83.291914,5,...,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0
2,I860708600,2017-01-16,2017-03-31,0,1,1986-09-15,3,2,83.291914,5,...,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0
3,I860708600,2017-01-16,2017-04-30,0,1,1986-09-15,3,2,83.291914,5,...,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0
4,I860708600,2017-01-16,2017-05-31,0,1,1986-09-15,3,2,83.291914,5,...,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
327989,M200618962,2020-10-22,2020-10-31,0,1,2020-09-14,3,2,40.000000,0,...,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0
327990,M200618962,2020-10-22,2020-11-30,0,1,2020-09-14,3,2,40.000000,0,...,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
328215,M200807648,2020-09-19,2020-09-30,0,1,2020-09-19,3,2,9.000000,3,...,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0
328216,M200807648,2020-09-19,2020-10-31,0,1,2020-09-19,3,2,9.000000,3,...,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0


In [11]:
data_haz = df_train.copy()
data_haz = data_haz.drop_duplicates(subset = 'cd_op', keep = 'last')

In [12]:
data_haz

Unnamed: 0,cd_op,date_entree_defaut,arrete,fl_pass_DET,nb_prt,DT_MEP_OPE,CD_NAT_EMP1,CD_NAT_EMP2,ANC_BANC_MAX_PFI,CD_DEST_PFI,...,fl_prt_Conso_0,fl_prt_Conso_1,fl_fonc_0,fl_fonc_1,AMI_0.0,AMI_1.0,EXE_0.0,EXE_1.0,PTG_0.0,PTG_1.0
11,I860708600,2017-01-16,2017-12-31,0,1,1986-09-15,3,2,83.291914,5,...,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0
24,I950404213,2017-03-15,2018-03-31,0,1,1995-06-06,3,2,83.291914,3,...,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0
33,I950706618,2016-01-28,2017-04-30,1,1,1995-09-25,3,2,83.291914,3,...,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0
39,I950901248,2017-01-27,2017-06-30,0,1,1995-10-18,3,2,167.000000,3,...,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0
41,I960300578,2016-06-28,2016-07-31,0,1,1996-04-10,3,2,5.000000,3,...,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
327905,M200503777,2020-08-19,2020-11-30,0,1,2020-08-19,3,2,173.000000,0,...,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0
327924,M200504641,2020-09-15,2020-11-30,0,1,2020-07-15,3,2,1.000000,3,...,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0
327927,M200504653,2020-09-17,2020-11-30,0,1,2020-09-17,3,2,1.000000,3,...,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0
327990,M200618962,2020-10-22,2020-11-30,0,1,2020-09-14,3,2,40.000000,0,...,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0


In [13]:
data_haz["Survie"] =  data_haz["arrete"] - data_haz["date_entree_defaut"]

In [14]:
data_haz["Survie"] = data_haz["Survie"].dt.days

In [15]:
data_haz = data_haz.select_dtypes(exclude=['datetime64'])

In [16]:
data_haz.drop(columns = ["cd_op"], inplace = True)

# Modélisation

In [17]:
data_haz

Unnamed: 0,fl_pass_DET,nb_prt,CD_NAT_EMP1,CD_NAT_EMP2,ANC_BANC_MAX_PFI,CD_DEST_PFI,CD_NATUR_OP_PFI,MT_PATRIM_MOB,MT_CHA_HORS_OPE_PFI,NB_CHARGE_HORS_OPE_PFI,...,fl_prt_Conso_1,fl_fonc_0,fl_fonc_1,AMI_0.0,AMI_1.0,EXE_0.0,EXE_1.0,PTG_0.0,PTG_1.0,Survie
11,0,1,3,2,83.291914,5,16,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,349
24,0,1,3,2,83.291914,3,2,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,381
33,1,1,3,2,83.291914,3,2,0.0,0.0,0.0,...,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,458
39,0,1,3,2,167.000000,3,2,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,154
41,0,1,3,2,5.000000,3,2,0.0,2484.0,1.0,...,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,33
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
327905,0,1,3,2,173.000000,0,10,15323.0,23862.0,3.0,...,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,103
327924,0,1,3,2,1.000000,3,10,3897.0,3912.0,1.0,...,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,76
327927,0,1,3,2,1.000000,3,16,4130.0,12840.0,1.0,...,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,74
327990,0,1,3,2,40.000000,0,16,150047.0,40006.0,1.0,...,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,39


In [18]:
cph = CoxPHFitter()

cph.fit(data_haz, duration_col='Survie', event_col='fl_pass_DET')

# Affichage du résumé du modèle
print(cph.summary)

ConvergenceError: Convergence halted due to matrix inversion problems. Suspicion is high collinearity. Please see the following tips in the lifelines documentation: https://lifelines.readthedocs.io/en/latest/Examples.html#problems-with-convergence-in-the-cox-proportional-hazard-modelMatrix is singular.

In [97]:
cph.check_assumptions(data_haz, p_value_threshold=0.05, show_plots=True)

AttributeError: Must call `fit` first.

In [98]:
cph.plot()

AttributeError: Must call `fit` first.