In [1]:
import numpy as np
import pandas as pd
import gc
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import preprocessing, impute
import pickle

# récupération des données
df = pd.read_csv("data/df2.csv")

# One-hot encoding pour variables catégorielles avec get_dummies
def one_hot_encoder(df, nan_as_category = True):
    original_columns = list(df.columns)
    categorical_columns = [col for col in df.columns if df[col].dtype == 'object']
    df = pd.get_dummies(df, columns= categorical_columns, dummy_na= nan_as_category)
    new_columns = [c for c in list(df.columns) if c not in original_columns]
    return df, new_columns

# Prétraitement des demandes de crédit antérieures chez Home Credit

In [2]:
# récupération des données
prev = pd.read_csv('data/previous_application.csv')

# encodage des données catégorielles
prev, cat_cols = one_hot_encoder(prev, nan_as_category= True)

In [3]:
# traitement des valeurs aberrantes
prev['DAYS_FIRST_DRAWING'].replace(365243, np.nan, inplace= True)
prev['DAYS_FIRST_DUE'].replace(365243, np.nan, inplace= True)
prev['DAYS_LAST_DUE_1ST_VERSION'].replace(365243, np.nan, inplace= True)
prev['DAYS_LAST_DUE'].replace(365243, np.nan, inplace= True)
prev['DAYS_TERMINATION'].replace(365243, np.nan, inplace= True)

In [4]:
# nouvelle feature : pourcentage de la valeur demandée par rapport à la valeur reçue
prev['APP_CREDIT_PERC'] = prev['AMT_APPLICATION'] / prev['AMT_CREDIT']

In [5]:
# préparation des agrégations

# Variables numériques
num_aggregations = {
    'AMT_ANNUITY': ['min', 'max', 'mean'],
    'AMT_APPLICATION': ['min', 'max', 'mean'],
    'AMT_CREDIT': ['min', 'max', 'mean'],
    'APP_CREDIT_PERC': ['min', 'max', 'mean', 'var'],
    'AMT_DOWN_PAYMENT': ['min', 'max', 'mean'],
    'AMT_GOODS_PRICE': ['min', 'max', 'mean'],
    'HOUR_APPR_PROCESS_START': ['min', 'max', 'mean'],
    'RATE_DOWN_PAYMENT': ['min', 'max', 'mean'],
    'DAYS_DECISION': ['min', 'max', 'mean'],
    'CNT_PAYMENT': ['mean', 'sum'],
}
# Variables catégorielles
cat_aggregations = {}
for cat in cat_cols:
    cat_aggregations[cat] = ['mean']
    
cat_aggregations

{'NAME_CONTRACT_TYPE_Cash loans': ['mean'],
 'NAME_CONTRACT_TYPE_Consumer loans': ['mean'],
 'NAME_CONTRACT_TYPE_Revolving loans': ['mean'],
 'NAME_CONTRACT_TYPE_XNA': ['mean'],
 'NAME_CONTRACT_TYPE_nan': ['mean'],
 'WEEKDAY_APPR_PROCESS_START_FRIDAY': ['mean'],
 'WEEKDAY_APPR_PROCESS_START_MONDAY': ['mean'],
 'WEEKDAY_APPR_PROCESS_START_SATURDAY': ['mean'],
 'WEEKDAY_APPR_PROCESS_START_SUNDAY': ['mean'],
 'WEEKDAY_APPR_PROCESS_START_THURSDAY': ['mean'],
 'WEEKDAY_APPR_PROCESS_START_TUESDAY': ['mean'],
 'WEEKDAY_APPR_PROCESS_START_WEDNESDAY': ['mean'],
 'WEEKDAY_APPR_PROCESS_START_nan': ['mean'],
 'FLAG_LAST_APPL_PER_CONTRACT_N': ['mean'],
 'FLAG_LAST_APPL_PER_CONTRACT_Y': ['mean'],
 'FLAG_LAST_APPL_PER_CONTRACT_nan': ['mean'],
 'NAME_CASH_LOAN_PURPOSE_Building a house or an annex': ['mean'],
 'NAME_CASH_LOAN_PURPOSE_Business development': ['mean'],
 'NAME_CASH_LOAN_PURPOSE_Buying a garage': ['mean'],
 'NAME_CASH_LOAN_PURPOSE_Buying a holiday home / land': ['mean'],
 'NAME_CASH_LOAN_PU

In [6]:
# agrégats 

prev_agg = prev.groupby('SK_ID_CURR').agg({**num_aggregations, **cat_aggregations})
prev_agg.columns = pd.Index(['PREV_' + e[0] + "_" + e[1].upper() for e in prev_agg.columns.tolist()])

# Agrégation des attributs numériques des crédits accordés
approved = prev[prev['NAME_CONTRACT_STATUS_Approved'] == 1]
approved_agg = approved.groupby('SK_ID_CURR').agg(num_aggregations)
approved_agg.columns = pd.Index(['APPROVED_' + e[0] + "_" + e[1].upper() for e in approved_agg.columns.tolist()])
prev_agg = prev_agg.join(approved_agg, how='left', on='SK_ID_CURR')

# Agrégation des attributs numériques des crédits refusés 
refused = prev[prev['NAME_CONTRACT_STATUS_Refused'] == 1]
refused_agg = refused.groupby('SK_ID_CURR').agg(num_aggregations)
refused_agg.columns = pd.Index(['REFUSED_' + e[0] + "_" + e[1].upper() for e in refused_agg.columns.tolist()])
prev_agg = prev_agg.join(refused_agg, how='left', on='SK_ID_CURR')

In [7]:
# imputation par KNN

prev_agg.replace(to_replace=np.inf, value=np.nan, inplace=True)
prev_agg[prev_agg.columns] = impute.KNNImputer().fit_transform(prev_agg)

In [8]:
# jointure aux données principales

df = df.join(prev_agg, how='left', on='SK_ID_CURR')

del prev_agg
gc.collect()

print(f"Taille du data set : {df.shape}")
missing_values_prop = df.isnull().sum().sum()/df.size
print(f"Proportion de valeurs manquantes : {missing_values_prop}")

Taille du data set : (356251, 494)
Proportion de valeurs manquantes : 0.05814231850384066


In [9]:
# enregistrement des données

df.to_csv("data/df3.csv", index=False)

# Prétraitement des données POS-Cash

In [10]:
# lecture des données
pos = pd.read_csv('data/POS_CASH_balance.csv')
pos, cat_cols = one_hot_encoder(pos, nan_as_category= True)
# agrégations des données
aggregations = {
    'MONTHS_BALANCE': ['max', 'mean', 'size'],
    'SK_DPD': ['max', 'mean'],
    'SK_DPD_DEF': ['max', 'mean']
}
for cat in cat_cols:
    aggregations[cat] = ['mean']

pos_agg = pos.groupby('SK_ID_CURR').agg(aggregations)
pos_agg.columns = pd.Index(['POS_' + e[0] + "_" + e[1].upper() for e in pos_agg.columns.tolist()])
# décompte des comptes
pos_agg['POS_COUNT'] = pos.groupby('SK_ID_CURR').size()
# imputation par KNN
pos_agg.replace(to_replace=np.inf, value=np.nan, inplace=True)
pos_agg[pos_agg.columns] = impute.KNNImputer().fit_transform(pos_agg)
# jointure
df = df.join(pos_agg, how='left', on='SK_ID_CURR')

del pos_agg
gc.collect()

print(f"Taille du data set : {df.shape}")
missing_values_prop = df.isnull().sum().sum()/df.size
print(f"Proportion de valeurs manquantes : {missing_values_prop}")

Taille du data set : (356251, 512)
Proportion de valeurs manquantes : 0.05797354340759745


In [11]:
# enregistrement des données

df.to_csv("data/df4.csv", index=False)

# Prétraitement des données des versements

In [12]:
# lecture des données
ins = pd.read_csv('data/installments_payments.csv')
ins, cat_cols = one_hot_encoder(ins, nan_as_category= True)
# nouveaux attributs : pourcentage et différence entre la valeur payée et la mensualité prévue
ins['PAYMENT_PERC'] = ins['AMT_PAYMENT'] / ins['AMT_INSTALMENT']
ins['PAYMENT_DIFF'] = ins['AMT_INSTALMENT'] - ins['AMT_PAYMENT']
# nouveaux attributs : nombres de jours d'arriérés et de versement anticipés (pas de valeur négative)
ins['DPD'] = ins['DAYS_ENTRY_PAYMENT'] - ins['DAYS_INSTALMENT']
ins['DBD'] = ins['DAYS_INSTALMENT'] - ins['DAYS_ENTRY_PAYMENT']
ins['DPD'] = ins['DPD'].apply(lambda x: x if x > 0 else 0)
ins['DBD'] = ins['DBD'].apply(lambda x: x if x > 0 else 0)
# agrégations
aggregations = {
    'NUM_INSTALMENT_VERSION': ['nunique'],
    'DPD': ['max', 'mean', 'sum'],
    'DBD': ['max', 'mean', 'sum'],
    'PAYMENT_PERC': ['max', 'mean', 'sum', 'var'],
    'PAYMENT_DIFF': ['max', 'mean', 'sum', 'var'],
    'AMT_INSTALMENT': ['max', 'mean', 'sum'],
    'AMT_PAYMENT': ['min', 'max', 'mean', 'sum'],
    'DAYS_ENTRY_PAYMENT': ['max', 'mean', 'sum']
}
for cat in cat_cols:
    aggregations[cat] = ['mean']
ins_agg = ins.groupby('SK_ID_CURR').agg(aggregations)
ins_agg.columns = pd.Index(['INSTAL_' + e[0] + "_" + e[1].upper() for e in ins_agg.columns.tolist()])
# décompte des versements
ins_agg['INSTAL_COUNT'] = ins.groupby('SK_ID_CURR').size()
# imputation par KNN
ins_agg.replace(to_replace=np.inf, value=np.nan, inplace=True)
ins_agg[ins_agg.columns] = impute.KNNImputer().fit_transform(ins_agg)
# jointure
df = df.join(ins_agg, how='left', on='SK_ID_CURR')

del ins_agg
gc.collect()

print(f"Taille du data set : {df.shape}")
missing_values_prop = df.isnull().sum().sum()/df.size
print(f"Proportion de valeurs manquantes : {missing_values_prop}")

Taille du data set : (356251, 538)
Proportion de valeurs manquantes : 0.057432941243475435


In [13]:
# enregistrement des données

df.to_csv("data/df5.csv", index=False)

# Prétraitement des données sur les cartes de crédit

In [14]:
cc = pd.read_csv('data/credit_card_balance.csv')
cc, cat_cols = one_hot_encoder(cc, nan_as_category= True)
# agrégations
cc.drop(['SK_ID_PREV'], axis= 1, inplace = True)
cc_agg = cc.groupby('SK_ID_CURR').agg(['min', 'max', 'mean', 'sum', 'var'])
cc_agg.columns = pd.Index(['CC_' + e[0] + "_" + e[1].upper() for e in cc_agg.columns.tolist()])
# Décompte des lignes pour chaque client
cc_agg['CC_COUNT'] = cc.groupby('SK_ID_CURR').size()
# imputation par KNN
cc_agg.replace(to_replace=np.inf, value=np.nan, inplace=True)
cc_agg[cc_agg.columns] = impute.KNNImputer().fit_transform(cc_agg)
# jointure
df = df.join(cc_agg, how='left', on='SK_ID_CURR')

del cc_agg
gc.collect()

print(f"Taille du data set : {df.shape}")
missing_values_prop = df.isnull().sum().sum()/df.size
print(f"Proportion de valeurs manquantes : {missing_values_prop}")

Taille du data set : (356251, 679)
Proportion de valeurs manquantes : 0.19280098840143192


In [15]:
# enregistrement des données

df.to_csv("data/df6.csv", index=False)

# Enregistrement des données finales

In [16]:
train_df = df[df['TARGET'].notnull()]
test_df = df[df['TARGET'].isnull()]
print(f"Train : {train_df.shape}")
print(f"Test : {test_df.shape}")

Train : (307507, 679)
Test : (48744, 679)


In [17]:
train_df.to_csv("data/train_df.csv", index=False)
test_df.to_csv("data/test_df.csv", index=False)

