In [1]:
import pandas as pd
import numpy as np
import gc
import lightgbm as lgb
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
import sklearn
from sklearn import model_selection, dummy, metrics
import pickle

print(pd.__version__) #1.1.3
print(np.__version__) #1.19.4
print(sns.__version__) #0.11.0
print(matplotlib.__version__) # 3.3.2
print(sklearn.__version__) # 0.23.2

1.1.3
1.19.4
0.11.0
3.3.2
0.23.2


# Récupération des données

In [2]:
data = pd.read_csv("data/train_df.csv", index_col="index")
cat_features = pickle.load(open("pickle_cat_features.pkl", "rb"))

# Isolation de la cible et séparation des données (train, test)

In [3]:
X = data.drop(columns=['TARGET','SK_ID_CURR'])
y = data["TARGET"]
cat_features_index = np.where(X.columns.isin(cat_features))

del data
gc.collect()

20

In [9]:
list(cat_features_index[0])

[0, 1, 2, 3, 9, 10, 11, 12, 13, 26, 30, 38, 84, 85, 87, 88, 120]

In [10]:
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.2, random_state=123)

print(f"X_train : {X_train.shape}")
print(f"X_test : {X_test.shape}")
print(f"y_train : {y_train.shape}")
print(f"y_test : {y_test.shape}")

del X, y
gc.collect()

X_train : (246005, 676)
X_test : (61502, 676)
y_train : (246005,)
y_test : (61502,)


3808

# Etablissement d'une baseline

In [11]:
# dummy classifier 
dummy_classifier = dummy.DummyClassifier(strategy="stratified", random_state=123)
dummy_classifier.fit(X_train,y_train)

DummyClassifier(random_state=123, strategy='stratified')

In [12]:
# performances
probas_pred = dummy_classifier.predict_proba(X_test)[:, 1]
p, r, _ = metrics.precision_recall_curve(y_test, probas_pred)
baseline = metrics.auc(r,p)
print(f"Baseline AUCPR par dummy classifier  : {baseline}")

del dummy_classifier
gc.collect()

Baseline AUCPR par dummy classifier  : 0.11880592645594451


40

# Entraînement d'un classifieur LightGBM sur le train set sur 5 folds (5 classifieurs)

In [13]:
# rectification des noms des features pour le classifieur LGBM

def replace_name(name):
    for c in ["[","]",",","{","}",'"',":"," "]:
        if c in name :
            name = name.replace(c,"_")
    return name
            
features = list(map(replace_name, X_train.columns))
X_train.columns = features
X_test.columns = features

In [14]:
# métrique LGBM personnalisée utilisant l'aire sous la courbe precision-recall (AUCPR)

def f_aucpr(y_true, probas_pred):
    
    p, r, _ = metrics.precision_recall_curve(y_true, probas_pred)
    score = metrics.auc(r,p)
    return "AUCPR", score, True

In [None]:
# modèle de cross validation pour entraînement par fold
folds = model_selection.StratifiedKFold(n_splits= 5, shuffle=True, random_state=123)

# création des dataframes pour stocker les résultats
oof_preds = np.zeros(X_train.shape[0])
sub_preds = np.zeros(X_test.shape[0])
feature_importance_df = pd.DataFrame()

for n_fold, (train_idx, valid_idx) in enumerate(folds.split(X_train, y_train)):
    train_x, train_y = X_train.iloc[train_idx], y_train.iloc[train_idx]
    valid_x, valid_y = X_train.iloc[valid_idx], y_train.iloc[valid_idx]

    # paramètres déterminés par optimisation Bayésienne
    clf = lgb.LGBMClassifier(
        n_jobs=-1,
        n_estimators=10000,
        learning_rate=0.02,
        num_leaves=37,
        colsample_bytree=0.1626780977682063,
        subsample=0.9061817454792065,
        max_depth=5,
        reg_alpha=0.8515028010996651,
        reg_lambda=0.5421869781111216,
        min_split_gain=0.021063972265591233,
        min_child_weight=29.09287622047518,
        silent=-1,
        verbose=-1,
        random_state=123,
        categorical_feature=list(cat_features_index[0]))
    
    
    # entraînement de chaque fold
    clf.fit(train_x, train_y, eval_set=[(train_x, train_y), (valid_x, valid_y)], 
        eval_metric=f_aucpr, verbose= 200, early_stopping_rounds= 200)

    # stockage des prédictions
    oof_preds[valid_idx] = clf.predict_proba(valid_x, num_iteration=clf.best_iteration_)[:, 1]
    sub_preds += clf.predict_proba(X_test, num_iteration=clf.best_iteration_)[:, 1] / folds.n_splits
    
    # stockage des features importances
    fold_importance_df = pd.DataFrame()
    fold_importance_df["feature"] = X_train.columns
    fold_importance_df["importance"] = clf.feature_importances_
    fold_importance_df["fold"] = n_fold + 1
    feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
    _, score, _ = f_aucpr(valid_y, oof_preds[valid_idx])
    print('Fold %2d AUCPR : %.6f' % (n_fold + 1, score))
    del clf, train_x, train_y, valid_x, valid_y
    gc.collect()


Please use categorical_feature argument of the Dataset constructor to pass this parameter.


Training until validation scores don't improve for 200 rounds
[200]	training's binary_logloss: 0.241213	training's AUCPR: 0.302481	valid_1's binary_logloss: 0.245612	valid_1's AUCPR: 0.264616
[400]	training's binary_logloss: 0.230647	training's AUCPR: 0.33594	valid_1's binary_logloss: 0.23904	valid_1's AUCPR: 0.279485
[600]	training's binary_logloss: 0.224771	training's AUCPR: 0.361489	valid_1's binary_logloss: 0.236785	valid_1's AUCPR: 0.286548


In [None]:
# affichage des performances

_, train_score, _ = f_aucpr(y_train, oof_preds)
_, test_score, _ = f_aucpr(y_test, sub_preds)
print(f"AUCPR sur train set  : {train_score}")
print(f"AUCPR sur test set : {test_score}")

In [None]:
# affichage de l'importance des features

cols = feature_importance_df[["feature", "importance"]].groupby("feature").mean()\
                    .sort_values(by="importance", ascending=False)[:40].index
best_features = feature_importance_df.loc[feature_importance_df.feature.isin(cols)]
plt.figure(figsize=(8, 10))
sns.barplot(x="importance", y="feature", data=best_features.sort_values(by="importance", ascending=False))
plt.title('LightGBM Features (avg over folds)')
plt.tight_layout()
plt.savefig('img/lgbm_importances01.png')

# Entraînement d'un LGBM Classifier unique sur le train set

In [None]:
# création d'un set de validation

train_x, valid_x, train_y, valid_y = model_selection.train_test_split(X_train, y_train, test_size=0.2, 
                                                                      random_state=123)

In [None]:
# modèle avec les mêmes paramètres

clf = lgb.LGBMClassifier(
        n_jobs=-1,
        n_estimators=10000,        
        learning_rate=0.02,
        num_leaves=37,
        colsample_bytree=0.1626780977682063,
        subsample=0.9061817454792065,
        max_depth=5,
        reg_alpha=0.8515028010996651,
        reg_lambda=0.5421869781111216,
        min_split_gain=0.021063972265591233,
        min_child_weight=29.09287622047518,
        silent=-1,
        verbose=-1,
        random_state=123,
        categorical_feature=list(cat_features_index[0]))

# entraînement
clf.fit(train_x, train_y, eval_set=[(train_x, train_y), (valid_x, valid_y)], 
    eval_metric=f_aucpr, verbose= 200, early_stopping_rounds= 200)

In [None]:
# performances

y_pred = clf.predict_proba(X_test, num_iteration=clf.best_iteration_)[:, 1]
f_aucpr(y_test, y_pred)

In [None]:
# features importances

feature_importance_df = pd.DataFrame()
feature_importance_df["feature"] = X_train.columns
feature_importance_df["importance"] = clf.feature_importances_
best_features_idx = feature_importance_df.sort_values(by="importance", ascending=False)[:40].index
plt.figure(figsize=(8, 10))
sns.barplot(x="importance", y="feature", data=feature_importance_df.iloc[best_features_idx].sort_values(by="importance", ascending=False))
plt.title('LightGBM Features')
plt.tight_layout()

Les performances sont à peine moins bonnes, je garde donc ce principe d'un classifieur unique.


In [None]:
# AUROC du modèle

print(metrics.roc_auc_score(y_test, y_pred))


In [None]:
# recherche du seuil optimisant le f1-score

for thres in np.linspace(0.1,0.19,10):
    y_pred_label = [0 if i<=thres else 1 for i in y_pred]
    score = metrics.f1_score(y_test,y_pred_label)
    print(f"Seuil {thres} f1-score {score}")

In [None]:
# rapport de performances 

thres = 0.15
y_pred_label = [0 if i<=thres else 1 for i in y_pred]
print(metrics.classification_report(y_test, y_pred_label, digits=4))
print(metrics.confusion_matrix(y_test, y_pred_label))