In [1]:
import pandas as pd
import numpy as np
import gc
import lightgbm as lgb
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
import sklearn
from sklearn import model_selection, dummy, metrics
import pickle

print(pd.__version__) #1.1.3
print(np.__version__) #1.19.4
print(sns.__version__) #0.11.0
print(matplotlib.__version__) # 3.3.2
print(sklearn.__version__) # 0.23.2


1.1.3
1.19.4
0.11.0
3.3.2
0.23.2


# Récupération et préparation des données

In [2]:

X_train = pd.read_csv("data/X_train_resampled.csv")
y_train = pd.read_csv("data/y_train_resampled.csv")
X_test = pd.read_csv("data/X_test.csv")
y_test = pd.read_csv("data/y_test.csv")

cat_features = pickle.load(open("data/pickle_cat_features.pkl", "rb"))
cat_features_index = np.where(X_train.columns.isin(cat_features))

BETA = 2

NameError: name 'X' is not defined

In [None]:
X_train.drop(columns="Unnamed: 0", inplace=True)
X_test.drop(columns="SK_ID_CURR", inplace=True)
y_train.drop(columns="Unnamed: 0", inplace=True)
y_test.drop(columns="SK_ID_CURR", inplace=True)

print(f"X_train : {X_train.shape}")
print(f"X_test : {X_test.shape}")
print(f"y_train : {y_train.shape}")
print(f"y_test : {y_test.shape}")

def replace_name(name):
    for c in ["[","]",",","{","}",'"',":"," "]:
        if c in name :
            name = name.replace(c,"_")
    return name
            
features = list(map(replace_name, X_train.columns))
X_train.columns = features
X_test.columns = features

# Etablissement d'une baseline

In [None]:
%%time
# dummy classifier 
dummy_classifier = dummy.DummyClassifier(strategy="stratified", random_state=123)
dummy_classifier.fit(X_train,y_train)

In [None]:
# performances
y_pred = dummy_classifier.predict_proba(X_test)[:, 1]
baseline = metrics.fbeta_score(y_test, y_pred, beta=BETA)
print(f"Baseline F_beta par dummy classifier  : {baseline}")

del dummy_classifier
gc.collect()

# Entraînement d'un classifieur LightGBM sur le train set sur 5 folds (5 classifieurs)

In [None]:
# métrique F_beta

def f_beta(y_true, probas_pred):
    y_pred = np.vectorize(lambda x : 0 if x<0.5 else 1)(probas_pred)
    score = metrics.fbeta_score(y_true, y_pred, beta=BETA)
    return "F_beta", score, True

In [None]:
%%time
# modèle de cross validation pour entraînement par fold
folds = model_selection.StratifiedKFold(n_splits= 5, shuffle=True, random_state=123)

# création des dataframes pour stocker les résultats
oof_preds = np.zeros(X_train.shape[0])
sub_preds = np.zeros(X_test.shape[0])
feature_importance_df = pd.DataFrame()

for n_fold, (train_idx, valid_idx) in enumerate(folds.split(X_train, y_train)):
    train_x, train_y = X_train.iloc[train_idx], y_train.iloc[train_idx]
    valid_x, valid_y = X_train.iloc[valid_idx], y_train.iloc[valid_idx]

    # paramètres déterminés par optimisation Bayésienne
    clf = lgb.LGBMClassifier(
        n_jobs=-1,
        n_estimators=10000,
        learning_rate=0.02,
        num_leaves=37,
        colsample_bytree=0.26424255740815,
        subsample=0.9222047021355166,
        max_depth=6,
        reg_alpha=0.8495823622837118,
        reg_lambda=0.7247308695357746,
        min_split_gain=0.05365093112258974,
        min_child_weight=28.91981182288273,
        silent=-1,
        verbose=-1,
        random_state=123,
        categorical_feature=list(cat_features_index[0]))
    
    
    # entraînement de chaque fold
    clf.fit(train_x, train_y, eval_set=[(train_x, train_y), (valid_x, valid_y)], 
        eval_metric=f_beta, verbose= 200, early_stopping_rounds= 200)

    # stockage des prédictions
    oof_preds[valid_idx] = clf.predict_proba(valid_x, num_iteration=clf.best_iteration_)[:, 1]
    sub_preds += clf.predict_proba(X_test, num_iteration=clf.best_iteration_)[:, 1] / folds.n_splits
    
    # stockage des features importances
    fold_importance_df = pd.DataFrame()
    fold_importance_df["feature"] = X_train.columns
    fold_importance_df["importance"] = clf.feature_importances_
    fold_importance_df["fold"] = n_fold + 1
    feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
    _, score, _ = f_beta(valid_y, oof_preds[valid_idx])
    print('Fold %2d F_beta : %.6f' % (n_fold + 1, score))
    del clf, train_x, train_y, valid_x, valid_y
    gc.collect()


In [None]:
# affichage des performances

_, train_score, _ = f_beta(y_train, oof_preds)
_, test_score, _ = f_beta(y_test, sub_preds)
print(f"F_beta sur train set  : {train_score}")
print(f"F_beta sur test set : {test_score}")

In [None]:
# affichage de l'importance des features

cols = feature_importance_df[["feature", "importance"]].groupby("feature").mean()\
                    .sort_values(by="importance", ascending=False)[:40].index
best_features = feature_importance_df.loc[feature_importance_df.feature.isin(cols)]
plt.figure(figsize=(8, 10))
sns.barplot(x="importance", y="feature", data=best_features.sort_values(by="importance", ascending=False))
plt.title('LightGBM Features (avg over folds)')
plt.tight_layout()
plt.savefig('img/lgbm_importances01.png')

# Entraînement d'un LGBM Classifier unique sur le train set

In [None]:
# création d'un set de validation

train_x, valid_x, train_y, valid_y = model_selection.train_test_split(X_train, y_train, test_size=0.2, 
                                                                      random_state=123)

In [None]:
%%time
# modèle avec les mêmes paramètres

clf = lgb.LGBMClassifier(
        n_jobs=-1,
        n_estimators=10000,        
        learning_rate=0.02,
        num_leaves=37,
        colsample_bytree=0.26424255740815,
        subsample=0.9222047021355166,
        max_depth=6,
        reg_alpha=0.8495823622837118,
        reg_lambda=0.7247308695357746,
        min_split_gain=0.05365093112258974,
        min_child_weight=28.91981182288273,
        silent=-1,
        verbose=-1,
        random_state=123,
        categorical_feature=list(cat_features_index[0]))

# entraînement
clf.fit(train_x, train_y, eval_set=[(train_x, train_y), (valid_x, valid_y)], 
    eval_metric=f_beta, verbose= 200, early_stopping_rounds= 200)

In [None]:
# performances

y_pred = clf.predict_proba(X_test, num_iteration=clf.best_iteration_)[:, 1]
f_beta(y_test, y_pred)

In [None]:
# features importances

feature_importance_df = pd.DataFrame()
feature_importance_df["feature"] = X_train.columns
feature_importance_df["importance"] = clf.feature_importances_
best_features_idx = feature_importance_df.sort_values(by="importance", ascending=False)[:40].index
plt.figure(figsize=(8, 10))
sns.barplot(x="importance", y="feature", data=feature_importance_df.iloc[best_features_idx].sort_values(by="importance", ascending=False))
plt.title('LightGBM Features')
plt.tight_layout()

Les performances sont à peine moins bonnes, je garde donc ce principe d'un classifieur unique.


In [None]:
# AUROC du modèle

print(metrics.roc_auc_score(y_test, y_pred))


In [None]:
# recherche du seuil optimisant le f_beta

scores = list()
thresolds = np.linspace(0.1, 0.25, 16)

for thres in np.linspace(0.1, 0.25, 16):
    y_pred_label = [0 if i<=thres else 1 for i in y_pred]
    score = metrics.fbeta_score(y_test, y_pred_label, beta=BETA)
    scores.append(score)
    print(f"Seuil {thres} f-beta-score {score}")
    
sns.lineplot(x=thresolds, y=scores)
plt.title("F-beta-score en fonction du seuil de décision")
plt.xlabel("Seuil")
plt.ylabel("F-beta-score")

In [None]:
# rapport de performances 

thres = 0.19
y_pred_label = [0 if i<=thres else 1 for i in y_pred]
print(metrics.classification_report(y_test, y_pred_label, digits=4))
conf_matrix = metrics.confusion_matrix(y_test, y_pred_label)
print(conf_matrix)
sns.heatmap(conf_matrix, annot=True, fmt="d")
