# <a id='1'>0. Sommaire</a>

- <a href='#1'>0. Sommaire</a>  
- <a href='#2'>1. Librairies</a>
- <a href='#3'>2. Data</a>
- <a href='#4'>3. Initalisation</a>
- <a href='#5'>4. Bilan</a>

# <a id='1'>1. Librairies</a>

In [78]:
import os
import pkg_resources
import types
#import lightgbm_with_simple_features as fe
from importlib import reload


import numpy as np
import pandas as pd



from pycaret.classification import *
from sklearn.utils import check_random_state
import outils_model

from boruta import BorutaPy
from BorutaShap import BorutaShap
from collections import Counter

import pickle
import lightgbm as lgb
from sklearn.inspection import permutation_importance
from sklearn.feature_selection import RFECV

In [2]:
class BorutaPyForLGB(BorutaPy):
    def __init__(self, estimator, n_estimators=1000, perc=100, alpha=0.05,
                 two_step=True, max_iter=100, random_state=None, verbose=0):
        super().__init__(estimator, n_estimators, perc, alpha,
                         two_step, max_iter, random_state, verbose)
        self._is_lightgbm = 'lightgbm' in str(type(self.estimator))
        
    def _fit(self, X, y):
        # check input params
        self._check_params(X, y)

        if not isinstance(X, np.ndarray):
            X = self._validate_pandas_input(X) 
        if not isinstance(y, np.ndarray):
            y = self._validate_pandas_input(y)

        self.random_state = check_random_state(self.random_state)
        # setup variables for Boruta
        n_sample, n_feat = X.shape
        _iter = 1
        # holds the decision about each feature:
        # 0  - default state = tentative in original code
        # 1  - accepted in original code
        # -1 - rejected in original code
        dec_reg = np.zeros(n_feat, dtype=np.int)
        # counts how many times a given feature was more important than
        # the best of the shadow features
        hit_reg = np.zeros(n_feat, dtype=np.int)
        # these record the history of the iterations
        imp_history = np.zeros(n_feat, dtype=np.float)
        sha_max_history = []

        # set n_estimators
        if self.n_estimators != 'auto':
            self.estimator.set_params(n_estimators=self.n_estimators)

        # main feature selection loop
        while np.any(dec_reg == 0) and _iter < self.max_iter:
            # find optimal number of trees and depth
            if self.n_estimators == 'auto':
                # number of features that aren't rejected
                not_rejected = np.where(dec_reg >= 0)[0].shape[0]
                n_tree = self._get_tree_num(not_rejected)
                self.estimator.set_params(n_estimators=n_tree)

            # make sure we start with a new tree in each iteration
            if self._is_lightgbm:
                self.estimator.set_params(random_state=self.random_state.randint(0, 10000))
            else:
                self.estimator.set_params(random_state=self.random_state)

            # add shadow attributes, shuffle them and train estimator, get imps
            cur_imp = self._add_shadows_get_imps(X, y, dec_reg)

            # get the threshold of shadow importances we will use for rejection
            imp_sha_max = np.percentile(cur_imp[1], self.perc)

            # record importance history
            sha_max_history.append(imp_sha_max)
            imp_history = np.vstack((imp_history, cur_imp[0]))

            # register which feature is more imp than the max of shadows
            hit_reg = self._assign_hits(hit_reg, cur_imp, imp_sha_max)

            # based on hit_reg we check if a feature is doing better than
            # expected by chance
            dec_reg = self._do_tests(dec_reg, hit_reg, _iter)

            # print out confirmed features
            if self.verbose > 0 and _iter < self.max_iter:
                self._print_results(dec_reg, _iter, 0)
            if _iter < self.max_iter:
                _iter += 1

        # we automatically apply R package's rough fix for tentative ones
        confirmed = np.where(dec_reg == 1)[0]
        tentative = np.where(dec_reg == 0)[0]
        # ignore the first row of zeros
        tentative_median = np.median(imp_history[1:, tentative], axis=0)
        # which tentative to keep
        tentative_confirmed = np.where(tentative_median
                                       > np.median(sha_max_history))[0]
        tentative = tentative[tentative_confirmed]

        # basic result variables
        self.n_features_ = confirmed.shape[0]
        self.support_ = np.zeros(n_feat, dtype=np.bool)
        self.support_[confirmed] = 1
        self.support_weak_ = np.zeros(n_feat, dtype=np.bool)
        self.support_weak_[tentative] = 1

        # ranking, confirmed variables are rank 1
        self.ranking_ = np.ones(n_feat, dtype=np.int)
        # tentative variables are rank 2
        self.ranking_[tentative] = 2
        # selected = confirmed and tentative
        selected = np.hstack((confirmed, tentative))
        # all rejected features are sorted by importance history
        not_selected = np.setdiff1d(np.arange(n_feat), selected)
        # large importance values should rank higher = lower ranks -> *(-1)
        imp_history_rejected = imp_history[1:, not_selected] * -1

        # update rank for not_selected features
        if not_selected.shape[0] > 0:
                # calculate ranks in each iteration, then median of ranks across feats
                iter_ranks = self._nanrankdata(imp_history_rejected, axis=1)
                rank_medians = np.nanmedian(iter_ranks, axis=0)
                ranks = self._nanrankdata(rank_medians, axis=0)

                # set smallest rank to 3 if there are tentative feats
                if tentative.shape[0] > 0:
                    ranks = ranks - np.min(ranks) + 3
                else:
                    # and 2 otherwise
                    ranks = ranks - np.min(ranks) + 2
                self.ranking_[not_selected] = ranks
        else:
            # all are selected, thus we set feature supports to True
            self.support_ = np.ones(n_feat, dtype=np.bool)

        self.importance_history_ = imp_history

        # notify user
        if self.verbose > 0:
            self._print_results(dec_reg, _iter, 1)
        return self

In [3]:
def get_imports():
    for name, val in globals().items():
        if isinstance(val, types.ModuleType):
            # Split ensures you get root package, 
            # not just imported function
            name = val.__name__.split(".")[0]

        elif isinstance(val, type):
            name = val.__module__.split(".")[0]
            
        # Some packages are weird and have different
        # imported names vs. system/pip names. Unfortunately,
        # there is no systematic way to get pip names from
        # a package's imported name. You'll have to add
        # exceptions to this list manually!
        poorly_named_packages = {
            "PIL": "Pillow",
            "sklearn": "scikit-learn"
        }
        if name in poorly_named_packages.keys():
            name = poorly_named_packages[name]
            
        yield name
imports = list(set(get_imports()))

# The only way I found to get the version of the root package
# from only the name of the package is to cross-check the names 
# of installed packages vs. imported packages
requirements = []
for m in pkg_resources.working_set:
    if m.project_name in imports and m.project_name!="pip":
        requirements.append((m.project_name, m.version))
for r in requirements:
    print("{}=={}".format(*r))

lightgbm==3.3.2
scikit-learn==1.1.3
BorutaShap==1.0.16
numpy==1.23.5
pandas==1.5.3
pycaret==3.0.2


# <a id='2'>2. Data</a>

In [4]:
df = pd.read_pickle("df_final_prot5.pkl")

In [5]:
train_df = df[df['TARGET'].notnull()]
test_df = df[df['TARGET'].isnull()]

# <a id='4'>3. Initialisation</a>

In [6]:
s = setup(data = train_df, 
          target = 'TARGET', 
          train_size = 0.8,
          ignore_features = "SK_ID_CURR",
          #log_data = True,
          #log_experiment = "dagshub",
          #experiment_name = "LGBM optimization",
          fix_imbalance = True,
          fix_imbalance_method = 'SMOTE',
          normalize = True,
          normalize_method="robust",
          #feature_selection = True,
          #feature_selection_method = "classic",
          #remove_multicollinearity = True,
          #multicollinearity_threshold = 0.9,
          session_id=42)

Unnamed: 0,Description,Value
0,Session id,42
1,Target,TARGET
2,Target type,Binary
3,Original data shape,"(307507, 546)"
4,Transformed data shape,"(513792, 545)"
5,Transformed train set shape,"(452290, 545)"
6,Transformed test set shape,"(61502, 545)"
7,Ignore features,10
8,Numeric features,544
9,Preprocess,True


In [7]:
X_train = get_config('X_train_transformed')
y_train = get_config('y_train_transformed')
X_test = get_config('X_test_transformed')
y_test = get_config('y_test_transformed')

In [8]:
X_train_pickle = 'X_train.pickle'
with open(X_train_pickle, 'wb') as f:
    pickle.dump(X_train, f, pickle.HIGHEST_PROTOCOL)

In [9]:
y_train_pickle = 'y_train.pickle'
with open(y_train_pickle, 'wb') as f:
    pickle.dump(y_train, f, pickle.HIGHEST_PROTOCOL)

In [10]:
X_train_col = X_train.keys().to_list()

In [11]:
lgbm = create_model('lightgbm', num_boost_round = 100)

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.9195,0.7761,0.0217,0.5309,0.0416,0.0355,0.095
1,0.9194,0.7681,0.0237,0.5222,0.0453,0.0385,0.0982
2,0.9193,0.7713,0.0191,0.5,0.0369,0.0311,0.0857
3,0.9192,0.7645,0.0206,0.494,0.0396,0.0334,0.0883
4,0.9198,0.7723,0.0206,0.5942,0.0399,0.0347,0.1
5,0.9195,0.7716,0.0176,0.5385,0.0341,0.0292,0.0865
6,0.9194,0.773,0.0247,0.5158,0.0471,0.04,0.0994
7,0.9202,0.7786,0.0227,0.6818,0.0439,0.0389,0.1144
8,0.9199,0.7633,0.0242,0.5926,0.0464,0.0404,0.108
9,0.9193,0.7713,0.0196,0.5132,0.0378,0.0321,0.0884


Processing:   0%|          | 0/4 [00:00<?, ?it/s]

In [12]:
save_model(lgbm, "lgbm")

Transformation Pipeline and Model Successfully Saved


(Pipeline(memory=FastMemory(location=C:\Users\EBERTH~1\AppData\Local\Temp\joblib),
          steps=[('numerical_imputer',
                  TransformerWrapper(exclude=None,
                                     include=['CODE_GENDER', 'FLAG_OWN_CAR',
                                              'FLAG_OWN_REALTY', 'CNT_CHILDREN',
                                              'AMT_INCOME_TOTAL', 'AMT_CREDIT',
                                              'AMT_ANNUITY', 'AMT_GOODS_PRICE',
                                              'REGION_POPULATION_RELATIVE',
                                              'DAYS_BIRTH', 'DAYS_EMPLOYED',
                                              'DAYS_REGISTRATI...
                  LGBMClassifier(boosting_type='gbdt', class_weight=None,
                                 colsample_bytree=1.0, importance_type='split',
                                 learning_rate=0.1, max_depth=-1,
                                 min_child_samples=20, min_child_weigh

In [13]:
pipeline = load_model('lgbm')

Transformation Pipeline and Model Successfully Loaded


## <a id='3.1'>3.1 LGBM</a>

### round 1

In [14]:
lgbm_features_imp = lgbm.feature_importances_

In [15]:
df_lgbm_fi = pd.DataFrame({'feature': X_train_col,
                           'importance': lgbm_features_imp}).sort_values('importance', ascending = False)

In [16]:
# Exploration des variables avec zéro importance
zero_features_sans = \
    list(df_lgbm_fi[df_lgbm_fi['importance'] == 0.0][
        'feature'])
print(f'{len(zero_features_sans)} variables avec 0.0 importance :\n')
zero_features_sans

254 variables avec 0.0 importance :



['PREV_NAME_GOODS_CATEGORY_Jewelry_MEAN',
 'PREV_NAME_GOODS_CATEGORY_MedicalSupplies_MEAN',
 'PREV_NAME_GOODS_CATEGORY_Tourism_MEAN',
 'PREV_NAME_GOODS_CATEGORY_OfficeAppliances_MEAN',
 'PREV_NAME_GOODS_CATEGORY_Insurance_MEAN',
 'PREV_NAME_TYPE_SUITE_Other_B_MEAN',
 'PREV_NAME_GOODS_CATEGORY_Medicine_MEAN',
 'FLAG_DOCUMENT_17',
 'PREV_NAME_GOODS_CATEGORY_HouseConstruction_MEAN',
 'PREV_NAME_GOODS_CATEGORY_Vehicles_MEAN',
 'PREV_NAME_TYPE_SUITE_Spousepartner_MEAN',
 'PREV_NAME_TYPE_SUITE_Other_A_MEAN',
 'PREV_NAME_GOODS_CATEGORY_Computers_MEAN',
 'PREV_NAME_GOODS_CATEGORY_Homewares_MEAN',
 'PREV_NAME_GOODS_CATEGORY_ClothingandAccessories_MEAN',
 'PREV_NAME_GOODS_CATEGORY_AutoAccessories_MEAN',
 'PREV_NAME_GOODS_CATEGORY_ConstructionMaterials_MEAN',
 'PREV_NAME_GOODS_CATEGORY_ConsumerElectronics_MEAN',
 'PREV_NAME_GOODS_CATEGORY_AudioVideo_MEAN',
 'PREV_NAME_GOODS_CATEGORY_Other_MEAN',
 'PREV_NAME_GOODS_CATEGORY_Education_MEAN',
 'PREV_NAME_GOODS_CATEGORY_AdditionalService_MEAN',
 'PREV

### round 2

In [17]:
feature_important_round_1 = df_lgbm_fi.loc[df_lgbm_fi["importance"] > 0]['feature'].to_list()

In [18]:
lgbm_round_2 = lgbm.fit(X_train[feature_important_round_1],y_train)

In [19]:
lgbm_features_imp_2 = lgbm_round_2.feature_importances_

In [20]:
df_lgbm_fi_2 = pd.DataFrame({'feature': feature_important_round_1,
                           'importance': lgbm_features_imp_2}).sort_values('importance', ascending = False)

In [21]:
# Exploration des variables avec zéro importance
zero_features_sans = \
    list(df_lgbm_fi_2[df_lgbm_fi_2['importance'] == 0.0][
        'feature'])
print(f'{len(zero_features_sans)} variables avec 0.0 importance :\n')
zero_features_sans

20 variables avec 0.0 importance :



['ORGANIZATION_TYPE_Mobile',
 'PREV_NAME_CASH_LOAN_PURPOSE_Other_MEAN',
 'PREV_NAME_PORTFOLIO_Cash_MEAN',
 'POS_NAME_CONTRACT_STATUS_Demand_MEAN',
 'BURO_CREDIT_DAY_OVERDUE_MAX',
 'CLOSED_DAYS_CREDIT_MEAN',
 'PREV_NAME_CONTRACT_STATUS_Approved_MEAN',
 'CLOSED_DAYS_CREDIT_UPDATE_MEAN',
 'PREV_NAME_TYPE_SUITE_Children_MEAN',
 'FONDKAPREMONT_MODE_regoperaccount',
 'LIVINGAREA_MODE',
 'PREV_NAME_TYPE_SUITE_Family_MEAN',
 'PREV_CODE_REJECT_REASON_SCO_MEAN',
 'BURO_CREDIT_TYPE_Realestateloan_MEAN',
 'APPROVED_AMT_GOODS_PRICE_MEAN',
 'INSTAL_DBD_MAX',
 'APPROVED_RATE_DOWN_PAYMENT_MEAN',
 'PREV_WEEKDAY_APPR_PROCESS_START_THURSDAY_MEAN',
 'POS_NAME_CONTRACT_STATUS_Canceled_MEAN',
 'PREV_WEEKDAY_APPR_PROCESS_START_TUESDAY_MEAN']

### round 3

In [22]:
feature_important_round_2 = df_lgbm_fi_2.loc[df_lgbm_fi_2["importance"] > 0]['feature'].to_list()

In [23]:
lgbm_round_3 = lgbm.fit(X_train[feature_important_round_2],y_train)

In [24]:
lgbm_features_imp_3 = lgbm_round_3.feature_importances_

In [25]:
df_lgbm_fi_3 = pd.DataFrame({'feature': feature_important_round_2,
                           'importance': lgbm_features_imp_3}).sort_values('importance', ascending = False)

In [26]:
# Exploration des variables avec zéro importance
zero_features_sans = \
    list(df_lgbm_fi_3[df_lgbm_fi_3['importance'] == 0.0][
        'feature'])
print(f'{len(zero_features_sans)} variables avec 0.0 importance :\n')
zero_features_sans

0 variables avec 0.0 importance :



[]

In [27]:
# Sauvegarde des features importances avec boruta
fs_lgbm = 'fs_lgbm.pickle'
with open(fs_lgbm, 'wb') as f:
    pickle.dump(df_lgbm_fi_3, f, pickle.HIGHEST_PROTOCOL)

## <a id='3.2'>3.2 Boruta</a>

In [28]:
X_train_boruta = np.array(X_train)
y_train_boruta = np.array(y_train)

In [29]:
boruta_feature_selector = BorutaPyForLGB(lgbm,
                                   n_estimators = 'auto',
                                   verbose = 2,
                                   random_state = 42,
                                   max_iter = 50,
                                   perc= 90)

In [30]:
boruta_feature_selector.fit(X_train_boruta, y_train_boruta)

Iteration: 	1 / 50
Confirmed: 	0
Tentative: 	544
Rejected: 	0
Iteration: 	2 / 50
Confirmed: 	0
Tentative: 	544
Rejected: 	0
Iteration: 	3 / 50
Confirmed: 	0
Tentative: 	544
Rejected: 	0
Iteration: 	4 / 50
Confirmed: 	0
Tentative: 	544
Rejected: 	0
Iteration: 	5 / 50
Confirmed: 	0
Tentative: 	544
Rejected: 	0
Iteration: 	6 / 50
Confirmed: 	0
Tentative: 	544
Rejected: 	0
Iteration: 	7 / 50
Confirmed: 	0
Tentative: 	544
Rejected: 	0
Iteration: 	8 / 50
Confirmed: 	149
Tentative: 	139
Rejected: 	256
Iteration: 	9 / 50
Confirmed: 	149
Tentative: 	139
Rejected: 	256
Iteration: 	10 / 50
Confirmed: 	149
Tentative: 	139
Rejected: 	256
Iteration: 	11 / 50
Confirmed: 	149
Tentative: 	139
Rejected: 	256
Iteration: 	12 / 50
Confirmed: 	164
Tentative: 	124
Rejected: 	256
Iteration: 	13 / 50
Confirmed: 	164
Tentative: 	124
Rejected: 	256
Iteration: 	14 / 50
Confirmed: 	164
Tentative: 	116
Rejected: 	264
Iteration: 	15 / 50
Confirmed: 	164
Tentative: 	116
Rejected: 	264
Iteration: 	16 / 50
Confirmed: 	

In [31]:
X_train_filtered = boruta_feature_selector.transform(X_train_boruta)


In [32]:
# Liste des variables confirmées avec une haute importance
fs_boruta = list()
features = [f for f in X_train.columns]
indexes = np.where(boruta_feature_selector.support_ == True)
for x in np.nditer(indexes):
    fs_boruta.append(features[x])
display(f'fs_boruta : {fs_boruta}')

"fs_boruta : ['CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 'CNT_CHILDREN', 'AMT_INCOME_TOTAL', 'AMT_CREDIT', 'AMT_ANNUITY', 'AMT_GOODS_PRICE', 'REGION_POPULATION_RELATIVE', 'DAYS_BIRTH', 'DAYS_EMPLOYED', 'DAYS_REGISTRATION', 'DAYS_ID_PUBLISH', 'FLAG_EMP_PHONE', 'FLAG_WORK_PHONE', 'FLAG_PHONE', 'CNT_FAM_MEMBERS', 'REGION_RATING_CLIENT', 'REGION_RATING_CLIENT_W_CITY', 'REG_CITY_NOT_LIVE_CITY', 'REG_CITY_NOT_WORK_CITY', 'EXT_SOURCE_2', 'EXT_SOURCE_3', 'FLOORSMAX_MODE', 'FLOORSMAX_MEDI', 'OBS_30_CNT_SOCIAL_CIRCLE', 'DEF_30_CNT_SOCIAL_CIRCLE', 'OBS_60_CNT_SOCIAL_CIRCLE', 'DEF_60_CNT_SOCIAL_CIRCLE', 'DAYS_LAST_PHONE_CHANGE', 'FLAG_DOCUMENT_3', 'FLAG_DOCUMENT_6', 'AMT_REQ_CREDIT_BUREAU_QRT', 'AMT_REQ_CREDIT_BUREAU_YEAR', 'NAME_CONTRACT_TYPE_Cashloans', 'NAME_TYPE_SUITE_Unaccompanied', 'NAME_INCOME_TYPE_Commercialassociate', 'NAME_INCOME_TYPE_Pensioner', 'NAME_INCOME_TYPE_Working', 'NAME_EDUCATION_TYPE_Highereducation', 'NAME_EDUCATION_TYPE_Secondarysecondaryspecial', 'NAME_FAMILY_STATUS_

In [33]:
# Dataframe de features importance avec boruta
df_fs_boruta = pd.DataFrame(fs_boruta)

In [34]:
# Sauvegarde des features importances avec boruta
fs_boruta = 'fs_boruta.pickle'
with open(fs_lgbm, 'wb') as f:
    pickle.dump(df_fs_boruta, f, pickle.HIGHEST_PROTOCOL)

## <a id='3.3'>3.3 BorutaSHAP</a>

In [35]:
# Create the model with several hyperparameters
lgbm_borutashap = lgb.LGBMClassifier(objective='binary',
                          boosting_type='goss',
                          n_estimators=10000,
                          class_weight='balanced',
                          num_boost_round=100)

In [36]:
# Initialisation de BorutaShap
Feature_Selector = BorutaShap(model=lgbm_borutashap,
                              importance_measure='shap',
                              classification=True)

In [37]:
# Entraînement
Feature_Selector.fit(X=X_train, y=y_train, n_trials=100, random_state=42)

  0%|          | 0/100 [00:00<?, ?it/s]

178 attributes confirmed important: ['APPROVED_AMT_ANNUITY_MEAN', 'WALLSMATERIAL_MODE_Panel', 'BURO_AMT_CREDIT_SUM_LIMIT_MEAN', 'APPROVED_APP_CREDIT_PERC_MIN', 'CLOSED_AMT_CREDIT_SUM_LIMIT_MEAN', 'INSTAL_PAYMENT_DIFF_MEAN', 'PREV_WEEKDAY_APPR_PROCESS_START_WEDNESDAY_MEAN', 'DAYS_EMPLOYED_PERC', 'APPROVED_RATE_DOWN_PAYMENT_MIN', 'AMT_ANNUITY', 'PREV_AMT_APPLICATION_MIN', 'INSTAL_NUM_INSTALMENT_VERSION_NUNIQUE', 'INSTAL_DAYS_ENTRY_PAYMENT_MEAN', 'DEF_60_CNT_SOCIAL_CIRCLE', 'REGION_POPULATION_RELATIVE', 'CNT_CHILDREN', 'ORGANIZATION_TYPE_Selfemployed', 'OCCUPATION_TYPE_Salesstaff', 'BURO_DAYS_CREDIT_MEAN', 'EMERGENCYSTATE_MODE_No', 'PREV_CHANNEL_TYPE_APCashloan_MEAN', 'ANNUITY_INCOME_PERC', 'BURO_AMT_CREDIT_MAX_OVERDUE_MEAN', 'APPROVED_RATE_DOWN_PAYMENT_MAX', 'AMT_CREDIT', 'PREV_AMT_DOWN_PAYMENT_MIN', 'FLAG_WORK_PHONE', 'EXT_SOURCE_2', 'INSTAL_DAYS_ENTRY_PAYMENT_SUM', 'PREV_PRODUCT_COMBINATION_Cash_MEAN', 'PREV_APP_CREDIT_PERC_MAX', 'CLOSED_AMT_CREDIT_SUM_DEBT_MAX', 'CLOSED_MONTHS_BALANCE

In [38]:
# Liste des variables avec une haute importance
fs_borshap = Feature_Selector.accepted
fs_borshap

['APPROVED_AMT_ANNUITY_MEAN',
 'WALLSMATERIAL_MODE_Panel',
 'BURO_AMT_CREDIT_SUM_LIMIT_MEAN',
 'APPROVED_APP_CREDIT_PERC_MIN',
 'CLOSED_AMT_CREDIT_SUM_LIMIT_MEAN',
 'INSTAL_PAYMENT_DIFF_MEAN',
 'PREV_WEEKDAY_APPR_PROCESS_START_WEDNESDAY_MEAN',
 'DAYS_EMPLOYED_PERC',
 'APPROVED_RATE_DOWN_PAYMENT_MIN',
 'AMT_ANNUITY',
 'PREV_AMT_APPLICATION_MIN',
 'INSTAL_NUM_INSTALMENT_VERSION_NUNIQUE',
 'INSTAL_DAYS_ENTRY_PAYMENT_MEAN',
 'DEF_60_CNT_SOCIAL_CIRCLE',
 'REGION_POPULATION_RELATIVE',
 'CNT_CHILDREN',
 'ORGANIZATION_TYPE_Selfemployed',
 'OCCUPATION_TYPE_Salesstaff',
 'BURO_DAYS_CREDIT_MEAN',
 'EMERGENCYSTATE_MODE_No',
 'PREV_CHANNEL_TYPE_APCashloan_MEAN',
 'ANNUITY_INCOME_PERC',
 'BURO_AMT_CREDIT_MAX_OVERDUE_MEAN',
 'APPROVED_RATE_DOWN_PAYMENT_MAX',
 'AMT_CREDIT',
 'PREV_AMT_DOWN_PAYMENT_MIN',
 'FLAG_WORK_PHONE',
 'EXT_SOURCE_2',
 'INSTAL_DAYS_ENTRY_PAYMENT_SUM',
 'PREV_PRODUCT_COMBINATION_Cash_MEAN',
 'PREV_APP_CREDIT_PERC_MAX',
 'CLOSED_AMT_CREDIT_SUM_DEBT_MAX',
 'CLOSED_MONTHS_BALANCE_SIZ

In [39]:
# Dataframe
df_fs_borshap = pd.DataFrame(fs_borshap)
df_fs_borshap.shape

(178, 1)

In [40]:
# Liste des variables à ne pas conserver
cols_to_supp_borutashap = Feature_Selector.features_to_remove
cols_to_supp_borutashap

array(['AMT_INCOME_TOTAL', 'FLAG_MOBIL', 'FLAG_CONT_MOBILE', 'FLAG_EMAIL',
       'HOUR_APPR_PROCESS_START', 'REG_REGION_NOT_LIVE_REGION',
       'REG_REGION_NOT_WORK_REGION', 'LIVE_REGION_NOT_WORK_REGION',
       'LIVE_CITY_NOT_WORK_CITY', 'YEARS_BEGINEXPLUATATION_AVG',
       'FLOORSMAX_AVG', 'LIVINGAREA_AVG', 'LIVINGAREA_MODE',
       'YEARS_BEGINEXPLUATATION_MEDI', 'LIVINGAREA_MEDI',
       'TOTALAREA_MODE', 'FLAG_DOCUMENT_2', 'FLAG_DOCUMENT_4',
       'FLAG_DOCUMENT_5', 'FLAG_DOCUMENT_7', 'FLAG_DOCUMENT_8',
       'FLAG_DOCUMENT_9', 'FLAG_DOCUMENT_10', 'FLAG_DOCUMENT_11',
       'FLAG_DOCUMENT_12', 'FLAG_DOCUMENT_13', 'FLAG_DOCUMENT_14',
       'FLAG_DOCUMENT_15', 'FLAG_DOCUMENT_16', 'FLAG_DOCUMENT_17',
       'FLAG_DOCUMENT_18', 'FLAG_DOCUMENT_19', 'FLAG_DOCUMENT_20',
       'FLAG_DOCUMENT_21', 'AMT_REQ_CREDIT_BUREAU_HOUR',
       'AMT_REQ_CREDIT_BUREAU_DAY', 'AMT_REQ_CREDIT_BUREAU_WEEK',
       'AMT_REQ_CREDIT_BUREAU_MON', 'NAME_CONTRACT_TYPE_Revolvingloans',
       'NAME_TYPE_S

In [41]:
# Sauvegarde du feature selector avec borutashap
fic_sav_fSelector_borshap = \
    'features_selector_borshap.pickle'
with open(fic_sav_fSelector_borshap, 'wb') as f:
    pickle.dump(Feature_Selector, f, pickle.HIGHEST_PROTOCOL)

In [42]:
# Sauvegarde des features importances avec borutashap
fi_borutashap = \
    'fi_borutashap.pickle'
with open(fi_borutashap, 'wb') as f:
    pickle.dump(df_fs_borshap, f, pickle.HIGHEST_PROTOCOL)

## <a id='3.4'>3.4 Permutation</a>

In [43]:
lgbm_permut = create_model("lightgbm", objective='binary',
                          boosting_type='goss',
                          n_estimators=10000,
                          class_weight='balanced',
                          num_boost_round=10)

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.9191,0.7046,0.0,0.0,0.0,-0.0004,-0.0042
1,0.919,0.692,0.0,0.0,0.0,-0.0006,-0.005
2,0.919,0.6979,0.001,0.2,0.002,0.0012,0.0088
3,0.9191,0.6941,0.0,0.0,0.0,-0.0002,-0.0033
4,0.9191,0.6951,0.0,0.0,0.0,-0.0003,-0.0038
5,0.9193,0.7057,0.0,0.0,0.0,0.0,0.0
6,0.9191,0.6962,0.001,0.2222,0.002,0.0013,0.0099
7,0.9192,0.7099,0.0,0.0,0.0,-0.0001,-0.0019
8,0.9191,0.6917,0.0005,0.1667,0.001,0.0005,0.0049
9,0.9189,0.6987,0.0,0.0,0.0,-0.0008,-0.006


Processing:   0%|          | 0/4 [00:00<?, ?it/s]

In [44]:
permut = permutation_importance(lgbm_permut, 
                                X_test, 
                                y_test,
                                random_state=42,
                                n_jobs=-1)

In [45]:
sorted_idx = np.abs(permut.importances_mean).argsort()

In [46]:
# Constitution du dataframe de travail
df_fs_perm_imp_sklearn = \
    pd.DataFrame({'Variables': X_test.columns,
                  'Importance': permut.importances_mean,
                  'Importance_abs': np.abs(permut.importances_mean)})

In [47]:
# Sauvegarde des features importances avec permutation importance sklearn
fi_permut = \
    'fi_permut.pickle'
with open(fi_permut, 'wb') as f:
    pickle.dump(df_fs_perm_imp_sklearn, f, pickle.HIGHEST_PROTOCOL)

## <a id='3.5'>3.5 RFECV</a>

In [57]:
lgbm_rfecv = lgb.LGBMClassifier(random_state=42)

In [58]:
selector = RFECV(estimator=lgbm_rfecv, 
                 step=1,
                 scoring='accuracy', 
                 cv=5, 
                 verbose=1
                )

In [59]:
selector.fit(X_train, y_train)

Fitting estimator with 544 features.
Fitting estimator with 543 features.
Fitting estimator with 542 features.
Fitting estimator with 541 features.
Fitting estimator with 540 features.
Fitting estimator with 539 features.
Fitting estimator with 538 features.
Fitting estimator with 537 features.
Fitting estimator with 536 features.
Fitting estimator with 535 features.
Fitting estimator with 534 features.
Fitting estimator with 533 features.
Fitting estimator with 532 features.
Fitting estimator with 531 features.
Fitting estimator with 530 features.
Fitting estimator with 529 features.
Fitting estimator with 528 features.
Fitting estimator with 527 features.
Fitting estimator with 526 features.
Fitting estimator with 525 features.
Fitting estimator with 524 features.
Fitting estimator with 523 features.
Fitting estimator with 522 features.
Fitting estimator with 521 features.
Fitting estimator with 520 features.
Fitting estimator with 519 features.
Fitting estimator with 518 features.
F

In [61]:
# Sauvegarde des features importances avec RFECV
selector_svg = \
    'selector_svg.pickle'
with open(selector_svg, 'wb') as f:
    pickle.dump(selector, f, pickle.HIGHEST_PROTOCOL)

In [70]:
df_RFECV = pd.DataFrame()
df_RFECV['variables'] = selector.get_feature_names_out()
df_RFECV['importance'] = selector.estimator_.feature_importances_

df_RFECV = df_RFECV.sort_values(by='importance', ascending=False)

In [71]:
df_RFECV

Unnamed: 0,variables,importance
29,EXT_SOURCE_2,167
30,EXT_SOURCE_3,146
0,CODE_GENDER,102
198,PAYMENT_RATE,75
95,NAME_FAMILY_STATUS_Married,57
...,...,...
81,NAME_INCOME_TYPE_Businessman,0
83,NAME_INCOME_TYPE_Maternityleave,0
297,PREV_AMT_GOODS_PRICE_MIN,0
85,NAME_INCOME_TYPE_Stateservant,0


In [72]:
# Sauvegarde des features importances avec RFECV
fi_rfecv = \
    'fi_rfecv.pickle'
with open(fi_rfecv, 'wb') as f:
    pickle.dump(df_RFECV, f, pickle.HIGHEST_PROTOCOL)

# <a id='5'>4 Bilan</a>

In [76]:
# Récupération des variables conservées pour chaque feature selection
liste_var = df_lgbm_fi_3['feature'].to_list()
print(len(liste_var))

liste_var.extend(df_fs_boruta[0].to_list())
print(len(liste_var))

liste_var.extend(df_fs_borshap[0].to_list())
print(len(liste_var))

liste_var.extend(df_fs_perm_imp_sklearn['Variables'].to_list())
print(len(liste_var))

liste_var.extend(df_RFECV['variables'].to_list())
print(len(liste_var))

270
455
633
1177
1694


In [79]:
# Nomre de répétitions de chacune des variables
dico_nbre_repet_var = Counter(liste_var)
len(dico_nbre_repet_var)

544

In [80]:
df_nbr_repet_var = pd.DataFrame.from_dict(dico_nbre_repet_var,
                                          orient='index',
                                          columns=['Nbr_repetition'])\
    .reset_index().rename(columns={'index':'Variables'}) \
    .sort_values(by='Nbr_repetition', ascending=False)
df_nbr_repet_var.style.hide_index()

Variables,Nbr_repetition
EXT_SOURCE_2,5
PREV_NAME_CONTRACT_TYPE_Revolvingloans_MEAN,5
BURO_DAYS_CREDIT_MAX,5
WEEKDAY_APPR_PROCESS_START_MONDAY,5
APPROVED_AMT_ANNUITY_MAX,5
APPROVED_APP_CREDIT_PERC_MAX,5
NAME_TYPE_SUITE_Unaccompanied,5
APPROVED_AMT_DOWN_PAYMENT_MIN,5
CLOSED_AMT_CREDIT_SUM_OVERDUE_MEAN,5
BURO_AMT_CREDIT_SUM_SUM,5


In [87]:
# Les variables présentes plus de 6 fois pour les 8 méthodes de feature
#  sélection seront conservées
var_cons_train_set = \
    df_nbr_repet_var[df_nbr_repet_var['Nbr_repetition'] > 4][
        'Variables'].to_list()
print(f'{len(var_cons_train_set)} variables conservées pour le train_set')

In [93]:
var_cons_train_df = pd.DataFrame(var_cons_train_set, columns=["variables"])

In [94]:
var_cons_train_df

Unnamed: 0,variables
0,ACTIVE_AMT_CREDIT_SUM_DEBT_MEAN
1,ACTIVE_AMT_CREDIT_SUM_DEBT_SUM
2,ACTIVE_AMT_CREDIT_SUM_LIMIT_MEAN
3,ACTIVE_AMT_CREDIT_SUM_LIMIT_SUM
4,ACTIVE_AMT_CREDIT_SUM_OVERDUE_MEAN
...,...
154,WALLSMATERIAL_MODE_Stonebrick
155,WEEKDAY_APPR_PROCESS_START_FRIDAY
156,WEEKDAY_APPR_PROCESS_START_MONDAY
157,WEEKDAY_APPR_PROCESS_START_TUESDAY


In [107]:
# Sauvegarde des features conservées sous forme de liste
var_cons_list = 'var_cons_list.pickle'
with open(os.path.join(current_dir, var_cons_list), 'wb') as f:
    pickle.dump(var_cons_list, f, pickle.HIGHEST_PROTOCOL)

In [101]:
# Sauvegarde des features conservées sous forme de dataframe
var_cons_df = r'.\Data\var_cons_df.pickle'
with open(var_cons_df, 'wb') as f:
    pickle.dump(var_cons_train_df, f, pickle.HIGHEST_PROTOCOL)

In [109]:
var_cons_train_df.to_pickle("var_cons_df.pkl")

In [111]:
current_dir = os.getcwd()
current_dir

'C:\\Users\\eberthaud\\Documents\\OC\\Support\\7.P7'

In [104]:
print(os.path.join(current_dir, var_cons_list))

C:\Users\eberthaud\Documents\OC\Support\7.P7\var_cons_list.pickle
