# <a id='1'>0. Sommaire</a>

# <a id='1'>1. Librairies</a>

In [19]:
import os

import lightgbm_with_simple_features as fe
from importlib import reload

import numpy as np
import pandas as pd
from verstack import NaNImputer
from verstack import PandasOptimizer
import re
import pkg_resources
import types

In [20]:
def get_imports():
    for name, val in globals().items():
        if isinstance(val, types.ModuleType):
            # Split ensures you get root package, 
            # not just imported function
            name = val.__name__.split(".")[0]

        elif isinstance(val, type):
            name = val.__module__.split(".")[0]
            
        # Some packages are weird and have different
        # imported names vs. system/pip names. Unfortunately,
        # there is no systematic way to get pip names from
        # a package's imported name. You'll have to add
        # exceptions to this list manually!
        poorly_named_packages = {
            "PIL": "Pillow",
            "sklearn": "scikit-learn"
        }
        if name in poorly_named_packages.keys():
            name = poorly_named_packages[name]
            
        yield name
imports = list(set(get_imports()))

# The only way I found to get the version of the root package
# from only the name of the package is to cross-check the names 
# of installed packages vs. imported packages
requirements = []
for m in pkg_resources.working_set:
    if m.project_name in imports and m.project_name!="pip":
        requirements.append((m.project_name, m.version))
for r in requirements:
    print("{}=={}".format(*r))

verstack==3.7.1
numpy==1.23.5
pandas==1.5.3


# 2 . Feature Engineering

In [2]:
num_rows = None
df = fe.application_train_test(num_rows)
with fe.timer("Process bureau and bureau_balance"):
    bureau = fe.bureau_and_balance(num_rows)
    print("Bureau df shape:", bureau.shape)
    df = df.join(bureau, how='left', on='SK_ID_CURR')
    del bureau
    fe.gc.collect()
with fe.timer("Process previous_applications"):
    prev = fe.previous_applications(num_rows)
    print("Previous applications df shape:", prev.shape)
    df = df.join(prev, how='left', on='SK_ID_CURR')
    del prev
    fe.gc.collect()
with fe.timer("Process POS-CASH balance"):
    pos = fe.pos_cash(num_rows)
    print("Pos-cash balance df shape:", pos.shape)
    df = df.join(pos, how='left', on='SK_ID_CURR')
    del pos
    fe.gc.collect()
with fe.timer("Process installments payments"):
    ins = fe.installments_payments(num_rows)
    print("Installments payments df shape:", ins.shape)
    df = df.join(ins, how='left', on='SK_ID_CURR')
    del ins
    fe.gc.collect()
with fe.timer("Process credit card balance"):
    cc = fe.credit_card_balance(num_rows)
    print("Credit card balance df shape:", cc.shape)
    df = df.join(cc, how='left', on='SK_ID_CURR')
    del cc

Train samples: 307511, test samples: 48744
Bureau df shape: (305811, 116)
Process bureau and bureau_balance - done in 31s
Previous applications df shape: (338857, 249)
Process previous_applications - done in 35s
Pos-cash balance df shape: (337252, 18)
Process POS-CASH balance - done in 16s
Installments payments df shape: (339587, 26)
Process installments payments - done in 39s
Credit card balance df shape: (103558, 141)
Process credit card balance - done in 19s


In [3]:
df.replace([np.inf, -np.inf], 0, inplace=True)

# 3 . Imputation

In [4]:
df_target = df[["SK_ID_CURR","TARGET"]].copy()

In [5]:
columns_to_impute = df.keys().to_list()
columns_to_impute.remove("TARGET")

In [6]:
df_to_impute = df[columns_to_impute].copy()

In [7]:
# initialize with default parameters
imputer = NaNImputer()
df_imputed = imputer.impute(df_to_impute)


 * Initiating NaNImputer.impute
     . Dataset dimensions:
     .. rows:         356251
     .. columns:      796
     .. mb in memory: 1849.92
     .. NaN cols num: 616

   - Drop hopeless NaN cols

   - droped column OWN_CAR_AGE with NaNs and a constant non-NaN value

   - droped column EXT_SOURCE_1 with NaNs and a constant non-NaN value

   - droped column APARTMENTS_AVG with NaNs and a constant non-NaN value

   - droped column BASEMENTAREA_AVG with NaNs and a constant non-NaN value

   - droped column YEARS_BUILD_AVG with NaNs and a constant non-NaN value

   - droped column COMMONAREA_AVG with NaNs and a constant non-NaN value

   - droped column ELEVATORS_AVG with NaNs and a constant non-NaN value

   - droped column ENTRANCES_AVG with NaNs and a constant non-NaN value

   - droped column FLOORSMIN_AVG with NaNs and a constant non-NaN value

   - droped column LANDAREA_AVG with NaNs and a constant non-NaN value

   - droped column LIVINGAPARTMENTS_AVG with NaNs and a constant n

In [8]:
df_imputed.isna().sum()

SK_ID_CURR                        0
CODE_GENDER                       0
FLAG_OWN_CAR                      0
FLAG_OWN_REALTY                   0
CNT_CHILDREN                      0
                                 ..
INSTAL_AMT_PAYMENT_SUM            0
INSTAL_DAYS_ENTRY_PAYMENT_MAX     0
INSTAL_DAYS_ENTRY_PAYMENT_MEAN    0
INSTAL_DAYS_ENTRY_PAYMENT_SUM     0
INSTAL_COUNT                      0
Length: 545, dtype: int64

In [9]:
df_final = pd.merge(df_target, df_imputed, left_on = "SK_ID_CURR", right_on = "SK_ID_CURR", how = "left")

# 4 . Optimization mémoire

In [10]:
opt = PandasOptimizer()

In [11]:
df_final_opt = opt.optimize_memory_usage(df_final)


   - Original data size: 1170.43 MB

   - Optimized data size: 429.44 MB

   - Optimized data percentage of origianl data: 36.69%


In [3]:
# Change columns names ([LightGBM] Do not support special JSON characters in feature name.)
new_names = {col: re.sub(r'[^A-Za-z0-9_]+', '', col) for col in df_final_opt.columns}
new_n_list = list(new_names.values())
# [LightGBM] Feature appears more than one time.
new_names = {col: f'{new_col}_{i}' if new_col in new_n_list[:i] else new_col for i, (col, new_col) in enumerate(new_names.items())}
df_final_opt = df_final_opt.rename(columns=new_names)

In [4]:
df_final_opt.to_pickle("..\Data\Processed_data\df_final_prot5.pkl", protocol=5)

In [5]:
df_final_opt.to_pickle("..\Data\Processed_data\df_final_prot0.pkl", protocol=-1)

In [12]:
train_df = df[df['TARGET'].notnull()]
test_df = df[df['TARGET'].isnull()]

In [14]:
train_df.to_pickle(r"..\Data\Processed_data\train_df.pkl", protocol=-1)
test_df.to_pickle(r"..\Data\Processed_data\test_df.pkl", protocol=-1)