Notice: This notebook is not optimized for memory nor performance yet. Please use it with caution when handling large datasets.

# Feature engineering

This notebook is for BDSE12_03G_HomeCredit_V1.csv processing for bear LGBM final

### Prepare work environment

In [None]:
# Pandas for managing datasets
import numpy as np
import pandas as pd

In [None]:
np.__version__, pd.__version__

In [None]:
# math for operating numbers
import math

In [None]:
import gc

In [None]:
# Change pd displayg format for float
pd.options.display.float_format = '{:,.4f}'.format

In [None]:
# Just an easy way to completely show a dataframe in a cell
def completeShow(dfToShow, rowLimit:int = 1000, colLimit:int = 1000):
    '''
    An easy way to completely show a dataframe in a cell.
    dfToShow: dataframe you'd like to completely show
    rowLimit: upper limit of the row number, could be None or any integer (default: 1000)
    colLimit: upper limit of the column number, could be None or any integer  (default: 1000)
    '''
    with pd.option_context('display.max_rows', rowLimit, 'display.max_columns', colLimit):
        print(dfToShow)
    
# to show complete output of a cell: eg.
# with pd.option_context('display.max_rows', None, 'display.max_columns', None):
#     print(df.apply(lambda x:x.unique().size))

In [None]:
# Matplotlib for additional customization
from matplotlib import pyplot as plt
%matplotlib inline

In [None]:
# Seaborn for plotting and styling
import seaborn as sns
#Seaborn set() to set aesthetic parameters in one step.
sns.set() 

---

## To Study:

In [None]:
# for memory management
# !pip install psutil
# https://psutil.readthedocs.io/en/latest/

# import os, psutil, gc
# def usage():
#     process = psutil.Process(os.getpid())
#     return process.memory_info()[0] / float(2 ** 20)

In [None]:
# ToRead: multi-threading
# http://violin-tao.blogspot.com/2017/05/python3_26.html
# https://medium.com/@peilee_98185/%E6%94%BE%E9%96%8B%E9%82%A3%E8%A8%98%E6%86%B6%E9%AB%94-%E4%B9%8B-python-%E8%99%95%E7%90%86%E5%A4%A7%E8%B3%87%E6%96%99-84fd41806694
# import multiprocessing as mp
# pool = mp.Pool(6) # 裡面填要開幾核心
# result = pool.map( data_process_function, dfs )
# # dfs 為裝了很多 Pandas DataFrame 的 list

---

### Read & combine datasets

In [None]:
# Read dataset
# appl_train_df = pd.read_csv('../../../BDSE12-Group3/datasets/homecdt_eda/application_train.csv')
# appl_test_df = pd.read_csv('../../../BDSE12-Group3/datasets/homecdt_eda/application_test.csv')

appl_all_df = pd.read_csv('../../../BDSE12-Group3/datasets/homecdt_fteng/BDSE12_03G_HomeCredit_V1.csv',index_col=0)
# appl_all_df = pd.read_csv('../../../BDSE12-Group3/datasets/homecdt_fteng/BDSE12_03G_HomeCredit_V1.csv').drop(['unnamed 0'],axis=1)

In [None]:
appl_all_df.info()

In [None]:
# appl_train_df.shape, appl_test_df.shape

In [None]:
# Combine application_train and application_test
# appl_all_df = pd.concat([appl_train_df, appl_test_df], sort=False, ignore_index=True)
# appl_all_df.shape

In [None]:
appl_all_df.apply(lambda x:x.unique().size).describe()

In [None]:
appl_all_df['TARGET'].unique(), \
appl_all_df['TARGET'].unique().size

In [None]:
appl_all_df['TARGET'].value_counts()

In [None]:
appl_all_df['TARGET'].isnull().sum(), \
appl_all_df['TARGET'].size, \
(appl_all_df['TARGET'].isnull().sum()/appl_all_df['TARGET'].size).round(4)

In [None]:
# Make sure we can use the nullness of 'TARGET' column to separate train & test
# assert appl_all_df['TARGET'].isnull().sum() == appl_test_df.shape[0]

---

#### Integration from other tables?

---

## Randomized sampleing:

#### If the dataset is too large, I would proceed the following randomized sampling from original dataset to facilitate development and testing

In [None]:
# Randomized sampling from original dataset.
# This is just for simplifying the development process
# After coding is complete, should replace all df-->df, and remove this cell
# Reference: https://yiidtw.github.io/blog/2018-05-29-how-to-shuffle-dataframe-in-pandas/

# df= appl_all_df.sample(n = 1000).reset_index(drop=True)
# df.shape

In [None]:
# df.head()

---

## Tool: Get numerical/ categorical variables(columns) from a dataframe

In [None]:
def get_num_df (data_df, unique_value_threshold: int):
    """ 
    Output: a new dataframe with columns of numerical variables from the input dataframe.
    Input: 
        data_df: original dataframe, 
        unique_value_threshold(int): number of unique values of each column
    e.g. If we define a column with > 3 unique values as being numerical variable, unique_value_threshold = 3
    """
    num_mask = data_df.apply(lambda x:x.unique().size > unique_value_threshold,axis=0) 
    num_df = data_df[data_df.columns[num_mask]]
    return num_df

def get_cat_df (data_df, unique_value_threshold: int):
    """ 
    Output: a new dataframe with columns of categorical variables from the input dataframe.
    Input: 
        data_df: original dataframe, 
        unique_value_threshold(int): number of unique values of each column
    e.g. If we define a column with =<3 unique values as being numerical variable, unique_value_threshold = 3
    """
    cat_mask = data_df.apply(lambda x:x.unique().size <= unique_value_threshold,axis=0) 
    cat_df = data_df[data_df.columns[cat_mask]]
    return cat_df


In [None]:
# Be careful when doing this assertion with large datasets
# assert get_cat_df(appl_all_df, 3).columns.size + get_num_df(appl_all_df, 3).columns.size == appl_all_df.columns.size

---

#### Splitting id_target_df, cat_df, num_df

In [None]:
# Separate id and target columns before any further processing
id_target_df = appl_all_df.loc[:, ['SK_ID_CURR','TARGET']]

# Get the operating appl_all_df by removing id and target columns
appl_all_df_opr = appl_all_df.drop(['SK_ID_CURR','TARGET'], axis=1)

# A quick check of their shapes
appl_all_df.shape, id_target_df.shape, appl_all_df_opr.shape

In [None]:
# Spliting the numerical and categorical variable containing columns via the tools decribed above.
cat_df = get_cat_df (appl_all_df_opr, 100)
num_df = get_num_df (appl_all_df_opr, 100)

In [None]:
cat_df.info()
num_df.info()

In [None]:
# A quick check of their shapes
appl_all_df_opr.shape, cat_df.shape, num_df.shape

In [None]:
assert cat_df.shape[1] + num_df.shape[1] + id_target_df.shape[1] \
    == appl_all_df_opr.shape[1] + id_target_df.shape[1] \
    == appl_all_df.shape[1]

assert cat_df.shape[0] == num_df.shape[0] == id_target_df.shape[0] \
    == appl_all_df_opr.shape[0] \
    == appl_all_df.shape[0]

In [None]:
# Apply the following gc if memory is running slow
appl_all_df_opr.info()
appl_all_df.info()
del appl_all_df_opr
del appl_all_df
gc.collect()

---

## Dealing with categorical variables

#### Transform to String (i.e., python object) and fill nan with String 'nan'

In [None]:
cat_df_obj = cat_df.astype(str)

In [None]:
assert np.all(cat_df_obj.dtypes) == object

# There are no NA left
assert all(cat_df_obj.isnull().sum())==0

In [None]:
# The float nan will be tranformed to String 'nan'
# Use this assertion carefully when dealing with extra-large datasets
assert cat_df.isnull().equals(cat_df_obj.isin({'nan'}))

#### Dealing with special columns

Replace 'nan' with 'not specified' in column 'FONDKAPREMONT_MODE'

In [None]:
# Do the replacement and re-assign the modified column back to the original dataframe
cat_df_obj['FONDKAPREMONT_MODE'] = cat_df_obj['FONDKAPREMONT_MODE'].replace('nan','not specified')

In [None]:
# check again the unique value, it should be 1 less than the original cat_df
assert cat_df['FONDKAPREMONT_MODE'].unique().size == cat_df_obj['FONDKAPREMONT_MODE'].unique().size +1

In [None]:
# Apply the following gc if memory is running slow
cat_df.info()
del cat_df
gc.collect()

#### Do one-hot encoding

Check the input dataframe (i.e., cat_df_obj)

In [None]:
cat_df_obj.shape

In [None]:
cat_df_obj.apply(lambda x:x.unique().size).sum()

In [None]:
# ?pd.get_dummies

In [None]:
# pd.get_dummies() method deals only with categorical variables.
# Although it has a built-in argument 'dummy_na' to manage the na value, 
# our na value has already been converted to string object which are not recognized by the method.
# Let's just move forward as planned
cat_df_obj_ohe = pd.get_dummies(cat_df_obj, drop_first=True)
cat_df_obj_ohe.shape

In [None]:
# Make sure the ohe is successful
assert np.all(np.isin(cat_df_obj_ohe.values,[0,1])) == True
# cat_df_obj_ohe.dtypes
assert np.all(cat_df_obj_ohe.dtypes) == 'uint8'
# make sure the column counts are correct
assert cat_df_obj.apply(lambda x:x.unique().size).sum() == cat_df_obj_ohe.shape[1] + cat_df_obj.shape[1]

In [None]:
cat_df_obj_ohe.info()

In [None]:
# Apply the following gc if memory is running slow
cat_df_obj.info()
del cat_df_obj
gc.collect()

In [None]:
# %timeit np.isin(cat_df_obj_ohe.values,[0,1])
# # 1.86 s ± 133 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)

# %timeit cat_df_obj_ohe.isin([0 , 1])
# # 3.38 s ± 32.5 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)

In [None]:
# %timeit np.all(np.isin(cat_df_obj_ohe.values,[0,1]))
# # 1.85 s ± 28 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)

# %timeit np.all(cat_df_obj_ohe.isin([0 , 1]))
# # 3.47 s ± 193 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)

---

## Dealing with numerial variables

#### Get na flags

In [None]:
num_df.shape

In [None]:
# How many columns contain na value.
num_df.isna().any().sum()

In [None]:
num_isna_df = num_df[num_df.columns[num_df.isna().any()]]
num_notna_df = num_df[num_df.columns[num_df.notna().all()]]

assert num_isna_df.shape[1] + num_notna_df.shape[1] == num_df.shape[1]
assert num_isna_df.shape[0] == num_notna_df.shape[0] == num_df.shape[0]

In [None]:
num_isna_df.shape, num_notna_df.shape

In [None]:
# num_df.isna().any(): column names for those na containing columns
# use it to transform values bool to int, and then add suffix on the column names to get the na-flag df
num_naFlag_df = num_isna_df.isna().astype(np.uint8).add_suffix('_na')
num_naFlag_df.info() 

#### replace na with zero

In [None]:
num_isna_df = num_isna_df.fillna(0)
num_isna_df.shape

In [None]:
# How many columns contain na value.
num_isna_df.isna().any().sum()

In [None]:
num_isna_df.info()

In [None]:
assert num_isna_df.shape == num_naFlag_df.shape

In [None]:
num_df = pd.concat([num_notna_df,num_isna_df,num_naFlag_df], axis = 'columns')

In [None]:
assert num_notna_df.shape[1] + num_isna_df.shape[1] + num_naFlag_df.shape[1] == num_df.shape[1]

In [None]:
num_df.info(verbose=False)

In [None]:
# Apply the following gc if memory is running slow
del num_notna_df
del num_isna_df
del num_naFlag_df
gc.collect()

In [None]:
# might not be very useful at this point
def summary_df (data_df):
    """ 
    Output: a new dataframe with summary info from the input dataframe.
    Input: data_df, the original dataframe
    """
    summary_df = pd.concat([(data_df.describe(include='all')), \
           (data_df.dtypes.to_frame(name='dtypes').T), \
           (data_df.isnull().sum().to_frame(name='isnull').T), \
           (data_df.apply(lambda x:x.unique().size).to_frame(name='uniqAll').T)])
    return summary_df

In [None]:
# summary_df(num_df)

#### re-casting to reduce memory use (beta)

In [None]:
# np.isfinite(num_df).all().value_counts()

In [None]:
# num_df_finite = num_df[num_df.columns[np.isfinite(num_df).all()]]
# num_df_infinite = num_df[num_df.columns[np.isfinite(num_df).all() == False]]
# num_df_finite.shape, num_df_infinite.shape

In [None]:
# assert num_df_finite.shape[0] == num_df_infinite.shape[0] == num_df.shape[0]
# assert num_df_finite.shape[1] + num_df_infinite.shape[1] == num_df.shape[1]

In [None]:
# def reduce_mem_usage(props, finite:bool = True):
#     props.info(verbose=False)
#     start_mem_usg = props.memory_usage().sum() / 1024**2 
#     print("Memory usage of properties dataframe is :",start_mem_usg," MB")
#     if finite == True:  
#         props[props.columns[(props.min()>=0) & (props.max()<255)]] = \
#         props[props.columns[(props.min()>=0) & (props.max()<255)]].astype(np.uint8, copy=False)
#         props.info(verbose=False)

#         props[props.columns[(props.min()>=0) &(props.max() >= 255) & (props.max()<65535)]] = \
#         props[props.columns[(props.min()>=0) &(props.max() >= 255) & (props.max()<65535)]] \
#         .astype(np.uint16, copy=False)
#         props.info(verbose=False)

#         props[props.columns[(props.min()>=0) &(props.max() >= 65535) & (props.max()<4294967295)]] = \
#         props[props.columns[(props.min()>=0) &(props.max() >= 65535) & (props.max()<4294967295)]] \
#         .astype(np.uint32, copy=False)
#         props.info(verbose=False)

#         props[props.columns[(props.min()>=0) &(props.max() >= 4294967295)]] = \
#         props[props.columns[(props.min()>=0) &(props.max() >= 4294967295)]] \
#         .astype(np.uint64, copy=False)
#         props.info(verbose=False)
#     else:
#         props = props.astype(np.float32, copy=False)
#         props.info(verbose=False)
        
#     print("___MEMORY USAGE AFTER COMPLETION:___")
#     mem_usg = props.memory_usage().sum() / 1024**2 
#     print("Memory usage is: ",mem_usg," MB")
#     print("This is ",100*mem_usg/start_mem_usg,"% of the initial size")
    
#     return props

# if num_na_df_finite.min()>=0:
#     if num_na_df_finite.max() < 255:
#         props[col] = props[col].astype(np.uint8)
#     elif num_na_df_finite.max() < 65535:
#         props[col] = props[col].astype(np.uint16)
#     elif num_na_df_finite.max() < 4294967295:
#         props[col] = props[col].astype(np.uint32)
#     else:
#         props[col] = props[col].astype(np.uint64)

In [None]:
# num_df_finite.info()

In [None]:
# num_df_finite = reduce_mem_usage(num_df_finite, finite = True)

In [None]:
# num_df_infinite.info()

In [None]:
# num_df_infinite = reduce_mem_usage(num_df_infinite, finite = False)

In [None]:
# num_df = pd.concat([num_df_finite, num_df_infinite], axis ='columns')
# num_df.info()

In [None]:
# assert num_df_finite.shape[0] == num_df_infinite.shape[0] == num_df.shape[0]
# assert num_df_finite.shape[1] + num_df_infinite.shape[1] == num_df.shape[1]

In [None]:
# del num_df_finite
# del num_df_infinite
# gc.collect()

---

#### Normalization (DO LATER!!)

##### Generally, in tree-based models, the scale of the features does not matter.
https://scikit-learn.org/stable/modules/preprocessing.html#normalization
https://datascience.stackexchange.com/questions/22036/how-does-lightgbm-deal-with-value-scale

---

## Combine to a complete, processed dataset

In [None]:
frames = np.array([id_target_df, cat_df_obj_ohe, num_df])

In [None]:
id_target_df.shape, cat_df_obj_ohe.shape, num_df.shape

In [None]:
appl_all_processed_df = pd.concat(frames, axis ='columns')
appl_all_processed_df.shape

In [None]:
assert appl_all_processed_df.shape[1] == id_target_df.shape[1] + cat_df_obj_ohe.shape[1] + num_df.shape[1]

In [None]:
appl_all_processed_df.info()

In [None]:
# Apply the following gc if memory is running slow
del id_target_df
del cat_df_obj_ohe
del num_df
gc.collect()

---

Below not executed

## Balance the 'TARGET' column

In [None]:
appl_all_processed_df['TARGET'].value_counts()

In [None]:
balanceFactor = ((appl_all_processed_df['TARGET'].value_counts()[0])/(appl_all_processed_df['TARGET'].value_counts()[1])).round(0).astype(int)
balanceFactor
# appl_all_processed_df['TARGET'].value_counts()[0]
# appl_all_processed_df['TARGET'].value_counts()[1]

In [None]:
default_df = appl_all_processed_df[appl_all_processed_df['TARGET']==1]
default_df.shape

In [None]:
default_df_balanced = pd.concat( [default_df] * (balanceFactor - 1), sort=False, ignore_index=True )
default_df_balanced.shape

In [None]:
appl_all_processed_df_balanced = pd.concat([appl_all_processed_df , default_df_balanced], sort=False, ignore_index=True)
appl_all_processed_df_balanced.shape

In [None]:
(appl_all_processed_df_balanced['TARGET'].unique(),
(appl_all_processed_df_balanced['TARGET'].value_counts()[1], \
appl_all_processed_df_balanced['TARGET'].value_counts()[0], \
appl_all_processed_df_balanced['TARGET'].isnull().sum()))

In [None]:
# Apply the following gc if memory is running slow
del appl_all_processed_df
gc.collect()

---

## Export to CSV

In [None]:
appl_all_processed_df_balanced.to_csv('../../../BDSE12-Group3/datasets/homecdt_ss_output/ss_fteng_fromBDSE12_03G_HomeCredit_V1_20200201b_balanced.csv', index = False)
# appl_all_processed_df_balanced.to_csv('../../../BDSE12-Group3/datasets/homecdt_fteng/ss_output/ss_fteng_appl_all_v1_20200128.csv', index = False)

In [None]:
# Apply the following gc if memory is running slow
del appl_all_processed_df_balanced
gc.collect()

---

---

# Todo

Todo:
* cleaning:
    * num_df: normalize with z-score
* feature engineering:
    * make reciprocol, polynomial columns of the existing columns. 1/x, x^x.
    * multiplying each columns, two columns at a time.
    * asset items, income items, willingness(history + misc profile) items, loading(principle + interest) items
    * Integration from other tables?

https://ithelp.ithome.com.tw/articles/10202059
https://stackoverflow.com/questions/26414913/normalize-columns-of-pandas-data-frame
https://www.kaggle.com/parasjindal96/how-to-normalize-dataframe-pandas
    

---

## EDA

### Quick check for numerical columns

In [None]:
numcol = df['CNT_FAM_MEMBERS']

In [None]:
numcol.describe(), \
numcol.isnull().sum(), \
numcol.size

In [None]:
numcol.value_counts(sort=True), numcol.unique().size

In [None]:
# numcol_toYear = pd.to_numeric(\
#                               ((numcol.abs() / 365) \
#                                .round(0)) \
#                               ,downcast='integer')
# numcol_toYear.describe()

In [None]:
# numcol_toYear.value_counts(sort=True), numcol_toYear.unique().size

### Quick check for categorical columns

In [None]:
catcol = df['HOUR_APPR_PROCESS_START']

In [None]:
catcol.unique(), \
catcol.unique().size

In [None]:
catcol.value_counts(sort=True)

In [None]:
catcol.isnull().sum(), \
catcol.size

In [None]:
catcol.isnull().sum(), \
catcol.size

## Appendix

### Tool: Getting summary dataframe

In [None]:
# might not be very useful at this point
def summary_df (data_df):
    """ 
    Output: a new dataframe with summary info from the input dataframe.
    Input: data_df, the original dataframe
    """
    summary_df = pd.concat([(data_df.describe(include='all')), \
           (data_df.dtypes.to_frame(name='dtypes').T), \
           (data_df.isnull().sum().to_frame(name='isnull').T), \
           (data_df.apply(lambda x:x.unique().size).to_frame(name='uniqAll').T)])
    return summary_df

def data_quality_df (data_df):
    """ 
    Output: a new dataframe with summary info from the input dataframe.
    Input: data_df, the original dataframe
    """
    data_quality_df = pd.concat([(data_df.describe(include='all')), \
           (data_df.dtypes.to_frame(name='dtypes').T), \
           (data_df.isnull().sum().to_frame(name='isnull').T), \
           (data_df.apply(lambda x:x.unique().size).to_frame(name='uniqAll').T)])
    return data_quality_df.iloc[[11,13,12,0,],:]


In [None]:
data_quality_df(appl_all_df)

In [None]:
# df.to_csv(file_name, encoding='utf-8', index=False)
# data_quality_df(df).to_csv("./eda_output/application_train_data_quality.csv")

In [None]:
df['CNT_CHILDREN'].value_counts()

In [None]:
df['CNT_CHILDREN'].value_counts().sum()

In [None]:
df.describe()

In [None]:
summary_df(df)

In [None]:
# df.to_csv(file_name, encoding='utf-8', index=False)
# summary_df(df).to_csv("./eda_output/application_train_summary_df.csv")

---

### .nunique() function

In [None]:
# nunique() function excludes NaN 
# i.e. it does not consider NaN as a "value", therefore NaN is not counted as a "unique value"
df.nunique()

In [None]:
df.nunique() == df.apply(lambda x:x.unique().shape[0])

In [None]:
df['AMT_REQ_CREDIT_BUREAU_YEAR'].unique().shape[0]

In [None]:
df['AMT_REQ_CREDIT_BUREAU_YEAR'].nunique()

In [None]:
df['AMT_REQ_CREDIT_BUREAU_YEAR'].unique().size

### .value_counts() function

In [None]:
# .value_counts() function has similar viewpoint towards NaN.
# i.e. it does not consider null as a value, therefore not counted in .value_counts()

In [None]:
df['NAME_TYPE_SUITE'].value_counts()

In [None]:
df['AMT_REQ_CREDIT_BUREAU_YEAR'].isnull().sum()

In [None]:
df['AMT_REQ_CREDIT_BUREAU_YEAR'].size

In [None]:
df['AMT_REQ_CREDIT_BUREAU_YEAR'].value_counts().sum() + df['AMT_REQ_CREDIT_BUREAU_YEAR'].isnull().sum() == \
df['AMT_REQ_CREDIT_BUREAU_YEAR'].size

### 重複值

In [None]:
# Counting unique values (cf. .nunique() function, see above section)
# This code was retrieved from HT

df.apply(lambda x:x.unique().shape[0])

In [None]:
# It is the same if you write (df.apply(lambda x:x.unique().size))
assert (df.apply(lambda x:x.unique().shape[0])==df.apply(lambda x:x.unique().size)).all

In [None]:
# # %timeit showed the performances are similar
# %timeit df.apply(lambda x:x.unique().shape[0])
# %timeit df.apply(lambda x:x.unique().size)

### 空值

In [None]:
# 含空值欄位占比
print(f"{df.isnull().any().sum()} in {df.shape[1]} columns (ratio: {(df.isnull().any().sum()/df.shape[1]).round(2)}) has empty value(s)")
