# Summary

Feature engineering is the process of using domain knowledge of the data to create features that make machine learning algorithms work. Feature engineering is fundamental to the application of machine learning, and is both difficult and expensive.

This section we will cover some steps and methods generally used for feature engineering. Upon the completion of this chapter, you will know:


*   How to manually create more features via feature engineering

# Load Libraries

In [1]:
## system config ##
import sys
import os
nb_dir = os.path.split(os.getcwd())[0] + "/" + os.path.split(os.getcwd())[1] + "/Self-defined Modules"
sys.path.append(nb_dir) # Add a directory into sys path

print ('Python', sys.version)

Python 3.7.6 (default, Jan  8 2020, 20:23:39) [MSC v.1916 64 bit (AMD64)]


In [2]:
## public libraries ##
import numpy as np

In [3]:
## self-defined modules ##
import SupportingFunctions as SF

# Import Data

# More Data & More Features

## 1 - Bureau data

In [4]:
## data loading ##
df_bureau = SF.load_csv(os.path.split(os.getcwd())[0] + "\\2. Data\\bureau.csv")
df_bureau_bal = SF.load_csv(os.path.split(os.getcwd())[0] + "\\2. Data\\bureau_balance.csv")

time elapsed: 2.4176523685455322 seconds
dataframe dimension: (1716428, 17)
time elapsed: 6.938809633255005 seconds
dataframe dimension: (27299925, 3)


In [5]:
temp01 = df_bureau_bal[df_bureau_bal['MONTHS_BALANCE'] >= -24].copy() # past 2 years

# get past DPD info
temp01["DPD60+"] = 0
temp01.loc[temp01["STATUS"].isin(["3", "4", "5"]), "DPD60+"] = 1
temp01_agg = temp01.groupby('SK_ID_BUREAU').agg({'DPD60+':['sum', 'max']}).reset_index() # roll-up
temp01_agg.columns = ['SK_ID_BUREAU', 'N_DPD60', 'FLAG_DPD60']

# get T0 status (Open/Close)
temp02 = temp01.loc[temp01["MONTHS_BALANCE"] == 0, ["SK_ID_BUREAU", "STATUS"]]
temp02.loc[temp02["STATUS"] != "C", "STATUS"] = "O"

print ("\n"+"="*50)
SF.merge_rate(temp02, temp01_agg, 'SK_ID_BUREAU')
print ("="*50)

temp03 = temp01[['SK_ID_BUREAU']].drop_duplicates()
df_bureau_bal_agg = temp03.merge(temp02, on='SK_ID_BUREAU', how='left').merge(temp01_agg, on='SK_ID_BUREAU', how='left')
df_bureau_bal_agg['STATUS'] = df_bureau_bal_agg['STATUS'].fillna('C')


% left only: 0.0
% right only: 0.15694080309093417
% merge rate: 0.8430591969090658



In [6]:
print (df_bureau.shape)
print (df_bureau[['SK_ID_BUREAU']].drop_duplicates().shape)
print (df_bureau[['SK_ID_CURR']].drop_duplicates().shape)

(1716428, 17)
(1716428, 1)
(305811, 1)


In [7]:
temp01 = df_bureau.merge(df_bureau_bal_agg, on=["SK_ID_BUREAU"], how="left")

temp01["CREDIT_CLOSED"] = 0
temp01.loc[temp01["CREDIT_ACTIVE"] == "Closed", "CREDIT_CLOSED"] = 1

temp01["CREDIT_BAD"] = 0
temp01.loc[temp01["CREDIT_ACTIVE"] == "Bad debt", "CREDIT_BAD"] = 1

temp01["STATUS_OPEN"] = 0
temp01.loc[temp01["STATUS"] == "O", "STATUS_OPEN"] = 1

df_bureau_processed = temp01.groupby("SK_ID_CURR").agg({"SK_ID_BUREAU":"count",
                                                        "CREDIT_CLOSED":"sum",
                                                        "CREDIT_BAD":"sum",
                                                        "DAYS_CREDIT":"max",
                                                        "CREDIT_DAY_OVERDUE":"max",
                                                        "AMT_CREDIT_MAX_OVERDUE":"max",
                                                        "CNT_CREDIT_PROLONG":"sum",
                                                        "AMT_CREDIT_SUM":"sum",
                                                        "AMT_CREDIT_SUM_DEBT":"sum",
                                                        "AMT_CREDIT_SUM_LIMIT":"sum",
                                                        "AMT_CREDIT_SUM_OVERDUE":"sum",
                                                        "AMT_ANNUITY":"sum",
                                                        "STATUS_OPEN":"sum",
                                                        "N_DPD60":"sum",
                                                        "FLAG_DPD60":"max"}).reset_index()

# rename columns
df_bureau_processed = df_bureau_processed.rename(index=str, columns={"SK_ID_BUREAU":"CREDIT_CT", 
                                                                     "CREDIT_CLOSED":"CREDIT_CLOSED_SUM",
                                                                     "CREDIT_BAD":"CREDIT_BAD_SUM",
                                                                     "DAYS_CREDIT":"DAYS_CREDIT_MAX",
                                                                     "CREDIT_DAY_OVERDUE":"CREDIT_DAY_OVERDUE_MAX",
                                                                     "AMT_CREDIT_MAX_OVERDUE":"AMT_CREDIT_MAX_OVERDUE_MAX",
                                                                     "CNT_CREDIT_PROLONG":"CNT_CREDIT_PROLONG_SUM",
                                                                     "STATUS_OPEN":"STATUS_OPEN_CT"})

for col in df_bureau_processed.columns.tolist():
    if col != 'SK_ID_CURR':
        df_bureau_processed = df_bureau_processed.rename(index=str, columns={col:"bureau_"+col})

In [8]:
## delete data to release memory ##
del df_bureau 
del df_bureau_bal

## 2 - Previous Application

In [9]:
# df_pos_cash_bal
df_pos_cash_bal = SF.load_csv(os.path.split(os.getcwd())[0] + "\\2. Data\\POS_CASH_balance.csv")

df_temp01 = df_pos_cash_bal[df_pos_cash_bal['MONTHS_BALANCE'] >= -24].copy()
df_temp01['NAME_CONTRACT_STATUS_C'] = np.where(df_temp01['NAME_CONTRACT_STATUS'] == 'Completed', 1, 0)
df_temp01['SK_DPD60+'] = np.where(df_temp01['SK_DPD'] > 60, 1, 0)
df_pos_cash_bal_agg = df_temp01.groupby(['SK_ID_PREV', 'SK_ID_CURR']).agg({"NAME_CONTRACT_STATUS_C":"max", 
                                                                           "SK_DPD60+":"max"}).reset_index()

df_pos_cash_bal_agg = df_pos_cash_bal_agg.groupby('SK_ID_CURR').agg({"NAME_CONTRACT_STATUS_C":"sum", 
                                                                     "SK_DPD60+":"sum"}).reset_index()

for col in df_pos_cash_bal_agg.columns.tolist():
    if col != 'SK_ID_CURR':
        df_bureau_processed = df_bureau_processed.rename(index=str, columns={col:"pos_cash"+col})
        
del df_pos_cash_bal

time elapsed: 5.878181457519531 seconds
dataframe dimension: (10001358, 8)


In [10]:
# df_installments_payments
df_installments_payments = SF.load_csv(os.path.split(os.getcwd())[0] + "\\2. Data\\installments_payments.csv")

df_temp01 = df_installments_payments.copy()
df_temp01['LATE_PAYMENT'] = np.where(df_temp01['DAYS_INSTALMENT'] - df_temp01['DAYS_ENTRY_PAYMENT'] < 0, 1, 0)
df_temp01['INSUFFICIENT_PAYMENT'] = np.where(df_temp01['AMT_PAYMENT'] - df_temp01['AMT_INSTALMENT'] < 0, 1, 0)

df_installments_payments_agg = df_temp01.groupby(['SK_ID_PREV', 'SK_ID_CURR']).agg({"LATE_PAYMENT":"sum", 
                                                                                    "INSUFFICIENT_PAYMENT":"sum"}).reset_index()

df_installments_payments_agg = df_installments_payments_agg.groupby('SK_ID_CURR').agg({"LATE_PAYMENT":"sum", 
                                                                                       "INSUFFICIENT_PAYMENT":"sum"}).reset_index()

for col in df_installments_payments_agg.columns.tolist():
    if col != 'SK_ID_CURR':
        df_bureau_processed = df_bureau_processed.rename(index=str, columns={col:"installments_"+col})
        
del df_installments_payments

time elapsed: 9.524728059768677 seconds
dataframe dimension: (13605401, 8)


In [11]:
# df_credit_card_bal
df_credit_card_bal = SF.load_csv(os.path.split(os.getcwd())[0] + "\\2. Data\\credit_card_balance.csv")

df_temp01 = df_credit_card_bal[df_credit_card_bal['MONTHS_BALANCE'] >= -24].copy() # past 2 years
df_temp01['UTILIZATION'] = np.where(df_temp01['AMT_CREDIT_LIMIT_ACTUAL'] == 0, np.nan, df_temp01['AMT_BALANCE'] * 1.0 / df_temp01['AMT_CREDIT_LIMIT_ACTUAL'])
df_temp01['NAME_CONTRACT_STATUS_C'] = np.where(df_temp01['NAME_CONTRACT_STATUS'] == 'Completed', 1, 0)
df_temp01['SK_DPD60+'] = np.where(df_temp01['SK_DPD'] > 60, 1, 0)

df_credit_card_bal_agg = df_temp01.groupby(['SK_ID_PREV', 'SK_ID_CURR']).agg({"UTILIZATION":"max",
                                                                              "NAME_CONTRACT_STATUS_C":"max", 
                                                                              "SK_DPD60+":"max"}).reset_index()

df_credit_card_bal_agg = df_credit_card_bal_agg.groupby('SK_ID_CURR').agg({"UTILIZATION":"max",
                                                                           "NAME_CONTRACT_STATUS_C":"sum", 
                                                                           "SK_DPD60+":"sum"}).reset_index()

for col in df_credit_card_bal_agg.columns.tolist():
    if col != 'SK_ID_CURR':
        df_bureau_processed = df_bureau_processed.rename(index=str, columns={col:"credit_card_"+col})
        
del df_credit_card_bal

time elapsed: 7.600109338760376 seconds
dataframe dimension: (3840312, 23)


In [12]:
# df_pre_application
df_pre_application = SF.load_csv(os.path.split(os.getcwd())[0] + "\\2. Data\\previous_application.csv")

df_pre_application['REFUSED_CONTRACT'] = np.where(df_pre_application['NAME_CONTRACT_STATUS'] == 'Refused', 1, 0)

df_pre_application_agg = df_pre_application.groupby('SK_ID_CURR').agg({'SK_ID_PREV':'count',
                                                                       'NAME_CONTRACT_TYPE':'nunique',
                                                                       'AMT_ANNUITY':'sum',
                                                                       'AMT_APPLICATION':'sum',
                                                                       'AMT_CREDIT':'sum',
                                                                       'AMT_DOWN_PAYMENT':'sum',
                                                                       'AMT_GOODS_PRICE':'sum',
                                                                       'RATE_DOWN_PAYMENT':'max',
                                                                       'RATE_INTEREST_PRIMARY':'max',
                                                                       'RATE_INTEREST_PRIVILEGED':'max',
                                                                       'NAME_CASH_LOAN_PURPOSE':'nunique',
                                                                       'REFUSED_CONTRACT':['sum', 'max'],
                                                                       'NAME_PAYMENT_TYPE':'nunique',
                                                                       'PRODUCT_COMBINATION':'nunique',
                                                                       'DAYS_FIRST_DUE':'max',
                                                                       'NFLAG_INSURED_ON_APPROVAL':'max'}).reset_index()

# rename columns
ls_name=[]
for i in range(len(df_pre_application_agg.columns.tolist())):
    if df_pre_application_agg.columns.tolist()[i][1] == '':
        ls_name.append(df_pre_application_agg.columns.tolist()[i][0])
    else:
        ls_name.append('pre_app_'+ df_pre_application_agg.columns.tolist()[i][0] + '_' + df_pre_application_agg.columns.tolist()[i][1])

df_pre_application_agg.columns = ls_name

del df_pre_application

time elapsed: 6.822546005249023 seconds
dataframe dimension: (1670214, 37)


In [13]:
df_pre_application_processed_agg = df_pre_application_agg.merge(df_pos_cash_bal_agg, on='SK_ID_CURR', how='left').merge(
    df_installments_payments_agg, on='SK_ID_CURR', how='left').merge(
    df_credit_card_bal_agg, on='SK_ID_CURR', how='left')

In [14]:
# delete data to release memory
del df_pre_application_agg
del df_pos_cash_bal_agg
del df_installments_payments_agg
del df_credit_card_bal_agg

# Merge datasets

In [15]:
df_model_basic = SF.load_csv(os.path.split(os.getcwd())[0] + "\\3. Output\\Intermediate Data\\modeling_ds_basic.csv")
df_model = df_model_basic.merge(df_pre_application_processed_agg, on="SK_ID_CURR", how="left").merge(df_bureau_processed, on="SK_ID_CURR", how="left")

time elapsed: 3.8680663108825684 seconds
dataframe dimension: (307511, 51)


In [16]:
df_model.to_csv(os.path.split(os.getcwd())[0] + "\\3. Output\\Intermediate Data\\modeling_ds.csv", index=False)