In [2]:
import warnings
import numpy as np
import scorecardpy as sc
import pandas as pd

warnings.filterwarnings("ignore")

In [3]:
def preprocess_application_record(data:pd.DataFrame) -> pd.DataFrame:
    data['OCCUPATION_TYPE'] = np.where(data['DAYS_EMPLOYED'] > 0, 'Unemployed', data['OCCUPATION_TYPE'])
    data.fillna({'OCCUPATION_TYPE':'Unknown'}, inplace=True)
    return data

def preprocess_credit_record(data:pd.DataFrame) -> pd.DataFrame:
    data['IS_BAD'] = np.where(data['STATUS'].isin(['2','3','4','5']), 1, 0)
    data = data.groupby(['ID'])['IS_BAD'].sum().reset_index()

    data['IS_APPROVED'] = np.where(data['IS_BAD'] > 1, 0, 1)
    data.drop(columns=['IS_BAD'], inplace=True)
    return data

def merge_dataset(predictors:pd.DataFrame, target:pd.DataFrame) -> pd.DataFrame:
    merged_dataset = pd.merge(left=predictors, right=target, on='ID', how='inner')
    return merged_dataset

def feature_selection(data:pd.DataFrame) -> pd.DataFrame:
    reduced_data = sc.var_filter(data, y='IS_APPROVED')
    return reduced_data

def woe_transformer(data:pd.DataFrame) -> pd.DataFrame:
    bins = sc.woebin(data, y='IS_APPROVED')
    merged_dataset_woe = sc.woebin_ply(data, bins)
    return merged_dataset_woe

In [4]:
# import dataset
raw_application_record = pd.read_csv('../data/01_raw/application_record.csv')
raw_credit_record = pd.read_csv('../data/01_raw/credit_record.csv')

# preprocess
application_preprocessed = preprocess_application_record(raw_application_record)
credit_preprocessed = preprocess_credit_record(raw_credit_record)

# merge dataset
merged_dataset = merge_dataset(application_preprocessed, credit_preprocessed)

# feature selection
reduced_dataset = feature_selection(merged_dataset)

# woe transformation
dataset_woe = woe_transformer(reduced_dataset)

[INFO] filtering variables ...
[INFO] creating woe binning ...


  init_bin = init_bin.groupby('brkp', group_keys=False).agg({
  init_bin = init_bin.groupby('brkp', group_keys=False).agg({
  .agg({'good':sum, 'bad':sum}).reset_index()\
  .agg({'good':sum, 'bad':sum}).reset_index()\
  .agg({'good':sum, 'bad':sum}).reset_index()\
  .agg({'good':sum, 'bad':sum}).reset_index()\
  init_bin = init_bin.groupby('brkp', group_keys=False).agg({
  init_bin = init_bin.groupby('brkp', group_keys=False).agg({
  binning_1bst_brk = binning_1bst_brk.groupby(['variable', 'bstbin'], group_keys=False)\
  .agg({'good':sum, 'bad':sum, 'bin':lambda x:'%,%'.join(x)}).reset_index()\
  .agg({'good':sum, 'bad':sum, 'bin':lambda x:'%,%'.join(x)}).reset_index()\
  binning_1bst_brk = binning_1bst_brk.groupby(['variable', 'bstbin'], group_keys=False)\
  .agg({'good':sum, 'bad':sum, 'bin':lambda x:'%,%'.join(x)}).reset_index()\
  .agg({'good':sum, 'bad':sum, 'bin':lambda x:'%,%'.join(x)}).reset_index()\
  .agg({'good':sum, 'bad':sum}).reset_index()\
  .agg({'good':sum, 'bad':sum})

[INFO] converting into woe values ...


In [5]:
dataset_woe.columns

Index(['IS_APPROVED', 'NAME_INCOME_TYPE_woe', 'OCCUPATION_TYPE_woe',
       'NAME_HOUSING_TYPE_woe', 'NAME_FAMILY_STATUS_woe', 'CNT_CHILDREN_woe',
       'CNT_FAM_MEMBERS_woe', 'AMT_INCOME_TOTAL_woe',
       'NAME_EDUCATION_TYPE_woe', 'FLAG_OWN_REALTY_woe', 'CODE_GENDER_woe',
       'DAYS_BIRTH_woe', 'DAYS_EMPLOYED_woe'],
      dtype='object')