## **4.1 Pre-processing Training Set**

In this part, we preprocess the train set by replacing the values with WOE based from its bin.
We will save the train WOE dataset to config file

In [25]:
# Import library
import pandas as pd
import numpy as np

# Load configuration
import src.utils as utils

Update the config file to have WOE_map_dict_path

In [26]:
config_data = utils.config_load()
config_data

{'raw_dataset_path': 'data/raw/UCI_Credit_Card.csv',
 'dataset_path': 'data/output/data.pkl',
 'predictors_set_path': 'data/output/predictors.pkl',
 'response_set_path': 'data/output/response.pkl',
 'train_path': ['data/output/X_train.pkl', 'data/output/y_train.pkl'],
 'test_path': ['data/output/X_test.pkl', 'data/output/y_test.pkl'],
 'data_train_path': 'data/output/data_train.pkl',
 'data_train_binned_path': 'data/output/data_train_binned.pkl',
 'crosstab_list_path': 'data/output/crosstab_list.pkl',
 'WOE_table_path': 'data/output/WOE_table.pkl',
 'IV_table_path': 'data/output/IV_table.pkl',
 'WOE_map_dict_path': 'data/output/WOE_map_dict.pkl',
 'response_variable': 'default.payment.next.month',
 'test_size': 0.3,
 'num_columns': ['LIMIT_BAL',
  'AGE',
  'BILL_AMT1',
  'BILL_AMT2',
  'BILL_AMT3',
  'BILL_AMT4',
  'BILL_AMT5',
  'BILL_AMT6',
  'PAY_AMT1',
  'PAY_AMT2',
  'PAY_AMT3',
  'PAY_AMT4',
  'PAY_AMT5',
  'PAY_AMT6'],
 'cat_columns': ['SEX',
  'EDUCATION',
  'MARRIAGE',
  'PAY_

In [27]:
# Function to generate the WOE mapping dictionary
def get_woe_map_dict():
    """Get the WOE mapping dictionary"""
    # Load the WOE table
    WOE_table = utils.pickle_load(config_data['WOE_table_path'])

    # Initialize the dictionary
    WOE_map_dict = {}
    
    unique_char = set(WOE_table['Characteristic'])
    for char in unique_char:
        # Get the Attribute & WOE info for each characteristics
        current_data = (WOE_table
                            [WOE_table['Characteristic']==char]     # Filter based on characteristic
                            [['Attribute', 'WOE']])                 # Then select the attribute & WOE
        
        # Get the mapping
        WOE_map_dict[char] = {}
        for idx in current_data.index:
            attribute = current_data.loc[idx, 'Attribute']
            woe = current_data.loc[idx, 'WOE']
            WOE_map_dict[char][attribute] = woe

    # Validate data
    print('Number of key : ', len(WOE_map_dict.keys()))

    # Dump
    utils.pickle_dump(WOE_map_dict, config_data['WOE_map_dict_path'])

    return WOE_map_dict

In [28]:
WOE_map_dict = get_woe_map_dict()
WOE_map_dict

Number of key :  23


{'PAY_AMT1_bin': {Interval(-0.001, 1000.0, closed='right'): -0.4808484068698445,
  Interval(1000.0, 2118.5, closed='right'): -0.017381952326989074,
  Interval(2118.5, 5021.25, closed='right'): 0.1086121122944235,
  Interval(5021.25, 873552.0, closed='right'): 0.5613018943420917},
 'MARRIAGE': {0: 0.9384822886116739,
  1: -0.06637853913696033,
  2: 0.06386136818011379,
  3: -0.3517251936357196},
 'PAY_AMT6_bin': {Interval(-0.001, 100.0, closed='right'): -0.34856290840739995,
  Interval(100.0, 1500.0, closed='right'): -0.12349223892327439,
  Interval(1500.0, 4000.0, closed='right'): 0.08425107650183983,
  Interval(4000.0, 527143.0, closed='right'): 0.5112046085975265},
 'SEX': {1: -0.12245680357507098, 2: 0.08519142335722953},
 'BILL_AMT3_bin': {Interval(-61506.001, 2729.25, closed='right'): -0.008548914790808163,
  Interval(2729.25, 20012.0, closed='right'): -0.008303906826823824,
  Interval(20012.0, 60042.5, closed='right'): -0.08742244058182495,
  Interval(60042.5, 855086.0, closed='r

Next, transform the inputed data based on the map dictionary above.
Update the config file to have the path for the new data contains the WOE values.

In [29]:
config_data = utils.config_load()
config_data

{'raw_dataset_path': 'data/raw/UCI_Credit_Card.csv',
 'dataset_path': 'data/output/data.pkl',
 'predictors_set_path': 'data/output/predictors.pkl',
 'response_set_path': 'data/output/response.pkl',
 'train_path': ['data/output/X_train.pkl', 'data/output/y_train.pkl'],
 'test_path': ['data/output/X_test.pkl', 'data/output/y_test.pkl'],
 'data_train_path': 'data/output/data_train.pkl',
 'data_train_binned_path': 'data/output/data_train_binned.pkl',
 'crosstab_list_path': 'data/output/crosstab_list.pkl',
 'WOE_table_path': 'data/output/WOE_table.pkl',
 'IV_table_path': 'data/output/IV_table.pkl',
 'WOE_map_dict_path': 'data/output/WOE_map_dict.pkl',
 'X_train_woe_path': 'data/output/X_train_woe.pkl',
 'response_variable': 'default.payment.next.month',
 'test_size': 0.3,
 'num_columns': ['LIMIT_BAL',
  'AGE',
  'BILL_AMT1',
  'BILL_AMT2',
  'BILL_AMT3',
  'BILL_AMT4',
  'BILL_AMT5',
  'BILL_AMT6',
  'PAY_AMT1',
  'PAY_AMT2',
  'PAY_AMT3',
  'PAY_AMT4',
  'PAY_AMT5',
  'PAY_AMT6'],
 'cat_co

In [32]:
# Function to replace the raw data in the train set with WOE values
def transform_woe(raw_data=None, type=None, config_data=None):
    """Replace data value with WOE"""
    # Load the numerical columns
    num_cols = config_data['num_columns']

    # Load the WOE_map_dict
    WOE_map_dict = utils.pickle_load(config_data['WOE_map_dict_path'])

    # Load the saved data if type is not None
    if type is not None:
        raw_data = utils.pickle_load(config_data[f'{type}_path'][0])

    # Map the data
    woe_data = raw_data.copy()
    for col in woe_data.columns:
        if col in num_cols:
            map_col = col + '_bin'
        else:
            map_col = col

        woe_data[col] = woe_data[col].map(WOE_map_dict[map_col])

    # Map the data if there is a missing value or out of range value
    for col in woe_data.columns:
        if col in num_cols:
            map_col = col + '_bin'
        else:
            map_col = col

    # Validate
    print('Raw data shape : ', raw_data.shape)
    print('WOE data shape : ', woe_data.shape)

    # Dump data
    if type is not None:
        utils.pickle_dump(woe_data, config_data[f'X_{type}_woe_path'])

    return woe_data

In [33]:
# Transform the train set
X_train_woe = transform_woe(type='train', config_data=config_data)

Raw data shape :  (21000, 23)
WOE data shape :  (21000, 23)


In [35]:
X_train_woe.head(10)

Unnamed: 0,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,...,BILL_AMT3,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6
11018,0.289268,0.085191,-0.083675,0.063861,0.148229,0.67445,0.410773,0.289754,0.228802,0.204829,...,0.10979,0.102737,0.113327,0.081753,0.561302,0.596787,0.550725,0.481097,0.460033,0.511205
1710,-0.151806,0.085191,0.160746,0.063861,0.148229,0.67445,0.410773,0.289754,0.228802,-1.475777,...,-0.087422,-0.110235,0.113327,0.081753,0.108612,0.596787,0.550725,-0.361988,0.062411,0.084251
4618,-0.151806,-0.122457,0.160746,0.063861,0.148229,0.67445,0.410773,0.289754,0.228802,0.204829,...,0.10979,-0.110235,-0.145598,-0.174806,0.108612,0.087882,-0.06462,0.09523,0.062411,0.084251
5482,-0.511254,0.085191,-0.083675,-0.066379,-0.088671,0.67445,0.410773,0.423769,0.403478,0.207339,...,-0.008549,-0.003038,-0.016236,0.015826,-0.017382,-0.426817,-0.427002,-0.361988,-0.320885,-0.348563
26187,-0.511254,0.085191,0.160746,0.063861,0.148229,0.675981,0.286901,0.277892,0.238141,0.207339,...,-0.008549,-0.003038,-0.016236,0.015826,-0.017382,-0.426817,-0.427002,-0.361988,-0.320885,-0.348563
11716,-0.151806,-0.122457,-0.083675,0.063861,0.148229,-2.08118,-1.494481,-1.332519,-1.37126,-1.475777,...,-0.087422,0.102737,0.113327,0.081753,0.561302,-0.094842,0.080008,-0.361988,0.460033,0.084251
10804,-0.151806,0.085191,0.160746,0.063861,-0.088671,0.67445,0.410773,0.289754,0.228802,0.204829,...,-0.008304,0.016942,0.058978,-0.174806,-0.017382,-0.094842,0.080008,0.09523,0.062411,0.084251
11312,0.619303,-0.122457,-0.083675,-0.066379,0.148229,0.67445,0.410773,0.289754,0.228802,0.204829,...,0.10979,0.102737,0.113327,0.081753,0.561302,0.596787,0.550725,0.09523,0.062411,0.511205
15150,-0.511254,0.085191,-0.083675,0.063861,-0.088671,0.67445,0.410773,0.289754,0.228802,0.204829,...,-0.087422,0.016942,0.058978,0.015826,0.108612,-0.094842,-0.06462,-0.361988,0.460033,0.511205
27959,0.289268,0.085191,-0.083675,-0.066379,0.148229,0.675981,0.286901,0.277892,0.238141,0.207339,...,-0.008549,-0.003038,-0.016236,0.015826,-0.480848,-0.426817,-0.427002,-0.361988,-0.320885,-0.348563
