## **4.1 Pre-processing Training Set**

In this part, we preprocess the train set by replacing the values with WOE based from its bin.
We will save the train WOE dataset to config file

In [2]:
# Import library
import pandas as pd
import numpy as np

# Load configuration
import src.utils as utils

Update the config file to have WOE_map_dict_path

In [3]:
config_data = utils.config_load()
config_data

{'raw_dataset_path': 'data/raw/Training Data.csv',
 'dataset_path': 'data/output/data.pkl',
 'predictors_set_path': 'data/output/predictors.pkl',
 'response_set_path': 'data/output/response.pkl',
 'train_path': ['data/output/X_train.pkl', 'data/output/y_train.pkl'],
 'test_path': ['data/output/X_test.pkl', 'data/output/y_test.pkl'],
 'data_train_path': 'data/output/data_train.pkl',
 'data_train_binned_path': 'data/output/data_train_binned.pkl',
 'crosstab_list_path': 'data/output/crosstab_list.pkl',
 'WOE_table_path': 'data/output/WOE_table.pkl',
 'IV_table_path': 'data/output/IV_table.pkl',
 'WOE_map_dict_path': 'data/output/WOE_map_dict.pkl',
 'response_variable': 'risk_flag',
 'test_size': 0.3,
 'num_columns': ['income',
  'age',
  'experience',
  'current_job_years',
  'current_house_years'],
 'cat_columns': ['married',
  'house_ownership',
  'car_ownership',
  'profession',
  'city',
  'state'],
 'num_of_bins': 4}

In [4]:
# Function to generate the WOE mapping dictionary
def get_woe_map_dict():
    """Get the WOE mapping dictionary"""
    # Load the WOE table
    WOE_table = utils.pickle_load(config_data['WOE_table_path'])

    # Initialize the dictionary
    WOE_map_dict = {}
    
    unique_char = set(WOE_table['Characteristic'])
    for char in unique_char:
        # Get the Attribute & WOE info for each characteristics
        current_data = (WOE_table
                            [WOE_table['Characteristic']==char]     # Filter based on characteristic
                            [['Attribute', 'WOE']])                 # Then select the attribute & WOE
        
        # Get the mapping
        WOE_map_dict[char] = {}
        for idx in current_data.index:
            attribute = current_data.loc[idx, 'Attribute']
            woe = current_data.loc[idx, 'WOE']
            WOE_map_dict[char][attribute] = woe

    # Validate data
    print('Number of key : ', len(WOE_map_dict.keys()))

    # Dump
    utils.pickle_dump(WOE_map_dict, config_data['WOE_map_dict_path'])

    return WOE_map_dict

Next, transform the inputed data based on the map dictionary above.
Update the config file to have the path for the new data contains the WOE values.

In [5]:
WOE_map_dict = get_woe_map_dict()
WOE_map_dict

Number of key :  11


{'profession': {'Air_traffic_controller': -0.08670372374040573,
  'Analyst': 0.03611998273911602,
  'Architect': -0.07379213383425075,
  'Army_officer': -0.24846386452866992,
  'Artist': -0.002304449464694245,
  'Aviator': -0.10130673038820888,
  'Biomedical_Engineer': -0.028357385669495506,
  'Chartered_Accountant': -0.24550956730222667,
  'Chef': 0.05349410309375176,
  'Chemical_engineer': 0.11356399220637219,
  'Civil_engineer': -0.12064244939190821,
  'Civil_servant': -0.006526947497931333,
  'Comedian': 0.025289716247096566,
  'Computer_hardware_engineer': -0.08575754580918107,
  'Computer_operator': -0.0002736800610991145,
  'Consultant': -0.042462617752923,
  'Dentist': 0.15212427181485977,
  'Design_Engineer': 0.1850669826646566,
  'Designer': 0.10180896529681781,
  'Drafter': 0.09806419655580703,
  'Economist': 0.2663465070214861,
  'Engineer': 0.050368442765399875,
  'Fashion_Designer': 0.03605056159572165,
  'Financial_Analyst': 0.17789785358372312,
  'Firefighter': -0.11651

In [6]:
config_data = utils.config_load()
config_data

{'raw_dataset_path': 'data/raw/Training Data.csv',
 'dataset_path': 'data/output/data.pkl',
 'predictors_set_path': 'data/output/predictors.pkl',
 'response_set_path': 'data/output/response.pkl',
 'train_path': ['data/output/X_train.pkl', 'data/output/y_train.pkl'],
 'test_path': ['data/output/X_test.pkl', 'data/output/y_test.pkl'],
 'data_train_path': 'data/output/data_train.pkl',
 'data_train_binned_path': 'data/output/data_train_binned.pkl',
 'crosstab_list_path': 'data/output/crosstab_list.pkl',
 'WOE_table_path': 'data/output/WOE_table.pkl',
 'IV_table_path': 'data/output/IV_table.pkl',
 'WOE_map_dict_path': 'data/output/WOE_map_dict.pkl',
 'X_train_woe_path': 'data/output/X_train_woe.pkl',
 'response_variable': 'risk_flag',
 'test_size': 0.3,
 'num_columns': ['income',
  'age',
  'experience',
  'current_job_years',
  'current_house_years'],
 'cat_columns': ['married',
  'house_ownership',
  'car_ownership',
  'profession',
  'city',
  'state'],
 'num_of_bins': 4}

In [7]:
# Function to replace the raw data in the train set with WOE values
def transform_woe(raw_data=None, type=None, config_data=None):
    """Replace data value with WOE"""
    # Load the numerical columns
    num_cols = config_data['num_columns']

    # Load the WOE_map_dict
    WOE_map_dict = utils.pickle_load(config_data['WOE_map_dict_path'])

    # Load the saved data if type is not None
    if type is not None:
        raw_data = utils.pickle_load(config_data[f'{type}_path'][0])

    # Map the data
    woe_data = raw_data.copy()
    for col in woe_data.columns:
        if col in num_cols:
            map_col = col + '_bin'
        else:
            map_col = col

        woe_data[col] = woe_data[col].map(WOE_map_dict[map_col])

    # Map the data if there is a missing value or out of range value
    for col in woe_data.columns:
        if col in num_cols:
            map_col = col + '_bin'
        else:
            map_col = col

    # Validate
    print('Raw data shape : ', raw_data.shape)
    print('WOE data shape : ', woe_data.shape)

    # Dump data
    if type is not None:
        utils.pickle_dump(woe_data, config_data[f'X_{type}_woe_path'])

    return woe_data

In [8]:
# Transform the train set
X_train_woe = transform_woe(type='train', config_data=config_data)

Raw data shape :  (176400, 11)
WOE data shape :  (176400, 11)


In [9]:
X_train_woe.head(10)

Unnamed: 0,income,age,experience,married,house_ownership,car_ownership,profession,city,state,current_job_years,current_house_years
203209,0.04274,0.029236,0.14139,-0.02083,-0.024684,0.11611,0.063605,-0.299325,0.216157,0.087933,0.012016
42903,-0.037107,0.029236,-0.005662,-0.02083,-0.024684,0.11611,-0.101307,0.099907,-0.056232,-0.001491,0.012016
59095,0.04274,0.029236,-0.160299,-0.02083,-0.024684,-0.046671,0.02529,-0.062463,0.067921,0.053387,0.012016
69516,-0.037107,0.034512,-0.160299,0.199691,-0.024684,-0.046671,-0.089838,0.089267,0.044885,-0.13045,-0.020778
232569,0.042678,0.070343,0.07089,0.199691,-0.024684,-0.046671,-0.006527,-0.261752,0.084975,0.087933,0.018878
144582,-0.037107,0.034512,-0.005662,-0.02083,0.257258,-0.046671,0.053494,-0.016763,0.067921,0.087933,0.018878
207633,-0.037107,-0.11791,-0.005662,-0.02083,-0.024684,0.11611,0.049067,-0.270268,0.067921,0.087933,0.011189
241892,0.04274,0.034512,-0.005662,-0.02083,-0.024684,-0.046671,-0.193092,0.096416,0.607073,0.087933,-0.020778
195223,0.04274,0.070343,-0.005662,-0.02083,-0.024684,0.11611,-0.006527,-0.334061,-0.056232,-0.13045,0.012016
84082,0.04274,0.029236,-0.160299,-0.02083,-0.024684,0.11611,0.035509,0.338252,0.084975,-0.13045,0.018878
