# Import Library

In [21]:
import pandas as pd
import numpy as np
import src.utils as utils

from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from imblearn.under_sampling import RandomUnderSampler

# Load Configuration File

In [3]:
CONFIG_DATA = utils.config_load()
CONFIG_DATA

{'raw_dataset_path': 'data/raw/data.csv',
 'data_set_path': 'data/output/data.pkl',
 'input_set_path': 'data/output/input.pkl',
 'output_set_path': 'data/output/output.pkl',
 'input_columns_path': 'data/output/input_columns.pkl',
 'train_set_path': ['data/output/X_train.pkl', 'data/output/y_train.pkl'],
 'valid_set_path': ['data/output/X_valid.pkl', 'data/output/y_valid.pkl'],
 'test_set_path': ['data/output/X_test.pkl', 'data/output/y_test.pkl'],
 'index_column': 'Unnamed: 0',
 'output_column': 'SeriousDlqin2yrs',
 'seed': 42,
 'test_size': 0.2,
 'clean_late_col': 'NumberOfTimes90DaysLate',
 'clean_late_val': 96,
 'clean_unsecure_col': 'RevolvingUtilizationOfUnsecuredLines',
 'constant_imputer_col': 'NumberOfDependents',
 'constant_imputer_path': 'data/output/constant_imputer.pkl',
 'constant_imputer_val': 0.0,
 'median_imputer_col': 'MonthlyIncome',
 'median_imputer_path': 'data/output/median_imputer.pkl',
 'standardizer_path': 'data/output/standardizer.pkl',
 'preprocessor_path': 'd

### Preprocessing Plan
EDA Summary:
- For <span style="background-color: #767676;">NumberOfTimes90DaysLate = 96</span> or <span style="background-color: #767676;">NumberOfTimes90DaysLate = 98</span> will be deleted.

- For <span style="background-color: #767676;">RevolvingUtilizationOfUnsecuredLines > 1.35</span> will be deleted.

- For <span style="background-color: #767676;">MonthlyIncome</span> and <span style="background-color: #767676;">NumberOfDependents = 0.0</span> will be applied by median

- Standardize the data

- Balance the data

# Load Dataset

In [5]:
def load_dataset(return_file=True):
    # Load train data
    X_train = utils.pickle_load(CONFIG_DATA['train_set_path'][0])
    y_train = utils.pickle_load(CONFIG_DATA['train_set_path'][1])

    # Load valid data
    X_valid = utils.pickle_load(CONFIG_DATA['valid_set_path'][0])
    y_valid = utils.pickle_load(CONFIG_DATA['valid_set_path'][1])

    # Load test data
    X_test = utils.pickle_load(CONFIG_DATA['test_set_path'][0])
    y_test = utils.pickle_load(CONFIG_DATA['test_set_path'][1])

    # Print 
    print("X_train shape :", X_train.shape)
    print("y_train shape :", y_train.shape)
    print("X_valid shape :", X_valid.shape)
    print("y_valid shape :", y_valid.shape)
    print("X_test shape  :", X_test.shape)
    print("y_test shape  :", y_test.shape)

    if return_file:
        return X_train, X_valid, X_test, y_train, y_valid, y_test
    
X_train, X_valid, X_test, y_train, y_valid, y_test = load_dataset()

X_train shape : (96000, 10)
y_train shape : (96000,)
X_valid shape : (24000, 10)
y_valid shape : (24000,)
X_test shape  : (30000, 10)
y_test shape  : (30000,)


# Preprocess Train

##### 1. Delete <span style="background-color: #767676;">NumberOfTimes90DaysLate = 96</span> or <span style="background-color: #767676;">NumberOfTimes90DaysLate = 98</span>

In [6]:
def clean_late_data(X, y):
    """Function to clean NumberOfTimes90DaysLate columns"""
    # Find index to drop
    drop_condition = X[CONFIG_DATA['clean_late_col']] >= CONFIG_DATA['clean_late_val']
    index_to_drop = X[drop_condition].index.tolist()

    # Drop data
    X_drop = X.drop(index = index_to_drop)
    y_drop = y.drop(index = index_to_drop)

    # Print
    print("X shape :", X_drop.shape)
    print("y shape :", y_drop.shape)

    return X_drop, y_drop

X_train_drop_1, y_train_drop_1 = clean_late_data(X_train, y_train)

X shape : (95838, 10)
y shape : (95838,)


#### 2. Delete <span style="background-color: #767676;">RevolvingUtilizationOfUnsecuredLines > 1.35</span>

In [7]:
def clean_unsecured_data(X, y):
    """Function to clean RevolvingUtilizationOfUnsecuredLines columns from outlier"""
    # Find upper boundary & lower boundary
    q1, q3 = np.quantile(X[CONFIG_DATA['clean_unsecure_col']], q = [0.25, 0.75])
    iqr = q3-q1
    ub = q3 + 1.5*iqr
    lb = q1 - 1.5*iqr

    # Filter data
    drop_condition_1 = X[CONFIG_DATA['clean_unsecure_col']] > ub
    drop_condition_2 = X[CONFIG_DATA['clean_unsecure_col']] < lb
    index_to_drop = X[drop_condition_1 | drop_condition_2].index.tolist()

    # Drop data
    X_drop = X.drop(index = index_to_drop)
    y_drop = y.drop(index = index_to_drop)

    # Print
    print("X shape :", X_drop.shape)
    print("y shape :", y_drop.shape)

    return X_drop, y_drop

X_train_drop_2, y_train_drop_2 = clean_unsecured_data(X_train_drop_1, y_train_drop_1)

X shape : (95350, 10)
y shape : (95350,)


#### 3. Impute Missing Values

In [8]:
def fit_imputer(data, return_file=True):
    """Function to fit imputer (constant & median)"""
    # Create imputer
    constant_imputer = SimpleImputer(missing_values = np.nan,
                                     strategy = "constant",
                                     fill_value = CONFIG_DATA['constant_imputer_val'])
    
    median_imputer = SimpleImputer(missing_values = np.nan,
                                   strategy = "median")
    
    # Fit imputer
    constant_imputer.fit(data[[CONFIG_DATA['constant_imputer_col']]])
    median_imputer.fit(data[[CONFIG_DATA['median_imputer_col']]])

    # Dump imputer
    utils.pickle_dump(constant_imputer, CONFIG_DATA['constant_imputer_path'])
    utils.pickle_dump(median_imputer, CONFIG_DATA['median_imputer_path'])

    if return_file:
        return constant_imputer, median_imputer

In [9]:
# Fit imputer
constant_imputer, median_imputer = fit_imputer(data = X_train_drop_2)

In [10]:
def transform_imputer(data, constant_imputer, median_imputer):
    """Function to transform imputer"""
    data = data.copy()

    # Transform
    impute_constant = constant_imputer.transform(data[[CONFIG_DATA['constant_imputer_col']]])
    impute_median = median_imputer.transform(data[[CONFIG_DATA['median_imputer_col']]])

    # Join transformed data
    data[CONFIG_DATA['constant_imputer_col']] = impute_constant
    data[CONFIG_DATA['median_imputer_col']] = impute_median
    
    # print
    print('data shape :', data.shape)

    return data

In [11]:
X_train_imputed = transform_imputer(data = X_train_drop_2,
                                    constant_imputer = constant_imputer,
                                    median_imputer = median_imputer)

data shape : (95350, 10)


In [14]:
# Check missing values after imputation
X_train_imputed.isna().sum()

RevolvingUtilizationOfUnsecuredLines    0
age                                     0
NumberOfTime30-59DaysPastDueNotWorse    0
DebtRatio                               0
MonthlyIncome                           0
NumberOfOpenCreditLinesAndLoans         0
NumberOfTimes90DaysLate                 0
NumberRealEstateLoansOrLines            0
NumberOfTime60-89DaysPastDueNotWorse    0
NumberOfDependents                      0
dtype: int64

#### 4. Standardize the data

In [15]:
def fit_standardize(data, return_file=True):
    """Find standardizer data"""
    standardizer = StandardScaler()

    # Fit standardizer
    standardizer.fit(data)

    # Dump standardizer
    utils.pickle_dump(standardizer, CONFIG_DATA['standardizer_path'])
    
    if return_file:
        return standardizer

In [16]:
# Fit standardizer
standardizer = fit_standardize(data=X_train_imputed)

In [17]:
def transform_standardize(data, standardizer):
    """Function to standardize data"""
    data_standard = pd.DataFrame(standardizer.transform(data))
    data_standard.columns = data.columns
    data_standard.index = data.index
    return data_standard

In [18]:
# Transform
X_train_std = transform_standardize(data = X_train_imputed,
                                    standardizer = standardizer)

In [19]:
X_train_std.describe()

Unnamed: 0,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
count,95350.0,95350.0,95350.0,95350.0,95350.0,95350.0,95350.0,95350.0,95350.0,95350.0
mean,8.395545e-17,-5.305783e-17,3.8153950000000005e-17,-4.471166e-18,2.2132270000000003e-17,-1.080532e-16,-3.87501e-18,7.515284e-17,-3.5918360000000006e-17,-6.326699e-17
std,1.000005,1.000005,1.000005,1.000005,1.000005,1.000005,1.000005,1.000005,1.000005,1.000005
min,-0.9054568,-2.128718,-0.3511723,-0.1734617,-0.4700236,-1.65205,-0.1855588,-0.9025782,-0.1946529,-0.6655234
25%,-0.820495,-0.7721203,-0.3511723,-0.1733744,-0.1844315,-0.6790339,-0.1855588,-0.9025782,-0.1946529,-0.6655234
50%,-0.4718411,-0.02599161,-0.3511723,-0.1732795,-0.07468973,-0.09522398,-0.1855588,-0.01959509,-0.1946529,-0.6655234
75%,0.6590006,0.720137,-0.3511723,-0.173033,0.07173021,0.4885859,-0.1855588,0.863388,-0.1946529,0.2388183
max,2.940152,3.840311,16.93327,161.5339,219.8005,9.634941,31.96411,46.77851,34.04343,11.09092


#### 5. Balance the data

In [20]:
# Check unbalance data
y_train_drop_2.value_counts(normalize=True)

0    0.93526
1    0.06474
Name: SeriousDlqin2yrs, dtype: float64

_Because unbalance data, we have to balancing the data._
_We will do the **downsampling** (only for training data)_

In [23]:
def random_undersampler(X, y):
    """Function to under sample the majority data"""
    # Create resampling object
    ros = RandomUnderSampler(random_state = CONFIG_DATA['seed'])

    # Balancing the set data
    X_resample, y_resample = ros.fit_resample(X, y)

    # Print
    print('Distribution before resampling :')
    print(y.value_counts())
    print("")
    print('Distribution after resampling  :')
    print(y_resample.value_counts())

    return X_resample, y_resample


X_train_clean, y_train_clean = random_undersampler(X_train_std, y_train_drop_2)

Distribution before resampling :
0    89177
1     6173
Name: SeriousDlqin2yrs, dtype: int64

Distribution after resampling  :
0    6173
1    6173
Name: SeriousDlqin2yrs, dtype: int64


In [24]:
# Drop all processor

preprocessor = {
    'constant_imputer': constant_imputer,
    'median_imputer': median_imputer,
    'standardizer': standardizer
}

utils.pickle_dump(preprocessor, CONFIG_DATA['preprocessor_path'])

# Run all step of preprocessing data

In [25]:
def clean_data(data, constant_imputer, median_imputer, standardizer):
    """Function to clean data"""
    # Impute missing value
    data_imputed = transform_imputer(data, constant_imputer, median_imputer)

    # Standardize data
    data_standard = transform_standardize(data_imputed, standardizer)

    return data_standard

def _preprocess_data(data):
    """Function to preprocess data"""
    # Load preprocessor
    preprocessor = utils.pickle_load(CONFIG_DATA['preprocessor_path'])
    constant_imputer = preprocessor['constant_imputer']
    median_imputer = preprocessor['median_imputer']
    standardizer = preprocessor['standardizer']

    data_clean = clean_data(data,
                            constant_imputer,
                            median_imputer,
                            standardizer)
    
    return data_clean

In [26]:
def generate_preprocessor(return_file=True):
    """Function to generate preprocessor"""
    # Load data
    X = utils.pickle_load(CONFIG_DATA['train_set_path'][0])
    y = utils.pickle_load(CONFIG_DATA['train_set_path'][1])

    # Drop unusual data
    X, y = clean_late_data(X, y)
    X, y = clean_unsecured_data(X, y)

    # Generate preprocessor: imputer
    constant_imputer, median_imputer = fit_imputer(data = X)
    X_imputed = transform_imputer(X, constant_imputer, median_imputer)

    # Generate preprocessor: standardizer
    standardizer = fit_standardize(X_imputed)

    # Dump file
    preprocessor = {
        'constant_imputer': constant_imputer,
        'median_imputer': median_imputer,
        'standardizer': standardizer
    }
    utils.pickle_dump(preprocessor, CONFIG_DATA['preprocessor_path'])
    
    if return_file:
        return preprocessor
    
preprocessor = generate_preprocessor()

X shape : (95838, 10)
y shape : (95838,)
X shape : (95350, 10)
y shape : (95350,)
data shape : (95350, 10)


In [27]:
# Function to preprocess train data

def preprocess_data(type='train', return_file=True):
    # Load data
    X = utils.pickle_load(CONFIG_DATA[f'{type}_set_path'][0])
    y = utils.pickle_load(CONFIG_DATA[f'{type}_set_path'][1])

    if type == 'train':
        # Drop unusual data
        X, y = clean_late_data(X, y)
        X, y = clean_unsecured_data(X, y)
        
    # Preprocess data
    X_clean = _preprocess_data(X)
    y_clean = y

    # FOR TRAINING ONLY -> DO UNDERSAMPLING
    if type == 'train':
        X_clean, y_clean = random_undersampler(X_clean, y_clean)

    # Print shape
    print("X clean shape:", X_clean.shape)
    print("y clean shape:", y_clean.shape)

    # Dump file
    utils.pickle_dump(X_clean, CONFIG_DATA[f'{type}_clean_path'][0])
    utils.pickle_dump(y_clean, CONFIG_DATA[f'{type}_clean_path'][1])

    if return_file:
        return X_clean, y_clean

In [28]:
# Transform X_train
X_train_clean, y_train_clean = preprocess_data(type = 'train')

X shape : (95838, 10)
y shape : (95838,)
X shape : (95350, 10)
y shape : (95350,)
data shape : (95350, 10)
Distribution before resampling :
0    89177
1     6173
Name: SeriousDlqin2yrs, dtype: int64

Distribution after resampling  :
0    6173
1    6173
Name: SeriousDlqin2yrs, dtype: int64
X clean shape: (12346, 10)
y clean shape: (12346,)


In [29]:
# Transform X_valid
X_valid_clean, y_valid_clean = preprocess_data(type = 'valid')

data shape : (24000, 10)
X clean shape: (24000, 10)
y clean shape: (24000,)


In [30]:
# Transform X_train
X_test_clean, y_test_clean = preprocess_data(type = 'test')

data shape : (30000, 10)
X clean shape: (30000, 10)
y clean shape: (30000,)


In [31]:
X_train_clean.describe()

Unnamed: 0,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
count,12346.0,12346.0,12346.0,12346.0,12346.0,12346.0,12346.0,12346.0,12346.0,12346.0
mean,0.482452,-0.192693,0.477863,-0.017201,-0.027364,-0.035405,0.548438,-0.00193,0.459825,0.075252
std,1.142308,0.9676,1.584223,0.544945,0.553847,1.046218,2.07208,1.113499,1.883419,1.044152
min,-0.905457,-2.128718,-0.351172,-0.173462,-0.470024,-1.65205,-0.185559,-0.902578,-0.194653,-0.665523
25%,-0.700233,-0.90778,-0.351172,-0.173368,-0.209835,-0.679034,-0.185559,-0.902578,-0.194653,-0.665523
50%,0.315171,-0.229481,-0.351172,-0.173263,-0.07469,-0.289827,-0.185559,-0.019595,-0.194653,-0.665523
75%,1.730008,0.448818,1.089198,-0.173009,0.042446,0.488586,-0.185559,0.863388,-0.194653,0.238818
max,2.928098,3.162013,16.933266,16.095748,47.086222,9.440338,31.964107,27.35288,34.043432,7.473552


In [32]:
X_train_clean.columns.tolist()

['RevolvingUtilizationOfUnsecuredLines',
 'age',
 'NumberOfTime30-59DaysPastDueNotWorse',
 'DebtRatio',
 'MonthlyIncome',
 'NumberOfOpenCreditLinesAndLoans',
 'NumberOfTimes90DaysLate',
 'NumberRealEstateLoansOrLines',
 'NumberOfTime60-89DaysPastDueNotWorse',
 'NumberOfDependents']