In [1]:
import numpy as np
import pandas as pd

from copy import deepcopy
from src.preprocessing import (drop_duplicate_data, preprocess_data,
                               numerical_imputer_fit, numerical_imputer_transform,
                               ohe_fit, ohe_transform,
                               scaler_fit, scaler_transform)
from src.utils import (deserialize_data, serialize_data,
                       concat_data, NUMERICAL_COLUMNS, CATEGORICAL_COLUMNS)

In [2]:
X_train = deserialize_data('../data/interim/X_train.pkl')
y_train = deserialize_data('../data/interim/y_train.pkl')

X_valid = deserialize_data('../data/interim/X_valid.pkl')
y_valid = deserialize_data('../data/interim/y_valid.pkl')

X_test = deserialize_data('../data/interim/X_test.pkl')
y_test = deserialize_data('../data/interim/y_test.pkl')

In [3]:
X_train, y_train = drop_duplicate_data(X_train, y_train)

Fungis drop_duplicate_data telah divalidasi.
Fungsi drop_duplicate_data: shape dataset sebelum dropping duplicate adalah (26064, 11).
Fungsi drop_duplicate_data: shape dari data yang duplicate adalah (118, 11).
Fungsi drop_duplicate_data: shape dataset setelah drop duplicate seharusnya adalah (25946, 11).
Fungsi drop_duplicate_data: shape dataset setelah dropping duplicate adalah (25946, 11).


In [4]:
X_train_numerical = X_train[NUMERICAL_COLUMNS]

numerical_imputer = numerical_imputer_fit(X_train_numerical)
X_train_numerical_imputed = numerical_imputer_transform(X_train_numerical, numerical_imputer)

serialize_data(numerical_imputer, "../preprocessing/numerical_imputer.pkl")
X_train_numerical_imputed.isna().sum(0)

age                      0
income                   0
employment_length        0
loan_amount              0
loan_interest_rate       0
loan_percent_income      0
credit_history_length    0
dtype: int64

In [5]:
X_train_categorical = X_train[CATEGORICAL_COLUMNS]

ohe = ohe_fit(X_train_categorical)
X_train_categorical_encoded = ohe_transform(X_train_categorical, ohe)

serialize_data(ohe, "../preprocessing/ohe.pkl")
X_train_categorical_encoded.head()

Unnamed: 0,home_ownership_MORTGAGE,home_ownership_OTHER,home_ownership_OWN,home_ownership_RENT,loan_intent_DEBTCONSOLIDATION,loan_intent_EDUCATION,loan_intent_HOMEIMPROVEMENT,loan_intent_MEDICAL,loan_intent_PERSONAL,loan_intent_VENTURE,loan_grade_A,loan_grade_B,loan_grade_C,loan_grade_D,loan_grade_E,loan_grade_F,loan_grade_G,cb_person_default_on_file_N,cb_person_default_on_file_Y
32377,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
1338,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
7047,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
8225,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
7178,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0


In [6]:
X_train_cleaned = concat_data(X_train_numerical_imputed, X_train_categorical_encoded)
X_train_cleaned.head()

Unnamed: 0,age,income,employment_length,loan_amount,loan_interest_rate,loan_percent_income,credit_history_length,home_ownership_MORTGAGE,home_ownership_OTHER,home_ownership_OWN,...,loan_intent_VENTURE,loan_grade_A,loan_grade_B,loan_grade_C,loan_grade_D,loan_grade_E,loan_grade_F,loan_grade_G,cb_person_default_on_file_N,cb_person_default_on_file_Y
32377,64.0,46000.0,2.0,4800.0,11.09,0.1,24.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
1338,26.0,26000.0,0.0,8500.0,16.45,0.33,3.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
7047,23.0,51000.0,3.0,16000.0,13.11,0.31,3.0,1.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
8225,22.0,56004.0,6.0,6000.0,7.88,0.11,4.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
7178,24.0,79000.0,3.0,7000.0,12.54,0.09,3.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0


In [7]:
scaler = scaler_fit(X_train_cleaned)
X_train_scaled = scaler_transform(X_train_cleaned, scaler)

serialize_data(scaler, "../preprocessing/scaler.pkl")
X_train_scaled.head()

X_train_clean = deepcopy(X_train_scaled)

In [8]:
X_valid_clean = preprocess_data(X_valid)
X_test_clean = preprocess_data(X_test)

In [9]:
X_valid_clean.head()

Unnamed: 0,age,income,employment_length,loan_amount,loan_interest_rate,loan_percent_income,credit_history_length,home_ownership_MORTGAGE,home_ownership_OTHER,home_ownership_OWN,...,loan_intent_VENTURE,loan_grade_A,loan_grade_B,loan_grade_C,loan_grade_D,loan_grade_E,loan_grade_F,loan_grade_G,cb_person_default_on_file_N,cb_person_default_on_file_Y
32198,2.223359,0.060012,0.30403,0.379467,-0.007135,-0.004348,2.754194,-0.838149,-0.056307,-0.293362,...,-0.458687,-0.702105,1.453196,-0.49794,-0.354182,-0.173239,-0.085432,-0.043941,0.460594,-0.460594
6319,-0.746602,-0.490119,-0.928333,-0.57064,-0.007135,-0.004348,-0.942323,-0.838149,-0.056307,-0.293362,...,-0.458687,-0.702105,1.453196,-0.49794,-0.354182,-0.173239,-0.085432,-0.043941,0.460594,-0.460594
32482,3.94281,0.21721,0.796974,0.854521,-1.42926,0.182568,4.479235,-0.838149,-0.056307,3.408761,...,-0.458687,1.424289,-0.688139,-0.49794,-0.354182,-0.173239,-0.085432,-0.043941,0.460594,-0.460594
27255,0.503908,-0.097186,0.30403,0.854521,0.035171,0.743315,-0.203019,-0.838149,-0.056307,-0.293362,...,2.180136,-0.702105,1.453196,-0.49794,-0.354182,-0.173239,-0.085432,-0.043941,0.460594,-0.460594
6148,-0.433975,-0.675707,-0.68186,-0.57064,-0.980168,0.836773,-0.449454,-0.838149,-0.056307,-0.293362,...,2.180136,1.424289,-0.688139,-0.49794,-0.354182,-0.173239,-0.085432,-0.043941,0.460594,-0.460594


In [10]:
X_test_clean.head()

Unnamed: 0,age,income,employment_length,loan_amount,loan_interest_rate,loan_percent_income,credit_history_length,home_ownership_MORTGAGE,home_ownership_OTHER,home_ownership_OWN,...,loan_intent_VENTURE,loan_grade_A,loan_grade_B,loan_grade_C,loan_grade_D,loan_grade_E,loan_grade_F,loan_grade_G,cb_person_default_on_file_N,cb_person_default_on_file_Y
26024,0.503908,0.068658,1.043447,1.329574,2.840371,0.462941,1.029153,1.193106,-0.056307,-0.293362,...,-0.458687,-0.702105,-0.688139,-0.49794,-0.354182,-0.173239,11.705154,-0.043941,-2.17111,2.17111
6126,-0.590288,-0.725979,0.796974,-0.57064,-1.260037,1.210604,-0.942323,-0.838149,-0.056307,-0.293362,...,2.180136,1.424289,-0.688139,-0.49794,-0.354182,-0.173239,-0.085432,-0.043941,0.460594,-0.460594
20835,0.19128,-0.380143,-1.174805,-0.728991,1.284819,-0.471637,0.782719,-0.838149,-0.056307,-0.293362,...,-0.458687,-0.702105,-0.688139,-0.49794,2.82341,-0.173239,-0.085432,-0.043941,0.460594,-0.460594
12036,-0.902916,0.101575,-0.435388,-0.586475,-1.146137,-0.845468,-0.695888,1.193106,-0.056307,-0.293362,...,-0.458687,1.424289,-0.688139,-0.49794,-0.354182,-0.173239,-0.085432,-0.043941,0.460594,-0.460594
29681,2.6923,-0.490182,0.30403,-0.760661,-0.007135,-0.284721,1.522022,-0.838149,17.759917,-0.293362,...,-0.458687,-0.702105,1.453196,-0.49794,-0.354182,-0.173239,-0.085432,-0.043941,0.460594,-0.460594


In [12]:
serialize_data(X_train_clean, "../data/processed/X_train_prep.pkl")
serialize_data(X_valid_clean, "../data/processed/X_valid_prep.pkl")
serialize_data(X_test_clean, "../data/processed/X_test_prep.pkl")

serialize_data(y_train, "../data/processed/y_train_prep.pkl")
serialize_data(y_valid, "../data/processed/y_valid_prep.pkl")
serialize_data(y_test, "../data/processed/y_test_prep.pkl")

In [13]:
X_train_clean.info()

<class 'pandas.core.frame.DataFrame'>
Index: 25946 entries, 32377 to 23654
Data columns (total 26 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   age                            25946 non-null  float64
 1   income                         25946 non-null  float64
 2   employment_length              25946 non-null  float64
 3   loan_amount                    25946 non-null  float64
 4   loan_interest_rate             25946 non-null  float64
 5   loan_percent_income            25946 non-null  float64
 6   credit_history_length          25946 non-null  float64
 7   home_ownership_MORTGAGE        25946 non-null  float64
 8   home_ownership_OTHER           25946 non-null  float64
 9   home_ownership_OWN             25946 non-null  float64
 10  home_ownership_RENT            25946 non-null  float64
 11  loan_intent_DEBTCONSOLIDATION  25946 non-null  float64
 12  loan_intent_EDUCATION          25946 non-null  