# 1. Load Required Libraries

In [139]:
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np 
import joblib
import os
import yaml
import src.util as util

# 2. Load Configuration File

In [140]:
config = util.load_config()

# 3. Load Dataset

In [141]:
raw_dataset = pd.read_csv(config["dataset_path"])

In [142]:
raw_dataset

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y
...,...,...,...,...,...,...,...,...,...,...,...,...,...
609,LP002978,Female,No,0,Graduate,No,2900,0.0,71.0,360.0,1.0,Rural,Y
610,LP002979,Male,Yes,3+,Graduate,No,4106,0.0,40.0,180.0,1.0,Rural,Y
611,LP002983,Male,Yes,1,Graduate,No,8072,240.0,253.0,360.0,1.0,Urban,Y
612,LP002984,Male,Yes,2,Graduate,No,7583,0.0,187.0,360.0,1.0,Urban,Y


In [143]:
dataset = raw_dataset.drop(['Loan_ID'], axis=1)

# 4. Data Validation

### 4.1 Null values

In [144]:
dataset.isnull().sum()

Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

In [145]:
dataset[['LoanAmount', 'Loan_Amount_Term', 'Credit_History']] = dataset[['LoanAmount', 'Loan_Amount_Term', 'Credit_History']].fillna(dataset[['LoanAmount', 'Loan_Amount_Term', 'Credit_History']].median())
dataset[['Gender', 'Married', 'Dependents', 'Self_Employed']] = dataset[['Gender', 'Married', 'Dependents', 'Self_Employed']].fillna("Unknown")

In [146]:
dataset.isnull().sum()

Gender               0
Married              0
Dependents           0
Education            0
Self_Employed        0
ApplicantIncome      0
CoapplicantIncome    0
LoanAmount           0
Loan_Amount_Term     0
Credit_History       0
Property_Area        0
Loan_Status          0
dtype: int64

### 4.2 Tipe Data

In [147]:
dataset.dtypes

Gender                object
Married               object
Dependents            object
Education             object
Self_Employed         object
ApplicantIncome        int64
CoapplicantIncome    float64
LoanAmount           float64
Loan_Amount_Term     float64
Credit_History       float64
Property_Area         object
Loan_Status           object
dtype: object

### 4.3 Range Data

In [148]:
dataset.describe()

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History
count,614.0,614.0,614.0,614.0,614.0
mean,5403.459283,1621.245798,145.752443,342.410423,0.855049
std,6109.041673,2926.248369,84.107233,64.428629,0.352339
min,150.0,0.0,9.0,12.0,0.0
25%,2877.5,0.0,100.25,360.0,1.0
50%,3812.5,1188.5,128.0,360.0,1.0
75%,5795.0,2297.25,164.75,360.0,1.0
max,81000.0,41667.0,700.0,480.0,1.0


### 4.4 Dimensi Data 

In [149]:
dataset.shape

(614, 12)

### 4.5 Remove Duplicates

In [150]:
def removeDuplicates(data):
    print(f"shape awal                    : {data.shape}, (#observasi, #fitur)")
    
     # Drop duplicate
    data = data.drop_duplicates()
    print(f"shape setelah drop duplikat   : {data.shape}, (#observasi, #fitur)")

    return data

In [151]:
dataset = removeDuplicates(dataset)
dataset

shape awal                    : (614, 12), (#observasi, #fitur)
shape setelah drop duplikat   : (614, 12), (#observasi, #fitur)


Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,Male,No,0,Graduate,No,5849,0.0,128.0,360.0,1.0,Urban,Y
1,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y
...,...,...,...,...,...,...,...,...,...,...,...,...
609,Female,No,0,Graduate,No,2900,0.0,71.0,360.0,1.0,Rural,Y
610,Male,Yes,3+,Graduate,No,4106,0.0,40.0,180.0,1.0,Rural,Y
611,Male,Yes,1,Graduate,No,8072,240.0,253.0,360.0,1.0,Urban,Y
612,Male,Yes,2,Graduate,No,7583,0.0,187.0,360.0,1.0,Urban,Y


# 5. Data Defense

In [152]:
def check_data(input_data, params):
    # Check data types
    assert input_data.select_dtypes("float").columns.to_list() == params["float64_columns"], "an error occurs in float64 column(s)."
    assert input_data.select_dtypes("object").columns.to_list() == params["object_columns"], "an error occurs in object column(s)."
    assert input_data.select_dtypes("int").columns.to_list() == params["int64_columns"], "an error occurs in int64 column(s)."

    # Check range of data
    assert input_data["ApplicantIncome"].between(params["range_ApplicantIncome"][0], params["range_ApplicantIncome"][1]).sum() == len(input_data), "an error occurs in age ApplicantIncome."
    assert input_data["CoapplicantIncome"].between(params["range_CoapplicantIncome"][0], params["range_CoapplicantIncome"][1]).sum() == len(input_data), "an error occurs in amount CoapplicantIncome."
    assert input_data["LoanAmount"].between(params["range_LoanAmount"][0], params["range_LoanAmount"][1]).sum() == len(input_data), "an error occurs in LoanAmount range."
    assert input_data["Loan_Amount_Term"].between(params["range_Loan_Amount_Term"][0], params["range_Loan_Amount_Term"][1]).sum() == len(input_data), "an error occurs in Loan_Amount_Term range."
    assert input_data["Credit_History"].between(params["range_Credit_History"][0], params["range_Credit_History"][1]).sum() == len(input_data), "an error occurs in Credit_History range."
    assert set(input_data["Gender"]).issubset(set(params["range_Gender"])), "an error occurs in Gender range."
    assert set(input_data["Married"]).issubset(set(params["range_Married"])), "an error occurs in Married range."
    assert set(input_data["Dependents"]).issubset(set(params["range_Dependents"])), "an error occurs in Dependents range."
    assert set(input_data["Education"]).issubset(set(params["range_Education"])), "an error occurs in Education range."
    assert set(input_data["Self_Employed"]).issubset(set(params["range_Self_Employed"])), "an error occurs in Self_Employed range."
    assert set(input_data["Property_Area"]).issubset(set(params["range_Property_Area"])), "an error occurs in Property_Area range."
    assert set(input_data["Loan_Status"]).issubset(set(params["range_Loan_Status"])), "an error occurs in Loan_Status range."
   

In [153]:
check_data(dataset, config)

# 6. Data Splitting

In [154]:
def splitInputOtput(data):
    x = data[config["predictors"]].copy()
    y = data[config["label"]].copy()
    return x,y

In [155]:
x,y = splitInputOtput(dataset)

In [156]:
dataset.columns

Index(['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed',
       'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
       'Loan_Amount_Term', 'Credit_History', 'Property_Area', 'Loan_Status'],
      dtype='object')

In [157]:
x

Unnamed: 0,ApplicantIncome,Gender,Married,Dependents,Education,Self_Employed,Property_Area,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History
0,5849,Male,No,0,Graduate,No,Urban,0.0,128.0,360.0,1.0
1,4583,Male,Yes,1,Graduate,No,Rural,1508.0,128.0,360.0,1.0
2,3000,Male,Yes,0,Graduate,Yes,Urban,0.0,66.0,360.0,1.0
3,2583,Male,Yes,0,Not Graduate,No,Urban,2358.0,120.0,360.0,1.0
4,6000,Male,No,0,Graduate,No,Urban,0.0,141.0,360.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...
609,2900,Female,No,0,Graduate,No,Rural,0.0,71.0,360.0,1.0
610,4106,Male,Yes,3+,Graduate,No,Rural,0.0,40.0,180.0,1.0
611,8072,Male,Yes,1,Graduate,No,Urban,240.0,253.0,360.0,1.0
612,7583,Male,Yes,2,Graduate,No,Urban,0.0,187.0,360.0,1.0


In [158]:
y.value_counts()

Y    422
N    192
Name: Loan_Status, dtype: int64

In [159]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state = 42, stratify = y)

In [160]:
x_valid, x_test, y_valid, y_test = train_test_split(x_test, y_test, test_size = 0.5, random_state = 42, stratify = y_test)

In [161]:
util.pickle_dump(x_train, config["train_set_path"][0])
util.pickle_dump(y_train, config["train_set_path"][1])

util.pickle_dump(x_valid, config["valid_set_path"][0])
util.pickle_dump(y_valid, config["valid_set_path"][1])

util.pickle_dump(x_test, config["test_set_path"][0])
util.pickle_dump(y_test, config["test_set_path"][1])