# 1. Load Required Libraries

In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np 
import joblib
import os
import yaml
import src.util as util

# 2. Load Configuration File

In [4]:
config = util.load_config()

# 3. Load Dataset

In [5]:
raw_dataset = pd.read_csv(config["dataset_path"])

In [7]:
raw_dataset

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y
...,...,...,...,...,...,...,...,...,...,...,...,...,...
609,LP002978,Female,No,0,Graduate,No,2900,0.0,71.0,360.0,1.0,Rural,Y
610,LP002979,Male,Yes,3+,Graduate,No,4106,0.0,40.0,180.0,1.0,Rural,Y
611,LP002983,Male,Yes,1,Graduate,No,8072,240.0,253.0,360.0,1.0,Urban,Y
612,LP002984,Male,Yes,2,Graduate,No,7583,0.0,187.0,360.0,1.0,Urban,Y


In [9]:
dataset = raw_dataset.drop('Loan_ID', axis=1)

# 4. Data Validation

### 4.1 Null values

In [12]:
dataset.isnull().sum()

Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

### 4.2 Tipe Data

In [13]:
dataset.dtypes

Gender                object
Married               object
Dependents            object
Education             object
Self_Employed         object
ApplicantIncome        int64
CoapplicantIncome    float64
LoanAmount           float64
Loan_Amount_Term     float64
Credit_History       float64
Property_Area         object
Loan_Status           object
dtype: object

### 4.3 Range Data

In [14]:
dataset.describe()

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History
count,614.0,614.0,592.0,600.0,564.0
mean,5403.459283,1621.245798,146.412162,342.0,0.842199
std,6109.041673,2926.248369,85.587325,65.12041,0.364878
min,150.0,0.0,9.0,12.0,0.0
25%,2877.5,0.0,100.0,360.0,1.0
50%,3812.5,1188.5,128.0,360.0,1.0
75%,5795.0,2297.25,168.0,360.0,1.0
max,81000.0,41667.0,700.0,480.0,1.0


In [1]:
dataset["step"].to_list()

NameError: name 'dataset' is not defined

### 4.4 Dimensi Data 

In [18]:
dataset.shape

(6362620, 10)

### 4.5 Remove Duplicates

In [19]:
def removeDuplicates(data):
    print(f"shape awal                    : {data.shape}, (#observasi, #fitur)")
    
     # Drop duplicate
    data = data.drop_duplicates()
    print(f"shape setelah drop duplikat   : {data.shape}, (#observasi, #fitur)")

    return data

In [20]:
dataset = removeDuplicates(dataset)
dataset

shape awal                    : (6362620, 10), (#observasi, #fitur)
shape setelah drop duplikat   : (6362620, 10), (#observasi, #fitur)


Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud
0,1,PAYMENT,9839.64,C1231006815,170136.00,160296.36,M1979787155,0.00,0.00,0
1,1,PAYMENT,1864.28,C1666544295,21249.00,19384.72,M2044282225,0.00,0.00,0
2,1,TRANSFER,181.00,C1305486145,181.00,0.00,C553264065,0.00,0.00,1
3,1,CASH_OUT,181.00,C840083671,181.00,0.00,C38997010,21182.00,0.00,1
4,1,PAYMENT,11668.14,C2048537720,41554.00,29885.86,M1230701703,0.00,0.00,0
...,...,...,...,...,...,...,...,...,...,...
6362615,743,CASH_OUT,339682.13,C786484425,339682.13,0.00,C776919290,0.00,339682.13,1
6362616,743,TRANSFER,6311409.28,C1529008245,6311409.28,0.00,C1881841831,0.00,0.00,1
6362617,743,CASH_OUT,6311409.28,C1162922333,6311409.28,0.00,C1365125890,68488.84,6379898.11,1
6362618,743,TRANSFER,850002.52,C1685995037,850002.52,0.00,C2080388513,0.00,0.00,1


# 5. Data Defense

In [None]:
def check_data(input_data, params):
    # Check data types
    assert input_data.select_dtypes("float").columns.to_list() == params["float64_columns"], "an error occurs in float64 column(s)."
    assert input_data.drop(columns='isFraud').select_dtypes("object").columns.to_list() == params["object_columns"], "an error occurs in object column(s)."
    assert input_data.select_dtypes("int").columns.to_list() == params["int64_columns"], "an error occurs in int64 column(s)."

    # Check range of data
    assert input_data["step"].between(params["range_step"][0], params["range_step"][1]).sum() == len(input_data), "an error occurs in age step."
    assert input_data["amount"].between(params["range_amount"][0], params["range_amount"][1]).sum() == len(input_data), "an error occurs in amount range."
    assert input_data["oldbalanceOrg"].between(params["range_oldbalanceOrg"][0], params["range_oldbalanceOrg"][1]).sum() == len(input_data), "an error occurs in oldbalanceOrg range."
    assert input_data["newbalanceOrig"].between(params["newbalanceOrig"][0], params["newbalanceOrig"][1]).sum() == len(input_data), "an error occurs in newbalanceOrig range."
    assert input_data["oldbalanceDest"].between(params["oldbalanceDest"][0], params["oldbalanceDest"][1]).sum() == len(input_data), "an error occurs in oldbalanceDest range."
    assert input_data["newbalanceDest"].between(params["newbalanceDest"][0], params["newbalanceDest"][1]).sum() == len(input_data), "an error occurs in newbalanceDest range."
    assert set(input_data["type"]).issubset(set(params["range_type"])), "an error occurs in type range."
    assert set(input_data["nameOrig"]).issubset(set(params["range_nameOrig"])), "an error occurs in nameOrig range."
    assert set(input_data["nameDest"]).issubset(set(params["range_nameDest"])), "an error occurs in nameDest range."
   

In [None]:
check_data(dataset, config_data)

# 6. Data Splitting

In [None]:
def splitInputOtput(data):
    x = data[config_data["predictors"]].copy()
    y = data[config_data["label"]].copy()
    return x,y