In [10]:
import numpy as np
import pandas as pd

from copy import deepcopy

In [11]:
from src.utils import deserialize_data, serialize_data

X_train = deserialize_data("data/interim/X_train.pkl")
y_train = deserialize_data("data/interim/y_train.pkl")

X_valid = deserialize_data("data/interim/X_valid.pkl")
y_valid = deserialize_data("data/interim/y_valid.pkl")

X_test = deserialize_data("data/interim/X_test.pkl")
y_test = deserialize_data("data/interim/y_test.pkl")

In [12]:
X_test

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length
26024,31,70550,MORTGAGE,9.0,MEDICAL,F,18000,19.74,0.22,Y,10
6126,24,20000,RENT,8.0,VENTURE,A,6000,7.14,0.30,N,2
20835,29,42000,RENT,0.0,HOMEIMPROVEMENT,D,5000,14.96,0.12,N,9
12036,22,72644,MORTGAGE,3.0,PERSONAL,A,5900,7.49,0.08,N,3
29681,45,35000,OTHER,6.0,PERSONAL,B,4800,,0.14,N,12
...,...,...,...,...,...,...,...,...,...,...,...
2145,24,48000,RENT,0.0,PERSONAL,C,2400,12.99,0.05,Y,4
23920,28,67386,MORTGAGE,11.0,HOMEIMPROVEMENT,A,1500,6.91,0.02,N,6
26739,34,110000,MORTGAGE,15.0,EDUCATION,B,6000,9.45,0.05,N,6
26939,29,120000,MORTGAGE,12.0,VENTURE,B,12000,10.65,0.10,N,9


In [13]:
def drop_duplicate_data(X, y):
    """
    This function drops duplicated data from row X and y.
    
    Parameters
    -----------
    X : dataframe
        features of dataset
        
    y : series
        target of dataset

    Returns
    -------
    X : dataframe
        dropped duplicated data features of dataset
        
    y : dataframe
        dropped duplicated data target of dataset
    """
    
    if not isinstance(X, pd.DataFrame):
        raise TypeError("Fungsi median_imputation: parameter X haruslah bertipe DataFrame!")
    
    if not isinstance(y, pd.Series):
        raise TypeError("Fungsi median_imputation: parameter y haruslah bertipe DataFrame!")
    
    print(f"Fungis drop_duplicate_data telah divalidasi.")
    
    X = X.copy()
    y = y.copy()
    print(f"Fungsi drop_duplicate_data: shape dataset sebelum dropping duplicate adalah {X.shape}.")
    
    X_duplicate = X_train[X_train.duplicated()]
    print(f"Fungsi drop_duplicate_data: shape dari data yang duplicate adalah {X_duplicate.shape}.")

    X_clean = (X.shape[0] - X_duplicate.shape[0], X.shape[1])
    print(f"Fungsi drop_duplicate_data: shape dataset setelah drop duplicate seharusnya adalah {X_clean}.")
    
    X.drop_duplicates(inplace=True)
    y = y[X.index]
    
    print(f"Fungsi drop_duplicate_data: shape dataset setelah dropping duplicate adalah {X.shape}.")
    
    return X, y

In [14]:
X_train, y_train = drop_duplicate_data(X_train, y_train)

Fungis drop_duplicate_data telah divalidasi.
Fungsi drop_duplicate_data: shape dataset sebelum dropping duplicate adalah (26064, 11).
Fungsi drop_duplicate_data: shape dari data yang duplicate adalah (118, 11).
Fungsi drop_duplicate_data: shape dataset setelah drop duplicate seharusnya adalah (25946, 11).
Fungsi drop_duplicate_data: shape dataset setelah dropping duplicate adalah (25946, 11).


In [15]:
def median_imputation(data, subset_data, fit):
    """
    Parameters
    -----------
    data : dataframe
        dataset to be imputed
        
    subset_data : list of string
        columns name

    fit : boolean
        if fit=true, this function will return median of subset_data
        if fit=false, this function will impute the data based on subset_data

    Returns
    -------
    X : dataframe
        dropped duplicated data features of dataset
        
    y : dataframe
        dropped duplicated data target of dataset
    """
    
    if not isinstance(data, pd.DataFrame):
        raise TypeError("Fungsi median_imputation: parameter data haruslah bertipe DataFrame!")
    
    if fit is True and not isinstance(subset_data, list):
        raise TypeError(
            "Fungsi median_imputation: untuk nilai parameter fit = True, subset_data harus bertipe list dan berisi " 
            "daftar nama kolom yang ingin dicari nilai mediannya guna menjadi data imputasi pada kolom tersebut")
    
    if fit is False and not isinstance(subset_data, dict):
        raise TypeError(
            "Fungsi median_imputation: untuk nilai parameter fit = False, subset_data harus bertipe dict dan berisi "
            "key yang merupakan nama kolom beserta value yang merupakan nilai median dari kolom tersebut")
    
    if not isinstance(fit, bool):
        raise TypeError("Fungsi median_imputation: parameter fit haruslah bertipe boolean, bernilai True atau False.")
    
    print("Fungsi median_imputation: parameter telah divalidasi.")
    
    data = data.copy()
    subset_data = deepcopy(subset_data) 
    
    """
    Handles fitting data
    """
    if fit is True:
        imputation_data = {}
        for subset in subset_data:
            imputation_data[subset] = data[subset].median(numeric_only=True)
            
        print(f"Fungsi median_imputation: proses fitting telah selesai, berikut hasilnya {imputation_data}")
            
        return imputation_data
    
    """
    Handles transforming data
    """
    print("Fungsi median_imputation: informasi count na sebelum dilakukan imputasi")
    print(data.isna().sum())
    print()
    
    for subset in subset_data:
        data[subset] = data[subset].fillna(subset_data[subset])

    print("Fungsi median_imputation: informasi count na setelah dilakukan imputasi.")
    print(data.isna().sum())
    print()
    
    return data

In [16]:
X_train.head()

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length
32377,64,46000,RENT,2.0,PERSONAL,C,4800,11.09,0.1,Y,24
1338,26,26000,OWN,0.0,DEBTCONSOLIDATION,E,8500,16.45,0.33,N,3
7047,23,51000,MORTGAGE,3.0,PERSONAL,C,16000,13.11,0.31,Y,3
8225,22,56004,MORTGAGE,6.0,MEDICAL,A,6000,7.88,0.11,N,4
7178,24,79000,RENT,3.0,PERSONAL,C,7000,12.54,0.09,N,3


In [17]:
subset_data = ["person_age", "person_income", "person_emp_length", "loan_amnt", "loan_int_rate", "loan_percent_income", "cb_person_cred_hist_length"]
subset_data = median_imputation(X_train, subset_data, fit=True)

X_train = median_imputation(X_train, subset_data, fit=False)
X_valid = median_imputation(X_valid, subset_data, fit=False)
X_test = median_imputation(X_test, subset_data, fit=False)

Fungsi median_imputation: parameter telah divalidasi.
Fungsi median_imputation: proses fitting telah selesai, berikut hasilnya {'person_age': np.float64(26.0), 'person_income': np.float64(55000.0), 'person_emp_length': np.float64(4.0), 'loan_amnt': np.float64(8000.0), 'loan_int_rate': np.float64(10.99), 'loan_percent_income': np.float64(0.15), 'cb_person_cred_hist_length': np.float64(4.0)}
Fungsi median_imputation: parameter telah divalidasi.
Fungsi median_imputation: informasi count na sebelum dilakukan imputasi
person_age                       0
person_income                    0
person_home_ownership            0
person_emp_length              711
loan_intent                      0
loan_grade                       0
loan_amnt                        0
loan_int_rate                 2463
loan_percent_income              0
cb_person_default_on_file        0
cb_person_cred_hist_length       0
dtype: int64

Fungsi median_imputation: informasi count na setelah dilakukan imputasi.
person_ag

In [77]:
from sklearn.preprocessing import OneHotEncoder


def create_onehot_encoder(categories, path):
    """
    create_onehot_encoder create encoder based on categories and save it in path
    
    :param categories: 
    :param path: 
    :return: 
    """
    
    if not isinstance(categories, list):
        raise TypeError("Fungsi create_onehot_encoder: parameter categories haruslah bertipe list, berisi kategori yang akan dibuat encodernya.")
    
    if not isinstance(path, str):
        raise TypeError("Fungsi create_onehot_encoder: parameter path haruslah bertipe string, berisi lokasi pada disk komputer dimana encoder akan disimpan.")
    
    ohe = OneHotEncoder()
    categories_ = np.array(categories).reshape(-1, 1)    
    ohe.fit(categories_)

    serialize_data(ohe, path)
    
    print(f"Kategori yang telah dipelajari adalah {categories_[0].tolist()}")
    
    return ohe

In [78]:
person_home_ownership = ["RENT", "OWN", "MORTGAGE", "OTHER"]
loan_intent = ["PERSONAL", "DEBTCONSOLIDATION", "MEDICAL", "HOMEIMPROVEMENT", "VENTURE", "EDUCATION"]
loan_grade = ["C", "E", "A", "B", "D", "F", "G"]
cb_person_default_on_file = ["Y", "N"]

In [79]:
ohe_person_home_ownership = create_onehot_encoder(person_home_ownership, "models/person_home_ownership.pkl")
ohe_loan_intent = create_onehot_encoder(loan_intent, "models/loan_intent.pkl")
ohe_loan_grade = create_onehot_encoder(loan_grade, "models/loan_grade.pkl")
ohe_cb_person_default_on_file = create_onehot_encoder(cb_person_default_on_file, "models/cb_person_default_on_file.pkl")

Kategori yang telah dipelajari adalah ['RENT']
Kategori yang telah dipelajari adalah ['PERSONAL']
Kategori yang telah dipelajari adalah ['C']
Kategori yang telah dipelajari adalah ['Y']


In [80]:
def ohe_transform(dataset, subset, prefix, ohe):
    if not isinstance(dataset, pd.DataFrame):
        raise TypeError("Fungsi ohe_transform: parameter dataset harus bertipe DataFrame!")
    
    if not isinstance(subset, str):
        raise TypeError("Fungsi ohe_transform: parameter ohe harus bertipe OneHotEncoder!")
    
    if not isinstance(prefix, str):
        raise TypeError("Fungsi ohe_transform: parameter prefix harus bertipe str")
    
    if not isinstance(ohe, OneHotEncoder):
        raise TypeError("Fungsi ohe_transform: parameter subset harus bertipe str!")
    
    try:
        dataset[subset]
    except:
        raise RuntimeError("Fungsi ohe_transform: parameter subset string namun data tidak ditemukan dalam daftar kolom yang terdapat pada parameter datase")
    
    print("Fungsi ohe_transform: parameter telah divalidasi.")

    print(f"Fungsi ohe_transform: daftar nama kolom sebelum dilakukan pengkodean adalah {list(dataset.columns)}")
    
    dataset = dataset.copy()

    col_names = []
    for col in ohe.categories_[0]:
        col_name = f"{prefix}_{col}"
        col_names.append(col_name)

    encoded = pd.DataFrame(
        ohe.transform(dataset[subset].to_frame()).toarray(),
        columns=col_names,
        index=dataset[subset].index
    )

    dataset = pd.concat([dataset, encoded], axis=1)
    dataset = dataset.drop(subset, axis=1)
    
    print(f"Fungsi ohe_transform: daftar nama kolom setelah dilakukan pengkodean adalah {list(dataset.columns)}")
    
    return dataset   

In [81]:
X_train = ohe_transform(X_train, "person_home_ownership", "home_ownership", ohe_person_home_ownership)
X_train = ohe_transform(X_train, "loan_intent", "loan_intent", ohe_loan_intent)
X_train = ohe_transform(X_train, "loan_grade", "grade", ohe_loan_grade)
X_train = ohe_transform(X_train, "cb_person_default_on_file", "default_on_file", ohe_cb_person_default_on_file)

Fungsi ohe_transform: parameter telah divalidasi.
Fungsi ohe_transform: daftar nama kolom sebelum dilakukan pengkodean adalah ['person_age', 'person_income', 'person_home_ownership', 'person_emp_length', 'loan_intent', 'loan_grade', 'loan_amnt', 'loan_int_rate', 'loan_percent_income', 'cb_person_default_on_file', 'cb_person_cred_hist_length']
Fungsi ohe_transform: daftar nama kolom setelah dilakukan pengkodean adalah ['person_age', 'person_income', 'person_emp_length', 'loan_intent', 'loan_grade', 'loan_amnt', 'loan_int_rate', 'loan_percent_income', 'cb_person_default_on_file', 'cb_person_cred_hist_length', 'home_ownership_MORTGAGE', 'home_ownership_OTHER', 'home_ownership_OWN', 'home_ownership_RENT']
Fungsi ohe_transform: parameter telah divalidasi.
Fungsi ohe_transform: daftar nama kolom sebelum dilakukan pengkodean adalah ['person_age', 'person_income', 'person_emp_length', 'loan_intent', 'loan_grade', 'loan_amnt', 'loan_int_rate', 'loan_percent_income', 'cb_person_default_on_file',



In [84]:
serialize_data(X_train, "data/processed/X_train_prep.pkl")
serialize_data(X_valid, "data/processed/X_valid_prep.pkl")
serialize_data(X_train, "data/processed/X_test_prep.pkl")

serialize_data(y_train, "data/processed/y_train_prep.pkl")