## Loan Dataset

Dataset Link:  https://www.kaggle.com/datasets/burak3ergun/loan-data-set

### Important Libraries

In [1]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer, StandardScaler
import pickle

### Dataset

In [2]:
# calling the dataset
df = pd.read_csv("./loan_data_set.csv")
df.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


### Missing Values

In [3]:
# functions to treat the missing values
def rmv_missing_vals(d):
    dfcpy = d.copy().drop(["Loan_ID"], axis = 1)
    cat_var = ["Gender", "Married", "Dependents", "Education", "Self_Employed", "Credit_History", "Property_Area", "Loan_Status"]
    num_var = [x for x in d.columns if x not in cat_var]

    for feat in dfcpy.columns:
        # replacing the missing values in the categorical variables by most frequent categories according to loan status
        if feat in cat_var:
            repl_dict = pd.crosstab(dfcpy[feat], dfcpy["Loan_Status"]).idxmax().to_dict()

            for validx, val in repl_dict.items():
                dfcpy.loc[(dfcpy[feat].isnull()) & (dfcpy['Loan_Status'] == validx), feat] = val
        # replacing the missing values in the numerical variables by median according to loan status
        elif feat in num_var:
            repl_dict = dfcpy.groupby("Loan_Status")[feat].median().to_dict()
            for validx, val in repl_dict.items():
                dfcpy.loc[(dfcpy[feat].isnull()) & (dfcpy['Loan_Status'] == validx), feat] = val

    return dfcpy

### Scalling

In [8]:
# function to apply scaling in the dataset using the standard scaler
def scalling(d):
    dfcpy = d.copy()
    cat_var = ["Gender", "Married", "Dependents", "Education", "Self_Employed", "Credit_History", "Property_Area", "Loan_Status"]
    num_var = [x for x in d.columns if x not in cat_var]
    scaler = StandardScaler()

    dfcpy[num_var] = pd.DataFrame(scaler.fit_transform(dfcpy[num_var]))
    with open('./scaler.pkl', "wb") as f:
        pickle.dump(scaler, f)
    dfcpy = pd.concat([dfcpy[num_var], dfcpy[cat_var]], axis = 1)
    return dfcpy

### Encoding

In [5]:
# function to apply one-hot encoding in the categorical varaibles
def encode(d):
    cat_var = ["Gender", "Married", "Dependents", "Education", "Self_Employed", "Credit_History", "Property_Area", "Loan_Status"]
    dfcpy = d.copy()
    dummies = pd.get_dummies(dfcpy[cat_var], drop_first=True)
    df_new = pd.concat([dfcpy.drop(cat_var, axis=1), dummies], axis=1)
    return df_new

### Preprocessing Pipeline

In [9]:
# combining the missing  value treatment, scalling in a pipeline
pipe_without_encode = Pipeline([
    ('rmvmissing', FunctionTransformer(rmv_missing_vals)),
    ('scaler', FunctionTransformer(scalling))
])

pipe_without_encode.fit(df)

In [10]:
df_preprocessed_without_encode = pipe_without_encode.transform(df)

In [11]:
df_preprocessed_without_encode

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Gender,Married,Dependents,Education,Self_Employed,Credit_History,Property_Area,Loan_Status
0,0.072991,-0.554487,-0.234815,0.273231,Male,No,0,Graduate,No,1.0,Urban,Y
1,-0.134412,-0.038732,-0.211017,0.273231,Male,Yes,1,Graduate,No,1.0,Rural,N
2,-0.393747,-0.554487,-0.948735,0.273231,Male,Yes,0,Graduate,Yes,1.0,Urban,Y
3,-0.462062,0.251980,-0.306207,0.273231,Male,Yes,0,Not Graduate,No,1.0,Urban,Y
4,0.097728,-0.554487,-0.056335,0.273231,Male,No,0,Graduate,No,1.0,Urban,Y
...,...,...,...,...,...,...,...,...,...,...,...,...
609,-0.410130,-0.554487,-0.889241,0.273231,Female,No,0,Graduate,No,1.0,Rural,Y
610,-0.212557,-0.554487,-1.258100,-2.522836,Male,Yes,3+,Graduate,No,1.0,Rural,Y
611,0.437174,-0.472404,1.276316,0.273231,Male,Yes,1,Graduate,No,1.0,Urban,Y
612,0.357064,-0.554487,0.491004,0.273231,Male,Yes,2,Graduate,No,1.0,Urban,Y


In [10]:
# saving the processed data, without encoding
df_preprocessed_without_encode.to_csv('./loan_data_set_preprocessed_unencoded.csv', index=False)

In [16]:
# combining the missing  value treatment, scalling, encoding in a pipeline
pipe = Pipeline([
    ('rmvmissing', FunctionTransformer(rmv_missing_vals)),
    ('scaler', FunctionTransformer(scalling)),
    ('encoder', FunctionTransformer(encode))
])

pipe.fit(df)

In [17]:
df_preprocessed = pipe.transform(df)

In [18]:
df_preprocessed.head()

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Gender_Male,Married_Yes,Dependents_1,Dependents_2,Dependents_3+,Education_Not Graduate,Self_Employed_Yes,Property_Area_Semiurban,Property_Area_Urban,Loan_Status_Y
0,0.072991,-0.554487,-0.234815,0.273231,1.0,1,0,0,0,0,0,0,0,1,1
1,-0.134412,-0.038732,-0.211017,0.273231,1.0,1,1,1,0,0,0,0,0,0,0
2,-0.393747,-0.554487,-0.948735,0.273231,1.0,1,1,0,0,0,0,1,0,1,1
3,-0.462062,0.25198,-0.306207,0.273231,1.0,1,1,0,0,0,1,0,0,1,1
4,0.097728,-0.554487,-0.056335,0.273231,1.0,1,0,0,0,0,0,0,0,1,1


In [123]:
# saving the processed data
df_preprocessed.to_csv('./loan_data_set_preprocessed.csv', index=False)