# Import Libraries

In [3]:
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

# Load Data

In [8]:
raw_train = pd.read_csv(filepath_or_buffer="../data/loan-train.csv")
raw_test = pd.read_csv(filepath_or_buffer="../data/loan-test.csv")
raw_train.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [9]:
raw_train.shape

(614, 13)

In [10]:
raw_train.nunique()

Loan_ID              614
Gender                 2
Married                2
Dependents             4
Education              2
Self_Employed          2
ApplicantIncome      505
CoapplicantIncome    287
LoanAmount           203
Loan_Amount_Term      10
Credit_History         2
Property_Area          3
Loan_Status            2
dtype: int64

In [11]:
# copy the raw data for modifications
train_df = raw_train.copy(deep=True)
test_df = raw_test.copy(deep=True)

In [13]:
train_df.info() # only for training

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            614 non-null    object 
 1   Gender             601 non-null    object 
 2   Married            611 non-null    object 
 3   Dependents         599 non-null    object 
 4   Education          614 non-null    object 
 5   Self_Employed      582 non-null    object 
 6   ApplicantIncome    614 non-null    int64  
 7   CoapplicantIncome  614 non-null    float64
 8   LoanAmount         592 non-null    float64
 9   Loan_Amount_Term   600 non-null    float64
 10  Credit_History     564 non-null    float64
 11  Property_Area      614 non-null    object 
 12  Loan_Status        614 non-null    object 
dtypes: float64(4), int64(1), object(8)
memory usage: 62.5+ KB


In [14]:
test_df.info() # only for testing (prediction)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 367 entries, 0 to 366
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            367 non-null    object 
 1   Gender             356 non-null    object 
 2   Married            367 non-null    object 
 3   Dependents         357 non-null    object 
 4   Education          367 non-null    object 
 5   Self_Employed      344 non-null    object 
 6   ApplicantIncome    367 non-null    int64  
 7   CoapplicantIncome  367 non-null    int64  
 8   LoanAmount         362 non-null    float64
 9   Loan_Amount_Term   361 non-null    float64
 10  Credit_History     338 non-null    float64
 11  Property_Area      367 non-null    object 
dtypes: float64(3), int64(2), object(7)
memory usage: 34.5+ KB


In [15]:
train_y = train_df[["Loan_Status"]]
train_df.drop(labels="Loan_Status", axis=1, inplace=True)
train_df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            614 non-null    object 
 1   Gender             601 non-null    object 
 2   Married            611 non-null    object 
 3   Dependents         599 non-null    object 
 4   Education          614 non-null    object 
 5   Self_Employed      582 non-null    object 
 6   ApplicantIncome    614 non-null    int64  
 7   CoapplicantIncome  614 non-null    float64
 8   LoanAmount         592 non-null    float64
 9   Loan_Amount_Term   600 non-null    float64
 10  Credit_History     564 non-null    float64
 11  Property_Area      614 non-null    object 
dtypes: float64(4), int64(1), object(7)
memory usage: 57.7+ KB


# Drop Unnecessary Columns

In [17]:
train_df.drop(labels="Loan_ID", axis=1, inplace=True)
test_df.drop(labels="Loan_ID", axis=1, inplace=True)

In [23]:
print(sorted(train_df.columns))
print(sorted(test_df.columns))

['ApplicantIncome', 'CoapplicantIncome', 'Credit_History', 'Dependents', 'Education', 'Gender', 'LoanAmount', 'Loan_Amount_Term', 'Married', 'Property_Area', 'Self_Employed']
['ApplicantIncome', 'CoapplicantIncome', 'Credit_History', 'Dependents', 'Education', 'Gender', 'LoanAmount', 'Loan_Amount_Term', 'Married', 'Property_Area', 'Self_Employed']


# Check Duplicates

In [25]:
train_df[train_df.duplicated()]

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area


In [27]:
train_df.drop_duplicates(inplace=True)

In [26]:
test_df[test_df.duplicated()]

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
195,Male,No,0,Graduate,Yes,5833,0,116.0,360.0,1.0,Urban


Since, the duplicate row is in the **test df**, it will have no impact in model training. So, it's upto you whether to drop the duplicated rows from test df or not.

# Check Missing Values

In [29]:
train_df.isna().sum().sort_values(ascending=False)

Credit_History       50
Self_Employed        32
LoanAmount           22
Dependents           15
Loan_Amount_Term     14
Gender               13
Married               3
Education             0
ApplicantIncome       0
CoapplicantIncome     0
Property_Area         0
dtype: int64

# Impute Missing Values

* Impute **Categorical** Feature ➾ **Mean**
* Impute **Numerical** Feature ➾ **Mode**

In [31]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Gender             601 non-null    object 
 1   Married            611 non-null    object 
 2   Dependents         599 non-null    object 
 3   Education          614 non-null    object 
 4   Self_Employed      582 non-null    object 
 5   ApplicantIncome    614 non-null    int64  
 6   CoapplicantIncome  614 non-null    float64
 7   LoanAmount         592 non-null    float64
 8   Loan_Amount_Term   600 non-null    float64
 9   Credit_History     564 non-null    float64
 10  Property_Area      614 non-null    object 
dtypes: float64(4), int64(1), object(6)
memory usage: 52.9+ KB


In [33]:
train_df.columns.sort_values()

Index(['ApplicantIncome', 'CoapplicantIncome', 'Credit_History', 'Dependents',
       'Education', 'Gender', 'LoanAmount', 'Loan_Amount_Term', 'Married',
       'Property_Area', 'Self_Employed'],
      dtype='object')

In [30]:
train_df.nunique().sort_values(ascending=False)

ApplicantIncome      505
CoapplicantIncome    287
LoanAmount           203
Loan_Amount_Term      10
Dependents             4
Property_Area          3
Gender                 2
Married                2
Education              2
Self_Employed          2
Credit_History         2
dtype: int64

In [41]:
numerical_cols:list[str] = ['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount', 'Loan_Amount_Term']
categorical_cols:list[str] = [col for col in train_df.columns if col not in numerical_cols]
print(categorical_cols)

['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed', 'Credit_History', 'Property_Area']


In [47]:
categorical_imputer = SimpleImputer(strategy="most_frequent") # initialize imputer
categorical_imputer.fit(X=train_df[categorical_cols]) # learn

train_df[categorical_cols] = categorical_imputer.transform(X=train_df[categorical_cols]) # transform
test_df[categorical_cols] = categorical_imputer.transform(X=test_df[categorical_cols]) # transform

train_df[categorical_cols].head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,Credit_History,Property_Area
0,Male,No,0,Graduate,No,1.0,Urban
1,Male,Yes,1,Graduate,No,1.0,Rural
2,Male,Yes,0,Graduate,Yes,1.0,Urban
3,Male,Yes,0,Not Graduate,No,1.0,Urban
4,Male,No,0,Graduate,No,1.0,Urban


In [48]:
numerical_imputer = SimpleImputer(strategy="most_frequent") # initialize imputer
numerical_imputer.fit(X=train_df[numerical_cols]) # learn

train_df[numerical_cols] = numerical_imputer.transform(X=train_df[numerical_cols]) # transform
test_df[numerical_cols] = numerical_imputer.transform(X=test_df[numerical_cols]) # transform

test_df[numerical_cols].head()

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term
0,5720.0,0.0,110.0,360.0
1,3076.0,1500.0,126.0,360.0
2,5000.0,1800.0,208.0,360.0
3,2340.0,2546.0,100.0,360.0
4,3276.0,0.0,78.0,360.0


# Re-Check Missing Values

In [49]:
train_df.isna().sum().sort_values(ascending=False)

Gender               0
Married              0
Dependents           0
Education            0
Self_Employed        0
ApplicantIncome      0
CoapplicantIncome    0
LoanAmount           0
Loan_Amount_Term     0
Credit_History       0
Property_Area        0
dtype: int64

# Feature Engineering

In [50]:
train_df.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
0,Male,No,0,Graduate,No,5849.0,0.0,120.0,360.0,1.0,Urban
1,Male,Yes,1,Graduate,No,4583.0,1508.0,128.0,360.0,1.0,Rural
2,Male,Yes,0,Graduate,Yes,3000.0,0.0,66.0,360.0,1.0,Urban
3,Male,Yes,0,Not Graduate,No,2583.0,2358.0,120.0,360.0,1.0,Urban
4,Male,No,0,Graduate,No,6000.0,0.0,141.0,360.0,1.0,Urban


In [53]:
# combine ApplicantIncome + CoapplicantIncome into one
train_df["ApplicantIncome"] += train_df["CoapplicantIncome"]
test_df["ApplicantIncome"] += test_df["CoapplicantIncome"]

# drop the CoapplicantIncome as it's not needed
train_df.drop(labels=["CoapplicantIncome"],axis=1,inplace=True)
test_df.drop(labels=["CoapplicantIncome"],axis=1,inplace=True)

# remove the CoapplicantIncome from numerical cols also
numerical_cols.remove("CoapplicantIncome")

# Encoding Features
* Transforming Categorical Data into Numerical

In [55]:
train_df.nunique().sort_values()

Gender                2
Married               2
Education             2
Self_Employed         2
Credit_History        2
Property_Area         3
Dependents            4
Loan_Amount_Term     10
LoanAmount          203
ApplicantIncome     583
dtype: int64

In [57]:
train_df["Property_Area"].unique() # Can be treated as Nominal or Ordinal

array(['Urban', 'Rural', 'Semiurban'], dtype=object)

In [58]:
train_df["Dependents"].unique() # Ordinal therefore, can apply Label Encoding

array(['0', '1', '2', '3+'], dtype=object)

In [59]:
for col in categorical_cols:
    le = LabelEncoder()
    train_df[col] = le.fit_transform(y=train_df[col])
    test_df[col] = le.fit_transform(y=test_df[col])

train_df.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
0,1,0,0,0,0,5849.0,120.0,360.0,1,2
1,1,1,1,0,0,9107.0,128.0,360.0,1,0
2,1,1,0,0,1,3000.0,66.0,360.0,1,2
3,1,1,0,1,0,9657.0,120.0,360.0,1,2
4,1,0,0,0,0,6000.0,141.0,360.0,1,2


# Log Transormation on Numerical Features
* **`Log transformation`** is a common technique used in machine learning to transform numerical features that have a highly skewed distribution. The significance of log transformation is that it can help to improve the performance of machine learning models by making the data more normally distributed, reducing the effect of outliers, and stabilizing the variance.

In [61]:
train_df[numerical_cols] = np.log(train_df[numerical_cols])
test_df[numerical_cols] = np.log(test_df[numerical_cols])

train_df.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
0,1,0,0,0,0,8.674026,4.787492,5.886104,1,2
1,1,1,1,0,0,9.116799,4.85203,5.886104,1,0
2,1,1,0,0,1,8.006368,4.189655,5.886104,1,2
3,1,1,0,1,0,9.175438,4.787492,5.886104,1,2
4,1,0,0,0,0,8.699515,4.94876,5.886104,1,2


# Scale the Entire Data

In [64]:
minmax = MinMaxScaler() # initialize minmax scaler

train_df = minmax.fit_transform(X=train_df)
test_df = minmax.fit_transform(X=test_df)

train_df

array([[1.        , 0.        , 0.        , ..., 0.9220137 , 1.        ,
        1.        ],
       [1.        , 1.        , 0.33333333, ..., 0.9220137 , 1.        ,
        0.        ],
       [1.        , 1.        , 0.        , ..., 0.9220137 , 1.        ,
        1.        ],
       ...,
       [1.        , 1.        , 0.33333333, ..., 0.9220137 , 1.        ,
        1.        ],
       [1.        , 1.        , 0.66666667, ..., 0.9220137 , 1.        ,
        1.        ],
       [0.        , 0.        , 0.        , ..., 0.9220137 , 0.        ,
        0.5       ]])

# Building the Model