# Home Loan Approval Predictive Analysis (Classification)

### One-Hot Encoding - Has no intrinsic ranking, and will be used to indicate the presence of each possible value in the original dataset.
### XGBoost - will be used as the predictive algorithm.

## Quadri Alli

In [131]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [57]:
X_train_data = pd.read_csv('loan_sanction_train.csv')
test_data = pd.read_csv('loan_sanction_test.csv')

In [58]:
X_train_data

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y
...,...,...,...,...,...,...,...,...,...,...,...,...,...
609,LP002978,Female,No,0,Graduate,No,2900,0.0,71.0,360.0,1.0,Rural,Y
610,LP002979,Male,Yes,3+,Graduate,No,4106,0.0,40.0,180.0,1.0,Rural,Y
611,LP002983,Male,Yes,1,Graduate,No,8072,240.0,253.0,360.0,1.0,Urban,Y
612,LP002984,Male,Yes,2,Graduate,No,7583,0.0,187.0,360.0,1.0,Urban,Y


### Loan_Status is my Target variable, but its in text. I'm going to convert it to a numeric value for the sake of the algorithm. So "Y" becomes 1 and "N" becomes 0

In [59]:
test_data

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
0,LP001015,Male,Yes,0,Graduate,No,5720,0,110.0,360.0,1.0,Urban
1,LP001022,Male,Yes,1,Graduate,No,3076,1500,126.0,360.0,1.0,Urban
2,LP001031,Male,Yes,2,Graduate,No,5000,1800,208.0,360.0,1.0,Urban
3,LP001035,Male,Yes,2,Graduate,No,2340,2546,100.0,360.0,,Urban
4,LP001051,Male,No,0,Not Graduate,No,3276,0,78.0,360.0,1.0,Urban
...,...,...,...,...,...,...,...,...,...,...,...,...
362,LP002971,Male,Yes,3+,Not Graduate,Yes,4009,1777,113.0,360.0,1.0,Urban
363,LP002975,Male,Yes,0,Graduate,No,4158,709,115.0,360.0,1.0,Urban
364,LP002980,Male,No,0,Graduate,No,3250,1993,126.0,360.0,,Semiurban
365,LP002986,Male,Yes,0,Graduate,No,5000,2393,158.0,360.0,1.0,Rural


In [60]:
test_data.isna().sum()

Loan_ID               0
Gender               11
Married               0
Dependents           10
Education             0
Self_Employed        23
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount            5
Loan_Amount_Term      6
Credit_History       29
Property_Area         0
dtype: int64

In [61]:
X_train_data.columns

Index(['Loan_ID', 'Gender', 'Married', 'Dependents', 'Education',
       'Self_Employed', 'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
       'Loan_Amount_Term', 'Credit_History', 'Property_Area', 'Loan_Status'],
      dtype='object')

In [62]:
test_data.columns

Index(['Loan_ID', 'Gender', 'Married', 'Dependents', 'Education',
       'Self_Employed', 'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
       'Loan_Amount_Term', 'Credit_History', 'Property_Area'],
      dtype='object')

In [63]:
X_train_data.nunique()

Loan_ID              614
Gender                 2
Married                2
Dependents             4
Education              2
Self_Employed          2
ApplicantIncome      505
CoapplicantIncome    287
LoanAmount           203
Loan_Amount_Term      10
Credit_History         2
Property_Area          3
Loan_Status            2
dtype: int64

In [64]:
X_train_data['Dependents'].unique()

array(['0', '1', '2', '3+', nan], dtype=object)

In [65]:
X_train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            614 non-null    object 
 1   Gender             601 non-null    object 
 2   Married            611 non-null    object 
 3   Dependents         599 non-null    object 
 4   Education          614 non-null    object 
 5   Self_Employed      582 non-null    object 
 6   ApplicantIncome    614 non-null    int64  
 7   CoapplicantIncome  614 non-null    float64
 8   LoanAmount         592 non-null    float64
 9   Loan_Amount_Term   600 non-null    float64
 10  Credit_History     564 non-null    float64
 11  Property_Area      614 non-null    object 
 12  Loan_Status        614 non-null    object 
dtypes: float64(4), int64(1), object(8)
memory usage: 62.5+ KB


## Preprocessing Data

In [88]:
# Gender
X_train_data['Gender'] = X_train_data['Gender'].replace("Male", 1)
X_train_data['Gender'] = X_train_data['Gender'].replace("Female", 2)
test_data['Gender'] = test_data['Gender'].replace("Male", 1)
test_data['Gender'] = test_data['Gender'].replace("Female", 2)

In [89]:
# Married
X_train_data['Married'] = X_train_data['Married'].replace("Yes", 1)
X_train_data['Married'] = X_train_data['Married'].replace("No", 0)
test_data['Married'] = test_data['Married'].replace("Yes", 1)
test_data['Married'] = test_data['Married'].replace("No", 0)

In [90]:
# Dependents
X_train_data['Dependents'] = X_train_data['Dependents'].replace("3+", 3)
test_data['Dependents'] = test_data['Dependents'].replace("3+", 3)

In [91]:
X_train_data['Education'] = X_train_data['Education'].replace("Graduate", 1)
X_train_data['Education'] = X_train_data['Education'].replace("Not Graduate", 2)
test_data['Education'] = test_data['Education'].replace("Graduate", 1)
test_data['Education'] = test_data['Education'].replace("Not Graduate", 2)

In [92]:
X_train_data['Self_Employed'] = X_train_data['Self_Employed'].replace("Yes", 1)
X_train_data['Self_Employed'] = X_train_data['Self_Employed'].replace("No", 0)
test_data['Self_Employed'] = test_data['Self_Employed'].replace("Yes", 1)
test_data['Self_Employed'] = test_data['Self_Employed'].replace("No", 0)

In [93]:
X_train_data['Property_Area'] = X_train_data['Property_Area'].replace("Rural", 0)
X_train_data['Property_Area'] = X_train_data['Property_Area'].replace("Urban", 1)
X_train_data['Property_Area'] = X_train_data['Property_Area'].replace("Semiurban", 2)
test_data['Property_Area'] = test_data['Property_Area'].replace("Rural", 0)
test_data['Property_Area'] = test_data['Property_Area'].replace("Urban", 1)
test_data['Property_Area'] = test_data['Property_Area'].replace("Semiurban", 2)

In [72]:
X_train_data

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,1.0,0.0,0,1,0.0,5849,0.0,,360.0,1.0,1,Y
1,LP001003,1.0,1.0,1,1,0.0,4583,1508.0,128.0,360.0,1.0,0,N
2,LP001005,1.0,1.0,0,1,1.0,3000,0.0,66.0,360.0,1.0,1,Y
3,LP001006,1.0,1.0,0,2,0.0,2583,2358.0,120.0,360.0,1.0,1,Y
4,LP001008,1.0,0.0,0,1,0.0,6000,0.0,141.0,360.0,1.0,1,Y
...,...,...,...,...,...,...,...,...,...,...,...,...,...
609,LP002978,2.0,0.0,0,1,0.0,2900,0.0,71.0,360.0,1.0,0,Y
610,LP002979,1.0,1.0,3,1,0.0,4106,0.0,40.0,180.0,1.0,0,Y
611,LP002983,1.0,1.0,1,1,0.0,8072,240.0,253.0,360.0,1.0,1,Y
612,LP002984,1.0,1.0,2,1,0.0,7583,0.0,187.0,360.0,1.0,1,Y


In [73]:
# Dropping rows with missing values 
X_train_data_copy = X_train_data.copy()
X_train_data_copy.dropna(inplace=True)
X_train_data_copy

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
1,LP001003,1.0,1.0,1,1,0.0,4583,1508.0,128.0,360.0,1.0,0,N
2,LP001005,1.0,1.0,0,1,1.0,3000,0.0,66.0,360.0,1.0,1,Y
3,LP001006,1.0,1.0,0,2,0.0,2583,2358.0,120.0,360.0,1.0,1,Y
4,LP001008,1.0,0.0,0,1,0.0,6000,0.0,141.0,360.0,1.0,1,Y
5,LP001011,1.0,1.0,2,1,1.0,5417,4196.0,267.0,360.0,1.0,1,Y
...,...,...,...,...,...,...,...,...,...,...,...,...,...
609,LP002978,2.0,0.0,0,1,0.0,2900,0.0,71.0,360.0,1.0,0,Y
610,LP002979,1.0,1.0,3,1,0.0,4106,0.0,40.0,180.0,1.0,0,Y
611,LP002983,1.0,1.0,1,1,0.0,8072,240.0,253.0,360.0,1.0,1,Y
612,LP002984,1.0,1.0,2,1,0.0,7583,0.0,187.0,360.0,1.0,1,Y


In [74]:
y = X_train_data_copy.Loan_Status
X = X_train_data_copy.drop(columns=['Loan_ID', 'Loan_Status'])
x_train, x_val, y_train, y_val = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=42)

## Random Forest Classifier

In [75]:
# RandomForest
model_1 = RandomForestClassifier(n_estimators=100, random_state=42)
model_1.fit(x_train, y_train)
preds = model_1.predict(x_val)
preds

array(['Y', 'Y', 'Y', 'N', 'Y', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y',
       'Y', 'Y', 'Y', 'N', 'N', 'Y', 'N', 'N', 'Y', 'Y', 'Y', 'Y', 'Y',
       'Y', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y',
       'N', 'Y', 'Y', 'N', 'Y', 'Y', 'N', 'Y', 'Y', 'Y', 'N', 'N', 'Y',
       'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'Y',
       'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y',
       'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'N',
       'Y', 'Y', 'Y', 'Y', 'Y'], dtype=object)

In [97]:
# model Accuracy score
print(model_1.score(x_val,y_val))

0.8125


## Decision Tree Classifier

In [95]:
model_2 = DecisionTreeClassifier(max_depth=50, random_state=0)
model_2.fit(x_train, y_train)
predict = model_2.predict(x_val)
predict

array(['Y', 'Y', 'Y', 'N', 'Y', 'N', 'Y', 'N', 'Y', 'N', 'N', 'Y', 'Y',
       'N', 'N', 'Y', 'N', 'N', 'Y', 'Y', 'N', 'Y', 'Y', 'N', 'Y', 'Y',
       'N', 'N', 'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'N',
       'N', 'Y', 'Y', 'N', 'N', 'Y', 'N', 'Y', 'Y', 'Y', 'N', 'N', 'N',
       'N', 'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'Y', 'N', 'Y', 'Y', 'Y', 'Y',
       'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'N',
       'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'N', 'N', 'N', 'N', 'Y', 'N',
       'Y', 'Y', 'Y', 'Y', 'Y'], dtype=object)

In [96]:
# model Accuracy score
print(model_2.score(x_val,y_val))

0.6979166666666666


## Gradient Boosting Classifier

In [100]:
model_3 = GradientBoostingClassifier(learning_rate=0.1, n_estimators=500, random_state=0)
model_3.fit(x_train, y_train)
prediction = model_3.predict(x_val)
prediction

array(['Y', 'Y', 'Y', 'N', 'Y', 'N', 'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'Y',
       'N', 'N', 'Y', 'N', 'N', 'Y', 'N', 'N', 'Y', 'Y', 'N', 'Y', 'Y',
       'Y', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y',
       'N', 'Y', 'Y', 'N', 'Y', 'Y', 'N', 'Y', 'Y', 'Y', 'N', 'N', 'Y',
       'Y', 'N', 'Y', 'N', 'Y', 'N', 'Y', 'N', 'Y', 'Y', 'Y', 'N', 'Y',
       'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y',
       'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'N',
       'Y', 'Y', 'Y', 'Y', 'Y'], dtype=object)

In [101]:
# model Accuracy score
print(model_3.score(x_val,y_val))

0.78125


# Using trained Model to predict test_data

In [105]:
test_data.dropna(inplace=True)
test_data_copy = test_data.drop(columns=['Loan_ID'])
test_data_copy

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
0,1.0,1,0,1,0.0,5720,0,110.0,360.0,1.0,1
1,1.0,1,1,1,0.0,3076,1500,126.0,360.0,1.0,1
2,1.0,1,2,1,0.0,5000,1800,208.0,360.0,1.0,1
4,1.0,0,0,2,0.0,3276,0,78.0,360.0,1.0,1
5,1.0,1,0,2,1.0,2165,3422,152.0,360.0,1.0,1
...,...,...,...,...,...,...,...,...,...,...,...
361,1.0,1,1,1,0.0,2269,2167,99.0,360.0,1.0,2
362,1.0,1,3,2,1.0,4009,1777,113.0,360.0,1.0,1
363,1.0,1,0,1,0.0,4158,709,115.0,360.0,1.0,1
365,1.0,1,0,1,0.0,5000,2393,158.0,360.0,1.0,0


### In this case, our test_data has no Loan_Status, so we can't verify against it or calculate how accurate our model is. However, what we've done here is predict what the Loan status is going to be for the test_data based on our trained data set

In [120]:
model_4 = RandomForestClassifier(random_state=1)
model_4.fit(x_train,y_train)
prediction = model_4.predict(test_data_copy)
prediction

array(['Y', 'Y', 'Y', 'N', 'Y', 'Y', 'N', 'Y', 'N', 'Y', 'Y', 'Y', 'Y',
       'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y',
       'Y', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y',
       'Y', 'Y', 'Y', 'N', 'Y', 'Y', 'N', 'Y', 'Y', 'Y', 'Y', 'N', 'Y',
       'Y', 'N', 'N', 'Y', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'N',
       'N', 'N', 'Y', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'Y',
       'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y',
       'Y', 'N', 'N', 'N', 'Y', 'Y', 'Y', 'N', 'N', 'Y', 'N', 'Y', 'Y',
       'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'Y', 'Y', 'Y',
       'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'Y', 'Y', 'N', 'Y', 'N',
       'Y', 'Y', 'Y', 'N', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'N', 'Y',
       'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'Y',
       'N', 'Y', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y',
       'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'N