# AN APPROACH FOR PREDICTION OF LOAN ELIGIBILITY IN BANKING SYSTEM USING MACHINE LEARNING CLASSIFICATION ALGORITHMs

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression
import warnings
warnings.filterwarnings('ignore')

import os
print(os.listdir("."))

In [None]:
# reading data
train = pd.read_csv("../input/loan-prediction-data/train.csv")
test = pd.read_csv("../input/loan-prediction-data/test.csv")

In [None]:
train.shape, test.shape

In [None]:
train.head(3)

In [None]:
test.head(3)

# Dropping unnecessary columns

In [None]:
train=train.drop(columns=['Loan_ID'])
test=test.drop(columns=['Loan_ID'])

In [None]:
train.columns

In [None]:
test.columns

In [None]:
train.describe()

In [None]:
train.info()

In [None]:
train.Loan_Status.value_counts(normalize = True)

In [None]:
sns.countplot(train['Loan_Status'],label="Count")

In [None]:
sns.countplot(train['Gender'],label="Count")

In [None]:
### Data Visualization libraries
import seaborn as sns
import matplotlib.pyplot as plt

categorical_columns = ['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed', 'Property_Area','Credit_History','Loan_Amount_Term']


fig,axes = plt.subplots(4,2,figsize=(12,15))
for idx,cat_col in enumerate(categorical_columns):
    row,col = idx//2,idx%2
    sns.countplot(x=cat_col,data=train,hue='Loan_Status',ax=axes[row,col])


plt.subplots_adjust(hspace=1)

# Imputation of missing values

In [None]:
train.isnull().sum()

In [None]:
train["Gender"].fillna(train["Gender"].mode()[0],inplace=True)
train["Married"].fillna(train["Married"].mode()[0],inplace=True)
train['Dependents'].fillna(train["Dependents"].mode()[0],inplace=True)
train["Self_Employed"].fillna(train["Self_Employed"].mode()[0],inplace=True)
train["Credit_History"].fillna(train["Credit_History"].mode()[0],inplace=True)
train["Loan_Amount_Term"].fillna(train["Loan_Amount_Term"].mode()[0],inplace=True)

In [None]:
train.isnull().sum()

In [None]:
train["Loan_Amount_Term"].mode()[0]

In [None]:
train["Loan_Amount_Term"].value_counts()

Now we will see the LoanAmount variable. As it is a numerical variable, we can use mean or median to impute the missing values.

We will use median to fill the null values as earlier we saw that loan amount have outliers so the mean will not be the proper approach as it is highly affected by the presence of outliers.

In [None]:
train["LoanAmount"].fillna(train["LoanAmount"].median(),inplace=True)

In [None]:
train.isnull().sum()

In [None]:
test.isnull().sum()

In [None]:
# Imputation of missing values for test data

test["Gender"].fillna(test["Gender"].mode()[0],inplace=True)
test["Married"].fillna(test["Married"].mode()[0],inplace=True)
test['Dependents'].fillna(test["Dependents"].mode()[0],inplace=True)
test["Self_Employed"].fillna(test["Self_Employed"].mode()[0],inplace=True)
test["Credit_History"].fillna(test["Credit_History"].mode()[0],inplace=True)
test["Loan_Amount_Term"].fillna(test["Loan_Amount_Term"].mode()[0],inplace=True)
test["LoanAmount"].fillna(test["LoanAmount"].median(),inplace=True)
test.head()

In [None]:
test.isnull().sum()

# New feature derivation

Total_Income = ApplicantIncome + CoapplicantIncome
Income_Loan_Amount_Ratio = Total_Income / LoanAmount

In [None]:
train["Total_Income"]=(train["ApplicantIncome"]+train["CoapplicantIncome"])
# train["LoanAmount"]=train["LoanAmount"]*12
# train["EMI"]=train["LoanAmount"]/12
# train["Balance_Income"] = train["Total_Income"]-(train["EMI"]*12)
train["Income_Loan_Amount_Ratio"]=train["Total_Income"]/train["LoanAmount"]
# train["ApplicantIncome_Loan_Amount_Ratio"]=train["ApplicantIncome"]/train["LoanAmount"]
# train["Income_credit_history_ratio"]=train["Total_Income"]*(train["Credit_History"]+1)
# train["Amount_credit_history_ratio"]=train["LoanAmount"]*(train["Credit_History"]+1)
# train["Balance_Income_loan_Amount"]=train["Balance_Income"]/(train["LoanAmount"])
# train["Balance_Income_EMI"]=train["Balance_Income"]/(train["EMI"])
# train["Income_Balance_Income_Ratio"]=train["Total_Income"]/train["Balance_Income
train.columns

In [None]:
numerical_columns = ['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount','Total_Income','Income_Loan_Amount_Ratio']
fig,axes = plt.subplots(1,5,figsize=(17,5))
for idx,cat_col in enumerate(numerical_columns):
    sns.boxplot(y=cat_col,data=train,x='Loan_Status',ax=axes[idx])

print(train[numerical_columns].describe())
plt.subplots_adjust(hspace=1)

In [None]:
test["Total_Income"]=(test["ApplicantIncome"]+test["CoapplicantIncome"])
# train["LoanAmount"]=train["LoanAmount"]*12
# train["EMI"]=train["LoanAmount"]/12
# train["Balance_Income"] = train["Total_Income"]-(train["EMI"]*12)
test["Income_Loan_Amount_Ratio"]=test["Total_Income"]/test["LoanAmount"]
# train["ApplicantIncome_Loan_Amount_Ratio"]=train["ApplicantIncome"]/train["LoanAmount"]
# train["Income_credit_history_ratio"]=train["Total_Income"]*(train["Credit_History"]+1)
# train["Amount_credit_history_ratio"]=train["LoanAmount"]*(train["Credit_History"]+1)
# train["Balance_Income_loan_Amount"]=train["Balance_Income"]/(train["LoanAmount"])
# train["Balance_Income_EMI"]=train["Balance_Income"]/(train["EMI"])
# train["Income_Balance_Income_Ratio"]=train["Total_Income"]/train["Balance_Income
test.columns

# Encoding

In [None]:
train_encoded = pd.get_dummies(train,drop_first=True)
train_encoded.head()


In [None]:
test_encoded = pd.get_dummies(test,drop_first=True)
test_encoded.head()

# Splitting data

In [None]:
################# Splitting into Train -Test Data #######
from sklearn.model_selection import train_test_split

########## Split Features and Target Varible ############
X = train_encoded.drop(columns='Loan_Status_Y')
y = train_encoded[['Loan_Status_Y']]

################# Splitting into Train -Test Data #######
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,stratify =y,random_state =42)


In [None]:
y.head()

In [None]:
X_train.shape,X_test.shape,y_train.shape,y_test.shape

# Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,f1_score
LR_model = LogisticRegression(random_state=1)
LR_model.fit(X_train,y_train)
LR_prediction_result=LR_model.predict(X_test)

LR_score_logistic =accuracy_score(LR_prediction_result,y_test)*100 
LR_score_logistic

print("Training Data Set Accuracy: ", LR_score_logistic)
print("Training Data F1 Score ", f1_score(LR_prediction_result,y_test)*100)

In [None]:
# LR with Stratified K-fold

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,f1_score
from sklearn.model_selection import StratifiedKFold
from statistics import mean, stdev

LR_model = LogisticRegression(random_state=42)
# LR_model.fit(X_train,y_train)
# LR_prediction_result=LR_model.predict(X_test)

# Create StratifiedKFold object. 
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42) 
lst_accu_stratified = [] 
   
for train_index, test_index in skf.split(X, y): 
    X_train_fold, X_test_fold = X.loc[train_index], X.loc[test_index] 
    y_train_fold, y_test_fold = y.loc[train_index], y.loc[test_index] 
    LR_model.fit(X_train_fold, y_train_fold)
#     lst_accu_stratified.append(LR_model.score(X_test_fold, y_test_fold)) 
    LR_prediction_result=LR_model.predict(X_test_fold)
    lst_accu_stratified.append(accuracy_score(LR_prediction_result,y_test_fold)*100) 
   
# Print the output. 
print('List of possible accuracy:', lst_accu_stratified) 
print('\nMaximum Accuracy That can be obtained from this model is:', 
      max(lst_accu_stratified), '%') 
print('\nMinimum Accuracy:', 
      min(lst_accu_stratified), '%') 
print('\nOverall Accuracy:', 
      mean(lst_accu_stratified), '%') 
# LR_score_logistic =accuracy_score(LR_prediction_result,y_test)*100 
# LR_score_logistic

# print("Training Data Set Accuracy: ", LR_score_logistic)
# print("Training Data F1 Score ", f1_score(LR_prediction_result,y_test)*100)

In [None]:
LR_prediction_result_test=LR_model.predict(test_encoded)

In [None]:
LR_prediction_result_test

# Decision Trees

In [None]:
from sklearn.tree import DecisionTreeClassifier
DT_model = DecisionTreeClassifier(random_state=1)
DT_model.fit(X_train,y_train)
DT_prediction_result=DT_model.predict(X_test)
DT_score =accuracy_score(DT_prediction_result,y_test)*100 
DT_score

In [None]:
DT_prediction_result_test=DT_model.predict(test_encoded)

In [None]:
DT_prediction_result_test

# Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
RF_model = RandomForestClassifier(random_state=1,max_depth=10,n_estimators=50)
RF_model.fit(X_train,y_train)
RF_prediction_result=RF_model.predict(X_test)
RF_score = accuracy_score(RF_prediction_result,y_test)*100
RF_score

In [None]:
RF_prediction_result_test=RF_model.predict(test_encoded)

In [None]:
RF_prediction_result_test

In [None]:
importances = pd.Series(RF_model.feature_importances_,index=X.columns)
importances.plot(kind='barh', figsize=(12,8))

# K- Nearest Neighbours

In [None]:
from sklearn.neighbors import KNeighborsClassifier
knn_model = KNeighborsClassifier(n_neighbors=3)
knn_model.fit(X_train,y_train)
KNN_prediction_result=knn_model.predict(X_test)
KNN_score = accuracy_score(KNN_prediction_result,y_test)*100
KNN_score

In [None]:
KNN_prediction_result_test=knn_model.predict(test_encoded)

In [None]:
KNN_prediction_result_test

# Naive Bayes

In [None]:
from sklearn.naive_bayes import GaussianNB
NB_model = GaussianNB()
NB_model.fit(X_train,y_train)
NB_prediction_result=NB_model.predict(X_test)
NB_score = accuracy_score(NB_prediction_result,y_test)*100
NB_score

In [None]:
NB_prediction_result_test=knn_model.predict(test_encoded)

In [None]:
NB_prediction_result_test