In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In this notebook, I am going to show you that how different ML & DL models perform on a credit risk modeling dataset, where we will predict which of the customers will have their loan approved.

This is going to be a binary classification problem where the model will learn to predict the Loan_Status of a person, based on information available.

The dataset has 614 rows and 13 different features, including the target variable(Loan_Status). The data contains following features in it:

**Loan_ID**: A unique loan id

**Gender**: Male/Female
**Married**:Yes/No

**Dependents**:Number of poeple depending on applicant

**Education**:Applicant's education--Graduate/Not Gradudate

**Self_Employed**:Yes/No

**AppicantIncome**: Income of applicant($)

**CoapplicantIncome**:Income of co-applicant($)

**LoanAmount**:Loan amount($ thousands)

**Loan_Amount_Term**:Term for borrowing money(weeks)

**Credit_History**:Applicant's credit history

**Property_Area**:Urban/Rural/Semi

**Loan_Status**:Loan Approved (Yes/No)

First I am going to import some important libraries, then I will do some exploratory data analysis, then a bit of feature engineering followed by creating models and evaluating them on test set.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix,plot_confusion_matrix,classification_report
from sklearn.model_selection import StratifiedShuffleSplit,cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from tensorflow.keras import models,layers

# Exploratory Data Analysis

In [None]:
train=pd.read_csv('../input/loan-prediction-problem-dataset/train_u6lujuX_CVtuZ9i.csv')
train.head()

In [None]:
print(train.shape)

In [None]:
train.describe()

In [None]:
train.info()

The following illustration illustrates that from the data provided almost 69% of the loan applications were approved.

In [None]:
plt.figure(figsize=(10,7))
sns.countplot(x='Loan_Status',data=train)
print('Proportion of loan applications approved: ',train.Loan_Status.value_counts()[0]/len(train)*100)
print('Proportion of loan applications rejected: ',train.Loan_Status.value_counts()[1]/len(train)*100)

**The following 2 distribution plots indicate that majority of those people have submitted an application for loan who have income between 0 & 10000($).**

In [None]:
plt.figure(figsize=(10,7))
sns.distplot(train['ApplicantIncome'],color='Blue')

In [None]:
plt.figure(figsize=(10,7))
sns.histplot(x='ApplicantIncome',data=train,color='Red')

**The following distribution plots show that majority of people wanted a loan between 100,000 & 200,000($).**

In [None]:
plt.figure(figsize=(10,7))
sns.distplot(train['LoanAmount'],color='purple')

In [None]:
plt.figure(figsize=(10,7))
sns.histplot(data=train,x='LoanAmount',color='pink')

The following scatterplot between applicant's income and loan amount indicates that mainly people wanted to have a lesser loan. This is shown by the distribution of points at left bottom of the figure. We can also see that loan status is not affected by the amount of loan or applicant's income. It varies through out the data.

In [None]:
sns.relplot(x='ApplicantIncome',y='LoanAmount',data=train,hue='Loan_Status',height=7.0)
plt.xlabel('Applicant Income ')
plt.ylabel('Loan Amount (in thousands)')
plt.show()

**The following illustration indicates that there were more Male candidates for loan application as compared to Females, but the loan status is not much affected by it as there is almost same proportion of rejections in both the cases.**

In [None]:
plt.figure(figsize=(10,7))
sns.countplot(x='Gender',data=train,hue='Loan_Status',palette='flare_r')

**The following illustration illustrates that mainly those applicant's application for loan was approved who were not married. So if someone is not married, that person has greater chance for loan to be approved.**

In [None]:
grid=sns.FacetGrid(data=train,col='Loan_Status',height=5.0)
grid.map(sns.countplot,'Married',palette='jet_r')

**The following figure indicates that those who had no dependents got their loan approved compared to those who had. So if someone has no dependents, that person has greater chance for loan to be approved.**

In [None]:
plt.figure(figsize=(10,7))
sns.countplot('Dependents',data=train,hue='Loan_Status',palette='magma')

**The following plot gives an idea about the people's education. It explains that those people who were graduate, had greater chance for their loan to be approved.**

In [None]:
plt.figure(figsize=(10,7))
sns.countplot('Education',data=train,hue='Loan_Status',palette='inferno_r')

**The following figure suggests that people who were not self employed were given more preference over those who were self employed. If someone is not self employed, that person has more chance for getting loan approved.**

In [None]:
grid=sns.FacetGrid(data=train,col='Loan_Status',height=5.0)
grid.map(sns.countplot,'Self_Employed',palette='crest_r')

**The following plot suggests that those people's loan application was more accepted who had borrowed it for 360(weeks) compared to those who had borrowed it for less term.So if one borrows it for this much time period, that person has greater chance for loan to be approved.**

In [None]:
plt.figure(figsize=(15,8))
sns.countplot('Loan_Amount_Term',data=train,hue='Loan_Status',palette='Dark2')
plt.xlabel('Loan_Amount_Term (weeks)')
plt.show()

**The following plot shows the relationship between credit history and loan status. It suggests that those who had bad credit history didnt get their loan application approved compared to those who had god credit history. So if anyone has bad credit history, that person might have to face disappointment.**

In [None]:
grid=sns.FacetGrid(data=train,col='Loan_Status',height=5.0)
grid.map(sns.countplot,'Credit_History',palette='gist_earth');

**This count plot illustrates the property area applicants have. This feature is not showing any relation with loan status as people who had different property area got their applications approved. Through property area it cannot be distinguised that who's application has more chance of being approved.**

In [None]:
grid=sns.FacetGrid(data=train,col='Loan_Status',height=5.0)
grid.map(sns.countplot,'Property_Area',palette='CMRmap_r');

**Having a look at heatmap gives relationship between different integer/float datatype features. At this moment, it is not showing other features as they have object datatype.Later in this notebook we will see it in more detail.**

In [None]:
sns.heatmap(train.corr(),annot=True)

# Cleaning Data
There are alot of null values in the dataset, so we will have to deal with them.

In [None]:
train.isnull().sum()

**Those features who had only 2 unique values, I got their null values filled by their mode value. Whereas I got LoanAmount feature's null values filled by its median, because it may have outliers and taking average could divert our model from correctly predicting.**

In [None]:
train.Gender.fillna(train.Gender.mode()[0],inplace=True)
train.Married.fillna(train.Married.mode()[0],inplace=True)
train.Dependents.fillna(train.Dependents.mode()[0],inplace=True)
train.Self_Employed.fillna(train.Self_Employed.mode()[0],inplace=True)
train.LoanAmount.fillna(train.LoanAmount.median(),inplace=True)
train.Loan_Amount_Term.fillna(train.Loan_Amount_Term.mode()[0],inplace=True)
train.Credit_History.fillna(train.Credit_History.mode()[0],inplace=True)

In [None]:
train.isnull().sum()

**Our model only accepts tensors/numeric data, so I will use LabelEncoder module of sklearn library to encode the object datatype features. This will change their datatype to integer as well.**

In [None]:
train.dtypes

In [None]:
object_col=train.select_dtypes('object').columns
le=LabelEncoder()
for col in object_col:
train[col]=le.fit_transform(train[col])

In [None]:
train.dtypes

In [None]:
train.head()

**Now we will have a detailed look at heatmap diagram. It contains all features in it now because we recently encoded the object datatypes and they are converted to integer.
The heatmap suggests that their is high positive corelation between applicant's income and loan amount. The more the applicant's income is, the more loan amount he wants to have approved.
This also shows high positive corelation between loan status and credit history. The more good the credit history a person has, there is high chance for his loan to be approved. We also found this insight previously in countplot diagram of credit history and loan status.**

**The more darker the color is, there is more negative correlation. The more lighter it is, there is more positive correlation between the features.**


In [None]:
plt.figure(figsize=(12,7))
sns.heatmap(train.corr(),annot=True)

# Feature Engineering
**From the data, I am going to remove the Loan_ID and Property_Area feature because the heatmap suggests that these two features dont have any strong effect on any other feature.**

In [None]:
train.drop('Loan_ID',axis=1,inplace=True)
train.drop('Property_Area',axis=1,inplace=True)

In [None]:
y_train=train.Loan_Status
train.drop('Loan_Status',axis=1,inplace=True)
x_train=train

**Now I am going to split my data into train data(for training models) and test data(for evaluating models) with the help of sklearn's module, StratifiedShuffleSplit. I did not use train_test_split because it doesnt split the target variable in a balanced way. StratifiedShuffleSplit will make sure that both train and test data have equal proportion of target variables. I will be splitting data in to 60 40 ratio. 60% for training and 40% for testing.**

In [None]:
sss=StratifiedShuffleSplit(n_splits=1,test_size=0.3,random_state=42)
for train,test in sss.split(x_train,y_train):
  x_train,x_test=x_train.iloc[train],x_train.iloc[test]
  y_train,y_test=y_train.iloc[train],y_train.iloc[test]

print(f'Shape of x_train is {x_train.shape}')
print(f'Shape of y_train is {y_train.shape}')
print(f'Shape of x_test is {x_test.shape}')
print(f'Shape of y_test is {y_test.shape}')

**We can see that there is equal proportion of target variables in both datasets.This makes the data balanced,so easier and more effective for our model to make predictions.**

In [None]:
print('Proportion of 0 in y_train :',y_train.value_counts()[0]/ len(y_train))
print('Proportion of 0 in y_test :',y_test.value_counts()[0]/ len(y_test))
print('Proportion of 1 in y_train :',y_train.value_counts()[1]/ len(y_train))
print('Proportion of 1 in y_test :',y_test.value_counts()[1]/ len(y_test))

# Creating ML/DL models
Logistic Regression

In [None]:
model=LogisticRegression(max_iter=200)
model.fit(x_train,y_train)
cross_val=(cross_val_score(model,x_train,y_train,cv=5))
y_pred=model.predict(x_test)
cf_r=classification_report(y_test,y_pred)
print('Mean cross validation score ',np.mean(cross_val))
print('Accuracy on test data ', model.score(x_test,y_test))
print('Accuracy on train data ', model.score(x_train,y_train))
print(plot_confusion_matrix(model,x_test,y_test,values_format='0.3g'))
print('Classification Report :\n',cf_r)

# KNeighborsClassifier

Here I have used loop to get the best value for nearest neighbors.

In [None]:
for i in range(1,100):
  best_n=[]
  model=KNeighborsClassifier(n_neighbors=i)
  model.fit(x_train,y_train)
  cross_val=(cross_val_score(model,x_train,y_train,cv=5))
  best_n.append( model.score(x_test,y_test)) 
y_pred=model.predict(x_test) 
print('Mean cross validation score ',np.mean(cross_val))
print('Accuracy on test data ', model.score(x_test,y_test))
print(plot_confusion_matrix(model,x_test,y_test,values_format='0.3g'))
print('Accuracy on train data ', model.score(x_train,y_train))

In [None]:
model=DecisionTreeClassifier(random_state=42)
model.fit(x_train,y_train)
cross_val=(cross_val_score(model,x_train,y_train,cv=5))
y_pred=model.predict(x_test)
cf_r=classification_report(y_test,y_pred)  
print('Mean cross validation score ',np.mean(cross_val))
print('Accuracy on test data ', model.score(x_test,y_test))
print('Accuracy on train data ', model.score(x_train,y_train))
print(plot_confusion_matrix(model,x_test,y_test,values_format='0.3g'))
print('Classification Report :\n',cf_r)

In [None]:
model=XGBClassifier()
model.fit(x_train,y_train)
cross_val=(cross_val_score(model,x_train,y_train,cv=5))
y_pred=model.predict(x_test)
cf_r=classification_report(y_test,y_pred)  
print('Mean cross validation score ',np.mean(cross_val))
print('Accuracy on test data ', model.score(x_test,y_test))
print('Accuracy on train data ', model.score(x_train,y_train))
print(plot_confusion_matrix(model,x_test,y_test,values_format='0.3g'))
print('Classification Report :\n',cf_r)

In [None]:
model=models.Sequential()
model.add(layers.Dense(32,activation='relu',input_shape=x_train.shape))
model.add(layers.Dropout(0.3))
model.add(layers.Dense(16,activation='relu'))
model.add(layers.Dense(8,activation='relu'))
model.add(layers.Dropout(0.5))
model.add(layers.Dense(1,activation='sigmoid'))
model.compile(optimizer='rmsprop',loss='binary_crossentropy',metrics=['acc'])
model.fit(x_train,y_train,epochs=300,batch_size=2,verbose=0)

In [None]:
print('Accuracy on test data ', model.evaluate(x_test,y_test))
y_pred=model.predict(x_test)
y_pred=np.where(y_pred>0.5,1,0)
c_m=confusion_matrix(y_test,y_pred)
print('Confusion Matrix : \n',c_m)

# Conclusion
Out of all the ML/DL models I have tried, Logistic Regression was the one which performed the best with test accuracy of 85%. So if anyone is not married,is graduated,has no dependents, is not self employed,has good income and has good credit hisotry will have good chance of loan application being accepted.