# This notebook is basic demonstration of data insights from a loan applicants dataset.
## like what is the approval ratio of  loan and which factors is most important to get your loan approved.
## At last we will create a predictive model which can predict that which applicant get loan

In [1]:
import pandas as pd
import seaborn as sb
import numpy as np
from sklearn import model_selection
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestClassifier

### Some of the Questions :

1. Find % of total applicants for each unique value of dependents (for data biasing)
2. Find the average number of dependents per applicant (dependant refers to family members)
3. Find the %of applications approved for self employed applicants
4. What is the % of rejections for married male applicants
5. Which property area has the maximum approval ratio
6. Create a simple predictive model to assess whether a loan application will be approved or rejected and provide the accuracy score



# Importing Data

In [1]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        dataset = os.path.join(dirname, filename)
        print(os.path.join(dirname, filename))

data=pd.read_json(dataset)
data1=pd.read_json(dataset)

## Data has no null value

In [1]:
data.info()

In [1]:
data.tail()

# Plotting different features with respect to Application status(Approved or not)
## data insights

In [1]:
import matplotlib.pyplot as plt
fig,axes = plt.subplots(4,2,figsize=(13,20))
for id,i in enumerate(data[['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed','Credit_History','Property_Area', 'Income']]):
    row,col = id//2,id%2
    sb.countplot(x=i,data=data,hue='Application_Status',ax=axes[row,col])
plt.subplots_adjust(hspace=1)

# Ques 1. Find % of total applicants for each unique value of dependents

## for the purpose to see how data is biased toward one type of category
## As you can see Maximum application filled are from 0 dependant 

In [1]:
l=data.groupby(data.Dependents)
l.count()

In [1]:
print(round(294/511*100),"% of total applicats are with 0 -dependents")
print(round(85/511*100),"% of total applicats are with 1 -dependents")    
print(round(88/511*100),"% of total applicats are with 2 -dependents")
print(round(44/511*100)," % of total applicats are with 3+ -dependents")

# 2. Find the %of applications approved for self employed applicants
## 9% SELF EMPLOYED  candidates GETS THEIR loan APPROVED

In [1]:
data.groupby(['Self_Employed','Application_Status']).count()

In [1]:
print(46/511*100)

# 3. What is the % of rejections for married male applicant
## 17% of married males are rejected for Loan approval

In [1]:
data.groupby(['Married','Gender','Application_Status']).count()

In [1]:
print(87/511*100)

> Also chances of married male to get his loan approved is highest

# 4. Which property area has the maximum approval ratio
## = Semiurban

In [1]:
data.groupby(['Property_Area','Application_Status']).count()

## Data insights on basis of dependents

In [1]:
fig,axes = plt.subplots(5,2,figsize=(13,20))
for id,i in enumerate(data[['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed','Credit_History','Property_Area', 'Income','Application_Status']]):
    row,col = id//2,id%2
    sb.countplot(x=i,data=data,hue='Dependents',ax=axes[row,col])
plt.subplots_adjust(hspace=1)

# 5. Find average dependents per income group
 ## low salary applicant have 0 dependant 

In [1]:
sb.countplot(hue=data.Dependents,x=data.Income)

In [1]:
data.groupby(['Income','Dependents']).count()
# Average no. of dependents in high income candidates is 

## Max people applying for loan is of low income


In [1]:
data.groupby(['Income']).count() 

## Approval on basis of credit history:
## - applicants with a credit history have higher chance of approval.

In [1]:
data.groupby(['Application_Status','Credit_History']).count() 

# 6.) Create a simple predictive model to assess whether a loan application will be approved or rejected and provide the accuracy score

In [1]:
data.head()

In [1]:
data.drop(['Application_ID'],axis=1,inplace=True)

### convert the data to machine form

In [1]:
from sklearn.preprocessing import LabelEncoder
column=['Gender','Married','Dependents','Education','Self_Employed','Credit_History','Property_Area','Income','Application_Status']
all= LabelEncoder()
for i in column:
    data[i] = all.fit_transform(data[i])

In [1]:
data.head()

In [1]:
data.columns

In [1]:
x=data[['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed','Credit_History','Property_Area', 'Income']]
y=data.Application_Status
x_train, x_test, y_train, y_test = model_selection.train_test_split(x,y, test_size=0.2, random_state=7)

In [1]:
model = LogisticRegression()
model.fit(x_train,y_train)
predictions = model.predict(x_test)
print(accuracy_score(y_test, predictions)*100)

### accuracy=89 %

# Confusion Matrix

## 12 applicationn are wronglly accepted by our model
## they should not be accepted
## we need to decrease false negative 

In [1]:
from sklearn.metrics import confusion_matrix #confusuon matrix
pd.crosstab(y_test, predictions)
#positive=1 : application accepted

# heatmap to check co relation between attributes
## credit history is  highly co-related 

In [1]:
sb.clustermap(data.corr(),cmap='Blues',annot=True)
#credit history is correlated with application status

# other models

In [1]:
X_train=data[['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed','Credit_History','Property_Area', 'Income']]
Y_train=data.Application_Status

from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(x,y, test_size=0.3,
                                                    random_state=10)

In [1]:
#Using Logistic Regression Algorithm to the Training Set
from sklearn.linear_model import LogisticRegression
classifier1 = LogisticRegression(random_state = 0)
classifier1.fit(X_train, Y_train)
#Using KNeighborsClassifier Method of neighbors class to use Nearest Neighbor algorithm
from sklearn.neighbors import KNeighborsClassifier
classifier2 = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)
classifier2.fit(X_train, Y_train)
#Using SVC method of svm class to use Support Vector Machine Algorithm
from sklearn.svm import SVC
classifier3 = SVC(kernel = 'linear', random_state = 0)
classifier3.fit(X_train, Y_train)
# Using SVC method of svm class to use Kernel SVM Algorithm
from sklearn.svm import SVC
classifier4 = SVC(kernel = 'rbf', random_state = 1)
classifier4.fit(X_train, Y_train)
#Using GaussianNB method of naïve_bayes class to use Naïve Bayes Algorithm
from sklearn.naive_bayes import GaussianNB
classifier5 = GaussianNB()
classifier5.fit(X_train, Y_train)
#Using DecisionTreeClassifier of tree class to use Decision Tree Algorithm

from sklearn.tree import DecisionTreeClassifier
classifier6 = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)
classifier6.fit(X_train, Y_train)

In [1]:
Y_pred1 = classifier1.predict(X_test)
Y_pred2 = classifier2.predict(X_test)
Y_pred3 = classifier3.predict(X_test)
Y_pred4 = classifier4.predict(X_test)
Y_pred5= classifier5.predict(X_test)
Y_pred6 = classifier6.predict(X_test)

In [1]:

print(accuracy_score(Y_test, Y_pred1))
print(accuracy_score(Y_test, Y_pred2))
print(accuracy_score(Y_test, Y_pred3))
print(accuracy_score(Y_test, Y_pred4))
print(accuracy_score(Y_test, Y_pred5))
print(accuracy_score(Y_test, Y_pred6))

# Logistic Regression did decent job in prediction