### **Import Libraries**

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

### Import Dataset

In [None]:
data = pd.read_csv('../input/loan-predication/train_u6lujuX_CVtuZ9i (1).csv')

### Now Exploring the dataset

In [None]:
data.head()

In [None]:
data.describe()

In [None]:
print("Length of Data is: ", len(data.index))

In [None]:
print("Null Values in a Data: ")
data.isnull().any()

In [None]:
data.groupby('Gender').count()

So here we can see Gender Male took the more loan than Female.

In [None]:
data.groupby('Married')['Loan_ID'].count()

In [None]:
data.Property_Area.value_counts()

In [None]:
data.hist(column='ApplicantIncome', by='Education')

Here we can see Graduate People are highest applicant for the loan where the people who didn't graduate have less application for the loans.

In [None]:
data.isnull().sum()

In [None]:
# We can also do this with apply method
data.apply(lambda x: sum(x.isnull()), axis = 0)

In [None]:
data.shape # to check number of columns

In [None]:
data.nunique()

So we can see there are many categorical features

In [None]:
data.info()

Mostly columns are objects(string)

In [None]:
data.Property_Area.unique()

In [None]:
# Convert Object to Category
catList = ['Gender', 'Married', 'Education', 'Self_Employed', 'Credit_History', 'Property_Area', 'Loan_Status']
for cat in catList:
    data[cat] = data[cat].astype('category')

In [None]:
data.info()

There are two options for us:
1) Drop the missing values row
2) Fill the missinng values row with forward fill
we are going with ffill method

In [None]:
fillList = ['Gender', 'Married', 'Dependents', 'Self_Employed', 'Credit_History']
for fill in fillList:
    data[fill].fillna(method='ffill', inplace=True)

In [None]:
data.isnull().sum()

Here there are two numeric columns left with the NaN values so we have to look should we fill it with median, mode or mean

In [None]:
data.boxplot(column='LoanAmount')
plt.show()

It have lot of outliers so it is better to use median

In [None]:
data.boxplot(column='Loan_Amount_Term')
plt.show()

In [None]:
medianList = ['LoanAmount', 'Loan_Amount_Term']
for med in medianList:
    data[med].fillna(data[med].median(), inplace=True)

In [None]:
data.isnull().sum()

In [None]:
# For visualize the data
import seaborn as sns
from matplotlib import pyplot as plt

In [None]:
data['Credit_History'].hist()
plt.show()

Mostly people have a good credit history.. so now we can see if they are able to have a loan or not

In [None]:
data.groupby('Credit_History')['Loan_Status'].count()

In [None]:
data.groupby('Loan_Status')['Credit_History'].count()

So 385 people got the loan out of 475 people who have a good history while there is a chance that people with bad credit history can also have a loan

In [None]:
data.groupby(['Loan_Status','Gender']).sum()

In [None]:
data.groupby(['Loan_Status', 'Education']).sum()

In [None]:
data.groupby(['Loan_Status', 'Education', 'Gender']).sum()

In [None]:
data.Credit_History.unique()

In [None]:
greaterMeanIncome = data['ApplicantIncome'] > data.ApplicantIncome.mean() # because here we are choosing the avg Income of the applicant
isGraduate = (data.Education == 'Graduate')
isMarried = (data.Married == 'No')
loanStatus = (data.Loan_Status == 'Y')
data[(isMarried) & (isGraduate) & (greaterMeanIncome) & (loanStatus)]['Self_Employed'].value_counts()

So here we can see that there are less chance to get loan if the person is Graduated, Unmarried, have Average Income

In [None]:
greaterMeanIncome = data['ApplicantIncome'] > data.ApplicantIncome.mean() # because here we are choosing the avg Income of the applicant
isGraduate = (data.Education != 'Graduate')
isMarried = (data.Married != 'No')
loanStatus = (data.Loan_Status == 'Y')
data[(isMarried) & (isGraduate) & (greaterMeanIncome) & (loanStatus)]['Self_Employed'].value_counts()

See the difference that a person which is not graduated and unmarried have a better chance to get loan

In [None]:
data.groupby(['Gender', 'Education'])['Loan_Status'].count().plot(kind='bar')

In [None]:
data.groupby(['Gender', 'Education', 'Married'])['Loan_Status'].count().plot(kind='bar')

In [None]:
data.groupby(['Gender', 'Education', 'Married', 'Self_Employed'])['Loan_Status'].count().plot(kind='bar')

Here if a person is male, graduated, married, and not a self employed so that person have a higher chance to have a Yes in a Loan Status

In [None]:
# map Loan Status Y=1 , N=0 
data['Loan_Status'] = data['Loan_Status'].map({'Y':1,'N':0})
data['Gender'] = data['Gender'].map({'Male': 1, 'Female': 0})
data['Married'] = data['Married'].map({'Yes': 1, 'No': 0})
data['Education'] = data['Education'].map({'Graduate': 1, 'Not Graduate': 0})
data['Property_Area'] = data['Property_Area'].map({'Urban':1,'Rural':2,'Semiurban':3})
data['Self_Employed'] = data['Self_Employed'].map({'Yes': 1, 'No': 0})
# for a fast computation

In [None]:
# we can also change categorical values into one hot encoding through dummies which pd.get_dummies then concatenate
# but we are going to do here

In [None]:
data.set_index('Loan_ID', inplace=True)
data.head()

### Now Breaking into X and y

In [None]:
X = data.iloc[:, :-1].values
y = data.iloc[:, -1].values

In [None]:
print("X Shape", X.shape)
print("Y Shape", y.shape)
y = y.astype('int64')

In [None]:
data.iloc[:, :-1].info()

In [None]:
X = pd.DataFrame(X)

In [None]:
X.columns = ['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed', 'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount', 'Loan_Amount_Term', 'Credit_History', 'Property_Area']

In [None]:
X.head()

### Scaling

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X[['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount', 'Loan_Amount_Term', 'Property_Area']] = scaler.fit_transform(X[['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount', 'Loan_Amount_Term', 'Property_Area']])

In [None]:
X.head()

### One Hot Encoding

In [None]:
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder()
X = ohe.fit_transform(X)

#### Split Data

In [None]:
from sklearn.model_selection import train_test_split
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
print("Train X Shape", train_X.shape)
print("Test X Shape", test_X.shape)

### Metrics

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score

### Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier
dtc = DecisionTreeClassifier() # with no parameters define
dtc.fit(train_X, train_y)
pred_y = dtc.predict(test_X)
print("Score of Decision Tree: ", dtc.score(test_X, test_y))
print("Accuracy of Decision Tree: ", accuracy_score(test_y, pred_y))
print("Confusion Matrix of Decision Tree: \n", confusion_matrix(test_y, pred_y))

### Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(random_state=42)
rfc.fit(train_X, train_y)
pred_y = rfc.predict(test_X)
print("Score of Random Forest: ", rfc.score(test_X, test_y))
print("Accuracy of Random Forest: ", accuracy_score(test_y, pred_y))
print("Confusion Matrix of Random Forest: \n", confusion_matrix(test_y, pred_y))

### Stochastic Gradient Descent

In [None]:
from sklearn.linear_model import SGDClassifier
sgd = SGDClassifier()
sgd.fit(train_X, train_y)
pred_y = sgd.predict(test_X)
print("Score of SGD: ", sgd.score(test_X, test_y))
print("Accuracy of SGD: ", accuracy_score(test_y, pred_y))
print("Confusion Matrix of SGD: \n", confusion_matrix(test_y, pred_y))