In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
%matplotlib inline
import matplotlib.pyplot as plt
from matplotlib import rcParams
import warnings
warnings.filterwarnings('ignore')

<h3>Reading the Dataset</h3>

In [1]:
df=pd.read_csv('../input/loan-dataset/loan.csv')

In [1]:
df.head()

In [1]:
df.tail()

In [1]:
df.columns

In [1]:
df.shape

<h3>Data Preprocessing and EDA</h3>

In [1]:
df.info()

In [1]:
df.describe()

In [1]:
sns.heatmap(df.corr(),annot=True)
plt.show()

In [1]:
df.isnull().sum()

In [1]:
df.drop('Loan_ID',axis=1,inplace=True)

Let's replace the missing values in each columns with their respective column means to avoid losing data and misinterpretation of the data.

In [1]:
df['LoanAmount']=df['LoanAmount'].fillna(df['LoanAmount'].mean())

In [1]:
df['Credit_History']=df['Credit_History'].fillna(df['Credit_History'].median())

In [1]:
df.isnull().sum()

In [1]:
df.dropna(inplace=True)

In [1]:
df.shape

<h3>Data Visualization</h3>

In [1]:
sns.countplot(df['Loan_Status'])

In [1]:
df['Loan_Status'].value_counts()

In [1]:
plt.figure(figsize=(8,6))
sns.countplot(x='Married', hue='Loan_Status', data=df);

In [1]:
plt.figure(figsize=(8,6))
sns.countplot(x='Dependents', hue='Loan_Status', data=df);

In [1]:
plt.figure(figsize=(8,6))
sns.countplot(x='Property_Area', hue='Loan_Status', data=df);

In [1]:
plt.figure(figsize=(8,6))
sns.countplot(x='Self_Employed', hue='Loan_Status', data=df);

In [1]:
plt.figure(figsize=(8,6))
sns.countplot(x='Education', hue='Loan_Status', data=df);

1.Most people who got married got a loan.
2.If Dependents are 0, we got higher chance to get a loan.
3.Semiurban Property_Area got more chance to get a loan.

In [1]:
sns.pairplot(df,hue ='Loan_Status',palette='pastel')

In [1]:
df['Loan_Status'].replace('N',0,inplace=True)
df['Loan_Status'].replace('Y',1,inplace=True)

In [1]:
sns.heatmap(df.corr(),annot=True)

We can clearly see that Credit_History has the highest correlation with Loan_Status (a positive correlation of 0.54). Therefore our target value is highly dependant on this column.

<h3>Classification Models</h3>

In [1]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier

In [1]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
le=LabelEncoder()
ohe=OneHotEncoder()

In [1]:
df=df.drop(labels=['ApplicantIncome'],axis=1)
df=df.drop(labels=['CoapplicantIncome'],axis=1)
df=df.drop(labels=['LoanAmount'],axis=1)
df=df.drop(labels=['Loan_Amount_Term'],axis=1)

In [1]:
df.head()

In [1]:
df['Property_Area']=le.fit_transform(df['Property_Area'])
df['Dependents']=le.fit_transform(df['Dependents'])
df['Gender']=le.fit_transform(df['Gender'])
df['Married']=le.fit_transform(df['Married'])
df['Education']=le.fit_transform(df['Education'])
df['Self_Employed']=le.fit_transform(df['Self_Employed'])
df.head()

In [1]:
y = df['Loan_Status']
X = df.drop(['Loan_Status'], axis = 1)

In [1]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30)

<h3>LOGISTIC REGRESSION</h3>

In [1]:
lr=LogisticRegression()
lr.fit(X_train,y_train)
lryhat=lr.predict(X_test)

In [1]:
accuracy_score(lryhat,y_test)

<h3>SUPPORT VECTOR MACHINES</h3>

In [1]:
cl=SVC(kernel='rbf')
cl.fit(X_train,y_train)
clyhat=cl.predict(X_test)

In [1]:
accuracy_score(clyhat,y_test)

<h3>DECISION TREE</h3>

In [1]:
clf=DecisionTreeClassifier()
clf.fit(X_train,y_train)
pred1=clf.predict(X_test)

In [1]:
accuracy_score(pred1,y_test)

<h3>XGBOOST</h3>

In [1]:
model=XGBClassifier()
model.fit(X_train,y_train)
modelyhat=model.predict(X_test)

In [1]:
accuracy_score(modelyhat,y_test)

<h3>KNN MODEL</h3>

In [1]:
knn = KNeighborsClassifier(n_neighbors =8)  # n_neighbors means k
knn.fit(X_train, y_train)
knnyhat = knn.predict(X_test)

In [1]:
accuracy_score(knnyhat,y_test)

<h3>CONCLUSION</h3>

We see that the highest accuracy is achieved by Logistic Regression and SVM.