In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from matplotlib import pyplot as plt
import matplotlib
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')


In [None]:
df = pd.read_csv("../input/loan-prediction-problem-dataset/train_u6lujuX_CVtuZ9i.csv")
df.head()

In [None]:
df.describe()

In [None]:
df.info()

# Dealing with missing values

In [None]:
df.isnull().sum()

In [None]:
#1 first filling up categorical misising values

In [None]:
df['Gender'] = df["Gender"].fillna(df['Gender'].mode()[0])
df['Married'] = df["Married"].fillna(df['Married'].mode()[0])
df['Self_Employed'] = df["Self_Employed"].fillna(df['Self_Employed'].mode()[0])

In [None]:
# counting the Dependents for better understanding about the data before filling it up.

In [None]:
sns.countplot(x ='Dependents', data = df)

In [None]:
#as we can see filling with mode make sense here.

In [None]:
df['Dependents'] = df["Dependents"].fillna(df['Dependents'].mode()[0])

In [None]:
#2 filling numerical values

In [None]:
#we should check for outliers before filling up numerical values.

In [None]:
df['Credit_History'].unique()

In [None]:
sns.countplot(x ='Credit_History', data = df)

In [None]:
df['Credit_History']=df['Credit_History'].fillna(df['Credit_History'].mode()[0])

In [None]:
splot = sns.countplot(x ='Loan_Amount_Term', data = df)
for p in splot.patches:
    splot.annotate(format(p.get_height(), '.2f'), (p.get_x() + p.get_width() / 2., p.get_height()), ha = 'center', va = 'center', xytext = (0, 10), textcoords = 'offset points')

In [None]:
# we can see that 360 has count of 512 so thats why replacing the loan_amount_term by mode will be smarter choice.

In [None]:
df['Loan_Amount_Term']=df['Loan_Amount_Term'].fillna(df['Loan_Amount_Term'].mode()[0])

In [None]:
plt.figure(figsize=(25, 8))
splot = sns.countplot(x ='LoanAmount', data = df)
for p in splot.patches:
    splot.annotate(format(p.get_height(), '.2f'), (p.get_x() + p.get_width() / 2., p.get_height()), ha = 'center', va = 'center', xytext = (0, 10), textcoords = 'offset points')

In [None]:
#we can not replace loan amount with mode because here mean or median will be better.
#before making choice beteween mean and median we have to check for outliers.
# becuase mean is affected by outliers.

In [None]:
sns.boxplot(x="LoanAmount", data=df)

In [None]:
#there is outlier so 

In [None]:
Q1 = df['LoanAmount'].quantile(0.25)
Q3 = df['LoanAmount'].quantile(0.75)
IQR = Q3 - Q1

In [None]:
low_lim = Q1 - 1.5 * IQR
up_lim = Q3 + 1.5 * IQR
print('low_limit is', low_lim)
print('up_limit is', up_lim)

In [None]:
outlier = []
for x in df['LoanAmount']:
    if ((x> up_lim) or (x<low_lim)):
         outlier.append(x)
print(' outlier in the dataset is', outlier)

In [None]:
len(outlier)

In [None]:
#we will not remove the outliers becuase it has 39/592, which means it has 6.5% amount of data in whole.

In [None]:
#we will use median to replace the missing value.
#becuase median is not affected by the outliers.

In [None]:
df['LoanAmount']=df['LoanAmount'].fillna(df['LoanAmount'].median())

In [None]:
df.isnull().sum()
# now there is no missing values.

# Checking for data imbalance

In [None]:
sns.countplot(df['Loan_Status'])
print('The percentage of Y class : %.2f' % (df['Loan_Status'].value_counts()[0] / len(df)))
print('The percentage of N class : %.2f' % (df['Loan_Status'].value_counts()[1] / len(df)))

#there is almost balance we don't need to worry about that.

#  EDA

In [None]:
df['Loan_Status'].replace('N',0,inplace=True)
df['Loan_Status'].replace('Y',1,inplace=True)

In [None]:
#Credit history vs loan status
grid = sns.FacetGrid(df,col='Loan_Status', size=3.2, aspect=1.6)
grid.map(sns.countplot, 'Credit_History')

#people having credit history have easy time getting loan

In [None]:
#Gender vs loan status
sns.countplot(x ='Gender', data = df)


In [None]:
grid = sns.FacetGrid(df,col='Gender', size=3.2, aspect=1.6)
grid.map(sns.countplot, 'Loan_Status')

# chances for getting loan for female is easier compared to male.
#Loan status clearly depend upon the gender.

In [None]:
#Married vs loan status
sns.countplot(x='Married', hue='Loan_Status', data=df)

#people who are married have better chance at loan approval

In [None]:
grid = sns.FacetGrid(df,col='Married', size=3.2, aspect=1.6)
grid.map(sns.countplot, 'Loan_Status')

In [None]:
#Dependents vs loan status

#sns.barplot(x='Dependents', y='Loan_Status', data=df)
sns.countplot(x="Dependents", hue="Loan_Status", data=df)

In [None]:
grid = sns.FacetGrid(df,col='Dependents', size=3.2, aspect=1.6)
grid.map(sns.countplot, 'Loan_Status')

In [None]:
grid = sns.FacetGrid(df,col='Loan_Status', size=3.2, aspect=1.6)
grid.map(sns.countplot, 'Dependents')

#we should drop the dependents as it has no relation with loan status

In [None]:
#loan status vs Education
grid = sns.FacetGrid(df,col='Education', size=3.2, aspect=1.6)
grid.map(sns.countplot, 'Loan_Status')

In [None]:
sns.countplot(x="Education", hue="Loan_Status", data=df)
#in both situation people ae getting the loan but people who are graduate are getting loan easier compared to other.

In [None]:
#Self_Employed vs Education
grid = sns.FacetGrid(df,col='Loan_Status', size=3.2, aspect=1.6)
grid.map(sns.countplot, 'Self_Employed')

#people having job got loan easily

In [None]:
grid = sns.FacetGrid(df,col='Loan_Status', size=3.2, aspect=1.6)
grid.map(sns.countplot, 'Property_Area')

In [None]:
grid = sns.FacetGrid(df,col='Property_Area', size=3.2, aspect=1.6)
grid.map(sns.countplot, 'Loan_Status')

# property area has impact on loan status

In [None]:
df['Total_Income'] = df['ApplicantIncome'] + df['CoapplicantIncome']
df.head()

In [None]:
plt.figure(figsize=(8,10))
sns.boxplot(x="Loan_Status",y="Total_Income", data=df)

In [None]:
df['Loan_Amount_Term'].unique()

In [None]:
plt.figure(figsize=(15,15))
sns.countplot(x="Loan_Amount_Term", hue="Loan_Status", data=df)
#no patter

In [None]:
df['LoanAmount'].unique()

In [None]:
plt.figure(figsize=(8,10))
sns.boxplot(x="Loan_Status",y="LoanAmount", data=df)
#no pattern

# Drop featrues which has no use

In [None]:
cols = ['ApplicantIncome', 'CoapplicantIncome', "LoanAmount", "Loan_Amount_Term", "Total_Income", 'Loan_ID', 'CoapplicantIncome', 'Dependents']
df = df.drop(columns=cols, axis=1)
df.head()

# Label Encoding for categorical

In [None]:
from sklearn.preprocessing import LabelEncoder
cols = ['Gender',"Married","Education",'Self_Employed',"Property_Area"]
le = LabelEncoder()
for col in cols:
    df[col] = le.fit_transform(df[col])


In [None]:
df.head()

In [None]:
# specify input and output attributes
X = df.drop(columns=['Loan_Status'], axis=1)
y = df['Loan_Status']

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [None]:
# classify function
from sklearn.model_selection import cross_val_score
def classify(model, x, y):
    x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
    model.fit(x_train, y_train)
    print("Accuracy is", model.score(x_test, y_test)*100)
    # cross validation - it is used for better validation of model
    # eg: cv-5, train-4, test-1
    score = cross_val_score(model, x, y, cv=5)
    print("Cross validation is",np.mean(score)*100)

In [None]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
classify(model, X, y)

In [None]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier()
classify(model, X, y)

# Confusion matrix

In [None]:
from sklearn.metrics import confusion_matrix
y_pred = model.predict(x_test)
cm = confusion_matrix(y_test, y_pred)
cm

In [None]:
sns.heatmap(cm, annot=True)