# Importing the required libraries

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
import matplotlib
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

# Loading the Dataset

In [None]:
df = pd.read_csv('../input/loan-prediction-problem-dataset/train_u6lujuX_CVtuZ9i.csv')
df

# Dataset Info

In [None]:
df.info()

In [None]:
df.describe()

# Preprocessing of the dataset

In [None]:
#finding null values in dataset
df.isnull().sum()

In [None]:
#filling the missing value of categorivccal data
df['Gender'] = df["Gender"].fillna(df['Gender'].mode()[0])
df['Married'] = df["Married"].fillna(df['Married'].mode()[0])
df['Self_Employed'] = df["Self_Employed"].fillna(df['Self_Employed'].mode()[0])
df['Dependents'] = df["Dependents"].fillna(df['Dependents'].mode()[0])

In [None]:
df['Credit_History'].unique()

In [None]:
sns.countplot("Credit_History", data=df)

In [None]:
# We are replacing credit history NaN values with mode because

# credit history has only 2 unique values and replacing it with mode is better idea

In [None]:
df['Credit_History'] = df["Credit_History"].fillna(df['Credit_History'].mode()[0])

In [None]:
splot = sns.countplot(x ='Loan_Amount_Term', data = df)
for p in splot.patches:
    splot.annotate(format(p.get_height(), '.2f'), (p.get_x() + p.get_width() / 2., p.get_height()), ha = 'center', va = 'center', xytext = (0, 10), textcoords = 'offset points')

In [None]:
# We are replcing NaN values of Loan_Amount_Term with mode because
# 360 has the highest number of occurence

In [None]:
df['Loan_Amount_Term'] = df["Loan_Amount_Term"].fillna(df['Loan_Amount_Term'].mode()[0])

In [None]:
splot = sns.countplot(x ='LoanAmount', data = df)
for p in splot.patches:
    splot.annotate(format(p.get_height(), '.2f'), (p.get_x() + p.get_width() / 2., p.get_height()), ha = 'center', va = 'center', xytext = (0, 10), textcoords = 'offset points')

In [None]:
# Here we cannot replace the null values with mode.
# Here mean or median will be a better option. 
# We have to check for outliers before replcing NaN values with mean or median.

In [None]:
# Checking for outliers in "LoanAmount"

Q1 = df['LoanAmount'].quantile(0.25)
Q3 = df['LoanAmount'].quantile(0.75)
IQR = Q3 - Q1

In [None]:
low_lim = Q1 - 1.5 * IQR
up_lim = Q3 + 1.5 * IQR
print('low_limit is', low_lim)
print('up_limit is', up_lim)

In [None]:
outlier = []
for x in df['LoanAmount']:
    if ((x > up_lim) or (x < low_lim)):
         outlier.append(x)
print('Outlier in the dataset is', outlier)

In [None]:
len(outlier)

In [None]:
# 6.5% of data is in outlier so we will not remove the outliers from the dataset.
# We will replace the NaN values with median because median is not affected by outliers.

In [None]:
df['LoanAmount'] = df["LoanAmount"].fillna(df['LoanAmount'].median())

In [None]:
df['Total_Income'] = df['ApplicantIncome'] + df['CoapplicantIncome']
df

# Checking for Data Imbalance 

In [None]:
plt.figure(figsize=(8,6))
sns.countplot(df['Loan_Status']);

print('The percentage of Y class : %.2f' % (df['Loan_Status'].value_counts()[0] / len(df)))
print('The percentage of N class : %.2f' % (df['Loan_Status'].value_counts()[1] / len(df)))


# Exploring the individual features

In [None]:
df['Loan_Status'].replace('N',0,inplace=True)
df['Loan_Status'].replace('Y',1,inplace=True)

In [None]:
grid = sns.FacetGrid(df,col='Loan_Status', size=3.2, aspect=1.6)
grid.map(sns.countplot, 'Credit_History');
# Those applicants wo have credit history are more likely to get loan.

In [None]:
grid = sns.FacetGrid(df,col='Gender', size=3.2, aspect=1.6)
grid.map(sns.countplot, 'Loan_Status');

# Chances of getting loan to female applicant is higher.

In [None]:
sns.countplot(x='Married', hue='Loan_Status', data=df);
# Married applicants have higher chances getting loan

In [None]:
grid = sns.FacetGrid(df,col='Dependents', size=3.2, aspect=1.6)
grid.map(sns.countplot, 'Loan_Status');

In [None]:
grid = sns.FacetGrid(df,col='Loan_Status', size=3.2, aspect=1.6)
grid.map(sns.countplot, 'Dependents');

In [None]:
grid = sns.FacetGrid(df,col='Self_Employed', size=3.2, aspect=1.6)
grid.map(sns.countplot, 'Loan_Status');
# If the applicant is having a job he or she has the higher chances of getting loan.

In [None]:
grid = sns.FacetGrid(df,col='Property_Area', size=3.2, aspect=1.6)
grid.map(sns.countplot, 'Loan_Status');
#Semiurban applicants have higher chances.

In [None]:
sns.countplot(x='Education', hue='Loan_Status', data=df);
# Graduate applicants have higher chances of getting loan.

In [None]:
plt.figure(figsize=(8,6))
sns.boxplot(df['Loan_Status'], df['Total_Income']);
# Total income does not effect the loan status.


In [None]:
plt.figure(figsize=(15,15))
sns.countplot(x="Loan_Amount_Term", hue="Loan_Status", data=df)
#no correlation

In [None]:
plt.figure(figsize=(8,10))
sns.boxplot(x="Loan_Status",y="LoanAmount", data=df)

In [None]:
# drop unnecessary columns
cols = ['ApplicantIncome', 'CoapplicantIncome', "LoanAmount", "Loan_Amount_Term", "Total_Income", 'Loan_ID', 'Dependents']
df = df.drop(columns=cols, axis=1)
df.head()

# Label Encoding to categorical data

In [None]:
from sklearn.preprocessing import LabelEncoder
cols = ['Gender',"Married","Education",'Self_Employed',"Property_Area"]
le = LabelEncoder()
for col in cols:
    df[col] = le.fit_transform(df[col])

In [None]:
df.head()

In [None]:
# specify input and output attributes
X = df.drop(columns=['Loan_Status'], axis=1)
y = df['Loan_Status']

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [None]:
# classifier function
from sklearn.model_selection import cross_val_score
def classify(model, x, y):
    x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
    model.fit(x_train, y_train)
    print("Accuracy is", model.score(x_test, y_test)*100)
    # cross validation - it is used for better validation of model
    # eg: cv-5, train-4, test-1
    score = cross_val_score(model, x, y, cv=5)
    print("Cross validation is",np.mean(score)*100)

In [None]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
classify(model, X, y)

In [None]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier()
classify(model, X, y)

In [None]:
from sklearn.metrics import confusion_matrix
y_pred = model.predict(x_test)
cm = confusion_matrix(y_test, y_pred)
cm

In [None]:
sns.heatmap(cm, annot=True)