#                "Loan Approval Prediction using Machine Learning"

# Business Objectives: 


# Type of Machine Learning Problem:

# Performance Metrics:

# Features:


In [None]:
#Importing the necessary libraries

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
#Reading the training csv file using pandas

train_data=pd.read_csv(r'../input/loan-prediction-analytics-vidhya/train_ctrUa4K.csv')

In [None]:
#Viewing the first five rows of the training dataset

train_data.head()

In [None]:
train_data.shape

Observation : " The number of rows and columns in the training dataset are 614 and 13 respectively. "

In [None]:
#Names of all the columns in the training dataset

train_data.columns

In [None]:
train_data.dtypes

Observation : " As most models take numerical values as target variable, we will convert Y to 1 and N to 0. "

In [None]:
status={'Y': 1 , 'N': 0}
train_data['Loan_Status']=train_data['Loan_Status'].map(status)

In [None]:
#Viewing the first five rows after converting Yes to 1 and No to 0 in the Loan_Status column

train_data.head()

In [None]:
#Converting the datatype of the Loan_Status feature to integer

train_data["Loan_Status"]=train_data["Loan_Status"].astype(int)

In [None]:
#Differentiating the numerical and categorical features in the dataset alongwith their counts.

num_feat=train_data.select_dtypes(exclude=[object]).columns
cat_feat=train_data.select_dtypes(include=[object]).columns
print("No. of numerical features:",len(num_feat))
print(num_feat)
print("No. of categorical features:",len(cat_feat))
print(cat_feat)

Observation : " We have 6 numerical features and 7 categorical features in the dataset. "

In [None]:
#Summarizing the overall statistics of the numerical features

train_data.describe()

In [None]:
#Checking for duplicate values in the Loan_ID column

unique=len(set(train_data['Loan_ID']))
total=len(train_data['Loan_ID'])
dup=total - unique
print("No of duplicate ID values in the training dataset :",dup)

Observation : " As there are no duplicate values in the Loan_ID column, we will drop that column. "

In [None]:
train_data=train_data.drop("Loan_ID",axis=1)

In [None]:
#Checking the distribution of target variable

target=train_data['Loan_Status'].value_counts(normalize=True)
print(target*100)

Observation : " Around 69% people who applied for loans were approved whereas the rest 31% were not approved. Hence, the classes                 are imbalanced. "

# Univariate Analysis:

In [None]:
dist=train_data['Loan_Status'].value_counts()
print(dist)
train_data['Loan_Status'].value_counts(normalize=True).plot.bar()

In [None]:
train_data['Credit_History'].value_counts(normalize=True).plot.bar()

In [None]:
sns.distplot(train_data['ApplicantIncome'])

In [None]:
train_data['ApplicantIncome'].plot.box()

In [None]:
sns.distplot(train_data['CoapplicantIncome'])

In [None]:
train_data['CoapplicantIncome'].plot.box()

In [None]:
sns.distplot(train_data['LoanAmount'])

In [None]:
train_data['LoanAmount'].plot.box()

# Bivariate Analysis:

In [None]:
sns.barplot(x='Loan_Status',y='LoanAmount',data=train_data,ci=None)

#Loan approval status does not depend much on the Loan amount.

In [None]:
sns.barplot(x='Loan_Status',y='Loan_Amount_Term',data=train_data,ci=None)

#Loan approval status does not depend much on the Loan amount term.

In [None]:
sns.barplot(x='Loan_Status',y='ApplicantIncome',data=train_data,ci=None)

#Loan approval status does not depend on the Applicant's income

In [None]:
sns.barplot(x='Loan_Status',y='CoapplicantIncome',data=train_data,ci=None)

#Applicants with higher coapplicants income have lesser chance of loan approval, which means that most of the applicants have
#no coapplicant. Also, 25 percentile of coapplicant income is 0. Thus, loan approval does not depend on the coapplicants'
#income. 

In [None]:
sns.barplot(x='Gender',y='Loan_Status',data=train_data,ci=None)

### "ci=None" - ci means Size of confidence intervals to draw around estimated values. If None, error bars will not be drawn.

#The male and female applicants have almost similar chances of getting their loan approved.

In [None]:
sns.barplot(x='Married',y='Loan_Status',data=train_data,ci=None)

#Married applicants have higher chances of getting their loan approved.

In [None]:
sns.barplot(x='Dependents',y='Loan_Status',data=train_data,ci=None)

#Applicants with 2 dependents have highest chances of having their loan approved. Whereas, with 1 or 3+ dependents,their 
#approval chances remain the same.

In [None]:
sns.barplot(x='Education',y='Loan_Status',data=train_data,ci=None)

#Applicants with graduation are more likely to get their loan approved.

In [None]:
sns.barplot(x='Self_Employed',y='Loan_Status',data=train_data,ci=None)

#The applicants, whether self employed or not, have similar chances of loan approval.

In [None]:
sns.barplot(x='Property_Area',y='Loan_Status',data=train_data,ci=None)

#Applicants from rural and urban areas have lesser chances of getting their loan approved than applicants from semiurban
#areas.

In [None]:
#Correlation between the numerical features

corr=train_data.corr()
corr.sort_values('Loan_Status',ascending=True,inplace=True)
print(corr['Loan_Status'])
sns.heatmap(corr,annot=True)

# Missing Values Imputation:

In [None]:
train_data.isnull().sum()

In [None]:
#Imputing the numerical features with their median

train_data=train_data.fillna(train_data.median())

In [None]:
#train_data['Gender'].value_counts()        = Male
#train_data['Married'].value_counts()       = Yes
#train_data['Dependents'].value_counts()    = 0
#train_data['Self_Employed'].value_counts() = No

In [None]:
#Handling missing values in categorical features one by one with their mode

train_data['Gender']=train_data['Gender'].fillna(train_data['Gender'].mode()[0])

In [None]:
train_data['Married']=train_data['Married'].fillna(train_data['Married'].mode()[0])

In [None]:
train_data['Dependents']=train_data['Dependents'].fillna(train_data['Dependents'].mode()[0])

In [None]:
train_data['Self_Employed']=train_data['Self_Employed'].fillna(train_data['Self_Employed'].mode()[0])

In [None]:
sns.heatmap(train_data.isnull(),cbar=False)

In [None]:
#Standardization of the features

from sklearn.preprocessing import StandardScaler

X=train_data.drop("Loan_Status",axis=1)
Y=train_data['Loan_Status']
X=pd.get_dummies(X)
X.shape

In [None]:
X.head()

# Modeling:

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier

model=KNeighborsClassifier()

metric=['manhattan','euclidean','minkowski']
n_neighbors=range(1,21,2)
weights=['uniform','distance']
param_grid=dict(metric=metric,n_neighbors=n_neighbors,weights=weights)
grid_search=GridSearchCV(model,param_grid,scoring='f1',cv=5,n_jobs=-1)
result=grid_search.fit(X,Y)
print("Best score using K Nearest Neighbor: %f with %s"%(result.best_score_*100,result.best_params_))

In [None]:
from sklearn.linear_model import LogisticRegression

model=LogisticRegression()

solver=['lbfgs','liblinear']
penalty=['l1','l2','elaticnet']
c=[0.01,0.1,1,10,100]
param_grid=dict(solver=solver,penalty=penalty,C=c)
grid_search=GridSearchCV(model,param_grid,scoring='f1',cv=5,n_jobs=-1)
f_result=grid_search.fit(X,Y)
print("Best score using Logistic Regression: %f with %s"%(f_result.best_score_*100,f_result.best_params_))


In [None]:
from sklearn.tree import DecisionTreeClassifier

model=DecisionTreeClassifier()

criterion=['gini','entropy']
max_depth=[2,4,6,8,10]
param_grid=dict(criterion=criterion,max_depth=max_depth)
grid_search = GridSearchCV(model,param_grid,scoring='f1',n_jobs=-1,cv=5)
result=grid_search.fit(X,Y)
print("Best score using Decision Tree: %f with %s"%(result.best_score_*100,result.best_params_))


In [None]:
from sklearn.ensemble import RandomForestClassifier

model=RandomForestClassifier()

n_estimators=[10,50,100,150,200,250,300]
max_features=['sqrt','log2']
param_grid=dict(n_estimators=n_estimators,max_features=max_features)
grid_search=GridSearchCV(model,param_grid,scoring='f1',n_jobs=-1,cv=5)
result=grid_search.fit(X,Y)
print("Best score using RandomForest: %f with %s"%(result.best_score_*100,result.best_params_))


In [None]:
import xgboost
from sklearn.model_selection import RandomizedSearchCV

model=xgboost.XGBClassifier()

booster=['gbtree','gblinear']
learning_rate=[0.001,0.01,0.1,0.2,0.5]
n_estimators=[50,100,150,200,250,300]
max_depth=[2,4,6,8]
param_grid=dict(booster=booster,max_depth=max_depth,n_estimators=n_estimators,learning_rate=learning_rate)

random_search=RandomizedSearchCV(model,param_grid,scoring='f1',n_jobs=-1,cv=5)
result=random_search.fit(X,Y)
print("Best score using XGBoost: %f with %s"%(result.best_score_*100,result.best_params_))


Since Logistic Regression has the highest f1 score with 87.67%, we will be using this as our final model to predict the class labels.

In [None]:
f_result.best_estimator_

In [None]:
final_model = LogisticRegression(C=0.1, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l1',
                   random_state=None, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)
final_model.fit(X,Y)

In [None]:
test_data=pd.read_csv(r'../input/loan-prediction-analytics-vidhya/test_lAUu6dG.csv')

In [None]:
test_data.head()

In [None]:
test_data.isnull().sum()

In [None]:
test_data=test_data.drop('Loan_ID',axis=1)

In [None]:
test_data=test_data.fillna(test_data.median())

In [None]:
test_data['Gender']=test_data['Gender'].fillna(test_data['Gender'].mode()[0])

In [None]:
test_data['Dependents']=test_data['Dependents'].fillna(test_data['Dependents'].mode()[0])

In [None]:
test_data['Self_Employed']=test_data['Self_Employed'].fillna(test_data['Self_Employed'].mode()[0])

In [None]:
sns.heatmap(test_data.isnull(),cbar=False)

In [None]:
test_data=pd.get_dummies(test_data)

In [None]:
test_data.shape

In [None]:
test_data.head()

In [None]:
y_pred=final_model.predict(test_data)

In [None]:
sample_submission=pd.read_csv(r'../input/loan-prediction-analytics-vidhya/sample_submission_49d68Cx.csv')

In [None]:
sample_submission['Loan_Status']=y_pred

In [None]:
status={1: 'Y', 0: 'N'}
sample_submission['Loan_Status']=sample_submission['Loan_Status'].map(status)

In [None]:
sample_submission.head()

In [None]:
sample_submission.to_csv(r'C:\Users\Dipanjan Dey Sarkar\OneDrive\Documents\Loan_Prediction.csv',index=False)