In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

**Objective of the Problem**

To identify customers who are worthy of a bank loan through logistic regression classification method. In this problem we will be predicting whether a person's loan should be approved or not based on a set of independent variables. This is a binary classification solution which would save banks a lot of time by automating the manual worth customer identification process.

**Steps Followed**

1. Loading, Cleaning and Understanding the Data
2. Exploratory Data Analyis (EDA)
3. Logistic Regression Model Building
4. Model Performance Analysis


**Loading packages**

In [None]:
import seaborn as sns                  # For data visualization 
import matplotlib.pyplot as plt        # For plotting graphs 
%matplotlib inline 
import warnings   # To ignore any warnings 
warnings.filterwarnings("ignore")

**Loading data**

* **train_data:** This data is used for training the model. It contains attributes as well as target variable
* **test_data:** This data is used for testing the model. It contains the attributes but no target variables. The target variables are to be predicted by the  model.

In [None]:
train_data = pd.read_csv('../input/bank-data/train_data.csv')
test_data = pd.read_csv('../input/bank-data/test_data.csv')

**Retaining a copy of the original data**

In [None]:
train_org = train_data
test_org = test_data

In [None]:
train_data.head()

In [None]:
train_data.columns

In [None]:
test_data.head()

In [None]:
test_data.columns

In [None]:
train_data.dtypes

In [None]:
train_data.shape,test_data.shape

**Evaluating the target variable (Loan_Status) in training data**



**Preparing the data**

* Replacing missing values (Numerical Values: Median, Categorical Values: Mode)

In [None]:
train_data['Gender'].fillna(train_data['Gender'].mode()[0], inplace=True) 
train_data['Married'].fillna(train_data['Married'].mode()[0], inplace=True) 
train_data['Dependents'].fillna(train_data['Dependents'].mode()[0], inplace=True) 
train_data['Self_Employed'].fillna(train_data['Self_Employed'].mode()[0], inplace=True) 
train_data['Credit_History'].fillna(train_data['Credit_History'].mode()[0], inplace=True)

train_data['Loan_Amount_Term'].fillna(train_data['Loan_Amount_Term'].mode()[0], inplace=True)
train_data['LoanAmount'].fillna(train_data['LoanAmount'].median(), inplace=True)

train_data.isnull().sum()

In [None]:
test_data['Gender'].fillna(test_data['Gender'].mode()[0], inplace=True) 
test_data['Married'].fillna(test_data['Married'].mode()[0], inplace=True) 
test_data['Dependents'].fillna(test_data['Dependents'].mode()[0], inplace=True) 
test_data['Self_Employed'].fillna(test_data['Self_Employed'].mode()[0], inplace=True) 
test_data['Credit_History'].fillna(test_data['Credit_History'].mode()[0], inplace=True)

test_data['Loan_Amount_Term'].fillna(test_data['Loan_Amount_Term'].mode()[0], inplace=True)
test_data['LoanAmount'].fillna(test_data['LoanAmount'].median(), inplace=True)

test_data.isnull().sum()

In [None]:
train_data['Loan_Status'].value_counts(normalize=True)

In [None]:
train_data['Loan_Status'].value_counts().plot.bar(color="c")

**Inferences**

* Out of 614 loan applications about 69% of them were approved.

Types of variables in the dataset are categorical, ordinal and numerical.

* Categorical: Gender, Married, Self_Employed, Credit_History,Loan_Status
* Ordinal: Dependents, Education, Property_Area
* Numerical: ApplicantIncome, CoapplicantIncome, LoanAmount, Loan_Amount_Term

**EDA (Categorical Attributes)**

In [None]:
plt.figure(1)
plt.subplot(221)
train_data['Gender'].value_counts(normalize=True).plot.bar(figsize=(20,10), title= 'Gender',color="c") 
plt.subplot(222) 
train_data['Married'].value_counts(normalize=True).plot.bar(title= 'Married',color="c") 
plt.subplot(223) 
train_data['Self_Employed'].value_counts(normalize=True).plot.bar(title= 'Self_Employed',color="c") 
plt.subplot(224) 
train_data['Credit_History'].value_counts(normalize=True).plot.bar(title= 'Credit_History',color="c") 
plt.show()


**Inferences**

* 80% Male; 20% Female
* 65% Married; 35% Unmarried
* 15% Self employed.
* 85% Repaid their debts.

**EDA (Ordinal Attributes)**

In [None]:
plt.figure(1) 
plt.subplot(131) 
train_data['Dependents'].value_counts(normalize=True).plot.bar(figsize=(24,6), title= 'Dependents',color="c") 
plt.subplot(132) 
train_data['Education'].value_counts(normalize=True).plot.bar(title= 'Education',color="c") 
plt.subplot(133) 
train_data['Property_Area'].value_counts(normalize=True).plot.bar(title= 'Property_Area',color="c") 
plt.show()

**Inferences**

* Majority of applicants do not have dependents.
* About 80% are graduates
* Most applicants were from the semi urban area.



**EDA (Numerical Attributes)**

In [None]:
plt.figure(1) 
plt.subplot(131) 
sns.distplot(train_data['ApplicantIncome']); 
plt.subplot(132) 
train_data['ApplicantIncome'].plot.box(figsize=(16,5)) 
train_data.boxplot(column='ApplicantIncome', by = 'Education') 
plt.suptitle("")


**Inferences**

* Income dsitribution is skewed to the left, hence normalisation would be required for better performance of the model.
* Box plot shows that there are quite a few outliers who are mostly "Graduate"

**EDA (Target Variable compared to Independent Variables)**

In [None]:
Gender=pd.crosstab(train_data['Gender'],train_data['Loan_Status']) 
Married=pd.crosstab(train_data['Married'],train_data['Loan_Status']) 
Dependents=pd.crosstab(train_data['Dependents'],train_data['Loan_Status']) 
Education=pd.crosstab(train_data['Education'],train_data['Loan_Status']) 
Self_Employed=pd.crosstab(train_data['Self_Employed'],train_data['Loan_Status']) 
Credit_History=pd.crosstab(train_data['Credit_History'],train_data['Loan_Status']) 
Property_Area=pd.crosstab(train_data['Property_Area'],train_data['Loan_Status']) 

Gender.div(Gender.sum(1).astype(float), axis=0).plot(kind="bar", stacked=True, figsize=(5,5))
plt.show()

Married.div(Married.sum(1).astype(float), axis=0).plot(kind="bar", stacked=True, figsize=(5,5)) 
plt.show() 

Dependents.div(Dependents.sum(1).astype(float), axis=0).plot(kind="bar", stacked=True, figsize=(5,5)) 
plt.show() 

Education.div(Education.sum(1).astype(float), axis=0).plot(kind="bar", stacked=True, figsize=(5,5)) 
plt.show() 

Self_Employed.div(Self_Employed.sum(1).astype(float), axis=0).plot(kind="bar", stacked=True, figsize=(5,5)) 
plt.show()

Credit_History.div(Credit_History.sum(1).astype(float), axis=0).plot(kind="bar", stacked=True, figsize=(5,5)) 
plt.show() 

Property_Area.div(Property_Area.sum(1).astype(float), axis=0).plot(kind="bar", stacked=True, figsize=(5,5)) 
plt.show()


**Inferences**
* Proportion of male and female for approved and unapproved loans is same and insignificant for decision making
* Married applicant with successfully approved loans is marginally higher than unmarried applicants
* People with credit history as 1 have greater loan approvals
* Greater number of loans are approved in semi-urban areas followed by urban and then rural areas.

In [None]:
bins_income=[0,2000,5000,8000,81000] 
group=['Low','Average','High', 'Very high'] 
train_data['Income_buckets']=pd.cut(train_data['ApplicantIncome'],bins_income,labels=group)
Income_buckets=pd.crosstab(train_data['Income_buckets'],train_data['Loan_Status']) 
Income_buckets.div(Income_buckets.sum(1).astype(float), axis=0).plot(kind="bar", stacked=True) 
plt.xlabel('ApplicantIncome') 
P = plt.ylabel('Percentage')

bins_loan_amount=[0,100,200,700] 
group=['Low','Average','High'] 
train_data['LoanAmount_buckets']=pd.cut(train_data['LoanAmount'],bins_loan_amount,labels=group)
LoanAmount_buckets=pd.crosstab(train_data['LoanAmount_buckets'],train_data['Loan_Status']) 
LoanAmount_buckets.div(LoanAmount_buckets.sum(1).astype(float), axis=0).plot(kind="bar", stacked=True) 
plt.xlabel('LoanAmount') 
P = plt.ylabel('Percentage')
plt.show()

**Inferences**

* Loan wasn't approved for more than 60% of the applicants with low income.
* More loans are approved for Low or Average earning candidates

**For Linear Regression, the attributes need to be converted into numerical values and the insignificant attributes need to be dropped, the following code  does that**

* Dependdent attribute with value "3+" in converted to 3
* Label attribute is converted into ordinal attribute with "Y"=1 and "N"=0

In [None]:
train_data=train_data.drop(['Loan_ID','Customer_ID','Income_buckets', 'LoanAmount_buckets'], axis=1)
test_data=test_data.drop(['Loan_ID','Customer_ID'], axis=1)
train_data['Dependents'].replace('3+', 3,inplace=True) 
test_data['Dependents'].replace('3+', 3,inplace=True) 
train_data['Loan_Status'].replace('N', 0,inplace=True) 
train_data['Loan_Status'].replace('Y', 1,inplace=True)

**Evaluating correlation between variables through heat map**

In [None]:
matrix = train_data.corr() 
f, ax = plt.subplots(figsize=(9, 6)) 
sns.heatmap(matrix, vmax=.8, square=True, cmap="YlOrRd");

**Building the Logistic Regression Model (using scikit-learn (sklearn) : open source library for Python)**

**Sklearn requires the target variable in a separate dataset. So, we will drop our target variable from the train dataset and save it in another dataset.**

In [None]:
x = train_data.drop('Loan_Status',1)
y = train_data.Loan_Status

Making dummy variables to store numerical values for Categorical variable as Logistic Regression takes only numerical inputs.

e.g. 'Gender' variable is transformed to 'Gender_Male' and 'Gender_Female' which hold values of either 0/1 (Yes/No)

This is achived by the following code

In [None]:
x=pd.get_dummies(x) 
train_data=pd.get_dummies(train_data) 
test_data=pd.get_dummies(test_data)

**Logistic Regression using Stratified k-fold cross validation and Prediction for test data**

In [None]:
from sklearn.linear_model import LogisticRegression as lr 
from sklearn.model_selection import StratifiedKFold as skf
from sklearn.metrics import accuracy_score as acc

i=1 
kfolds = skf(n_splits=10,random_state=1,shuffle=True) 
for train_i,test_i in kfolds.split(x,y):     
    print('\n kfold {} / {}'.format(i,kfolds.n_splits))     
    xtrain,xval = x.loc[train_i],x.loc[test_i]     
    ytrain,yval = y[train_i],y[test_i]         
    model = lr(random_state=1)     
    model.fit(xtrain, ytrain)     
    pred_test = model.predict(xval)     
    score = acc(yval,pred_test)     
    print('Accuracy:',score)     
    i+=1 
prediction_test_data = model.predict(test_data) 
prediction = model.predict_proba(xval)[:,1]

**ROC Curve for the Model**

In [None]:
from sklearn import metrics as m
fpr, tpr, _ = m.roc_curve(yval,prediction) 
auc = m.roc_auc_score(yval, prediction) 
plt.figure(figsize=(10,8)) 
plt.plot(fpr,tpr,label="validation, auc="+str(auc)) 
plt.xlabel('False Positive Rate') 
plt.ylabel('True Positive Rate') 
plt.legend(loc=4) 
plt.show()

**Displaying Prediction Results for the Test Data**

In [None]:
test_org['Prediction']=prediction_test_data
test_org[['Loan_ID','Customer_ID','Prediction']]
