In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Predicting Loan Application
This code below developement version of code that credit to [vipin kumar](https://www.kaggle.com/vipin20/loan-prediction-problem) who gave me idea to improvise the code. Several additional data preprocessing is updated to improve the quality of data in order to reach better quality of prediction, accuracy, recall and the score. 
The objective of the data analyisis is finding best model that give better prediction for loans status whether will be approved(yes) or rejected(No) based on criteria of applicant(Gender, Status (married, Dependent), Education, Jobs(employee or self employed), Income(Total Income), Loan Amount,Loan Term Credit History, and Property Area. 
The analysis will be divided to several steps
1. Data Overview
2. Data Cleaning & Transform
   * 2.1 Drop unecessary columns
   * 2.2 Transform 'Total_Income' data from Object into Float
   * 2.3 Review the Data Statistik to Understand the Distribution
   * 2.4  Checking of Missing Data
   * 2.5 Checking the Skewness and Kurtosis of Data
3. Build Model Select the suitable Model
   * 3.1 Dividing Dataframe
   * 3.2 Splitting the Data of x and y 
   * 3.3 Testing Several Model
   * 3.4 Chosen Model, Threshold and Scores

We are going to use data of [Loan Application Data](http://https://www.kaggle.com/vipin20/loan-application-data).
Before we start making the code and start data processing, we import necessary module to our script.

In [None]:
#import module
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from scipy.stats import norm
import warnings as wr
wr.filterwarnings('ignore')

## 1. Data Overview
We will load dataset and tryhing to review the data. The number or rows and columns, the atributes, the quality of data and make prelimenery judgement what to do for the next steps.

In [None]:
# load data and overview the dataset
loans = pd.read_csv('../input/loan-application-data/df1_loan.csv')
loans.head()

In [None]:
loans.shape

In [None]:
loans.dtypes


the dataset has 15 columns and 500 row of data. 6 numerical columns (float and integer), and 9 object. We found Total_Income columns identified as "object(string)" ( We need to transform this data later to numerical later in the proces.)


## 2. DATA CLEANING
After reviewing the data, We have and idea what the data about and the quality of data, to ensure the data integrity, we will started process data cleaning.
### 2.1 Drop unecessary columns 
'Unnamed:0' and 'Loan_ID' is not important for our data analyisis and we need to drop the data. 

In [None]:
#Drop Unnecessary columns
loans = loans.drop(['Unnamed: 0', 'Loan_ID'], axis = 1)

### 2.2 Transform 'Total_Income' data from Object into Float
Total_Income is important keys for the analysis. We need to transform 'Total_Income' into numeric data so we can process the data for further process

In [None]:
# Remove $ in data 'Total_Income'
loans['Total_Income'] = loans['Total_Income'].str.replace('$',' ')
loans.head()

In [None]:
# Change the 'Total Income' Data type from Objet into Float
loans['Total_Income'] = loans['Total_Income'].astype(float)
loans.dtypes

In [None]:
loans.shape


### 2.3 Review the Data Statistik to Understand the Distribution
Knowing data distribution is important step for data analysis. We need to avoid data that can skew our data set. this is can be reached by view the data statistic describtion and boxplot

#### 2.3.1 Dataset Statistic Description

In [None]:
# Average, Percentile,Median, Maximum and Minimum Data
loans.describe()

#### 2.3.2 Boxplot


In [None]:
# Boxplot
boxplot = loans.boxplot(column=['ApplicantIncome', 'CoapplicantIncome', 'Total_Income' ])

From Statistic Describtion and Box plot, we will focus on "Total_Income". We can use both data to remove outliers that can skew the dataset and analysis result. We will find out the maximum outside range and will filter out data from outside range.
Formula:
Threshold_Max_range = Q75 - 1.5 *(Q75-Q25) = 7495.25 + 1.5*(7495.25-4166) 


In [None]:
Threshold_Max_Total_Income = 7495.25 + 1.5*(7495.25-4166)
loans1 =loans[loans['Total_Income']< Threshold_Max_Total_Income]
boxplot = loans1.boxplot(column=['ApplicantIncome', 'CoapplicantIncome', 'Total_Income' ])

In [None]:
#Total rows and columns
loans1.shape

New Box plot shown about 41 rows of data that outliers removed from the dataset. We will reflect the data distribution on histogram below.

#### 2.3.3 Histogram Graphic

In [None]:
#histogram Graphic
plt.figure(figsize=(10,7))
plt.hist(loans1['Total_Income'], bins=20, align='right', color='blue', edgecolor='black')
plt.ylabel("frequency")
plt.xlabel("Total Income")
plt.title(' Histogram of Total Income ')
plt.show()

### 2.4  Checking of Missing Data
In order to enhance data integrity, we will find missing data and fill the data. We use average value to fill the numerical data and generate string for object data.

In [None]:
# Checking the missing data
loans1.isnull().sum().sort_values(ascending=False) 

In [None]:
# Checking The missing data Percentage From total rows of each atributes
total_null = loans1.isnull().sum().sort_values(ascending=False) 
count = loans1.isnull().count().sort_values(ascending=False) #total data of rows #500
percentage = (loans1.isnull().sum()/loans1.isnull().count()).round(2).sort_values(ascending=False)*100 #First sum and order all null values for each variabl
missing_data = pd.concat([total_null, percentage], axis=1, keys=['Total', 'Percentage'])
missing_data.head(10)

The missing data maximum is 8.5 % from 459 observation which mean it is considering low. Generating data with this amount consider safe without skewing the data.
The Missing data will be generated using existing data.

#### 2.4.1 Filling numerical missing values


In [None]:
#Finding numeric column
num_column = loans1._get_numeric_data().columns.tolist()
#finding category column
cat_column = set(loans1.columns)-set(num_column)

#Filling numerical missing values
for col in num_column:
    loans1[col].fillna(loans1[col].mean(), inplace=True)
# Filling string missing values
for col in cat_column:
    loans1[col].fillna(loans1[col].mode()[0],inplace=True)  
# Verification if there null numbers 
loans1.isnull().sum() 

### 2.4 Finding the Duplicate Data
Duplicate data can skew the analysis. We will check if there any duplicate data exist in Dataset

In [None]:
loans1.duplicated().sum() 
# Data shown no duplicate

### 2.5 Checking the Skewness and Kurtosis of Data
Skewness is id symetrical distribution and Kurtosis is heaviness distribution
The guidance for Skewness:
 1. Data is fairly symmetrical in range -0.5 and 0.5
 2. Moderate Skewness in between -1 and -0.5 or 0.5 and 1
 3. Highly Skewness is under -1 and above 1
THe guidance of Kurtosis:
 1. Leptokurtik, Distribution is tall and thin (K>3)
 2. Plotikurtik, Distribution is flat and value is spread out (K <3)
 2. Mesokurtik, Distribution is in between ( K =3)


In [None]:
#  the "Total_Income" Skewness and Kurtosis
sns.distplot(loans1['Total_Income'])
print("Skewness coeff. is: %f" % loans1['Total_Income'].skew().round(2)) # 0.934
print("Kurtosis coeff. is: %f" % loans1['Total_Income'].kurt().round(2)) 

The distribution graphic show the data is moderate skewness and the data moderatly spreadout(Platykurtik). It mean data in in good range distribution

After data Cleaning and Transform, we can start build model

## 3. Build Model 
To build model, we need to import necessery module for the program


In [None]:
from sklearn.preprocessing import LabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix 
from sklearn.metrics import roc_curve, auc,roc_auc_score
from sklearn.preprocessing import binarize
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

### 3.1 Dividing Dataframe
We divide dataframe into x and y to build relation model. for x, we use all attributes except for 'Loan_Status'. for y, we use the 'Loan_Status'. String data in dataframe willbe transform into binary format (1 and 0). for Loan_Status/y, which was previously contain string 'Yes' and 'No', will be transform into 1 and 0.

In [None]:
#making two variable dataframe x, y
x=loans1.drop(['Loan_Status'], axis=1)
y=loans1['Loan_Status']
# Transform the x data into binary
x=pd.get_dummies(x)
x.head()

In [None]:
# Label Binarizer, transform string to binary value 
lb =LabelBinarizer()
y=lb.fit_transform(y)
print(y[:5])

### 3.2 Splitting the Data of x and y 
We will split the data into Train and Test. Will take 30% of data set for the test. The data will split into x_train, y_train, x_test, y_test. The Train data are the datasets used to build the model and the test data are data used to testing the model

In [None]:
#spliting the data
x_train,x_test, y_train, y_test = train_test_split(x,y, test_size =0.3)

### 3.3 Testing Several Model
We will find suitable model from 5 models that gives more accurate prediction. The Model we use is Decision Tree Classifier, Gaussian NB, K neighbors Classifier, and Random Forest Classifier.

In [None]:
# Decision Tree Classifier
dtf = DecisionTreeClassifier()
dtf.fit(x_train, y_train)

# Gaussian NB
n_b = GaussianNB()
n_b.fit(x_train, y_train)

# K Neighbors Classifier
knn = KNeighborsClassifier()  
knn.fit(x_train, y_train)

# Random Forest Classifier
rfc=RandomForestClassifier()
rfc.fit(x_train,y_train)

We test the accuracy of model using data test in the model.

In [None]:
# least model
print("Decision Tree Classifier Score:",dtf.score(x_test, y_test).round(2))
print("Gaussian NB Score:",n_b.score(x_test, y_test).round(2)) 
print("K Neighbors Classifier Score:",knn.score(x_test, y_test).round(2))
print("Random Forest Classifier Score:",rfc.score(x_test, y_test).round(2))

Random Forest Classifier Show promising accuracy Score in 0.82. However, to improve the accuracy of the model, we will analyise model using confusing matrix, the Receiver operating characteristic (ROC) Curve and find the threshold. 

#### 3.3.1 Confusing Matrix
 With confusing Matrix, we will find True Positive Value (TP), True Negative Value (TN), False Positif Value (FP), and False Negative Value (FN).

In [None]:
# y predict and Confusing Matrix
y_predict=rfc.predict(x_test)
CM = confusion_matrix(y_test, y_predict)
print(CM) # TP = 89, TN = 20, FP=23, FN = 6



#### 3.3.2 Receiver Operating Charaacteristing (ROC)
ROC is visual represent how well the classification model work. We will define True Positif Rate (TPR), False Positive Rate (FPR), and Threshold.

In [None]:
rfc.predict_proba(x_test[0:10])
# roc_curve, auc,roc_auc_score 
fpr,tpr,threshold = roc_curve(y_test, rfc.predict_proba(x_test)[:,1] )
roc_auc=roc_auc_score(y_test,rfc.predict(x_test))
#graphic plot
plt.figure()
lw = 2
plt.plot(fpr, tpr, color='darkorange', lw=lw, label='ROC curve (area = %0.2f)'%roc_auc )
plt.plot([0,1],[0,1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.1])
plt.ylim([0.0,1.1])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic example')
plt.legend(loc="lower right")
plt.show()


#### 3.3.3 Finding Threshold
Finding Threshold between 0 to 1 is very qualitative approach. When the Threshold high, The data will be more precision (less false positive) Since we are handling loans approval data, which mean we want to people have chance to get loans. Threshold value is depend on the need. For this case, we will generate threshold dummy and get several output of Accuracy, Precision, Recall, and f1 scores. 

In [None]:
y_predict_proba=rfc.predict_proba(x_test)
threshold_dummy = [0,0.2,0.4,0.6,0.8,1]
accuracy_scores = []
precision_scores = []
recall_scores = []
f1_scores = []
for i in threshold_dummy:
 y_pred_class = binarize(y_predict_proba,i)
 y_pred_class1 = y_pred_class[:,1].astype(int)  
 accuracy_scores.append(accuracy_score(y_test, y_pred_class1))
 precision_scores.append(precision_score(y_test, y_pred_class1))
 recall_scores.append(recall_score(y_test, y_pred_class1))
 f1_scores.append(f1_score(y_test, y_pred_class1))

In [None]:
from pandas import DataFrame
TL = DataFrame([threshold_dummy, accuracy_scores,precision_scores, recall_scores, f1_scores]).transpose().round(2)
TL.columns =['threshold', 'accuracy_scores','precision_scores', 'recall_scores', 'f1_scores']
TL.head(6)

In [None]:
#graphic threshold vs Score
plt.plot(TL['threshold'],TL['accuracy_scores'] , color='darkorange', lw=lw, label='accuracy ' )
plt.plot(TL['threshold'],TL['precision_scores'] , color='red', lw=lw, label='precision ')
plt.plot(TL['threshold'],TL['recall_scores'] , color='blue', lw=lw, label='recall')
plt.plot(TL['threshold'],TL['f1_scores'] , color='yellow', lw=lw, label='f1_scores ')
plt.plot([0.4,0.4],[0,1], color='black', lw=lw,label='threshold =0.4', linestyle='--')
plt.xlim([0.0, 1.1])
plt.ylim([0.0,1.1])
plt.xlabel('Threshold')
plt.ylabel('Score')
plt.title('Threshold vs Score')
plt.legend(loc="lower left")
plt.show()

We found optimal score about 0.4 where the scores show optimal numbers using random forest classifier.

### 3.4 Chosen Model, Threshold and Scores
to summarise, we use Random forest classifier because it show better accuracy compare than other model. The optimal threshold at 0.4. the model using to make prediction and scores

In [None]:
y_predict_proba=rfc.predict_proba(x_test)
y_pred_class=binarize(y_predict_proba,0.4)
y_pred_class2=y_pred_class[:,1].astype(int) 
# confusing matrix
results = confusion_matrix(y_test, y_pred_class2) 
print('Confusion Matrix :', results)
print('Data Value :')
print('True Positif Value (TP):', results[1,1])
print('True Negative Value (TN):', results[0,0])
print('False Positive Value (FP):', results[0,1])
print('False Negative Value (FN):', results[1,0])

In [None]:
print("accuracy:", accuracy_score(y_test, y_pred_class2).round(2))
print("precision:", precision_score(y_test, y_pred_class2).round(2))
print("recall:", recall_score(y_test, y_pred_class2).round(2))
print("f1 score:", f1_score(y_test, y_pred_class2).round(2))