In [1]:
# As usual importing the modules we'll be working with.
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
myData = pd.read_csv('loan.csv')
myData.head()

Unnamed: 0,age,ed_new,employ,address,income,debtinc,creddebt,othdebt,Rloan
0,41,2,17,12,176,9.3,11.359392,5.008608,0
1,27,1,10,6,31,17.3,1.362202,4.000798,1
2,40,1,15,14,55,5.5,0.856075,2.168925,1
3,41,1,15,14,120,2.9,2.65872,0.82128,1
4,24,1,2,0,28,17.3,1.787436,3.056564,0


It would be easier to look at Rloan as "no" and "yes" values rather than 0-1, so let's change that real quick.

In [3]:
myData['Rloan'] = np.where(myData['Rloan']==0, 'no', 'yes')
myData.head()

Unnamed: 0,age,ed_new,employ,address,income,debtinc,creddebt,othdebt,Rloan
0,41,2,17,12,176,9.3,11.359392,5.008608,no
1,27,1,10,6,31,17.3,1.362202,4.000798,yes
2,40,1,15,14,55,5.5,0.856075,2.168925,yes
3,41,1,15,14,120,2.9,2.65872,0.82128,yes
4,24,1,2,0,28,17.3,1.787436,3.056564,no


In [4]:
# Checking for missing data
np.unique(myData.isnull())

array([False])

### Splitting the Data (train & test)<br>

Ok, so up until now we really just kind of did a quick review of at the data. It's time to dig a little deeper in order to get much more accurate and reliable conclusions. <br>

First off, we will split our data into a training & testing sets (70/30) in order to avoid over fitting.

In [5]:
x_train, x_test, y_train, y_test = train_test_split(myData.drop(['Rloan'], 1), myData['Rloan'], test_size=0.3, random_state=1234)

### Decision Tree<br>
Ok, we're done fiddling around with the data, time to run the tree model on our training set!

In [6]:
tree = DecisionTreeClassifier()
tree.fit(x_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [7]:
predicted = tree.predict(x_train)
cmatTrain = pd.crosstab(index=y_train, columns=predicted, margins=True)
cmatTrain

col_0,no,yes,All
Rloan,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,125,0,125
yes,0,365,365
All,125,365,490


This is our confusion matrix(cmat) for our training set. Let's explore the results.<br>

So as you can see, we have a few aspects to our cmat (from left to right).<br>
<b>TN</b> - True Negatives = 63<br>
<b>FP</b> - False Positives = 62<br>
<b>FN</b> - False Negatives = 35<br>
<b>TP</b> - True Positives = 330<br>

As we continue we want to figure out the following:<br>
<b>Sensitivity</b> = TP/(TP + FN)  -> Sensitivity is the ratio of all the correctly predicted positive examples (True Positive) to all the positive examples in the data.<br>
<b>Specificity</b> = TN/(TN + FP) -> Specificity measures the true negative rate: the proportion of negatives that are correctly identified.<br>
<b>Precision</b> = TP/(TP+FP) -> Precision (positive predictive value) measures the accuracy of the classifier, when it predicts an example to be positive (True Positive).<br>
<b>Accuracy</b> = (TN + TP)/(TN+TP+FN+FP) -> The ratio of correct predictions, (both positive and negative) to all predictions.<br>

This will detirmine how strong our model is.


In [8]:
TN = cmatTrain.iloc[0,0]
FP = cmatTrain.iloc[0,1]
FN = cmatTrain.iloc[1,0]
TP = cmatTrain.iloc[1,1]

sensitivity = TP/(TP + FN)
specificity = TN/(TN + FP)
precision = TP/(TP+FP)
accuracy = (TN + TP)/(TN+TP+FN+FP)

print('Sensitivity = {se}, Specificity = {sp}, Precision = {p}, Accuracy = {a}'.format(se=sensitivity, sp=specificity, 
                                                                                       p=precision, a=accuracy))

Sensitivity = 1.0, Specificity = 1.0, Precision = 1.0, Accuracy = 1.0


Based on the training set, we can see that we have 80%-90% reliance. Not bad, now let's see how well our model does when confronted with new data it has yet seen (the test set). In other words, let's see what it's really worth.

In [9]:
predicted = tree.predict(x_test)
cmatTest = pd.crosstab(index=y_test, columns=predicted, margins=True)
cmatTest

col_0,no,yes,All
Rloan,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,27,31,58
yes,35,117,152
All,62,148,210


In [10]:
TN = cmatTest.iloc[0,0]
FP = cmatTest.iloc[0,1]
FN = cmatTest.iloc[1,0]
TP = cmatTest.iloc[1,1]

sensitivity = TP/(TP + FN)
specificity = TN/(TN + FP)
precision = TP/(TP+FP)
accuracy = (TN + TP)/(TN+TP+FN+FP)

print('Sensitivity = {se}, Specificity = {sp}, Precision = {p}, Accuracy = {a}'.format(se=sensitivity, sp=specificity, 
                                                                                       p=precision, a=accuracy))

Sensitivity = 0.7697368421052632, Specificity = 0.46551724137931033, Precision = 0.7905405405405406, Accuracy = 0.6857142857142857


Seems that our model still holds a 79%-93% efficiancy of prediction, which is not the best.<br> 
Its weak spot is the abillity to accuratly identify False Positives as our Specificity is 37.79% accurate.