In [1]:
import pandas as pd

from sklearn import tree
from sklearn import metrics
from sklearn.preprocessing import OneHotEncoder

import graphviz 

import util

In [2]:
util.set_default_pandas_options()

In [3]:
df = pd.read_csv('../data/loan-risk.csv', index_col=0)
df

Unnamed: 0_level_0,Income,CreditRating,LoanRisk
Observation,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,23,High,High
1,17,Low,High
2,43,Low,High
3,68,High,Low
4,32,Moderate,Low
5,20,High,High


In [4]:
# one hot encoding or one-of-K scheme
df['CreditRatingLow'] = 0
df['CreditRatingModerate'] = 0
df['CreditRatingHigh'] = 0

for i in range(len(df)):
    if df.loc[i,'CreditRating'] == 'Low':
        df.loc[i,'CreditRatingLow'] = 1
    elif df.loc[i,'CreditRating'] == 'Moderate':
        df.loc[i,'CreditRatingModerate'] = 1
    elif df.loc[i,'CreditRating'] == 'High':
        df.loc[i,'CreditRatingHigh'] = 1    

df

Unnamed: 0_level_0,Income,CreditRating,LoanRisk,CreditRatingLow,CreditRatingModerate,CreditRatingHigh
Observation,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,23,High,High,0,0,1
1,17,Low,High,1,0,0
2,43,Low,High,1,0,0
3,68,High,Low,0,0,1
4,32,Moderate,Low,0,1,0
5,20,High,High,0,0,1


In [5]:
# alternative approach of performing one hot encoding in Pandas
pd.get_dummies(df['CreditRating'], prefix='CreditRating')

Unnamed: 0_level_0,CreditRating_High,CreditRating_Low,CreditRating_Moderate
Observation,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,True,False,False
1,False,True,False
2,False,True,False
3,True,False,False
4,False,False,True
5,True,False,False


In [6]:
# alternative approach of performing one hot encoding in Scikit Learn
ohe = OneHotEncoder()
ohe.fit_transform(df['CreditRating'].values.reshape(-1, 1)).toarray()

array([[1., 0., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [1., 0., 0.],
       [0., 0., 1.],
       [1., 0., 0.]])

In [7]:
df = df.drop('CreditRating', axis=1)
df

Unnamed: 0_level_0,Income,LoanRisk,CreditRatingLow,CreditRatingModerate,CreditRatingHigh
Observation,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,23,High,0,0,1
1,17,High,1,0,0
2,43,High,1,0,0
3,68,Low,0,0,1
4,32,Low,0,1,0
5,20,High,0,0,1


In [8]:
independent_variables = df.drop('LoanRisk', axis=1)

x = independent_variables.values
y = df['LoanRisk'].values

clf = tree.DecisionTreeClassifier(criterion='gini', splitter='best')
clf = clf.fit(x, y)
y_pred = clf.predict(x)

print('Accuracy = {}'.format(metrics.accuracy_score(y, y_pred)))

Accuracy = 1.0


In [9]:
print('Confusion = \n{}'.format(metrics.confusion_matrix(y, y_pred, labels=['Low','High'])))

Confusion = 
[[2 0]
 [0 4]]


In [10]:
dot_data = tree.export_graphviz(clf, out_file=None, feature_names=['Income','CreditRatingLow','CreditRatingModerate','CreditRatingHigh'], 
                                class_names=['High','Low'], filled=True, rounded=True, special_characters=True)
graph = graphviz.Source(dot_data)
graph.render("../data/loan-risk")

'..\\data\\loan-risk.pdf'