In [29]:
import numpy as np
import pandas as pd
import os
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report

In [128]:
data = pd.read_csv("../data"+os.sep+"CreditWorthiness.csv")
data.head()

Unnamed: 0,Cbal,Cdur,Chist,Cpur,Camt,Sbal,Edur,InRate,MSG,Oparties,...,Prop,age,inPlans,Htype,NumCred,JobType,Ndepend,telephone,foreign,creditScore
0,0 <= Rs. < 2000,9,all settled till now,Business,13790,Rs. < 1000,1 to 4 years,2,married or widowed male,no one,...,real estate,27,bank,own,1,employee with official position,1,yes,no,good
1,0 <= Rs. < 2000,15,dues not paid earlier,electronics,15250,no savings account,more than 7 years,4,single male,"yes, guarantor",...,real estate,50,none,own,2,employee with official position,1,yes,no,good
2,0 <= Rs. < 2000,36,none taken/all settled,Business,19410,Rs. < 1000,more than 7 years,4,single male,no one,...,Unknown,61,none,free,1,"employed either in management, self or in high...",1,yes,no,bad
3,0 <= Rs. < 2000,48,none taken/all settled,Business,144090,Rs. < 1000,1 to 4 years,2,single male,no one,...,Other cars etc.,25,none,own,1,employee with official position,1,yes,no,bad
4,no checking account,24,all settled till now,electronics,31690,Rs. < 1000,less than 1 year,4,divorced or separated or married female,no one,...,life insurance/building society,26,none,own,1,employee with official position,1,yes,no,good


In [129]:
# need to encode the categorical variables since the model is expecting a numerical input
label_encoder = LabelEncoder()
categorical_variables = ['Chist', 'Cpur', 'Cbal', 'Sbal', 'Edur', 'MSG', 'Oparties', 'Prop', 'inPlans', 'Htype', 'JobType', 'telephone', 'foreign']
for var in categorical_variables:
    data[var] = label_encoder.fit_transform(data[var]) 
    
data.head(20)

Unnamed: 0,Cbal,Cdur,Chist,Cpur,Camt,Sbal,Edur,InRate,MSG,Oparties,...,Prop,age,inPlans,Htype,NumCred,JobType,Ndepend,telephone,foreign,creditScore
0,1,9,1,0,13790,2,0,2,2,0,...,3,27,0,1,1,1,1,1,0,good
1,1,15,2,3,15250,4,3,4,3,2,...,3,50,1,1,2,1,1,1,0,good
2,1,36,3,0,19410,2,3,4,3,0,...,1,61,1,0,1,0,1,1,0,bad
3,1,48,3,0,144090,2,0,2,3,0,...,0,25,1,1,1,1,1,1,0,bad
4,3,24,1,3,31690,2,2,4,1,0,...,2,26,1,1,1,1,1,1,0,good
5,3,27,1,7,51780,4,3,4,3,0,...,2,48,1,1,4,1,2,1,0,good
6,3,12,1,3,21590,2,2,2,1,0,...,0,29,0,1,1,1,1,0,0,good
7,1,12,1,9,9950,3,0,4,2,0,...,3,22,1,1,1,1,1,0,0,good
8,3,36,1,2,18070,2,0,4,3,0,...,1,37,2,0,1,1,1,1,0,bad
9,3,36,1,3,23820,4,0,4,1,0,...,0,25,1,1,1,1,1,0,0,good


In [111]:
data.corr(numeric_only=True)

Unnamed: 0,Cbal,Cdur,Chist,Cpur,Camt,Sbal,Edur,InRate,MSG,Oparties,Prop,age,inPlans,Htype,NumCred,JobType,Ndepend,telephone,foreign,creditScore
Cbal,1.0,-0.072013,0.097283,-0.065453,-0.042705,0.074004,-0.027191,-0.00528,0.058828,-0.127737,-0.024044,0.059751,0.036032,-0.022424,0.076005,-0.037925,-0.014145,0.066296,-0.026758,0.350847
Cdur,-0.072013,1.0,0.007298,-0.116174,0.624984,0.020843,0.003824,0.074749,0.093721,-0.02449,-0.245655,-0.036136,0.003559,-0.157049,-0.011284,-0.215438,-0.023834,0.164718,-0.138196,-0.214927
Chist,0.097283,0.007298,1.0,0.005368,0.059325,0.034317,0.043585,-0.014551,0.062729,-0.063088,-0.003051,0.099398,0.021876,-0.017945,0.464907,-0.018078,-0.000965,0.035913,0.042139,0.087274
Cpur,-0.065453,-0.116174,0.005368,1.0,0.010598,0.025498,0.064084,-0.063221,0.032152,0.006207,0.013064,0.071636,-0.021345,-0.04168,0.013718,0.033065,0.106238,-0.027714,0.144694,-0.039573
Camt,-0.042705,0.624984,0.059325,0.010598,1.0,0.070127,0.038756,-0.271316,0.102481,-0.027832,-0.22455,0.032716,-0.020224,-0.135632,0.020795,-0.261139,0.017142,0.276995,-0.05005,-0.154739
Sbal,0.074004,0.020843,0.034317,0.025498,0.070127,1.0,0.056865,0.03294,0.045916,-0.030214,0.05313,0.09476,0.000726,-0.032711,0.015568,-0.040662,0.023693,0.075988,0.012566,0.103133
Edur,-0.027191,0.003824,0.043585,0.064084,0.038756,0.056865,1.0,0.074664,0.048592,-0.067221,-0.070799,0.289741,-0.05246,-0.126529,0.085495,-0.142279,0.028019,0.110568,-0.06476,-0.008932
InRate,-0.00528,0.074749,-0.014551,-0.063221,-0.271316,0.03294,0.074664,1.0,0.138389,-0.011398,-0.039353,0.058266,0.041423,-0.089405,0.021669,-0.07809,-0.071207,0.014413,-0.090024,-0.072404
MSG,0.058828,0.093721,0.062729,0.032152,0.102481,0.045916,0.048592,0.138389,1.0,0.026691,-0.0412,0.147954,0.003949,-0.225034,0.111867,-0.025732,0.256475,0.066474,0.046226,0.095055
Oparties,-0.127737,-0.02449,-0.063088,0.006207,-0.027832,-0.030214,-0.067221,-0.011398,0.026691,1.0,0.17359,-0.029873,-0.07784,0.065889,-0.025447,0.062588,0.0204,-0.075035,0.117999,0.025137


In [130]:
# looking for the model to find relationship between credit history, checking/savings balances, and age with the interest rate their credit score
X = data[['Chist', 'Cbal', 'Sbal', 'age']]
y = data['creditScore']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=36)

In [131]:
model = DecisionTreeClassifier()
model.fit(X_train, y_train)

In [132]:
unique_class_names = [str(x) for x in data['creditScore'].unique()]

export_graphviz(model, 'tree.dot',
               feature_names = ['Chist', 'Cbal', 'Sbal', 'age'],
               class_names = unique_class_names)

In [133]:
predictions = model.predict(X_test)
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

         bad       0.43      0.57      0.49        28
        good       0.81      0.71      0.76        72

    accuracy                           0.67       100
   macro avg       0.62      0.64      0.62       100
weighted avg       0.70      0.67      0.68       100

