# Credit Data Analysis
## Predict Defaulters Using Decision Tree

### 1. Initialize libraries and load data

In [20]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, confusion_matrix

In [21]:
credit = pd.read_csv("Data/credit.csv")
credit.sample(5)

Unnamed: 0,checking_balance,months_loan_duration,credit_history,purpose,amount,savings_balance,employment_duration,percent_of_income,years_at_residence,age,other_credit,housing,existing_loans_count,job,dependents,phone,default
451,unknown,6,good,furniture/appliances,518,< 100 DM,1 - 4 years,3,1,29,none,own,1,skilled,1,no,no
91,< 0 DM,12,critical,car,1409,< 100 DM,> 7 years,4,3,54,none,own,1,skilled,1,no,no
171,unknown,12,good,furniture/appliances,763,< 100 DM,1 - 4 years,4,1,26,none,own,1,skilled,1,yes,no
246,unknown,12,critical,furniture/appliances,717,< 100 DM,> 7 years,4,4,52,none,own,3,skilled,1,no,no
320,1 - 200 DM,30,critical,car,4249,< 100 DM,unemployed,4,2,28,none,own,2,management,1,no,yes


### 2. Encode and scale data

In [22]:
# encode the data
credit['checking_balance'] = credit['checking_balance'].replace(['< 0 DM', '1 - 200 DM','> 200 DM','unknown'],[1, 2, 3, 0])
credit['credit_history'] = credit['credit_history'].replace(['critical','poor','good','very good', 'perfect'],[0,1,2,3,4])
credit['purpose'] = credit['purpose'].replace(['furniture/appliances', 'education', 'car', 'car0', 'business', 'renovations'],[0,1,2,3,4,5])
credit['savings_balance'] = credit['savings_balance'].replace(['unknown', '< 100 DM', '100 - 500 DM', '500 - 1000 DM', '> 1000 DM'],[0,1,2,3,4])
credit['employment_duration'] = credit['employment_duration'].replace(['unemployed', '< 1 year', '1 - 4 years', '4 - 7 years', '> 7 years'],[0, 1, 2, 3, 4])

In [23]:
from sklearn.preprocessing import LabelEncoder
enc1 = LabelEncoder()
credit['other_credit']=enc1.fit_transform(credit['other_credit'])
credit['housing']=enc1.fit_transform(credit['housing'])
credit['job']=enc1.fit_transform(credit['job'])
credit['phone']=enc1.fit_transform(credit['phone'])
credit['default']=enc1.fit_transform(credit['default'])
credit.sample(5)

Unnamed: 0,checking_balance,months_loan_duration,credit_history,purpose,amount,savings_balance,employment_duration,percent_of_income,years_at_residence,age,other_credit,housing,existing_loans_count,job,dependents,phone,default
131,1,36,1,1,6887,1,2,4,3,29,2,1,1,1,1,1,1
545,1,24,1,2,1333,1,0,4,2,43,1,0,2,1,2,0,1
595,2,6,3,2,931,2,1,1,1,32,2,1,1,3,1,0,1
327,0,24,2,2,1525,4,3,4,3,34,1,1,1,1,2,1,0
932,0,9,0,2,1224,1,2,3,1,30,1,1,2,1,1,0,0


In [45]:
X=credit.iloc[:,0:16].values
y=credit.iloc[:,-1].values
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X = sc.fit_transform(X)
X[0,:]



array([-1.04541732e-03, -1.23647786e+00, -1.34401408e+00, -8.93292771e-01,
       -7.45131413e-01, -1.23139326e+00,  1.33807849e+00,  9.18477168e-01,
        1.04698668e+00,  2.76645648e+00,  2.18345684e-01, -1.33710455e-01,
        1.02707891e+00, -2.89639297e-01, -4.28289566e-01,  1.21459768e+00])

### 3. Train model and predict

In [69]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.30)
y_train.shape

(700,)

In [70]:
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [71]:
y_pred = dt.predict(X_test)
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
print(accuracy_score(y_test,y_pred))
print(confusion_matrix(y_test, y_pred))

0.6566666666666666
[[159  50]
 [ 53  38]]


### 4. Improving the model with random forests

In [72]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(criterion='entropy', n_estimators=20)
rf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=20, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [73]:
y_pred = rf.predict(X_test)
print(accuracy_score(y_test,y_pred))
print(confusion_matrix(y_test, y_pred))

0.7266666666666667
[[182  27]
 [ 55  36]]
