In [7]:
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier

In [8]:
df = pd.read_csv('credit.csv')

In [9]:
df.head()

Unnamed: 0,checking_balance,months_loan_duration,credit_history,purpose,amount,savings_balance,employment_duration,percent_of_income,years_at_residence,age,other_credit,housing,existing_loans_count,job,dependents,phone,default
0,< 0 DM,6,critical,furniture/appliances,1169,unknown,> 7 years,4,4,67,none,own,2,skilled,1,yes,no
1,1 - 200 DM,48,good,furniture/appliances,5951,< 100 DM,1 - 4 years,2,2,22,none,own,1,skilled,1,no,yes
2,unknown,12,critical,education,2096,< 100 DM,4 - 7 years,2,3,49,none,own,1,unskilled,2,no,no
3,< 0 DM,42,good,furniture/appliances,7882,< 100 DM,4 - 7 years,2,4,45,none,other,1,skilled,2,no,no
4,< 0 DM,24,poor,car,4870,< 100 DM,1 - 4 years,3,4,53,none,other,2,skilled,2,no,yes


In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 17 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   checking_balance      1000 non-null   object
 1   months_loan_duration  1000 non-null   int64 
 2   credit_history        1000 non-null   object
 3   purpose               1000 non-null   object
 4   amount                1000 non-null   int64 
 5   savings_balance       1000 non-null   object
 6   employment_duration   1000 non-null   object
 7   percent_of_income     1000 non-null   int64 
 8   years_at_residence    1000 non-null   int64 
 9   age                   1000 non-null   int64 
 10  other_credit          1000 non-null   object
 11  housing               1000 non-null   object
 12  existing_loans_count  1000 non-null   int64 
 13  job                   1000 non-null   object
 14  dependents            1000 non-null   int64 
 15  phone                 1000 non-null   o

# Transform all object data types into integer data types

In [11]:
for feature in df.columns:
    if df[feature].dtype == 'object':
        df[feature] = pd.Categorical(df[feature]).codes # codes will give the Integer representation of the Cateorical Type                                                                              

In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 17 columns):
 #   Column                Non-Null Count  Dtype
---  ------                --------------  -----
 0   checking_balance      1000 non-null   int8 
 1   months_loan_duration  1000 non-null   int64
 2   credit_history        1000 non-null   int8 
 3   purpose               1000 non-null   int8 
 4   amount                1000 non-null   int64
 5   savings_balance       1000 non-null   int8 
 6   employment_duration   1000 non-null   int8 
 7   percent_of_income     1000 non-null   int64
 8   years_at_residence    1000 non-null   int64
 9   age                   1000 non-null   int64
 10  other_credit          1000 non-null   int8 
 11  housing               1000 non-null   int8 
 12  existing_loans_count  1000 non-null   int64
 13  job                   1000 non-null   int8 
 14  dependents            1000 non-null   int64
 15  phone                 1000 non-null   int8 
 16  default

# Seperate the data into dependent and independent variables

In [13]:
x = df.drop('default', axis=1)
y = df.pop('default')

# Split the data into Testing and Training

## Cart Model - Version 1

In [14]:
from sklearn.model_selection import train_test_split

In [15]:
x_train, x_test, train_labels, test_lables = train_test_split(x, y, test_size=.30, random_state=1) # Test set size proportion is 30%

In [16]:
dt_model = DecisionTreeClassifier(criterion = 'gini')

In [17]:
dt_model.fit(x_train, train_labels)

DecisionTreeClassifier()

In [18]:
from sklearn import tree

In [19]:
train_char_label = ['No', 'Yes']

In [20]:
credit_tree_file = open('c:\credit_tree.dot', 'w')

In [21]:
dot_data = tree.export_graphviz(dt_model, out_file=credit_tree_file, feature_names=list(x_train), class_names=train_char_label)

In [22]:
credit_tree_file.close()

## Cart Model Version 2

In [23]:
# min_samples_leaf - Every leaf observation has 10 samples in it. (After split)
# min_samples_split - For a node to be split, it should have 30 observations in it. (Before split)
# mins_samples_leaf = 3 * min_samples_split

reg_dt_model = DecisionTreeClassifier(criterion = 'gini', max_depth=7, min_samples_leaf=10, min_samples_split=30)

In [24]:
reg_dt_model.fit(x_train, train_labels)

DecisionTreeClassifier(max_depth=7, min_samples_leaf=10, min_samples_split=30)

In [None]:
credit_tree_regularized = open('c:\credit_tree_regularized.dot', 'a')

dot_data = tree.export_graphviz(reg_dt_model, out_file=credit_tree_regularized, feature_names=list(x_train), class_names=train_char_label)

credit_tree_regularized.close()