In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix
from sklearn.tree import export_graphviz

In [2]:
df = pd.read_csv('credit.csv')
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
months_loan_duration,1000.0,20.903,12.058814,4.0,12.0,18.0,24.0,72.0
amount,1000.0,3271.258,2822.736876,250.0,1365.5,2319.5,3972.25,18424.0
percent_of_income,1000.0,2.973,1.118715,1.0,2.0,3.0,4.0,4.0
years_at_residence,1000.0,2.845,1.103718,1.0,2.0,3.0,4.0,4.0
age,1000.0,35.546,11.375469,19.0,27.0,33.0,42.0,75.0
existing_loans_count,1000.0,1.407,0.577654,1.0,1.0,1.0,2.0,4.0
dependents,1000.0,1.155,0.362086,1.0,1.0,1.0,1.0,2.0


In [3]:
df.head()

Unnamed: 0,checking_balance,months_loan_duration,credit_history,purpose,amount,savings_balance,employment_duration,percent_of_income,years_at_residence,age,other_credit,housing,existing_loans_count,job,dependents,phone,default
0,< 0 DM,6,critical,furniture/appliances,1169,unknown,> 7 years,4,4,67,none,own,2,skilled,1,yes,no
1,1 - 200 DM,48,good,furniture/appliances,5951,< 100 DM,1 - 4 years,2,2,22,none,own,1,skilled,1,no,yes
2,unknown,12,critical,education,2096,< 100 DM,4 - 7 years,2,3,49,none,own,1,unskilled,2,no,no
3,< 0 DM,42,good,furniture/appliances,7882,< 100 DM,4 - 7 years,2,4,45,none,other,1,skilled,2,no,no
4,< 0 DM,24,poor,car,4870,< 100 DM,1 - 4 years,3,4,53,none,other,2,skilled,2,no,yes


In [4]:
df.shape

(1000, 17)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 17 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   checking_balance      1000 non-null   object
 1   months_loan_duration  1000 non-null   int64 
 2   credit_history        1000 non-null   object
 3   purpose               1000 non-null   object
 4   amount                1000 non-null   int64 
 5   savings_balance       1000 non-null   object
 6   employment_duration   1000 non-null   object
 7   percent_of_income     1000 non-null   int64 
 8   years_at_residence    1000 non-null   int64 
 9   age                   1000 non-null   int64 
 10  other_credit          1000 non-null   object
 11  housing               1000 non-null   object
 12  existing_loans_count  1000 non-null   int64 
 13  job                   1000 non-null   object
 14  dependents            1000 non-null   int64 
 15  phone                 1000 non-null   o

In [6]:
for i in df.columns:
    if df[i].dtype == 'object':
        df[i] = pd.Categorical(df[i]).codes

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 17 columns):
 #   Column                Non-Null Count  Dtype
---  ------                --------------  -----
 0   checking_balance      1000 non-null   int8 
 1   months_loan_duration  1000 non-null   int64
 2   credit_history        1000 non-null   int8 
 3   purpose               1000 non-null   int8 
 4   amount                1000 non-null   int64
 5   savings_balance       1000 non-null   int8 
 6   employment_duration   1000 non-null   int8 
 7   percent_of_income     1000 non-null   int64
 8   years_at_residence    1000 non-null   int64
 9   age                   1000 non-null   int64
 10  other_credit          1000 non-null   int8 
 11  housing               1000 non-null   int8 
 12  existing_loans_count  1000 non-null   int64
 13  job                   1000 non-null   int8 
 14  dependents            1000 non-null   int64
 15  phone                 1000 non-null   int8 
 16  default

In [8]:
df.head()

Unnamed: 0,checking_balance,months_loan_duration,credit_history,purpose,amount,savings_balance,employment_duration,percent_of_income,years_at_residence,age,other_credit,housing,existing_loans_count,job,dependents,phone,default
0,1,6,0,4,1169,4,3,4,4,67,1,1,2,1,1,1,0
1,0,48,1,4,5951,2,0,2,2,22,1,1,1,1,1,0,1
2,3,12,0,3,2096,2,1,2,3,49,1,1,1,3,2,0,0
3,1,42,1,4,7882,2,1,2,4,45,1,0,1,1,2,0,0
4,1,24,3,1,4870,2,0,3,4,53,1,0,2,1,2,0,1


In [9]:
# Defining Dependent and Independent columns
X = df.drop("default", axis=1)
y = df[['default']]

In [10]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=1)

In [11]:
model = DecisionTreeClassifier(criterion='entropy')     # Default criterion is Gini Index
model.fit(X_train,y_train)

In [12]:
model.feature_importances_

array([0.13184506, 0.06034792, 0.06406769, 0.06091627, 0.20980847,
       0.08084632, 0.05008851, 0.04585947, 0.04047746, 0.10772077,
       0.0457864 , 0.00799638, 0.02872537, 0.03213315, 0.02425817,
       0.00912258])

In [13]:
for i in range(len(model.feature_importances_)):
    print(f"{X_train.columns[i]} --> {model.feature_importances_[i]}")

checking_balance --> 0.13184506251491765
months_loan_duration --> 0.06034791930479947
credit_history --> 0.0640676898621352
purpose --> 0.060916270319567256
amount --> 0.20980847237844696
savings_balance --> 0.08084631747008027
employment_duration --> 0.05008851013938227
percent_of_income --> 0.04585947483465482
years_at_residence --> 0.040477456079817335
age --> 0.10772077398865675
other_credit --> 0.04578640240986358
housing --> 0.00799637580898829
existing_loans_count --> 0.028725370333770806
job --> 0.032133151143185885
dependents --> 0.0242581709953123
phone --> 0.009122582416421233


In [14]:
# Above one can be done in different way
pd.DataFrame(model.feature_importances_, index = X_train.columns, columns = ['Feature_Importance'])

Unnamed: 0,Feature_Importance
checking_balance,0.131845
months_loan_duration,0.060348
credit_history,0.064068
purpose,0.060916
amount,0.209808
savings_balance,0.080846
employment_duration,0.050089
percent_of_income,0.045859
years_at_residence,0.040477
age,0.107721


In [15]:
# To view the tree diagram. Open the .dot file
export_graphviz(model, out_file='credit_tree.dot', feature_names = list(X_train), class_names = ['No','Yes'])

In [16]:
# Of course score is max in case of training set. But on test set score got lower
print(model.score(X_train,y_train))
print(model.score(X_test,y_test))

1.0
0.66


In [17]:
y_predict = model.predict(X_test)
confusion_matrix(y_test,y_predict)  # Recall of 0's is good (161) but for 1's is bad (only 34 out of 86 1's predicted right)

array([[163,  51],
       [ 51,  35]], dtype=int64)

Let's do Decision Tree Regularization. Let's limit Max_depth to 5

In [18]:
model_reg = DecisionTreeClassifier(max_depth=7, criterion='entropy')
model_reg.fit(X_train,y_train)

In [19]:
pd.DataFrame(model_reg.feature_importances_ ,index = X_train.columns, columns= ['Importance'])

Unnamed: 0,Importance
checking_balance,0.202099
months_loan_duration,0.06928
credit_history,0.085207
purpose,0.072193
amount,0.169812
savings_balance,0.075849
employment_duration,0.024542
percent_of_income,0.038249
years_at_residence,0.028817
age,0.110439


In [20]:
export_graphviz(model_reg, out_file= 'credit_tree_regularized.dot' , feature_names = list(X_train), class_names = ['No','Yes'])

In [21]:
print(model_reg.score(X_train,y_train))
print(model_reg.score(X_test,y_test))

0.86
0.7


In [22]:
y_reg_predict = model_reg.predict(X_test)

In [23]:
confusion_matrix(y_test,y_reg_predict)

array([[172,  42],
       [ 48,  38]], dtype=int64)

In [24]:
df.head()

Unnamed: 0,checking_balance,months_loan_duration,credit_history,purpose,amount,savings_balance,employment_duration,percent_of_income,years_at_residence,age,other_credit,housing,existing_loans_count,job,dependents,phone,default
0,1,6,0,4,1169,4,3,4,4,67,1,1,2,1,1,1,0
1,0,48,1,4,5951,2,0,2,2,22,1,1,1,1,1,0,1
2,3,12,0,3,2096,2,1,2,3,49,1,1,1,3,2,0,0
3,1,42,1,4,7882,2,1,2,4,45,1,0,1,1,2,0,0
4,1,24,3,1,4870,2,0,3,4,53,1,0,2,1,2,0,1
