In [51]:
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_auc_score
from sklearn.tree import export_text


In [2]:
df = pd.read_csv('CreditScoring-master/CreditScoring.csv')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4455 entries, 0 to 4454
Data columns (total 14 columns):
 #   Column     Non-Null Count  Dtype
---  ------     --------------  -----
 0   Status     4455 non-null   int64
 1   Seniority  4455 non-null   int64
 2   Home       4455 non-null   int64
 3   Time       4455 non-null   int64
 4   Age        4455 non-null   int64
 5   Marital    4455 non-null   int64
 6   Records    4455 non-null   int64
 7   Job        4455 non-null   int64
 8   Expenses   4455 non-null   int64
 9   Income     4455 non-null   int64
 10  Assets     4455 non-null   int64
 11  Debt       4455 non-null   int64
 12  Amount     4455 non-null   int64
 13  Price      4455 non-null   int64
dtypes: int64(14)
memory usage: 487.4 KB


In [7]:
df.columns = df.columns.str.lower()
df.head()

Unnamed: 0,status,seniority,home,time,age,marital,records,job,expenses,income,assets,debt,amount,price
0,1,9,1,60,30,2,1,3,73,129,0,0,800,846
1,1,17,1,60,58,3,1,1,48,131,0,0,1000,1658
2,2,10,2,36,46,2,2,3,90,200,3000,0,2000,2985
3,1,0,1,60,24,1,1,1,63,182,2500,0,900,1325
4,1,0,1,36,26,1,1,1,46,107,0,0,310,910


In [8]:
# Convert the categorical values to strings
status_mapping = {0:'unk', 1:'ok', 2:'default'}

home_mapping = {0:'unk', 1:'rent', 2:'owner', 3:'private', 4:'ignore', 5:'parents', 6:'other'}

marital_mapping = {0:'unk', 1:'single', 2:'married', 3:'widow', 4:'separated', 5:'divorced'}

records_mapping = {0:'unk', 1:'no', 2:'yes'}

job_mapping = {0:'unk', 1:'fixed', 2:'parttime', 3:'freelance', 4:'others'}

df['status'] = df['status'].map(status_mapping)
df['home'] = df['home'].map(home_mapping)
df['marital'] = df['marital'].map(marital_mapping)
df['records'] = df['records'].map(records_mapping)
df['job'] = df['job'].map(job_mapping)

In [9]:
df.head()

Unnamed: 0,status,seniority,home,time,age,marital,records,job,expenses,income,assets,debt,amount,price
0,ok,9,rent,60,30,married,no,freelance,73,129,0,0,800,846
1,ok,17,rent,60,58,widow,no,fixed,48,131,0,0,1000,1658
2,default,10,owner,36,46,married,yes,freelance,90,200,3000,0,2000,2985
3,ok,0,rent,60,24,single,no,fixed,63,182,2500,0,900,1325
4,ok,0,rent,36,26,single,no,fixed,46,107,0,0,310,910


In [10]:
df.describe().round()

Unnamed: 0,seniority,time,age,expenses,income,assets,debt,amount,price
count,4455.0,4455.0,4455.0,4455.0,4455.0,4455.0,4455.0,4455.0,4455.0
mean,8.0,46.0,37.0,56.0,763317.0,1060341.0,404382.0,1039.0,1463.0
std,8.0,15.0,11.0,20.0,8703625.0,10217569.0,6344253.0,475.0,628.0
min,0.0,6.0,18.0,35.0,0.0,0.0,0.0,100.0,105.0
25%,2.0,36.0,28.0,35.0,80.0,0.0,0.0,700.0,1118.0
50%,5.0,48.0,36.0,51.0,120.0,3500.0,0.0,1000.0,1400.0
75%,12.0,60.0,45.0,72.0,166.0,6000.0,0.0,1300.0,1692.0
max,48.0,72.0,68.0,180.0,99999999.0,99999999.0,99999999.0,5000.0,11140.0


In [12]:
for col in ['income', 'assets', 'debt']:
    df[col] = df[col].replace(to_replace=99999999, value=np.nan)
df.describe().round()

Unnamed: 0,seniority,time,age,expenses,income,assets,debt,amount,price
count,4455.0,4455.0,4455.0,4455.0,4421.0,4408.0,4437.0,4455.0,4455.0
mean,8.0,46.0,37.0,56.0,131.0,5403.0,343.0,1039.0,1463.0
std,8.0,15.0,11.0,20.0,86.0,11573.0,1246.0,475.0,628.0
min,0.0,6.0,18.0,35.0,0.0,0.0,0.0,100.0,105.0
25%,2.0,36.0,28.0,35.0,80.0,0.0,0.0,700.0,1118.0
50%,5.0,48.0,36.0,51.0,120.0,3000.0,0.0,1000.0,1400.0
75%,12.0,60.0,45.0,72.0,165.0,6000.0,0.0,1300.0,1692.0
max,48.0,72.0,68.0,180.0,959.0,300000.0,30000.0,5000.0,11140.0


In [13]:
## status is our target variable
df['status'].value_counts()

ok         3200
default    1254
unk           1
Name: status, dtype: int64

In [14]:
df = df[df['status']!='unk']

In [17]:
# first split dataset into train and test (80 - 20 split)
df_train_val, df_test = train_test_split(df, test_size = 0.20, random_state = 23295)
# next split train set into train and validation (60 - 20 split)
df_train, df_val = train_test_split(df_train_val, test_size = 0.25, random_state = 23295)

len(df_train), len(df_val), len(df_test)


(2672, 891, 891)

In [18]:
y_train = (df_train['status']=='default').values
y_val = (df_val['status']=='default').values

del df_train['status']
del df_val['status']

df_train.fillna(0, inplace=True)
df_val.fillna(0, inplace=True)

In [20]:
dict_train = df_train.to_dict(orient='records')
dict_val = df_val.to_dict(orient='records')

In [32]:
dict_vec = DictVectorizer(sparse=False)
X_train = dict_vec.fit_transform(dict_train)
X_val = dict_vec.transform(dict_val)

In [40]:
dict_vec.get_feature_names_out()

array(['age', 'amount', 'assets', 'debt', 'expenses', 'home=ignore',
       'home=other', 'home=owner', 'home=parents', 'home=private',
       'home=rent', 'home=unk', 'income', 'job=fixed', 'job=freelance',
       'job=others', 'job=parttime', 'marital=divorced',
       'marital=married', 'marital=separated', 'marital=single',
       'marital=widow', 'price', 'records=no', 'records=yes', 'seniority',
       'time'], dtype=object)

In [46]:
dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)

In [47]:
y_pred = dt.predict_proba(X_train)[:,1]
roc_auc_score(y_train, y_pred)

1.0

In [48]:
y_pred = dt.predict_proba(X_val)[:,1]
roc_auc_score(y_val, y_pred)

0.6473878540725229

In [50]:
# data has clearly been overfit to the training set
# hyperparameters that help improve generalization 
# 1) max_depth

dt = DecisionTreeClassifier(max_depth=2)
dt.fit(X_train, y_train)

In [56]:
tree_text = export_text(dt, feature_names = dict_vec.feature_names_)
print(tree_text)

|--- seniority <= 2.50
|   |--- records=no <= 0.50
|   |   |--- class: True
|   |--- records=no >  0.50
|   |   |--- class: False
|--- seniority >  2.50
|   |--- records=no <= 0.50
|   |   |--- class: False
|   |--- records=no >  0.50
|   |   |--- class: False



In [58]:
y_pred = dt.predict_proba(X_train)[:,1]
auc = roc_auc_score(y_train, y_pred)
print('train_auc: ', auc)

y_pred = dt.predict_proba(X_val)[:,1]
auc = roc_auc_score(y_val, y_pred)
print('validation auc: ', auc)

train_auc:  0.7179886004740519
validation auc:  0.7157132885117014


In [66]:
max_depth_range = list(np.arange(1, 21))
max_depth_range.append(None)
for depth in max_depth_range:
    dt = DecisionTreeClassifier(max_depth=depth)
    dt.fit(X_train, y_train)
    y_pred = dt.predict_proba(X_val)[:,1]
    auc = roc_auc_score(y_val, y_pred)
    print('%4s -> %.3f' % (depth, auc))

   1 -> 0.626
   2 -> 0.716
   3 -> 0.758
   4 -> 0.773
   5 -> 0.768
   6 -> 0.756
   7 -> 0.734
   8 -> 0.697
   9 -> 0.672
  10 -> 0.656
  11 -> 0.641
  12 -> 0.648
  13 -> 0.642
  14 -> 0.648
  15 -> 0.651
  16 -> 0.648
  17 -> 0.638
  18 -> 0.647
  19 -> 0.637
  20 -> 0.640
None -> 0.644


In [67]:
## Next lets tune min_leaf_size parameter
best_max_depth = [3,4,5,6]
min_leaf_size_range = [1,5,10,15,20,50,100,200]
for m in best_max_depth:
    print('depth: %s' % m)
    for leaf_size in min_leaf_size_range:
        dt = DecisionTreeClassifier(max_depth=m, min_samples_leaf=leaf_size)
        dt.fit(X_train, y_train)
        y_pred = dt.predict_proba(X_val)[:,1]
        auc = roc_auc_score(y_val, y_pred)
        print('%4s -> %.3f' % (leaf_size, auc))
    print()
    

depth: 3
   1 -> 0.758
   5 -> 0.758
  10 -> 0.758
  15 -> 0.758
  20 -> 0.758
  50 -> 0.758
 100 -> 0.760
 200 -> 0.751

depth: 4
   1 -> 0.773
   5 -> 0.773
  10 -> 0.771
  15 -> 0.771
  20 -> 0.771
  50 -> 0.772
 100 -> 0.770
 200 -> 0.762

depth: 5
   1 -> 0.768
   5 -> 0.759
  10 -> 0.767
  15 -> 0.768
  20 -> 0.774
  50 -> 0.777
 100 -> 0.777
 200 -> 0.765

depth: 6
   1 -> 0.756
   5 -> 0.754
  10 -> 0.762
  15 -> 0.766
  20 -> 0.771
  50 -> 0.790
 100 -> 0.782
 200 -> 0.766



In [69]:
## train the final model with the set parameters
dt = DecisionTreeClassifier(max_depth=6, min_samples_leaf=50)
dt.fit(X_train, y_train)
