# Decision Tree and Ensemble Learning

## Credit Risk Scoring

* Build a model, that the bank can use to take a decision on whether they give a credit or not
* The model gives the risk, that a customer won't pay back the credit ("Risk of Defaulting")

In [95]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction import DictVectorizer

from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_auc_score

from sklearn.tree import plot_tree, export_text

## Data Cleaning and Preparation
* Dataset source (https://github.com/gastonstat/CreditScoring)
* Re-encoding catagorical variables
* Doing Train, Validation and Test split

In [1]:
# import wget

# url = 'https://github.com/gastonstat/CreditScoring/raw/master/CreditScoring.csv'

# wget.download(url)

'CreditScoring.csv'

In [2]:
!head CreditScoring.csv

"Status","Seniority","Home","Time","Age","Marital","Records","Job","Expenses","Income","Assets","Debt","Amount","Price"
1,9,1,60,30,2,1,3,73,129,0,0,800,846
1,17,1,60,58,3,1,1,48,131,0,0,1000,1658
2,10,2,36,46,2,2,3,90,200,3000,0,2000,2985
1,0,1,60,24,1,1,1,63,182,2500,0,900,1325
1,0,1,36,26,1,1,1,46,107,0,0,310,910
1,1,2,60,36,2,1,1,75,214,3500,0,650,1645
1,29,2,60,44,2,1,1,75,125,10000,0,1600,1800
1,9,5,12,27,1,1,1,35,80,0,0,200,1093
1,0,2,60,32,2,1,3,90,107,15000,0,1200,1957


### Data Features:
* (Target) **Status**: credit status
    ```
    1: 'ok',
    2: 'default',
    0: 'unk'
    ```
* **Seniority**: job seniority (years)
* **Home**: type of home ownership
    ```
    1: 'rent',
    2: 'owner',
    3: 'private',
    4: 'ignore',
    5: 'parents',
    6: 'other',
    0: 'unk'
    ```
* **Time**: time of requested loan
* **Age**: client's age
* **Marital**: marital status
    ```
    1: 'single',
    2: 'married',
    3: 'widow',
    4: 'separated',
    5: 'divorced',
    0: 'unk'
    ```
* **Records**: existance of records
    ```
    1: 'no',
    2: 'yes',
    0: 'unk'
    ```
* **Job**: type of job
    ```
    1: 'fixed',
    2: 'partime',
    3: 'freelance',
    4: 'others',
    0: 'unk'
    ```
* **Expenses**: amount of expenses
* **Income**: amount of income
* **Assets**: amount of assets
* **Debt**: amount of debt
* **Amount**: amount requested of loan
* **Price**: price of good

In [5]:
df = pd.read_csv('CreditScoring.csv')

In [7]:
df.head()

Unnamed: 0,Status,Seniority,Home,Time,Age,Marital,Records,Job,Expenses,Income,Assets,Debt,Amount,Price
0,1,9,1,60,30,2,1,3,73,129,0,0,800,846
1,1,17,1,60,58,3,1,1,48,131,0,0,1000,1658
2,2,10,2,36,46,2,2,3,90,200,3000,0,2000,2985
3,1,0,1,60,24,1,1,1,63,182,2500,0,900,1325
4,1,0,1,36,26,1,1,1,46,107,0,0,310,910


In [8]:
df.columns = df.columns.str.lower()
df.head()

Unnamed: 0,status,seniority,home,time,age,marital,records,job,expenses,income,assets,debt,amount,price
0,1,9,1,60,30,2,1,3,73,129,0,0,800,846
1,1,17,1,60,58,3,1,1,48,131,0,0,1000,1658
2,2,10,2,36,46,2,2,3,90,200,3000,0,2000,2985
3,1,0,1,60,24,1,1,1,63,182,2500,0,900,1325
4,1,0,1,36,26,1,1,1,46,107,0,0,310,910


In [9]:
status_values = {
    1: 'ok',
    2: 'default',
    0: 'unk'
}

df.status = df.status.map(status_values)

home_values = {
    1: 'rent',
    2: 'owner',
    3: 'private',
    4: 'ignore',
    5: 'parents',
    6: 'other',
    0: 'unk'
}

df.home = df.home.map(home_values)

marital_values = {
    1: 'single',
    2: 'married',
    3: 'widow',
    4: 'separated',
    5: 'divorced',
    0: 'unk'
}

df.marital = df.marital.map(marital_values)

records_values = {
    1: 'no',
    2: 'yes',
    0: 'unk'
}

df.records = df.records.map(records_values)

job_values = {
    1: 'fixed',
    2: 'partime',
    3: 'freelance',
    4: 'others',
    0: 'unk'
}

df.job = df.job.map(job_values)

In [10]:
df.head()

Unnamed: 0,status,seniority,home,time,age,marital,records,job,expenses,income,assets,debt,amount,price
0,ok,9,rent,60,30,married,no,freelance,73,129,0,0,800,846
1,ok,17,rent,60,58,widow,no,fixed,48,131,0,0,1000,1658
2,default,10,owner,36,46,married,yes,freelance,90,200,3000,0,2000,2985
3,ok,0,rent,60,24,single,no,fixed,63,182,2500,0,900,1325
4,ok,0,rent,36,26,single,no,fixed,46,107,0,0,310,910


In [11]:
df.describe().round()

Unnamed: 0,seniority,time,age,expenses,income,assets,debt,amount,price
count,4455.0,4455.0,4455.0,4455.0,4455.0,4455.0,4455.0,4455.0,4455.0
mean,8.0,46.0,37.0,56.0,763317.0,1060341.0,404382.0,1039.0,1463.0
std,8.0,15.0,11.0,20.0,8703625.0,10217569.0,6344253.0,475.0,628.0
min,0.0,6.0,18.0,35.0,0.0,0.0,0.0,100.0,105.0
25%,2.0,36.0,28.0,35.0,80.0,0.0,0.0,700.0,1118.0
50%,5.0,48.0,36.0,51.0,120.0,3500.0,0.0,1000.0,1400.0
75%,12.0,60.0,45.0,72.0,166.0,6000.0,0.0,1300.0,1692.0
max,48.0,72.0,68.0,180.0,99999999.0,99999999.0,99999999.0,5000.0,11140.0


In [12]:
for c in ['income', 'assets', 'debt']:
    df[c] = df[c].replace(to_replace=99999999, value=np.nan)

In [13]:
df.isnull().sum()

status        0
seniority     0
home          0
time          0
age           0
marital       0
records       0
job           0
expenses      0
income       34
assets       47
debt         18
amount        0
price         0
dtype: int64

In [14]:
df.describe().round()

Unnamed: 0,seniority,time,age,expenses,income,assets,debt,amount,price
count,4455.0,4455.0,4455.0,4455.0,4421.0,4408.0,4437.0,4455.0,4455.0
mean,8.0,46.0,37.0,56.0,131.0,5403.0,343.0,1039.0,1463.0
std,8.0,15.0,11.0,20.0,86.0,11573.0,1246.0,475.0,628.0
min,0.0,6.0,18.0,35.0,0.0,0.0,0.0,100.0,105.0
25%,2.0,36.0,28.0,35.0,80.0,0.0,0.0,700.0,1118.0
50%,5.0,48.0,36.0,51.0,120.0,3000.0,0.0,1000.0,1400.0
75%,12.0,60.0,45.0,72.0,165.0,6000.0,0.0,1300.0,1692.0
max,48.0,72.0,68.0,180.0,959.0,300000.0,30000.0,5000.0,11140.0


In [15]:
df.status.value_counts()

ok         3200
default    1254
unk           1
Name: status, dtype: int64

In [16]:

df = df[df.status != 'unk']

In [17]:
df.isnull().sum()

status        0
seniority     0
home          0
time          0
age           0
marital       0
records       0
job           0
expenses      0
income       34
assets       47
debt         18
amount        0
price         0
dtype: int64

In [19]:
df_train_full, df_test = train_test_split(df, test_size=0.2, random_state=11)
df_train, df_val = train_test_split(df_train_full, test_size=0.25, random_state=11)

In [20]:
y_train = (df_train.status == 'default').values
y_val = (df_val.status == 'default').values

In [21]:
del df_train['status']
del df_val['status']

In [22]:

len(df_train), len(df_val), len(df_test)

(2672, 891, 891)

In [44]:
categorical = list(df_train.dtypes[df_train.dtypes == 'object'].index)
categorical

['home', 'marital', 'records', 'job']

In [65]:
# encoder = ColumnTransformer(
#     [('number1', OneHotEncoder(sparse=False), categorical)],
#     remainder="passthrough"
# )
# # enc = OneHotEncoder(sparse=False, categories = categorical)
# X_train = encoder.fit_transform(df_train.fillna(0))

In [66]:
# X_train.shape

In [67]:
# X_train

In [74]:
dict_train = df_train.fillna(0).to_dict(orient='records')
dict_val = df_val.fillna(0).to_dict(orient='records')

In [75]:
dv = DictVectorizer(sparse=False)

X_train = dv.fit_transform(dict_train)
X_val = dv.transform(dict_val)

## Modeling (Decision Tree)

In [77]:
dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)

DecisionTreeClassifier()

In [83]:
y_pred = dt.predict_proba(X_train)[:,1]
roc_auc_score(y_pred, y_train)

1.0

In [85]:
y_pred = dt.predict_proba(X_val)[:,1]
roc_auc_score(y_pred, y_val)

0.6668311849185151

In [103]:
dt = DecisionTreeClassifier(max_depth=7)
dt.fit(X_train, y_train)

y_pred = dt.predict_proba(X_train)[:, 1]
auc = roc_auc_score(y_train, y_pred)
print('train auc: %.3f' % auc)

y_pred = dt.predict_proba(X_val)[:, 1]
auc = roc_auc_score(y_val, y_pred)
print('val auc: %.3f' % auc)

train auc: 0.900
val auc: 0.755


In [106]:
# print(plot_tree(dt))

In [105]:
print(export_text(dt, feature_names = dv.get_feature_names()))

|--- records=yes <= 0.50
|   |--- job=partime <= 0.50
|   |   |--- income <= 74.50
|   |   |   |--- assets <= 4250.00
|   |   |   |   |--- income <= 20.00
|   |   |   |   |   |--- seniority <= 1.50
|   |   |   |   |   |   |--- home=parents <= 0.50
|   |   |   |   |   |   |   |--- class: True
|   |   |   |   |   |   |--- home=parents >  0.50
|   |   |   |   |   |   |   |--- class: False
|   |   |   |   |   |--- seniority >  1.50
|   |   |   |   |   |   |--- assets <= 2450.00
|   |   |   |   |   |   |   |--- class: True
|   |   |   |   |   |   |--- assets >  2450.00
|   |   |   |   |   |   |   |--- class: False
|   |   |   |   |--- income >  20.00
|   |   |   |   |   |--- expenses <= 71.00
|   |   |   |   |   |   |--- debt <= 900.00
|   |   |   |   |   |   |   |--- class: False
|   |   |   |   |   |   |--- debt >  900.00
|   |   |   |   |   |   |   |--- class: True
|   |   |   |   |   |--- expenses >  71.00
|   |   |   |   |   |   |--- income <= 54.00
|   |   |   |   |   |   |   |--- cla

## Decision trees parameter tuning
* `max_depth`
* `min_sample_leaf`

In [108]:
for d in [1,2,3,4,5,6,10,15,20, None]:
    dt = DecisionTreeClassifier()
    dt.fit(X_train, y_train)
    
    y_pred = dt.predict_proba(X_val)[:, 1]
    auc = roc_auc_score(y_pred, y_val)
    
    print('%4s -> %.3f' % (d, auc))    

   1 -> 0.669
   2 -> 0.652
   3 -> 0.672
   4 -> 0.684
   5 -> 0.683
   6 -> 0.670
  10 -> 0.663
  15 -> 0.674
  20 -> 0.678
None -> 0.670


In [None]:
for