In [11]:
from pathlib import Path
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve, roc_auc_score

from sklearn.model_selection import KFold
from sklearn.metrics import plot_roc_curve
from sklearn.metrics import auc

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

np.set_printoptions(suppress=True)

pd.options.display.max_columns = None
pd.options.display.max_rows = None
pd.options.display.max_colwidth = None

from IPython.display import clear_output

import json

import warnings

from sklearn import preprocessing

In [12]:
# Entscheidungsbaum
TREE_PARAMS = {
    "random_state" : 0,
    "max_depth": 7,
}

In [13]:
path1 = Path(r"A:\Workspace\Python\Masterarbeit\Kaggle Home Credit Datensatz")
path2 = Path(r"C:\Users\rober\Documents\Workspace\Python\Masterarbeit\Kaggle Home Credit Datensatz")

if path1.is_dir():
    DATASET_DIR = path1
else:
    DATASET_DIR = path2

In [14]:
app_train = pd.read_csv(DATASET_DIR / "4. FillNA" / "application.csv")
bureau = pd.read_csv(DATASET_DIR / "4. FillNA" / "bureau.csv")
pa_pos = pd.read_csv(DATASET_DIR / "4. FillNA" / "pa_pos.csv")
pa_ip = pd.read_csv(DATASET_DIR / "4. FillNA" / "pa_ip.csv")
pa_cc = pd.read_csv(DATASET_DIR / "4. FillNA" / "pa_cc.csv")

In [15]:
app_train = app_train.set_index("SK_ID_CURR")
bureau = bureau.set_index("SK_ID_CURR")
pa_pos = pa_pos.set_index("SK_ID_CURR")
pa_ip = pa_ip.set_index("SK_ID_CURR")
pa_cc = pa_cc.set_index("SK_ID_CURR")

In [16]:
MODEL_APPLICATION = "2.1. Esembler_DecisionTree_Application.json"
MODEL_BUREAU = "2.2. Esembler_DecisionTree_bureau.json"
MODEL_PA_POS = "2.3. Esembler_DecisionTree_pa_pos.json"
MODEL_PA_IP = "2.4. Esembler_DecisionTree_pa_ip.json"
MODEL_PA_CC = "2.5. Esembler_DecisionTree_pa_cc.json"

In [17]:
with open(DATASET_DIR / "Models" / MODEL_APPLICATION, 'r') as file:
    model_application_data = json.load(file)
    
with open(DATASET_DIR / "Models" / MODEL_BUREAU, 'r') as file:
    model_bureau_data = json.load(file)
    
with open(DATASET_DIR / "Models" / MODEL_PA_POS, 'r') as file:
    model_pa_pos_data = json.load(file)
    
with open(DATASET_DIR / "Models" / MODEL_PA_IP, 'r') as file:
    model_pa_ip_data = json.load(file)
    
with open(DATASET_DIR / "Models" / MODEL_PA_CC, 'r') as file:
    model_pa_cc_data = json.load(file)

In [19]:
model_application = DecisionTreeClassifier(**TREE_PARAMS)
model_bureau = DecisionTreeClassifier(**TREE_PARAMS)
model_pa_pos = DecisionTreeClassifier(**TREE_PARAMS)
model_pa_ip = DecisionTreeClassifier(**TREE_PARAMS)
model_pa_cc = DecisionTreeClassifier(**TREE_PARAMS)

In [20]:
df = app_train[["TARGET"]]
df.head()

Unnamed: 0_level_0,TARGET
SK_ID_CURR,Unnamed: 1_level_1
100002,1.0
100003,0.0
100004,0.0
100006,0.0
100007,0.0


In [21]:
# application prediction

In [23]:
x = app_train[model_application_data["keep"]]
x.head()

Unnamed: 0_level_0,A_EXT_SOURCE_3,A_AMT_REQ_CREDIT_BUREAU_DAY,A_DAYS_BIRTH,A_EXT_SOURCE_1,A_EXT_SOURCE_2,A_LANDAREA_AVG,A_AMT_REQ_CREDIT_BUREAU_HOUR,A_AMT_INCOME_TOTAL,A_DAYS_ID_PUBLISH,A_CODE_GENDER,A_OCCUPATION_TYPE,A_ORGANIZATION_TYPE,A_NAME_INCOME_TYPE,A_REGION_RATING_CLIENT
SK_ID_CURR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
100002,0.139376,0.0,-9461.0,0.083037,0.262949,0.0369,0.0,202500.0,-2120.0,0.0,0.0,0.0,0.0,2.0
100003,0.361809,0.0,-16765.0,0.311267,0.622246,0.013,0.0,270000.0,-291.0,1.0,1.0,1.0,1.0,1.0
100004,0.729567,0.0,-19046.0,0.550253,0.555912,0.071173,0.0,67500.0,-2531.0,0.0,0.0,2.0,0.0,2.0
100006,0.565391,0.005328,-19005.0,0.664796,0.650442,0.071143,0.007517,135000.0,-2437.0,1.0,0.0,0.0,0.0,2.0
100007,0.565607,0.0,-19932.0,0.557628,0.322738,0.053231,0.0,121500.0,-3458.0,0.0,1.0,3.0,0.0,2.0


In [24]:
y = app_train.loc[app_train.index]["TARGET"]
y.head()

SK_ID_CURR
100002    1.0
100003    0.0
100004    0.0
100006    0.0
100007    0.0
Name: TARGET, dtype: float64

In [25]:
model_application.fit(x,y)

DecisionTreeClassifier(max_depth=7, random_state=0)

In [26]:
predicted = pd.DataFrame(model_application.predict_proba(x), index=app_train.index)
predicted.columns = ["A_PAYBACK", "A_DEFAULT"]

In [27]:
df = pd.merge(df, predicted, how="left", left_index=True, right_index=True)
df.head()

Unnamed: 0_level_0,TARGET,A_PAYBACK,A_DEFAULT
SK_ID_CURR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
100002,1.0,0.573145,0.426855
100003,0.0,0.780943,0.219057
100004,0.0,0.977739,0.022261
100006,0.0,0.99635,0.00365
100007,0.0,0.962839,0.037161


In [28]:
# bureau prediction

In [30]:
x = bureau[model_bureau_data["keep"]]
x.head()

Unnamed: 0_level_0,B_Closed,B_Active
SK_ID_CURR,Unnamed: 1_level_1,Unnamed: 2_level_1
100002,6.0,2.0
100003,3.0,1.0
100004,2.0,0.0
100007,1.0,0.0
100008,2.0,1.0


In [31]:
y = app_train.loc[bureau.index]["TARGET"]
y.head()

SK_ID_CURR
100002    1.0
100003    0.0
100004    0.0
100007    0.0
100008    0.0
Name: TARGET, dtype: float64

In [32]:
model_bureau.fit(x,y)

DecisionTreeClassifier(max_depth=7, random_state=0)

In [33]:
predicted = pd.DataFrame(model_bureau.predict_proba(x), index=bureau.index)
predicted.columns = ["B_PAYBACK", "B_DEFAULT"]

In [34]:
df = pd.merge(df, predicted, how="left", left_index=True, right_index=True)
df.head()

Unnamed: 0_level_0,TARGET,A_PAYBACK,A_DEFAULT,B_PAYBACK,B_DEFAULT
SK_ID_CURR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
100002,1.0,0.573145,0.426855,0.944348,0.055652
100003,0.0,0.780943,0.219057,0.938875,0.061125
100004,0.0,0.977739,0.022261,0.945874,0.054126
100006,0.0,0.99635,0.00365,,
100007,0.0,0.962839,0.037161,0.9301,0.0699


In [35]:
# pa_pos prediction

In [36]:
x = pa_pos[model_pa_pos_data["keep"]]
x.head()

Unnamed: 0_level_0,PA_AMT_ANNUITY,PA_CNT_NAME_CONTRACT_STATUS_Refused,PA_POS_CNT_DPD,PA_CNT_NAME_PRODUCT_TYPE_walk-in,PA_AMT_APPLICATION,PA_AMT_DOWN_PAYMENT,PA_RATE_DOWN_PAYMENT,PA_AMT_GOODS_PRICE,PA_POS_CNT_PAYMENTS_LEFT,PA_CNT_NAME_YIELD_GROUP_high,PA_CNT_NAME_CLIENT_TYPE_New,PA_CNT_NAME_YIELD_GROUP_low_normal,PA_CNT_NAME_CONTRACT_TYPE_Revolving loans,PA_CNT_CREDITS,PA_CNT_PRODUCT_COMBINATION_Card Street,PA_CNT_NAME_CLIENT_TYPE_Refreshed,PA_CNT_NAME_PORTFOLIO_Cards,PA_CNT_PRODUCT_COMBINATION_Cash X-Sell: low
SK_ID_CURR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
158271,6404.31,0.0,0.0,0.0,58905.0,0.0,0.0,58905.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
252457,4951.35,0.0,0.0,1.0,47056.275,4.275,8.8e-05,47056.275,9.0,0.0,0.0,0.0,1.0,1.0,1.0,2.0,1.0,0.0
260094,97839.945,1.0,0.0,1.0,2023469.37,8765.37,0.312513,2023469.37,70.0,4.0,1.0,3.0,0.0,9.0,0.0,1.0,0.0,4.0
176456,14713.605,0.0,0.0,0.0,123486.075,12349.575,0.101388,123486.075,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
256657,60139.575,7.0,0.0,5.0,1115806.5,10246.286731,0.699443,1115806.5,28.0,0.0,1.0,2.0,2.0,3.0,1.0,0.0,2.0,0.0


In [37]:
y = app_train.loc[pa_pos.index]["TARGET"]
y.head()

SK_ID_CURR
158271    0.0
252457    0.0
260094    0.0
176456    0.0
256657    0.0
Name: TARGET, dtype: float64

In [38]:
model_pa_pos.fit(x,y)

DecisionTreeClassifier(max_depth=7, random_state=0)

In [39]:
predicted = pd.DataFrame(model_pa_pos.predict_proba(x), index=pa_pos.index)
predicted.columns = ["PA_POS_PAYBACK", "PA_POS_DEFAULT"]

In [40]:
df = pd.merge(df, predicted, how="left", left_index=True, right_index=True)
df.head()

Unnamed: 0_level_0,TARGET,A_PAYBACK,A_DEFAULT,B_PAYBACK,B_DEFAULT,PA_POS_PAYBACK,PA_POS_DEFAULT
SK_ID_CURR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
100002,1.0,0.573145,0.426855,0.944348,0.055652,0.913482,0.086518
100003,0.0,0.780943,0.219057,0.938875,0.061125,0.958792,0.041208
100004,0.0,0.977739,0.022261,0.945874,0.054126,0.912298,0.087702
100006,0.0,0.99635,0.00365,,,0.941321,0.058679
100007,0.0,0.962839,0.037161,0.9301,0.0699,0.926242,0.073758


In [41]:
# pa_ip prediction

In [42]:
x = pa_ip[model_pa_ip_data["keep"]]
x.head()

Unnamed: 0_level_0,PA_CNT_NAME_PRODUCT_TYPE_walk-in,PA_AMT_ANNUITY,PA_CNT_NAME_CONTRACT_STATUS_Refused,PA_IP_AMT_DIFF_180,PA_IP_DAYS_DIFF_90,PA_IP_PAYMENT_120,PA_IP_DAYS_DIFF_30,PA_IP_DAYS_DIFF_60,PA_IP_AMT_DIFF_60,PA_CNT_NAME_YIELD_GROUP_low_normal,PA_IP_PAYMENT_90,PA_AMT_APPLICATION,PA_IP_PAYMENT_60,PA_CNT_NAME_PORTFOLIO_Cards,PA_CNT_NAME_CLIENT_TYPE_New,PA_IP_AMT_DIFF_120,PA_AMT_DOWN_PAYMENT,PA_CNT_NAME_CONTRACT_TYPE_Revolving loans,PA_CNT_PRODUCT_COMBINATION_Cash X-Sell: low
SK_ID_CURR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
252457,1.0,4951.35,0.0,0.0,-17.0,0.912871,0.0,-15.0,0.0,0.0,0.9128709,47056.275,0.9128709,1.0,0.0,0.0,4.275,1.0,0.0
256657,5.0,49054.77,7.0,0.0,-12.0,0.204124,-6.0,-13.0,0.0,2.0,0.2041241,978570.0,0.2041241,2.0,1.0,0.0,15714.0,2.0,0.0
198678,0.0,51759.551912,0.0,0.0,-6.5,-0.000407,-20.0,-21.5,0.0,0.0,4.540312e-08,921721.5,4.540312e-08,0.0,1.0,0.0,13347.0,0.0,0.0
394447,1.0,2250.0,0.0,0.0,-1.0,-0.867506,0.0,-1.0,0.0,0.0,-1.044958,45000.0,0.814828,1.0,2.0,0.0,0.0,1.0,0.0
156331,0.0,40866.795,0.0,0.0,0.0,-1.114368,0.0,0.0,0.0,1.0,0.01416426,793035.0,0.01416426,1.0,1.0,0.0,0.0,2.0,0.0


In [43]:
y = app_train.loc[pa_ip.index]["TARGET"]
y.head()

SK_ID_CURR
252457    0.0
256657    0.0
198678    0.0
394447    0.0
156331    0.0
Name: TARGET, dtype: float64

In [44]:
model_pa_ip.fit(x,y)

DecisionTreeClassifier(max_depth=7, random_state=0)

In [45]:
predicted = pd.DataFrame(model_pa_ip.predict_proba(x), index=pa_ip.index)
predicted.columns = ["PA_IP_PAYBACK", "PA_IP_DEFAULT"]

In [46]:
df = pd.merge(df, predicted, how="left", left_index=True, right_index=True)
df.head()

Unnamed: 0_level_0,TARGET,A_PAYBACK,A_DEFAULT,B_PAYBACK,B_DEFAULT,PA_POS_PAYBACK,PA_POS_DEFAULT,PA_IP_PAYBACK,PA_IP_DEFAULT
SK_ID_CURR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
100002,1.0,0.573145,0.426855,0.944348,0.055652,0.913482,0.086518,0.936357,0.063643
100003,0.0,0.780943,0.219057,0.938875,0.061125,0.958792,0.041208,,
100004,0.0,0.977739,0.022261,0.945874,0.054126,0.912298,0.087702,,
100006,0.0,0.99635,0.00365,,,0.941321,0.058679,0.92772,0.07228
100007,0.0,0.962839,0.037161,0.9301,0.0699,0.926242,0.073758,0.897748,0.102252


In [47]:
# pa_cc prediction

In [48]:
x = pa_cc[model_pa_cc_data["keep"]]
x.head()

Unnamed: 0_level_0,PA_CC_BALANCE_60,PA_CNT_NAME_PRODUCT_TYPE_walk-in,PA_CC_BALANCE_90,PA_CNT_NAME_CONTRACT_STATUS_Refused,PA_CC_PAYMENT_180,PA_CNT_PRODUCT_COMBINATION_Cash X-Sell: low,PA_CC_BALANCE_150,PA_CC_DPD_30,PA_CC_DPD_90,PA_CC_PAYMENT_30,PA_CC_DPD_180,PA_CC_DPD_120,PA_CNT_CODE_REJECT_REASON_HC,PA_CNT_CREDITS
SK_ID_CURR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
394447,1.434002,1.0,0.992963,0.0,1.318045,0.0,-0.415697,0.0,0.0,-1.389108,0.0,0.0,0.0,1.0
361282,0.828945,1.0,0.861611,0.0,-1.152838,0.0,-1.07437,0.0,0.0,0.850318,0.0,0.0,0.0,1.0
436351,0.0,1.0,0.0,3.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,1.0
181153,0.0,1.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
309691,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [49]:
y = app_train.loc[pa_cc.index]["TARGET"]
y.head()

SK_ID_CURR
394447    0.0
361282    0.0
436351    0.0
181153    0.0
309691    0.0
Name: TARGET, dtype: float64

In [50]:
model_pa_cc.fit(x,y)

DecisionTreeClassifier(max_depth=7, random_state=0)

In [51]:
predicted = pd.DataFrame(model_pa_cc.predict_proba(x), index=pa_cc.index)
predicted.columns = ["PA_CC_PAYBACK", "PA_CC_DEFAULT"]

In [52]:
df = pd.merge(df, predicted, how="left", left_index=True, right_index=True)
df.head()

Unnamed: 0_level_0,TARGET,A_PAYBACK,A_DEFAULT,B_PAYBACK,B_DEFAULT,PA_POS_PAYBACK,PA_POS_DEFAULT,PA_IP_PAYBACK,PA_IP_DEFAULT,PA_CC_PAYBACK,PA_CC_DEFAULT
SK_ID_CURR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
100002,1.0,0.573145,0.426855,0.944348,0.055652,0.913482,0.086518,0.936357,0.063643,,
100003,0.0,0.780943,0.219057,0.938875,0.061125,0.958792,0.041208,,,,
100004,0.0,0.977739,0.022261,0.945874,0.054126,0.912298,0.087702,,,,
100006,0.0,0.99635,0.00365,,,0.941321,0.058679,0.92772,0.07228,0.950272,0.049728
100007,0.0,0.962839,0.037161,0.9301,0.0699,0.926242,0.073758,0.897748,0.102252,,


In [53]:
df = df.fillna(0.5)
df.head()

Unnamed: 0_level_0,TARGET,A_PAYBACK,A_DEFAULT,B_PAYBACK,B_DEFAULT,PA_POS_PAYBACK,PA_POS_DEFAULT,PA_IP_PAYBACK,PA_IP_DEFAULT,PA_CC_PAYBACK,PA_CC_DEFAULT
SK_ID_CURR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
100002,1.0,0.573145,0.426855,0.944348,0.055652,0.913482,0.086518,0.936357,0.063643,0.5,0.5
100003,0.0,0.780943,0.219057,0.938875,0.061125,0.958792,0.041208,0.5,0.5,0.5,0.5
100004,0.0,0.977739,0.022261,0.945874,0.054126,0.912298,0.087702,0.5,0.5,0.5,0.5
100006,0.0,0.99635,0.00365,0.5,0.5,0.941321,0.058679,0.92772,0.07228,0.950272,0.049728
100007,0.0,0.962839,0.037161,0.9301,0.0699,0.926242,0.073758,0.897748,0.102252,0.5,0.5


In [54]:
# finales Modell

In [55]:
model = DecisionTreeClassifier(**TREE_PARAMS)

aucs = []

kfold = KFold(5, shuffle=True, random_state=1)
data = df

y = df["TARGET"]
x = df.drop(["TARGET"], axis=1)

for ID_TRAIN, ID_TEST in kfold.split(data):
    
    x_train = x.iloc[ID_TRAIN]
    y_train = y.iloc[ID_TRAIN]
    x_test = x.iloc[ID_TEST]
    y_test = y.iloc[ID_TEST]
    
    model.fit(x_train, y_train)
    auc = roc_auc_score(y_test, model.predict_proba(x_test)[:,1])
    aucs.append(auc)
    
    print('train: %s, test: %s, auc: %s' % (ID_TRAIN, ID_TEST, auc))

print("\n")
print("Durchschnitt: %.2f" % (np.mean(aucs)))
print("Standardabw.: %.2f" % (np.std(aucs)))
print("Varianz:      %.2f" % (np.var(aucs)))

train: [     0      2      3 ... 307508 307509 307510], test: [     1      4      6 ... 307493 307495 307501], auc: 0.8197617399157294
train: [     0      1      2 ... 307508 307509 307510], test: [     3     18     22 ... 307499 307503 307504], auc: 0.8137417827031361
train: [     0      1      2 ... 307508 307509 307510], test: [     9     10     15 ... 307489 307492 307502], auc: 0.8096337211388106
train: [     0      1      2 ... 307506 307507 307508], test: [    12     16     23 ... 307505 307509 307510], auc: 0.8148293386745673
train: [     1      3      4 ... 307505 307509 307510], test: [     0      2      5 ... 307506 307507 307508], auc: 0.8052350680266736


Durchschnitt: 0.81
Standardabw.: 0.00
Varianz:      0.00
