In [1]:
from pathlib import Path
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve, roc_auc_score

from sklearn.model_selection import KFold
from sklearn.metrics import plot_roc_curve
from sklearn.metrics import auc

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

np.set_printoptions(suppress=True)

pd.options.display.max_columns = None
pd.options.display.max_rows = None
pd.options.display.max_colwidth = None

from IPython.display import clear_output

import json

import warnings

from sklearn import preprocessing

In [2]:
# Entscheidungsbaum
TREE_PARAMS = {
    "random_state" : 0,
    "max_depth": 7,
}

In [3]:
path1 = Path(r"A:\Workspace\Python\Masterarbeit\Kaggle Home Credit Datensatz")
path2 = Path(r"C:\Users\rober\Documents\Workspace\Python\Masterarbeit\Kaggle Home Credit Datensatz")

if path1.is_dir():
    DATASET_DIR = path1
else:
    DATASET_DIR = path2

In [4]:
app_train = pd.read_csv(DATASET_DIR / "4. FillNA" / "application.csv")
bureau = pd.read_csv(DATASET_DIR / "4. FillNA" / "bureau.csv")
pa_pos = pd.read_csv(DATASET_DIR / "4. FillNA" / "pa_pos.csv")
pa_ip = pd.read_csv(DATASET_DIR / "4. FillNA" / "pa_ip.csv")
pa_cc = pd.read_csv(DATASET_DIR / "4. FillNA" / "pa_cc.csv")

In [5]:
app_train = app_train.set_index("SK_ID_CURR")
bureau = bureau.set_index("SK_ID_CURR")
pa_pos = pa_pos.set_index("SK_ID_CURR")
pa_ip = pa_ip.set_index("SK_ID_CURR")
pa_cc = pa_cc.set_index("SK_ID_CURR")

In [6]:
MODEL_APPLICATION = "2.1. Esembler_DecisionTree_Application.json"
MODEL_BUREAU = "2.2. Esembler_DecisionTree_bureau.json"
MODEL_PA_POS = "2.3. Esembler_DecisionTree_pa_pos.json"
MODEL_PA_IP = "2.4. Esembler_DecisionTree_pa_ip.json"
MODEL_PA_CC = "2.5. Esembler_DecisionTree_pa_cc.json"

In [7]:
with open(DATASET_DIR / "Models" / MODEL_APPLICATION, 'r') as file:
    model_application_data = json.load(file)
    
with open(DATASET_DIR / "Models" / MODEL_BUREAU, 'r') as file:
    model_bureau_data = json.load(file)
    
with open(DATASET_DIR / "Models" / MODEL_PA_POS, 'r') as file:
    model_pa_pos_data = json.load(file)
    
with open(DATASET_DIR / "Models" / MODEL_PA_IP, 'r') as file:
    model_pa_ip_data = json.load(file)
    
with open(DATASET_DIR / "Models" / MODEL_PA_CC, 'r') as file:
    model_pa_cc_data = json.load(file)

In [8]:
print(len(model_application_data["keep"]))
print(len(model_bureau_data["keep"]))
print(len(model_pa_pos_data["keep"]))
print(len(model_pa_ip_data["keep"]))
print(len(model_pa_cc_data["keep"]))

12
6
12
16
14


In [9]:
model_application = DecisionTreeClassifier(**TREE_PARAMS)
model_bureau = DecisionTreeClassifier(**TREE_PARAMS)
model_pa_pos = DecisionTreeClassifier(**TREE_PARAMS)
model_pa_ip = DecisionTreeClassifier(**TREE_PARAMS)
model_pa_cc = DecisionTreeClassifier(**TREE_PARAMS)

In [10]:
df = app_train[["TARGET"]]
df.head()

Unnamed: 0_level_0,TARGET
SK_ID_CURR,Unnamed: 1_level_1
100002,1.0
100003,0.0
100004,0.0
100006,0.0
100007,0.0


In [11]:
# application prediction

In [12]:
x = app_train[model_application_data["keep"]]
x.head()

Unnamed: 0_level_0,A_EXT_SOURCE_3,A_AMT_REQ_CREDIT_BUREAU_DAY,A_DAYS_BIRTH,A_EXT_SOURCE_1,A_EXT_SOURCE_2,A_AMT_REQ_CREDIT_BUREAU_QRT,A_AMT_REQ_CREDIT_BUREAU_WEEK,A_BASEMENTAREA_AVG,A_LANDAREA_AVG,A_AMT_ANNUITY,A_AMT_INCOME_TOTAL,A_ORGANIZATION_TYPE
SK_ID_CURR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
100002,0.139376,0.0,-9461.0,0.083037,0.262949,0.0,0.0,0.0369,0.0369,24700.5,202500.0,0.0
100003,0.344844,0.0,-16765.0,0.311267,0.622246,0.0,0.0,0.0529,0.013,35698.5,270000.0,1.0
100004,0.729567,0.0,-19046.0,0.550824,0.555912,0.0,0.0,0.075009,0.067504,6750.0,67500.0,2.0
100006,0.565898,0.005653,-19005.0,0.666039,0.650442,0.250421,0.044626,0.076735,0.068598,29686.5,135000.0,0.0
100007,0.57139,0.0,-19932.0,0.559619,0.322738,0.0,0.0,0.071043,0.048938,21865.5,121500.0,3.0


In [13]:
y = app_train.loc[app_train.index]["TARGET"]
y.head()

SK_ID_CURR
100002    1.0
100003    0.0
100004    0.0
100006    0.0
100007    0.0
Name: TARGET, dtype: float64

In [14]:
model_application.fit(x,y)

DecisionTreeClassifier(max_depth=7, random_state=0)

In [15]:
predicted = pd.DataFrame(model_application.predict_proba(x), index=app_train.index)
predicted.columns = ["A_PAYBACK", "A_DEFAULT"]

In [16]:
df = pd.merge(df, predicted, how="left", left_index=True, right_index=True)
df.head()

Unnamed: 0_level_0,TARGET,A_PAYBACK,A_DEFAULT
SK_ID_CURR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
100002,1.0,0.682299,0.317701
100003,0.0,0.775732,0.224268
100004,0.0,0.977798,0.022202
100006,0.0,0.995996,0.004004
100007,0.0,0.954607,0.045393


In [17]:
# bureau prediction

In [18]:
x = bureau[model_bureau_data["keep"]]
x.head()

Unnamed: 0_level_0,B_Closed,B_Active,B_AMT_CREDIT_SUM,B_DAYS_CREDIT_ENDDATE,B_AMT_CREDIT_SUM_DEBT,CNT_BURAEU
SK_ID_CURR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
100002,6.0,2.0,638235.0,309.0,245781.0,8.0
100003,3.0,1.0,810000.0,1216.0,0.0,4.0
100004,2.0,0.0,792471.023437,1172.145688,353526.556281,2.0
100007,1.0,0.0,635755.340213,1163.754146,291016.898931,1.0
100008,2.0,1.0,267606.0,471.0,240057.0,3.0


In [19]:
y = app_train.loc[bureau.index]["TARGET"]
y.head()

SK_ID_CURR
100002    1.0
100003    0.0
100004    0.0
100007    0.0
100008    0.0
Name: TARGET, dtype: float64

In [20]:
model_bureau.fit(x,y)

DecisionTreeClassifier(max_depth=7, random_state=0)

In [21]:
predicted = pd.DataFrame(model_bureau.predict_proba(x), index=bureau.index)
predicted.columns = ["B_PAYBACK", "B_DEFAULT"]

In [22]:
df = pd.merge(df, predicted, how="left", left_index=True, right_index=True)
df.head()

Unnamed: 0_level_0,TARGET,A_PAYBACK,A_DEFAULT,B_PAYBACK,B_DEFAULT
SK_ID_CURR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
100002,1.0,0.682299,0.317701,0.939873,0.060127
100003,0.0,0.775732,0.224268,0.95992,0.04008
100004,0.0,0.977798,0.022202,0.949971,0.050029
100006,0.0,0.995996,0.004004,,
100007,0.0,0.954607,0.045393,0.929822,0.070178


In [23]:
# pa_pos prediction

In [24]:
x = pa_pos[model_pa_pos_data["keep"]]
x.head()

Unnamed: 0_level_0,PA_CNT_NAME_CONTRACT_STATUS_Refused,PA_AMT_ANNUITY,PA_POS_CNT_DPD,PA_AMT_DOWN_PAYMENT,PA_CNT_NAME_PRODUCT_TYPE_walk-in,PA_AMT_CREDIT,PA_CNT_NAME_YIELD_GROUP_high,PA_CNT_NAME_CLIENT_TYPE_New,PA_POS_CNT_PAYMENTS_LEFT,PA_CNT_NAME_YIELD_GROUP_low_normal,PA_CNT_NAME_CONTRACT_TYPE_Revolving loans,PA_CNT_CREDITS
SK_ID_CURR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
158271,0.0,6404.31,0.0,0.0,0.0,65124.0,0.0,1.0,0.0,0.0,0.0,1.0
252457,0.0,4951.35,0.0,4.275,1.0,52641.0,0.0,0.0,9.0,0.0,1.0,1.0
260094,1.0,97839.945,0.0,8765.37,1.0,2294779.5,4.0,1.0,70.0,3.0,0.0,9.0
176456,0.0,14713.605,0.0,12349.575,0.0,120307.5,0.0,1.0,0.0,0.0,0.0,1.0
256657,7.0,60139.575,0.0,78714.0,5.0,1277523.0,0.0,1.0,28.0,2.0,2.0,3.0


In [25]:
y = app_train.loc[pa_pos.index]["TARGET"]
y.head()

SK_ID_CURR
158271    0.0
252457    0.0
260094    0.0
176456    0.0
256657    0.0
Name: TARGET, dtype: float64

In [26]:
model_pa_pos.fit(x,y)

DecisionTreeClassifier(max_depth=7, random_state=0)

In [27]:
predicted = pd.DataFrame(model_pa_pos.predict_proba(x), index=pa_pos.index)
predicted.columns = ["PA_POS_PAYBACK", "PA_POS_DEFAULT"]

In [28]:
df = pd.merge(df, predicted, how="left", left_index=True, right_index=True)
df.head()

Unnamed: 0_level_0,TARGET,A_PAYBACK,A_DEFAULT,B_PAYBACK,B_DEFAULT,PA_POS_PAYBACK,PA_POS_DEFAULT
SK_ID_CURR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
100002,1.0,0.682299,0.317701,0.939873,0.060127,0.90922,0.09078
100003,0.0,0.775732,0.224268,0.95992,0.04008,0.95664,0.04336
100004,0.0,0.977798,0.022202,0.949971,0.050029,0.90922,0.09078
100006,0.0,0.995996,0.004004,,,0.945975,0.054025
100007,0.0,0.954607,0.045393,0.929822,0.070178,0.928786,0.071214


In [29]:
# pa_ip prediction

In [30]:
x = pa_ip[model_pa_ip_data["keep"]]
x.head()

Unnamed: 0_level_0,PA_CNT_NAME_PRODUCT_TYPE_walk-in,PA_IP_DAYS_DIFF_30,PA_CNT_NAME_CONTRACT_STATUS_Refused,PA_IP_DAYS_DIFF_180,PA_AMT_ANNUITY,PA_IP_DAYS_DIFF_60,PA_IP_AMT_DIFF_180,PA_IP_DAYS_DIFF_150,PA_CNT_NAME_YIELD_GROUP_low_normal,PA_RATE_DOWN_PAYMENT,PA_IP_PAYMENT_60,PA_IP_AMT_DIFF_120,PA_CNT_CODE_REJECT_REASON_HC,PA_AMT_DOWN_PAYMENT,PA_CNT_PRODUCT_COMBINATION_Card Street,PA_AMT_APPLICATION
SK_ID_CURR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
252457,1.0,0.0,0.0,0.0,4951.35,-15.0,0.0,0.0,0.0,8.8e-05,0.9128709,0.0,0.0,4.275,1.0,47056.275
256657,5.0,-6.0,7.0,-1.5,49054.77,-13.0,0.0,-13.0,2.0,0.217818,0.2041241,0.0,5.0,15714.0,1.0,978570.0
198678,0.0,-20.0,0.0,-0.5,98806.05,-21.5,0.0,0.5,0.0,0.193358,4.540312e-08,0.0,0.0,13347.0,0.0,921721.5
394447,1.0,-0.5,0.0,0.0,17141.445,-7.0,0.0,-8.5,0.0,0.0,0.6115381,0.0,0.0,0.0,1.0,129132.0
156331,0.0,0.0,0.0,-5.0,40866.795,0.0,0.0,-12.5,1.0,0.0,0.01416426,0.0,0.0,0.0,1.0,793035.0


In [31]:
y = app_train.loc[pa_ip.index]["TARGET"]
y.head()

SK_ID_CURR
252457    0.0
256657    0.0
198678    0.0
394447    0.0
156331    0.0
Name: TARGET, dtype: float64

In [32]:
model_pa_ip.fit(x,y)

DecisionTreeClassifier(max_depth=7, random_state=0)

In [33]:
predicted = pd.DataFrame(model_pa_ip.predict_proba(x), index=pa_ip.index)
predicted.columns = ["PA_IP_PAYBACK", "PA_IP_DEFAULT"]

In [34]:
df = pd.merge(df, predicted, how="left", left_index=True, right_index=True)
df.head()

Unnamed: 0_level_0,TARGET,A_PAYBACK,A_DEFAULT,B_PAYBACK,B_DEFAULT,PA_POS_PAYBACK,PA_POS_DEFAULT,PA_IP_PAYBACK,PA_IP_DEFAULT
SK_ID_CURR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
100002,1.0,0.682299,0.317701,0.939873,0.060127,0.90922,0.09078,0.934897,0.065103
100003,0.0,0.775732,0.224268,0.95992,0.04008,0.95664,0.04336,,
100004,0.0,0.977798,0.022202,0.949971,0.050029,0.90922,0.09078,,
100006,0.0,0.995996,0.004004,,,0.945975,0.054025,0.926338,0.073662
100007,0.0,0.954607,0.045393,0.929822,0.070178,0.928786,0.071214,0.902996,0.097004


In [35]:
# pa_cc prediction

In [36]:
x = pa_cc[model_pa_cc_data["keep"]]
x.head()

Unnamed: 0_level_0,PA_CC_BALANCE_60,PA_CNT_NAME_PRODUCT_TYPE_walk-in,PA_CC_BALANCE_90,PA_CNT_NAME_CONTRACT_STATUS_Refused,PA_CC_PAYMENT_180,PA_CNT_PRODUCT_COMBINATION_Cash X-Sell: low,PA_CC_BALANCE_150,PA_CC_DPD_30,PA_CC_DPD_90,PA_CC_PAYMENT_30,PA_CC_DPD_180,PA_CC_DPD_120,PA_CNT_CODE_REJECT_REASON_HC,PA_CNT_CREDITS
SK_ID_CURR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
394447,1.434002,1.0,0.992963,0.0,1.318045,0.0,-0.415697,0.0,0.0,-1.389108,0.0,0.0,0.0,1.0
361282,0.828945,1.0,0.861611,0.0,-1.152838,0.0,-1.07437,0.0,0.0,0.850318,0.0,0.0,0.0,1.0
436351,0.0,1.0,0.0,3.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,1.0
181153,0.0,1.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
309691,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [37]:
y = app_train.loc[pa_cc.index]["TARGET"]
y.head()

SK_ID_CURR
394447    0.0
361282    0.0
436351    0.0
181153    0.0
309691    0.0
Name: TARGET, dtype: float64

In [38]:
model_pa_cc.fit(x,y)

DecisionTreeClassifier(max_depth=7, random_state=0)

In [39]:
predicted = pd.DataFrame(model_pa_cc.predict_proba(x), index=pa_cc.index)
predicted.columns = ["PA_CC_PAYBACK", "PA_CC_DEFAULT"]

In [40]:
df = pd.merge(df, predicted, how="left", left_index=True, right_index=True)
df.head()

Unnamed: 0_level_0,TARGET,A_PAYBACK,A_DEFAULT,B_PAYBACK,B_DEFAULT,PA_POS_PAYBACK,PA_POS_DEFAULT,PA_IP_PAYBACK,PA_IP_DEFAULT,PA_CC_PAYBACK,PA_CC_DEFAULT
SK_ID_CURR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
100002,1.0,0.682299,0.317701,0.939873,0.060127,0.90922,0.09078,0.934897,0.065103,,
100003,0.0,0.775732,0.224268,0.95992,0.04008,0.95664,0.04336,,,,
100004,0.0,0.977798,0.022202,0.949971,0.050029,0.90922,0.09078,,,,
100006,0.0,0.995996,0.004004,,,0.945975,0.054025,0.926338,0.073662,0.950272,0.049728
100007,0.0,0.954607,0.045393,0.929822,0.070178,0.928786,0.071214,0.902996,0.097004,,


In [41]:
df = df.fillna(0.5)
df.head()

Unnamed: 0_level_0,TARGET,A_PAYBACK,A_DEFAULT,B_PAYBACK,B_DEFAULT,PA_POS_PAYBACK,PA_POS_DEFAULT,PA_IP_PAYBACK,PA_IP_DEFAULT,PA_CC_PAYBACK,PA_CC_DEFAULT
SK_ID_CURR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
100002,1.0,0.682299,0.317701,0.939873,0.060127,0.90922,0.09078,0.934897,0.065103,0.5,0.5
100003,0.0,0.775732,0.224268,0.95992,0.04008,0.95664,0.04336,0.5,0.5,0.5,0.5
100004,0.0,0.977798,0.022202,0.949971,0.050029,0.90922,0.09078,0.5,0.5,0.5,0.5
100006,0.0,0.995996,0.004004,0.5,0.5,0.945975,0.054025,0.926338,0.073662,0.950272,0.049728
100007,0.0,0.954607,0.045393,0.929822,0.070178,0.928786,0.071214,0.902996,0.097004,0.5,0.5


In [42]:
# finales Modell

In [43]:
model = DecisionTreeClassifier(**TREE_PARAMS)

aucs = []

kfold = KFold(5, shuffle=True, random_state=1)
data = df

y = df["TARGET"]
x = df.drop(["TARGET"], axis=1)

for ID_TRAIN, ID_TEST in kfold.split(data):
    
    x_train = x.iloc[ID_TRAIN]
    y_train = y.iloc[ID_TRAIN]
    x_test = x.iloc[ID_TEST]
    y_test = y.iloc[ID_TEST]
    
    model.fit(x_train, y_train)
    auc = roc_auc_score(y_test, model.predict_proba(x_test)[:,1])
    aucs.append(auc)
    
    print('train: %s, test: %s, auc: %s' % (ID_TRAIN, ID_TEST, auc))

print("\n")
print("Durchschnitt: %.4f" % (np.mean(aucs)))
print("Standardabw.: %.2f" % (np.std(aucs)))
print("Varianz:      %.2f" % (np.var(aucs)))

train: [     0      2      3 ... 307508 307509 307510], test: [     1      4      6 ... 307493 307495 307501], auc: 0.815773280476733
train: [     0      1      2 ... 307508 307509 307510], test: [     3     18     22 ... 307499 307503 307504], auc: 0.8130569176748729
train: [     0      1      2 ... 307508 307509 307510], test: [     9     10     15 ... 307489 307492 307502], auc: 0.8071248658519812
train: [     0      1      2 ... 307506 307507 307508], test: [    12     16     23 ... 307505 307509 307510], auc: 0.8151383043048673
train: [     1      3      4 ... 307505 307509 307510], test: [     0      2      5 ... 307506 307507 307508], auc: 0.8050609075051435


Durchschnitt: 0.8112
Standardabw.: 0.00
Varianz:      0.00
