In [1]:
from pathlib import Path
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve, roc_auc_score

from sklearn.model_selection import KFold
from sklearn.metrics import plot_roc_curve
from sklearn.metrics import auc

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

np.set_printoptions(suppress=True)

pd.options.display.max_columns = None
pd.options.display.max_rows = None
pd.options.display.max_colwidth = None

from IPython.display import clear_output

import json

import warnings

from sklearn import preprocessing

In [2]:
# Random Forest
TREE_PARAMS = {
    "random_state" : 0,
    "max_depth": 7,
    "n_jobs" : -1
}

In [3]:
path1 = Path(r"A:\Workspace\Python\Masterarbeit\Kaggle Home Credit Datensatz")
path2 = Path(r"C:\Users\rober\Documents\Workspace\Python\Masterarbeit\Kaggle Home Credit Datensatz")

if path1.is_dir():
    DATASET_DIR = path1
else:
    DATASET_DIR = path2

In [4]:
app_train = pd.read_csv(DATASET_DIR / "4. FillNA" / "application.csv")
bureau = pd.read_csv(DATASET_DIR / "4. FillNA" / "bureau.csv")
pa_pos = pd.read_csv(DATASET_DIR / "4. FillNA" / "pa_pos.csv")
pa_ip = pd.read_csv(DATASET_DIR / "4. FillNA" / "pa_ip.csv")
pa_cc = pd.read_csv(DATASET_DIR / "4. FillNA" / "pa_cc.csv")

In [5]:
app_train = app_train.set_index("SK_ID_CURR")
bureau = bureau.set_index("SK_ID_CURR")
pa_pos = pa_pos.set_index("SK_ID_CURR")
pa_ip = pa_ip.set_index("SK_ID_CURR")
pa_cc = pa_cc.set_index("SK_ID_CURR")

In [6]:
MODEL_APPLICATION = "3.1. Esembler_RandomForest_Application.json"
MODEL_BUREAU = "3.2. Esembler_RandomForest_bureau.json"
MODEL_PA_POS = "3.3. Esembler_RandomForest_pa_pos.json"
MODEL_PA_IP = "3.4. Esembler_RandomForest_pa_ip.json"
MODEL_PA_CC = "3.5. Esembler_RandomForest_pa_cc.json"

In [7]:
with open(DATASET_DIR / "Models" / MODEL_APPLICATION, 'r') as file:
    model_application_data = json.load(file)
    
with open(DATASET_DIR / "Models" / MODEL_BUREAU, 'r') as file:
    model_bureau_data = json.load(file)
    
with open(DATASET_DIR / "Models" / MODEL_PA_POS, 'r') as file:
    model_pa_pos_data = json.load(file)
    
with open(DATASET_DIR / "Models" / MODEL_PA_IP, 'r') as file:
    model_pa_ip_data = json.load(file)
    
with open(DATASET_DIR / "Models" / MODEL_PA_CC, 'r') as file:
    model_pa_cc_data = json.load(file)

In [8]:
print(len(model_application_data["keep"]))
print(len(model_bureau_data["keep"]))
print(len(model_pa_pos_data["keep"]))
print(len(model_pa_ip_data["keep"]))
print(len(model_pa_cc_data["keep"]))

7
6
16
28
21


In [9]:
model_application = RandomForestClassifier(**TREE_PARAMS)
model_bureau = RandomForestClassifier(**TREE_PARAMS)
model_pa_pos = RandomForestClassifier(**TREE_PARAMS)
model_pa_ip = RandomForestClassifier(**TREE_PARAMS)
model_pa_cc = RandomForestClassifier(**TREE_PARAMS)

In [10]:
df = app_train[["TARGET"]]
df.head()

Unnamed: 0_level_0,TARGET
SK_ID_CURR,Unnamed: 1_level_1
100002,1.0
100003,0.0
100004,0.0
100006,0.0
100007,0.0


In [11]:
# application prediction

In [12]:
x = app_train[model_application_data["keep"]]
x.head()

Unnamed: 0_level_0,A_EXT_SOURCE_3,A_EXT_SOURCE_1,A_EXT_SOURCE_2,A_AMT_REQ_CREDIT_BUREAU_DAY,A_BASEMENTAREA_AVG,A_DAYS_BIRTH,A_LANDAREA_AVG
SK_ID_CURR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
100002,0.139376,0.083037,0.262949,0.0,0.0369,-9461.0,0.0369
100003,0.361809,0.311267,0.622246,0.0,0.0529,-16765.0,0.013
100004,0.729567,0.550253,0.555912,0.0,0.07827,-19046.0,0.071173
100006,0.565391,0.664796,0.650442,0.005328,0.078591,-19005.0,0.071143
100007,0.565607,0.557628,0.322738,0.0,0.073712,-19932.0,0.053231


In [13]:
y = app_train.loc[app_train.index]["TARGET"]
y.head()

SK_ID_CURR
100002    1.0
100003    0.0
100004    0.0
100006    0.0
100007    0.0
Name: TARGET, dtype: float64

In [14]:
model_application.fit(x,y)

RandomForestClassifier(max_depth=7, n_jobs=-1, random_state=0)

In [15]:
predicted = pd.DataFrame(model_application.predict_proba(x), index=app_train.index)
predicted.columns = ["A_PAYBACK", "A_DEFAULT"]

In [16]:
df = pd.merge(df, predicted, how="left", left_index=True, right_index=True)
df.head()

Unnamed: 0_level_0,TARGET,A_PAYBACK,A_DEFAULT
SK_ID_CURR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
100002,1.0,0.621288,0.378712
100003,0.0,0.799484,0.200516
100004,0.0,0.963516,0.036484
100006,0.0,0.985105,0.014895
100007,0.0,0.93007,0.06993


In [17]:
# bureau prediction

In [18]:
x = bureau[model_bureau_data["keep"]]
x.head()

Unnamed: 0_level_0,B_Active,B_AMT_CREDIT_SUM,B_Closed,B_AMT_CREDIT_SUM_DEBT,B_DAYS_CREDIT_ENDDATE,B_CREDIT_DAY_OVERDUE
SK_ID_CURR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
100002,2.0,212745.0,6.0,81927.0,309.0,0.0
100003,1.0,810000.0,3.0,0.0,1216.0,0.0
100004,0.0,414243.226314,2.0,168078.269668,2294.569372,0.000598
100007,0.0,414219.927736,1.0,168159.053569,2323.521925,0.000844
100008,1.0,267606.0,2.0,240057.0,471.0,0.0


In [19]:
y = app_train.loc[bureau.index]["TARGET"]
y.head()

SK_ID_CURR
100002    1.0
100003    0.0
100004    0.0
100007    0.0
100008    0.0
Name: TARGET, dtype: float64

In [20]:
model_bureau.fit(x,y)

RandomForestClassifier(max_depth=7, n_jobs=-1, random_state=0)

In [21]:
predicted = pd.DataFrame(model_bureau.predict_proba(x), index=bureau.index)
predicted.columns = ["B_PAYBACK", "B_DEFAULT"]

In [22]:
df = pd.merge(df, predicted, how="left", left_index=True, right_index=True)
df.head()

Unnamed: 0_level_0,TARGET,A_PAYBACK,A_DEFAULT,B_PAYBACK,B_DEFAULT
SK_ID_CURR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
100002,1.0,0.621288,0.378712,0.939745,0.060255
100003,0.0,0.799484,0.200516,0.947487,0.052513
100004,0.0,0.963516,0.036484,0.944023,0.055977
100006,0.0,0.985105,0.014895,,
100007,0.0,0.93007,0.06993,0.930533,0.069467


In [23]:
# pa_pos prediction

In [24]:
x = pa_pos[model_pa_pos_data["keep"]]
x.head()

Unnamed: 0_level_0,PA_CNT_NAME_CONTRACT_STATUS_Refused,PA_AMT_ANNUITY,PA_AMT_GOODS_PRICE,PA_CNT_NAME_PRODUCT_TYPE_walk-in,PA_AMT_DOWN_PAYMENT,PA_POS_CNT_DPD,PA_AMT_CREDIT,PA_RATE_DOWN_PAYMENT,PA_CNT_CODE_REJECT_REASON_HC,PA_CNT_NAME_YIELD_GROUP_high,PA_POS_CNT_PAYMENTS_LEFT,PA_CNT_NAME_CLIENT_TYPE_New,PA_CNT_NAME_CONTRACT_TYPE_Revolving loans,PA_CNT_NAME_YIELD_GROUP_low_normal,PA_CNT_PRODUCT_COMBINATION_Cash X-Sell: low,PA_CNT_PRODUCT_COMBINATION_POS industry with interest
SK_ID_CURR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
158271,0.0,6404.31,58905.0,0.0,0.0,0.0,65124.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
252457,0.0,4951.35,47056.275,1.0,4.275,0.0,52641.0,8.8e-05,0.0,0.0,9.0,0.0,1.0,0.0,0.0,0.0
260094,1.0,97839.945,2023469.37,1.0,8765.37,0.0,2294779.5,0.312513,1.0,4.0,70.0,1.0,0.0,3.0,4.0,0.0
176456,0.0,14713.605,123486.075,0.0,12349.575,0.0,120307.5,0.101388,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
256657,7.0,60139.575,1115806.5,5.0,10246.286731,0.0,1277523.0,0.699443,5.0,0.0,28.0,1.0,2.0,2.0,0.0,0.0


In [25]:
y = app_train.loc[pa_pos.index]["TARGET"]
y.head()

SK_ID_CURR
158271    0.0
252457    0.0
260094    0.0
176456    0.0
256657    0.0
Name: TARGET, dtype: float64

In [26]:
model_pa_pos.fit(x,y)

RandomForestClassifier(max_depth=7, n_jobs=-1, random_state=0)

In [27]:
predicted = pd.DataFrame(model_pa_pos.predict_proba(x), index=pa_pos.index)
predicted.columns = ["PA_POS_PAYBACK", "PA_POS_DEFAULT"]

In [28]:
df = pd.merge(df, predicted, how="left", left_index=True, right_index=True)
df.head()

Unnamed: 0_level_0,TARGET,A_PAYBACK,A_DEFAULT,B_PAYBACK,B_DEFAULT,PA_POS_PAYBACK,PA_POS_DEFAULT
SK_ID_CURR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
100002,1.0,0.621288,0.378712,0.939745,0.060255,0.921243,0.078757
100003,0.0,0.799484,0.200516,0.947487,0.052513,0.955714,0.044286
100004,0.0,0.963516,0.036484,0.944023,0.055977,0.898008,0.101992
100006,0.0,0.985105,0.014895,,,0.939268,0.060732
100007,0.0,0.93007,0.06993,0.930533,0.069467,0.927158,0.072842


In [29]:
# pa_ip prediction

In [30]:
x = pa_ip[model_pa_ip_data["keep"]]
x.head()

Unnamed: 0_level_0,PA_CNT_NAME_PRODUCT_TYPE_walk-in,PA_CNT_NAME_CONTRACT_STATUS_Refused,PA_IP_DAYS_DIFF_90,PA_IP_DAYS_DIFF_120,PA_IP_DAYS_DIFF_60,PA_AMT_ANNUITY,PA_IP_DAYS_DIFF_30,PA_AMT_CREDIT,PA_IP_PAYMENT_180,PA_AMT_APPLICATION,PA_CNT_PRODUCT_COMBINATION_Card Street,PA_AMT_GOODS_PRICE,PA_IP_AMT_DIFF_180,PA_IP_PAYMENT_60,PA_IP_AMT_DIFF_60,PA_CNT_NAME_YIELD_GROUP_low_normal,PA_CNT_NAME_CONTRACT_TYPE_Revolving loans,PA_IP_AMT_DIFF_90,PA_IP_PAYMENT_120,PA_CNT_PRODUCT_COMBINATION_Cash X-Sell: low,PA_IP_PAYMENT_30,PA_CNT_NAME_PORTFOLIO_Cards,PA_CNT_NAME_CLIENT_TYPE_New,PA_CNT_NAME_YIELD_GROUP_high,PA_CNT_NAME_CLIENT_TYPE_Refreshed,PA_AMT_DOWN_PAYMENT,PA_CNT_PRODUCT_COMBINATION_POS industry with interest,PA_CNT_CREDITS
SK_ID_CURR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1
252457,1.0,0.0,-17.0,-14.0,-15.0,4951.35,0.0,52641.0,-0.912871,47056.275,1.0,47056.275,0.0,0.9128709,0.0,0.0,1.0,0.0,0.912871,0.0,-0.9128709,1.0,0.0,0.0,2.0,4.275,0.0,1.0
256657,5.0,7.0,-12.0,-12.5,-13.0,49054.77,-6.0,1198062.0,-1.020621,978570.0,1.0,978570.0,0.0,0.2041241,0.0,2.0,2.0,0.0,0.204124,0.0,0.2041241,2.0,1.0,0.0,0.0,15714.0,0.0,2.0
198678,0.0,0.0,-6.5,-1.0,-21.5,51759.551912,-20.0,941661.0,0.000204,921721.5,0.0,921721.5,0.0,4.540312e-08,0.0,0.0,0.0,0.0,-0.000407,0.0,4.540312e-08,0.0,1.0,0.0,0.0,13347.0,0.0,2.0
394447,1.0,0.0,-1.0,-2.0,-1.0,2250.0,0.0,45000.0,1.215377,45000.0,1.0,45000.0,0.0,0.814828,0.0,0.0,1.0,0.0,-0.867506,0.0,0.6508755,1.0,2.0,0.0,0.0,0.0,0.0,1.0
156331,0.0,0.0,0.0,-1.0,0.0,40866.795,0.0,793035.0,0.616385,793035.0,1.0,793035.0,0.0,0.01416426,0.0,1.0,2.0,0.0,-1.114368,0.0,-0.7381905,1.0,1.0,3.0,0.0,0.0,0.0,2.0


In [31]:
y = app_train.loc[pa_ip.index]["TARGET"]
y.head()

SK_ID_CURR
252457    0.0
256657    0.0
198678    0.0
394447    0.0
156331    0.0
Name: TARGET, dtype: float64

In [32]:
model_pa_ip.fit(x,y)

RandomForestClassifier(max_depth=7, n_jobs=-1, random_state=0)

In [33]:
predicted = pd.DataFrame(model_pa_ip.predict_proba(x), index=pa_ip.index)
predicted.columns = ["PA_IP_PAYBACK", "PA_IP_DEFAULT"]

In [34]:
df = pd.merge(df, predicted, how="left", left_index=True, right_index=True)
df.head()

Unnamed: 0_level_0,TARGET,A_PAYBACK,A_DEFAULT,B_PAYBACK,B_DEFAULT,PA_POS_PAYBACK,PA_POS_DEFAULT,PA_IP_PAYBACK,PA_IP_DEFAULT
SK_ID_CURR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
100002,1.0,0.621288,0.378712,0.939745,0.060255,0.921243,0.078757,0.93927,0.06073
100003,0.0,0.799484,0.200516,0.947487,0.052513,0.955714,0.044286,,
100004,0.0,0.963516,0.036484,0.944023,0.055977,0.898008,0.101992,,
100006,0.0,0.985105,0.014895,,,0.939268,0.060732,0.936439,0.063561
100007,0.0,0.93007,0.06993,0.930533,0.069467,0.927158,0.072842,0.915559,0.084441


In [35]:
# pa_cc prediction

In [36]:
x = pa_cc[model_pa_cc_data["keep"]]
x.head()

Unnamed: 0_level_0,PA_CC_BALANCE_60,PA_CC_BALANCE_30,PA_CC_BALANCE_90,PA_CC_BALANCE_180,PA_CC_BALANCE_120,PA_CNT_NAME_PRODUCT_TYPE_walk-in,PA_CC_PAYMENT_60,PA_CC_PAYMENT_180,PA_CC_PAYMENT_120,PA_CNT_NAME_CONTRACT_STATUS_Refused,PA_AMT_CREDIT,PA_AMT_APPLICATION,PA_AMT_GOODS_PRICE,PA_CNT_CODE_REJECT_REASON_HC,PA_CNT_NAME_YIELD_GROUP_high,PA_CNT_NAME_CLIENT_TYPE_New,PA_CNT_PRODUCT_COMBINATION_Cash X-Sell: low,PA_CNT_NAME_YIELD_GROUP_low_normal,PA_CNT_NAME_CLIENT_TYPE_Refreshed,PA_CNT_PRODUCT_COMBINATION_POS industry with interest,PA_CC_DPD_180
SK_ID_CURR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
394447,1.434002,-1.206367,0.992963,-0.455389,-0.349513,1.0,1.008387,1.318045,-0.292196,0.0,45000.0,45000.0,45000.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0
361282,0.828945,0.946004,0.861611,-1.16908,-0.39311,1.0,1.109609,-1.152838,-1.267179,0.0,45000.0,45000.0,45000.0,0.0,0.0,2.0,0.0,1.0,0.0,0.0,0.0
436351,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,3.0,45000.0,45000.0,45000.0,3.0,2.0,0.0,3.0,6.0,1.0,0.0,0.0
181153,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,3.0,45000.0,45000.0,45000.0,0.0,3.0,1.0,0.0,1.0,1.0,0.0,0.0
309691,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,45000.0,45000.0,45000.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0


In [37]:
y = app_train.loc[pa_cc.index]["TARGET"]
y.head()

SK_ID_CURR
394447    0.0
361282    0.0
436351    0.0
181153    0.0
309691    0.0
Name: TARGET, dtype: float64

In [38]:
model_pa_cc.fit(x,y)

RandomForestClassifier(max_depth=7, n_jobs=-1, random_state=0)

In [39]:
predicted = pd.DataFrame(model_pa_cc.predict_proba(x), index=pa_cc.index)
predicted.columns = ["PA_CC_PAYBACK", "PA_CC_DEFAULT"]

In [40]:
df = pd.merge(df, predicted, how="left", left_index=True, right_index=True)
df.head()

Unnamed: 0_level_0,TARGET,A_PAYBACK,A_DEFAULT,B_PAYBACK,B_DEFAULT,PA_POS_PAYBACK,PA_POS_DEFAULT,PA_IP_PAYBACK,PA_IP_DEFAULT,PA_CC_PAYBACK,PA_CC_DEFAULT
SK_ID_CURR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
100002,1.0,0.621288,0.378712,0.939745,0.060255,0.921243,0.078757,0.93927,0.06073,,
100003,0.0,0.799484,0.200516,0.947487,0.052513,0.955714,0.044286,,,,
100004,0.0,0.963516,0.036484,0.944023,0.055977,0.898008,0.101992,,,,
100006,0.0,0.985105,0.014895,,,0.939268,0.060732,0.936439,0.063561,0.952863,0.047137
100007,0.0,0.93007,0.06993,0.930533,0.069467,0.927158,0.072842,0.915559,0.084441,,


In [41]:
df = df.fillna(0.5)
df.head()

Unnamed: 0_level_0,TARGET,A_PAYBACK,A_DEFAULT,B_PAYBACK,B_DEFAULT,PA_POS_PAYBACK,PA_POS_DEFAULT,PA_IP_PAYBACK,PA_IP_DEFAULT,PA_CC_PAYBACK,PA_CC_DEFAULT
SK_ID_CURR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
100002,1.0,0.621288,0.378712,0.939745,0.060255,0.921243,0.078757,0.93927,0.06073,0.5,0.5
100003,0.0,0.799484,0.200516,0.947487,0.052513,0.955714,0.044286,0.5,0.5,0.5,0.5
100004,0.0,0.963516,0.036484,0.944023,0.055977,0.898008,0.101992,0.5,0.5,0.5,0.5
100006,0.0,0.985105,0.014895,0.5,0.5,0.939268,0.060732,0.936439,0.063561,0.952863,0.047137
100007,0.0,0.93007,0.06993,0.930533,0.069467,0.927158,0.072842,0.915559,0.084441,0.5,0.5


In [42]:
# finales Modell

In [43]:
model = RandomForestClassifier(**TREE_PARAMS)

aucs = []

kfold = KFold(5, shuffle=True, random_state=1)
data = df

y = df["TARGET"]
x = df.drop(["TARGET"], axis=1)

for ID_TRAIN, ID_TEST in kfold.split(data):
    
    x_train = x.iloc[ID_TRAIN]
    y_train = y.iloc[ID_TRAIN]
    x_test = x.iloc[ID_TEST]
    y_test = y.iloc[ID_TEST]
    
    model.fit(x_train, y_train)
    auc = roc_auc_score(y_test, model.predict_proba(x_test)[:,1])
    aucs.append(auc)
    
    print('train: %s, test: %s, auc: %s' % (ID_TRAIN, ID_TEST, auc))

print("\n")
print("Durchschnitt: %.2f" % (np.mean(aucs)))
print("Standardabw.: %.2f" % (np.std(aucs)))
print("Varianz:      %.2f" % (np.var(aucs)))

train: [     0      2      3 ... 307508 307509 307510], test: [     1      4      6 ... 307493 307495 307501], auc: 0.834300087045187
train: [     0      1      2 ... 307508 307509 307510], test: [     3     18     22 ... 307499 307503 307504], auc: 0.8285808454843269
train: [     0      1      2 ... 307508 307509 307510], test: [     9     10     15 ... 307489 307492 307502], auc: 0.8270255318746165
train: [     0      1      2 ... 307506 307507 307508], test: [    12     16     23 ... 307505 307509 307510], auc: 0.8318118067988715
train: [     1      3      4 ... 307505 307509 307510], test: [     0      2      5 ... 307506 307507 307508], auc: 0.8271904176669792


Durchschnitt: 0.83
Standardabw.: 0.00
Varianz:      0.00


In [44]:
model = RandomForestClassifier(**TREE_PARAMS)
model = model.fit(x, y)

In [45]:
app_test = pd.read_csv(DATASET_DIR / "4. FillNA" / "app_test.csv")
bureau = pd.read_csv(DATASET_DIR / "4. FillNA" / "bureau_all.csv")
pa_pos = pd.read_csv(DATASET_DIR / "4. FillNA" / "pa_pos_all.csv")
pa_ip = pd.read_csv(DATASET_DIR / "4. FillNA" / "pa_ip_all.csv")
pa_cc = pd.read_csv(DATASET_DIR / "4. FillNA" / "pa_cc_all.csv")

In [46]:
app_test = app_test.set_index("SK_ID_CURR")
bureau = bureau.set_index("SK_ID_CURR")
pa_pos = pa_pos.set_index("SK_ID_CURR")
pa_ip = pa_ip.set_index("SK_ID_CURR")
pa_cc = pa_cc.set_index("SK_ID_CURR")

In [47]:
df = pd.DataFrame(index=app_test.index)
df.head()

100001
100005
100013
100028
100038


In [48]:
# application prediction

In [49]:
x = app_test[model_application_data["keep"]]
x.head()

Unnamed: 0_level_0,A_EXT_SOURCE_3,A_EXT_SOURCE_1,A_EXT_SOURCE_2,A_AMT_REQ_CREDIT_BUREAU_DAY,A_BASEMENTAREA_AVG,A_DAYS_BIRTH,A_LANDAREA_AVG
SK_ID_CURR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
100001,0.15952,0.752614,0.789654,0.0,0.059,-19241.0,0.018102
100005,0.432962,0.56499,0.291656,0.0,0.086861,-18064.0,0.061702
100013,0.610991,0.719399,0.699787,0.0,0.091161,-20038.0,0.074275
100028,0.612704,0.525734,0.509677,0.0,0.1974,-13976.0,0.2042
100038,0.412998,0.202145,0.425687,0.008519,0.057558,-13040.0,0.08481


In [52]:
predicted = pd.DataFrame(model_application.predict_proba(x), index=app_test.index)
predicted.columns = ["A_PAYBACK", "A_DEFAULT"]

In [53]:
df = pd.merge(df, predicted, how="left", left_index=True, right_index=True)
df.head()

Unnamed: 0_level_0,A_PAYBACK,A_DEFAULT
SK_ID_CURR,Unnamed: 1_level_1,Unnamed: 2_level_1
100001,0.931503,0.068497
100005,0.896679,0.103321
100013,0.986201,0.013799
100028,0.970504,0.029496
100038,0.795132,0.204868


In [54]:
# bureau prediction

In [55]:
x = bureau[model_bureau_data["keep"]]
x.head()

Unnamed: 0_level_0,B_Active,B_AMT_CREDIT_SUM,B_Closed,B_AMT_CREDIT_SUM_DEBT,B_DAYS_CREDIT_ENDDATE,B_CREDIT_DAY_OVERDUE
SK_ID_CURR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
100001,3.0,290936.25,4.0,149171.625,728.0,0.0
100002,2.0,212745.0,6.0,81927.0,309.0,0.0
100003,1.0,810000.0,3.0,0.0,1216.0,0.0
100004,0.0,592024.411371,2.0,278490.012728,2401.338443,-0.099348
100005,2.0,219042.0,1.0,189469.5,439.333333,0.0


In [56]:
predicted = pd.DataFrame(model_bureau.predict_proba(x), index=bureau.index)
predicted.columns = ["B_PAYBACK", "B_DEFAULT"]

In [57]:
df = pd.merge(df, predicted, how="left", left_index=True, right_index=True)
df.head()

Unnamed: 0_level_0,A_PAYBACK,A_DEFAULT,B_PAYBACK,B_DEFAULT
SK_ID_CURR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
100001,0.931503,0.068497,0.929709,0.070291
100005,0.896679,0.103321,0.912473,0.087527
100013,0.986201,0.013799,0.92119,0.07881
100028,0.970504,0.029496,0.897605,0.102395
100038,0.795132,0.204868,,


In [58]:
# pa_pos prediction

In [59]:
x = pa_pos[model_pa_pos_data["keep"]]
x.head()

Unnamed: 0_level_0,PA_CNT_NAME_CONTRACT_STATUS_Refused,PA_AMT_ANNUITY,PA_AMT_GOODS_PRICE,PA_CNT_NAME_PRODUCT_TYPE_walk-in,PA_AMT_DOWN_PAYMENT,PA_POS_CNT_DPD,PA_AMT_CREDIT,PA_RATE_DOWN_PAYMENT,PA_CNT_CODE_REJECT_REASON_HC,PA_CNT_NAME_YIELD_GROUP_high,PA_POS_CNT_PAYMENTS_LEFT,PA_CNT_NAME_CLIENT_TYPE_New,PA_CNT_NAME_CONTRACT_TYPE_Revolving loans,PA_CNT_NAME_YIELD_GROUP_low_normal,PA_CNT_PRODUCT_COMBINATION_Cash X-Sell: low,PA_CNT_PRODUCT_COMBINATION_POS industry with interest
SK_ID_CURR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
100001,0.0,3951.0,24835.5,0.0,2520.0,0.0,23787.0,0.104326,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
100002,0.0,9251.775,179055.0,0.0,0.0,0.0,179055.0,0.0,0.0,0.0,6.0,1.0,0.0,1.0,0.0,0.0
100003,0.0,169661.97,1306309.5,0.0,6885.0,0.0,1452573.0,0.100061,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0
100004,0.0,5357.25,24282.0,0.0,4860.0,0.0,20106.0,0.212008,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
100005,0.0,4813.2,44617.5,0.0,4464.0,0.0,40153.5,0.108964,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0


In [60]:
predicted = pd.DataFrame(model_pa_pos.predict_proba(x), index=pa_pos.index)
predicted.columns = ["PA_POS_PAYBACK", "PA_POS_DEFAULT"]

In [61]:
df = pd.merge(df, predicted, how="left", left_index=True, right_index=True)
df.head()

Unnamed: 0_level_0,A_PAYBACK,A_DEFAULT,B_PAYBACK,B_DEFAULT,PA_POS_PAYBACK,PA_POS_DEFAULT
SK_ID_CURR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
100001,0.931503,0.068497,0.929709,0.070291,0.915474,0.084526
100005,0.896679,0.103321,0.912473,0.087527,0.891178,0.108822
100013,0.986201,0.013799,0.92119,0.07881,0.945056,0.054944
100028,0.970504,0.029496,0.897605,0.102395,0.932968,0.067032
100038,0.795132,0.204868,,,0.900141,0.099859


In [62]:
# pa_ip prediction

In [63]:
x = pa_ip[model_pa_ip_data["keep"]]
x.head()

Unnamed: 0_level_0,PA_CNT_NAME_PRODUCT_TYPE_walk-in,PA_CNT_NAME_CONTRACT_STATUS_Refused,PA_IP_DAYS_DIFF_90,PA_IP_DAYS_DIFF_120,PA_IP_DAYS_DIFF_60,PA_AMT_ANNUITY,PA_IP_DAYS_DIFF_30,PA_AMT_CREDIT,PA_IP_PAYMENT_180,PA_AMT_APPLICATION,PA_CNT_PRODUCT_COMBINATION_Card Street,PA_AMT_GOODS_PRICE,PA_IP_AMT_DIFF_180,PA_IP_PAYMENT_60,PA_IP_AMT_DIFF_60,PA_CNT_NAME_YIELD_GROUP_low_normal,PA_CNT_NAME_CONTRACT_TYPE_Revolving loans,PA_IP_AMT_DIFF_90,PA_IP_PAYMENT_120,PA_CNT_PRODUCT_COMBINATION_Cash X-Sell: low,PA_IP_PAYMENT_30,PA_CNT_NAME_PORTFOLIO_Cards,PA_CNT_NAME_CLIENT_TYPE_New,PA_CNT_NAME_YIELD_GROUP_high,PA_CNT_NAME_CLIENT_TYPE_Refreshed,PA_AMT_DOWN_PAYMENT,PA_CNT_PRODUCT_COMBINATION_POS industry with interest,PA_CNT_CREDITS
SK_ID_CURR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1
252457,1.0,0.0,-17.0,-14.0,-15.0,4951.35,0.0,52641.0,-0.912871,47056.275,1.0,47056.275,0.0,0.9128709,0.0,0.0,1.0,0.0,0.912871,0.0,-0.9128709,1.0,0.0,0.0,2.0,4.275,0.0,1.0
256657,5.0,7.0,-12.0,-12.5,-13.0,49054.77,-6.0,1198062.0,-1.020621,978570.0,1.0,978570.0,0.0,0.2041241,0.0,2.0,2.0,0.0,0.204124,0.0,0.2041241,2.0,1.0,0.0,0.0,15714.0,0.0,2.0
198678,0.0,0.0,-6.5,-1.0,-21.5,98806.05,-20.0,941661.0,0.000204,921721.5,0.0,921721.5,0.0,4.540312e-08,0.0,0.0,0.0,0.0,-0.000407,0.0,4.540312e-08,0.0,1.0,0.0,0.0,13347.0,0.0,2.0
310743,0.0,1.0,0.0,0.0,0.0,7002.72,0.0,75024.0,2.041241,67864.23,0.0,67864.23,0.0,-0.4082483,0.0,2.0,0.0,0.0,-0.408248,1.0,-0.4082483,0.0,1.0,0.0,0.0,4.23,0.0,1.0
394447,1.0,0.0,-7.0,-6.5,-7.0,17141.445,-0.5,129132.0,-0.412932,129132.0,1.0,129132.0,0.0,0.6115381,0.0,0.0,1.0,0.0,-0.229629,0.0,0.5295619,1.0,2.0,0.0,0.0,0.0,0.0,2.0


In [64]:
predicted = pd.DataFrame(model_pa_ip.predict_proba(x), index=pa_ip.index)
predicted.columns = ["PA_IP_PAYBACK", "PA_IP_DEFAULT"]

In [65]:
df = pd.merge(df, predicted, how="left", left_index=True, right_index=True)
df.head()

Unnamed: 0_level_0,A_PAYBACK,A_DEFAULT,B_PAYBACK,B_DEFAULT,PA_POS_PAYBACK,PA_POS_DEFAULT,PA_IP_PAYBACK,PA_IP_DEFAULT
SK_ID_CURR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
100001,0.931503,0.068497,0.929709,0.070291,0.915474,0.084526,,
100005,0.896679,0.103321,0.912473,0.087527,0.891178,0.108822,,
100013,0.986201,0.013799,0.92119,0.07881,0.945056,0.054944,0.872121,0.127879
100028,0.970504,0.029496,0.897605,0.102395,0.932968,0.067032,0.9211,0.0789
100038,0.795132,0.204868,,,0.900141,0.099859,,


In [66]:
# pa_cc prediction

In [67]:
x = pa_cc[model_pa_cc_data["keep"]]
x.head()

Unnamed: 0_level_0,PA_CC_BALANCE_60,PA_CC_BALANCE_30,PA_CC_BALANCE_90,PA_CC_BALANCE_180,PA_CC_BALANCE_120,PA_CNT_NAME_PRODUCT_TYPE_walk-in,PA_CC_PAYMENT_60,PA_CC_PAYMENT_180,PA_CC_PAYMENT_120,PA_CNT_NAME_CONTRACT_STATUS_Refused,PA_AMT_CREDIT,PA_AMT_APPLICATION,PA_AMT_GOODS_PRICE,PA_CNT_CODE_REJECT_REASON_HC,PA_CNT_NAME_YIELD_GROUP_high,PA_CNT_NAME_CLIENT_TYPE_New,PA_CNT_PRODUCT_COMBINATION_Cash X-Sell: low,PA_CNT_NAME_YIELD_GROUP_low_normal,PA_CNT_NAME_CLIENT_TYPE_Refreshed,PA_CNT_PRODUCT_COMBINATION_POS industry with interest,PA_CC_DPD_180
SK_ID_CURR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
394447,1.434002,-1.206367,0.992963,-0.455389,-0.349513,1.0,1.008387,1.318045,-0.292196,0.0,45000.0,45000.0,45000.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0
361282,0.828945,0.946004,0.861611,-1.16908,-0.39311,1.0,1.109609,-1.152838,-1.267179,0.0,45000.0,45000.0,45000.0,0.0,0.0,2.0,0.0,1.0,0.0,0.0,0.0
131335,0.719445,0.722068,0.791575,-1.619869,0.204483,1.0,-0.427508,2.030866,-0.215249,0.0,45000.0,45000.0,45000.0,0.0,0.0,2.0,0.0,1.0,0.0,0.0,0.0
436351,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,3.0,45000.0,45000.0,45000.0,3.0,2.0,0.0,3.0,6.0,1.0,0.0,0.0
181153,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,3.0,45000.0,45000.0,45000.0,0.0,3.0,1.0,0.0,1.0,1.0,0.0,0.0


In [68]:
predicted = pd.DataFrame(model_pa_cc.predict_proba(x), index=pa_cc.index)
predicted.columns = ["PA_CC_PAYBACK", "PA_CC_DEFAULT"]

In [69]:
df = pd.merge(df, predicted, how="left", left_index=True, right_index=True)
df.head()

Unnamed: 0_level_0,A_PAYBACK,A_DEFAULT,B_PAYBACK,B_DEFAULT,PA_POS_PAYBACK,PA_POS_DEFAULT,PA_IP_PAYBACK,PA_IP_DEFAULT,PA_CC_PAYBACK,PA_CC_DEFAULT
SK_ID_CURR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
100001,0.931503,0.068497,0.929709,0.070291,0.915474,0.084526,,,,
100005,0.896679,0.103321,0.912473,0.087527,0.891178,0.108822,,,,
100013,0.986201,0.013799,0.92119,0.07881,0.945056,0.054944,0.872121,0.127879,,
100028,0.970504,0.029496,0.897605,0.102395,0.932968,0.067032,0.9211,0.0789,0.894884,0.105116
100038,0.795132,0.204868,,,0.900141,0.099859,,,,


In [70]:
df = df.fillna(0.5)
df.head()

Unnamed: 0_level_0,A_PAYBACK,A_DEFAULT,B_PAYBACK,B_DEFAULT,PA_POS_PAYBACK,PA_POS_DEFAULT,PA_IP_PAYBACK,PA_IP_DEFAULT,PA_CC_PAYBACK,PA_CC_DEFAULT
SK_ID_CURR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
100001,0.931503,0.068497,0.929709,0.070291,0.915474,0.084526,0.5,0.5,0.5,0.5
100005,0.896679,0.103321,0.912473,0.087527,0.891178,0.108822,0.5,0.5,0.5,0.5
100013,0.986201,0.013799,0.92119,0.07881,0.945056,0.054944,0.872121,0.127879,0.5,0.5
100028,0.970504,0.029496,0.897605,0.102395,0.932968,0.067032,0.9211,0.0789,0.894884,0.105116
100038,0.795132,0.204868,0.5,0.5,0.900141,0.099859,0.5,0.5,0.5,0.5


In [99]:
TARGET = model.predict_proba(df)[:,1]

In [100]:
solution = pd.DataFrame(TARGET, index=df.index)
solution.columns = ["TARGET"]

In [101]:
solution.head()

Unnamed: 0_level_0,TARGET
SK_ID_CURR,Unnamed: 1_level_1
100001,0.05889
100005,0.121421
100013,0.014724
100028,0.027561
100038,0.210529


In [102]:
solution.to_csv(DATASET_DIR / "solution.csv")