In [1]:
from pathlib import Path
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve, roc_auc_score

from sklearn.model_selection import KFold
from sklearn.metrics import plot_roc_curve
from sklearn.metrics import auc

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

np.set_printoptions(suppress=True)

pd.options.display.max_columns = None
pd.options.display.max_rows = None
pd.options.display.max_colwidth = None

from IPython.display import clear_output

import json

import warnings

from sklearn import preprocessing

In [2]:
# Random Forest
TREE_PARAMS = {
    "random_state" : 0,
    "max_depth": 7,
    "n_jobs" : -1
}

In [3]:
path1 = Path(r"A:\Workspace\Python\Masterarbeit\Kaggle Home Credit Datensatz")
path2 = Path(r"C:\Users\rober\Documents\Workspace\Python\Masterarbeit\Kaggle Home Credit Datensatz")

if path1.is_dir():
    DATASET_DIR = path1
else:
    DATASET_DIR = path2

In [4]:
app_train = pd.read_csv(DATASET_DIR / "4. FillNA" / "application.csv")
bureau = pd.read_csv(DATASET_DIR / "4. FillNA" / "bureau.csv")
pa_pos = pd.read_csv(DATASET_DIR / "4. FillNA" / "pa_pos.csv")
pa_ip = pd.read_csv(DATASET_DIR / "4. FillNA" / "pa_ip.csv")
pa_cc = pd.read_csv(DATASET_DIR / "4. FillNA" / "pa_cc.csv")

In [5]:
app_train = app_train.set_index("SK_ID_CURR")
bureau = bureau.set_index("SK_ID_CURR")
pa_pos = pa_pos.set_index("SK_ID_CURR")
pa_ip = pa_ip.set_index("SK_ID_CURR")
pa_cc = pa_cc.set_index("SK_ID_CURR")

In [6]:
MODEL_APPLICATION = "3.1. Esembler_RandomForest_Application.json"
MODEL_BUREAU = "3.2. Esembler_RandomForest_bureau.json"
MODEL_PA_POS = "3.3. Esembler_RandomForest_pa_pos.json"
MODEL_PA_IP = "3.4. Esembler_RandomForest_pa_ip.json"
MODEL_PA_CC = "3.5. Esembler_RandomForest_pa_cc.json"

In [7]:
with open(DATASET_DIR / "Models" / MODEL_APPLICATION, 'r') as file:
    model_application_data = json.load(file)
    
with open(DATASET_DIR / "Models" / MODEL_BUREAU, 'r') as file:
    model_bureau_data = json.load(file)
    
with open(DATASET_DIR / "Models" / MODEL_PA_POS, 'r') as file:
    model_pa_pos_data = json.load(file)
    
with open(DATASET_DIR / "Models" / MODEL_PA_IP, 'r') as file:
    model_pa_ip_data = json.load(file)
    
with open(DATASET_DIR / "Models" / MODEL_PA_CC, 'r') as file:
    model_pa_cc_data = json.load(file)

In [8]:
print(len(model_application_data["keep"]))
print(len(model_bureau_data["keep"]))
print(len(model_pa_pos_data["keep"]))
print(len(model_pa_ip_data["keep"]))
print(len(model_pa_cc_data["keep"]))

10
7
17
27
21


In [9]:
model_application = RandomForestClassifier(**model_application_data["params"])
model_bureau = RandomForestClassifier(**TREE_PARAMS)
model_pa_pos = RandomForestClassifier(**TREE_PARAMS)
model_pa_ip = RandomForestClassifier(**TREE_PARAMS)
model_pa_cc = RandomForestClassifier(**TREE_PARAMS)

In [10]:
df = app_train[["TARGET"]]
df.head()

Unnamed: 0_level_0,TARGET
SK_ID_CURR,Unnamed: 1_level_1
100002,1.0
100003,0.0
100004,0.0
100006,0.0
100007,0.0


In [11]:
# application prediction

In [12]:
x = app_train[model_application_data["keep"]]
x.head()

Unnamed: 0_level_0,A_EXT_SOURCE_3,A_EXT_SOURCE_1,A_EXT_SOURCE_2,A_AMT_REQ_CREDIT_BUREAU_DAY,A_AMT_REQ_CREDIT_BUREAU_HOUR,A_YEARS_BEGINEXPLUATATION_AVG,A_DAYS_EMPLOYED,A_DAYS_BIRTH,A_BASEMENTAREA_AVG,A_LANDAREA_AVG
SK_ID_CURR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
100002,0.139376,0.083037,0.262949,0.0,0.0,0.9722,-637.0,-9461.0,0.0369,0.0369
100003,0.344844,0.311267,0.622246,0.0,0.0,0.9851,-1188.0,-16765.0,0.0529,0.013
100004,0.729567,0.550824,0.555912,0.0,0.0,0.981667,-225.0,-19046.0,0.075009,0.067504
100006,0.565898,0.666039,0.650442,0.005653,0.008068,0.979744,-3039.0,-19005.0,0.076735,0.068598
100007,0.57139,0.559619,0.322738,0.0,0.0,0.982023,-3038.0,-19932.0,0.071043,0.048938


In [13]:
y = app_train.loc[app_train.index]["TARGET"]
y.head()

SK_ID_CURR
100002    1.0
100003    0.0
100004    0.0
100006    0.0
100007    0.0
Name: TARGET, dtype: float64

In [14]:
model_application.fit(x,y)

RandomForestClassifier(max_depth=15, max_features=5, min_samples_leaf=12,
                       n_estimators=900, n_jobs=-1, random_state=0)

In [15]:
predicted = pd.DataFrame(model_application.predict_proba(x), index=app_train.index)
predicted.columns = ["A_PAYBACK", "A_DEFAULT"]

In [16]:
df = pd.merge(df, predicted, how="left", left_index=True, right_index=True)
df.head()

Unnamed: 0_level_0,TARGET,A_PAYBACK,A_DEFAULT
SK_ID_CURR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
100002,1.0,0.41187,0.58813
100003,0.0,0.854116,0.145884
100004,0.0,0.974147,0.025853
100006,0.0,0.998703,0.001297
100007,0.0,0.934973,0.065027


In [17]:
# bureau prediction

In [18]:
x = bureau[model_bureau_data["keep"]]
x.head()

Unnamed: 0_level_0,B_Active,B_Closed,B_AMT_CREDIT_SUM,B_AMT_CREDIT_SUM_DEBT,B_DAYS_CREDIT_ENDDATE,B_AMT_CREDIT_SUM_OVERDUE,B_AMT_ANNUITY
SK_ID_CURR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
100002,2.0,6.0,638235.0,245781.0,309.0,0.0,0.0
100003,1.0,3.0,810000.0,0.0,1216.0,0.0,0.0
100004,0.0,2.0,792471.023437,353526.556281,1172.145688,0.001962,2735.621154
100007,0.0,1.0,635755.340213,291016.898931,1163.754146,0.002387,2634.913696
100008,1.0,2.0,267606.0,240057.0,471.0,0.0,0.0


In [19]:
y = app_train.loc[bureau.index]["TARGET"]
y.head()

SK_ID_CURR
100002    1.0
100003    0.0
100004    0.0
100007    0.0
100008    0.0
Name: TARGET, dtype: float64

In [20]:
model_bureau.fit(x,y)

RandomForestClassifier(max_depth=7, n_jobs=-1, random_state=0)

In [21]:
predicted = pd.DataFrame(model_bureau.predict_proba(x), index=bureau.index)
predicted.columns = ["B_PAYBACK", "B_DEFAULT"]

In [22]:
df = pd.merge(df, predicted, how="left", left_index=True, right_index=True)
df.head()

Unnamed: 0_level_0,TARGET,A_PAYBACK,A_DEFAULT,B_PAYBACK,B_DEFAULT
SK_ID_CURR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
100002,1.0,0.41187,0.58813,0.939494,0.060506
100003,0.0,0.854116,0.145884,0.9502,0.0498
100004,0.0,0.974147,0.025853,0.940761,0.059239
100006,0.0,0.998703,0.001297,,
100007,0.0,0.934973,0.065027,0.929543,0.070457


In [23]:
# pa_pos prediction

In [24]:
x = pa_pos[model_pa_pos_data["keep"]]
x.head()

Unnamed: 0_level_0,PA_CNT_NAME_CONTRACT_STATUS_Refused,PA_AMT_ANNUITY,PA_AMT_GOODS_PRICE,PA_AMT_DOWN_PAYMENT,PA_CNT_NAME_PRODUCT_TYPE_walk-in,PA_POS_CNT_DPD,PA_AMT_CREDIT,PA_RATE_DOWN_PAYMENT,PA_CNT_NAME_YIELD_GROUP_high,PA_POS_CNT_PAYMENTS_LEFT,PA_CNT_NAME_CLIENT_TYPE_New,PA_CNT_NAME_YIELD_GROUP_low_normal,PA_CNT_NAME_CONTRACT_TYPE_Revolving loans,PA_CNT_CREDITS,PA_CNT_PRODUCT_COMBINATION_Cash X-Sell: low,PA_CNT_NAME_CLIENT_TYPE_Refreshed,PA_CNT_PRODUCT_COMBINATION_POS industry with interest
SK_ID_CURR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
158271,0.0,6404.31,58905.0,0.0,0.0,0.0,65124.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0
252457,0.0,4951.35,47056.275,4.275,1.0,0.0,52641.0,8.8e-05,0.0,9.0,0.0,0.0,1.0,1.0,0.0,2.0,0.0
260094,1.0,97839.945,2023469.37,8765.37,1.0,0.0,2294779.5,0.312513,4.0,70.0,1.0,3.0,0.0,9.0,4.0,1.0,0.0
176456,0.0,14713.605,123486.075,12349.575,0.0,0.0,120307.5,0.101388,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0
256657,7.0,60139.575,1115806.5,78714.0,5.0,0.0,1277523.0,0.699443,0.0,28.0,1.0,2.0,2.0,3.0,0.0,0.0,0.0


In [25]:
y = app_train.loc[pa_pos.index]["TARGET"]
y.head()

SK_ID_CURR
158271    0.0
252457    0.0
260094    0.0
176456    0.0
256657    0.0
Name: TARGET, dtype: float64

In [26]:
model_pa_pos.fit(x,y)

RandomForestClassifier(max_depth=7, n_jobs=-1, random_state=0)

In [27]:
predicted = pd.DataFrame(model_pa_pos.predict_proba(x), index=pa_pos.index)
predicted.columns = ["PA_POS_PAYBACK", "PA_POS_DEFAULT"]

In [28]:
df = pd.merge(df, predicted, how="left", left_index=True, right_index=True)
df.head()

Unnamed: 0_level_0,TARGET,A_PAYBACK,A_DEFAULT,B_PAYBACK,B_DEFAULT,PA_POS_PAYBACK,PA_POS_DEFAULT
SK_ID_CURR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
100002,1.0,0.41187,0.58813,0.939494,0.060506,0.923838,0.076162
100003,0.0,0.854116,0.145884,0.9502,0.0498,0.955988,0.044012
100004,0.0,0.974147,0.025853,0.940761,0.059239,0.900602,0.099398
100006,0.0,0.998703,0.001297,,,0.948088,0.051912
100007,0.0,0.934973,0.065027,0.929543,0.070457,0.922725,0.077275


In [29]:
# pa_ip prediction

In [30]:
x = pa_ip[model_pa_ip_data["keep"]]
x.head()

Unnamed: 0_level_0,PA_CNT_NAME_PRODUCT_TYPE_walk-in,PA_IP_DAYS_DIFF_60,PA_CNT_NAME_CONTRACT_STATUS_Refused,PA_IP_DAYS_DIFF_30,PA_AMT_ANNUITY,PA_IP_DAYS_DIFF_120,PA_IP_DAYS_DIFF_90,PA_IP_DAYS_DIFF_180,PA_AMT_CREDIT,PA_IP_PAYMENT_180,PA_AMT_APPLICATION,PA_IP_AMT_DIFF_60,PA_IP_AMT_DIFF_30,PA_IP_DAYS_DIFF_150,PA_CNT_NAME_YIELD_GROUP_low_normal,PA_CNT_PRODUCT_COMBINATION_Cash X-Sell: low,PA_IP_PAYMENT_60,PA_IP_AMT_DIFF_180,PA_IP_AMT_DIFF_90,PA_CNT_NAME_CONTRACT_TYPE_Revolving loans,PA_IP_AMT_DIFF_150,PA_CNT_NAME_CLIENT_TYPE_New,PA_CNT_NAME_YIELD_GROUP_high,PA_CNT_NAME_PORTFOLIO_Cards,PA_CNT_NAME_CLIENT_TYPE_Refreshed,PA_CNT_PRODUCT_COMBINATION_POS industry with interest,PA_RATE_DOWN_PAYMENT
SK_ID_CURR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1
252457,1.0,-15.0,0.0,0.0,4951.35,-14.0,-17.0,0.0,52641.0,-0.912871,47056.275,0.0,0.0,0.0,0.0,0.0,0.9128709,0.0,0.0,1.0,0.0,0.0,0.0,1.0,2.0,0.0,8.8e-05
256657,5.0,-13.0,7.0,-6.0,49054.77,-12.5,-12.0,-1.5,1198062.0,-1.020621,978570.0,0.0,0.0,-13.0,2.0,0.0,0.2041241,0.0,0.0,2.0,0.0,1.0,0.0,2.0,0.0,0.0,0.217818
198678,0.0,-21.5,0.0,-20.0,98806.05,-1.0,-6.5,-0.5,941661.0,0.000204,921721.5,0.0,0.0,0.5,0.0,0.0,4.540312e-08,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.193358
394447,1.0,-7.0,0.0,-0.5,17141.445,-6.5,-7.0,0.0,129132.0,-0.412932,129132.0,0.0,0.0,-8.5,0.0,0.0,0.6115381,0.0,0.0,1.0,0.0,2.0,0.0,1.0,0.0,0.0,0.0
156331,0.0,0.0,0.0,0.0,40866.795,-1.0,0.0,-5.0,793035.0,0.616385,793035.0,0.0,0.0,-12.5,1.0,0.0,0.01416426,0.0,0.0,2.0,0.0,1.0,3.0,1.0,0.0,0.0,0.0


In [31]:
y = app_train.loc[pa_ip.index]["TARGET"]
y.head()

SK_ID_CURR
252457    0.0
256657    0.0
198678    0.0
394447    0.0
156331    0.0
Name: TARGET, dtype: float64

In [32]:
model_pa_ip.fit(x,y)

RandomForestClassifier(max_depth=7, n_jobs=-1, random_state=0)

In [33]:
predicted = pd.DataFrame(model_pa_ip.predict_proba(x), index=pa_ip.index)
predicted.columns = ["PA_IP_PAYBACK", "PA_IP_DEFAULT"]

In [34]:
df = pd.merge(df, predicted, how="left", left_index=True, right_index=True)
df.head()

Unnamed: 0_level_0,TARGET,A_PAYBACK,A_DEFAULT,B_PAYBACK,B_DEFAULT,PA_POS_PAYBACK,PA_POS_DEFAULT,PA_IP_PAYBACK,PA_IP_DEFAULT
SK_ID_CURR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
100002,1.0,0.41187,0.58813,0.939494,0.060506,0.923838,0.076162,0.934588,0.065412
100003,0.0,0.854116,0.145884,0.9502,0.0498,0.955988,0.044012,,
100004,0.0,0.974147,0.025853,0.940761,0.059239,0.900602,0.099398,,
100006,0.0,0.998703,0.001297,,,0.948088,0.051912,0.939385,0.060615
100007,0.0,0.934973,0.065027,0.929543,0.070457,0.922725,0.077275,0.912314,0.087686


In [35]:
# pa_cc prediction

In [36]:
x = pa_cc[model_pa_cc_data["keep"]]
x.head()

Unnamed: 0_level_0,PA_CC_BALANCE_60,PA_CC_BALANCE_30,PA_CC_BALANCE_90,PA_CC_BALANCE_180,PA_CC_BALANCE_120,PA_CNT_NAME_PRODUCT_TYPE_walk-in,PA_CC_PAYMENT_60,PA_CC_PAYMENT_180,PA_CC_PAYMENT_120,PA_CNT_NAME_CONTRACT_STATUS_Refused,PA_AMT_CREDIT,PA_AMT_APPLICATION,PA_AMT_GOODS_PRICE,PA_CNT_CODE_REJECT_REASON_HC,PA_CNT_NAME_YIELD_GROUP_high,PA_CNT_NAME_CLIENT_TYPE_New,PA_CNT_PRODUCT_COMBINATION_Cash X-Sell: low,PA_CNT_NAME_YIELD_GROUP_low_normal,PA_CNT_NAME_CLIENT_TYPE_Refreshed,PA_CNT_PRODUCT_COMBINATION_POS industry with interest,PA_CC_DPD_180
SK_ID_CURR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
394447,1.434002,-1.206367,0.992963,-0.455389,-0.349513,1.0,1.008387,1.318045,-0.292196,0.0,45000.0,45000.0,45000.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0
361282,0.828945,0.946004,0.861611,-1.16908,-0.39311,1.0,1.109609,-1.152838,-1.267179,0.0,45000.0,45000.0,45000.0,0.0,0.0,2.0,0.0,1.0,0.0,0.0,0.0
436351,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,3.0,45000.0,45000.0,45000.0,3.0,2.0,0.0,3.0,6.0,1.0,0.0,0.0
181153,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,3.0,45000.0,45000.0,45000.0,0.0,3.0,1.0,0.0,1.0,1.0,0.0,0.0
309691,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,45000.0,45000.0,45000.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0


In [37]:
y = app_train.loc[pa_cc.index]["TARGET"]
y.head()

SK_ID_CURR
394447    0.0
361282    0.0
436351    0.0
181153    0.0
309691    0.0
Name: TARGET, dtype: float64

In [38]:
model_pa_cc.fit(x,y)

RandomForestClassifier(max_depth=7, n_jobs=-1, random_state=0)

In [39]:
predicted = pd.DataFrame(model_pa_cc.predict_proba(x), index=pa_cc.index)
predicted.columns = ["PA_CC_PAYBACK", "PA_CC_DEFAULT"]

In [40]:
df = pd.merge(df, predicted, how="left", left_index=True, right_index=True)
df.head()

Unnamed: 0_level_0,TARGET,A_PAYBACK,A_DEFAULT,B_PAYBACK,B_DEFAULT,PA_POS_PAYBACK,PA_POS_DEFAULT,PA_IP_PAYBACK,PA_IP_DEFAULT,PA_CC_PAYBACK,PA_CC_DEFAULT
SK_ID_CURR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
100002,1.0,0.41187,0.58813,0.939494,0.060506,0.923838,0.076162,0.934588,0.065412,,
100003,0.0,0.854116,0.145884,0.9502,0.0498,0.955988,0.044012,,,,
100004,0.0,0.974147,0.025853,0.940761,0.059239,0.900602,0.099398,,,,
100006,0.0,0.998703,0.001297,,,0.948088,0.051912,0.939385,0.060615,0.952863,0.047137
100007,0.0,0.934973,0.065027,0.929543,0.070457,0.922725,0.077275,0.912314,0.087686,,


In [41]:
df = df.fillna(0.5)
df.head()

Unnamed: 0_level_0,TARGET,A_PAYBACK,A_DEFAULT,B_PAYBACK,B_DEFAULT,PA_POS_PAYBACK,PA_POS_DEFAULT,PA_IP_PAYBACK,PA_IP_DEFAULT,PA_CC_PAYBACK,PA_CC_DEFAULT
SK_ID_CURR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
100002,1.0,0.41187,0.58813,0.939494,0.060506,0.923838,0.076162,0.934588,0.065412,0.5,0.5
100003,0.0,0.854116,0.145884,0.9502,0.0498,0.955988,0.044012,0.5,0.5,0.5,0.5
100004,0.0,0.974147,0.025853,0.940761,0.059239,0.900602,0.099398,0.5,0.5,0.5,0.5
100006,0.0,0.998703,0.001297,0.5,0.5,0.948088,0.051912,0.939385,0.060615,0.952863,0.047137
100007,0.0,0.934973,0.065027,0.929543,0.070457,0.922725,0.077275,0.912314,0.087686,0.5,0.5


In [42]:
# finales Modell

In [43]:
model = RandomForestClassifier(**TREE_PARAMS)

aucs = []

kfold = KFold(5, shuffle=True, random_state=1)
data = df

y = df["TARGET"]
x = df.drop(["TARGET"], axis=1)

for ID_TRAIN, ID_TEST in kfold.split(data):
    
    x_train = x.iloc[ID_TRAIN]
    y_train = y.iloc[ID_TRAIN]
    x_test = x.iloc[ID_TEST]
    y_test = y.iloc[ID_TEST]
    
    model.fit(x_train, y_train)
    auc = roc_auc_score(y_test, model.predict_proba(x_test)[:,1])
    aucs.append(auc)
    
    print('train: %s, test: %s, auc: %s' % (ID_TRAIN, ID_TEST, auc))

print("\n")
print("Durchschnitt: %.2f" % (np.mean(aucs)))
print("Standardabw.: %.2f" % (np.std(aucs)))
print("Varianz:      %.2f" % (np.var(aucs)))

train: [     0      2      3 ... 307508 307509 307510], test: [     1      4      6 ... 307493 307495 307501], auc: 0.9394438215801807
train: [     0      1      2 ... 307508 307509 307510], test: [     3     18     22 ... 307499 307503 307504], auc: 0.9367866336499592
train: [     0      1      2 ... 307508 307509 307510], test: [     9     10     15 ... 307489 307492 307502], auc: 0.9353776828992423
train: [     0      1      2 ... 307506 307507 307508], test: [    12     16     23 ... 307505 307509 307510], auc: 0.9374553568825879
train: [     1      3      4 ... 307505 307509 307510], test: [     0      2      5 ... 307506 307507 307508], auc: 0.9343673666153791


Durchschnitt: 0.94
Standardabw.: 0.00
Varianz:      0.00


In [44]:
model = RandomForestClassifier(**TREE_PARAMS)
model = model.fit(x, y)

In [46]:
app_test = pd.read_csv(DATASET_DIR / "4. FillNA" / "app_test.csv")
bureau = pd.read_csv(DATASET_DIR / "4. FillNA" / "bureau_all.csv")
pa_pos = pd.read_csv(DATASET_DIR / "4. FillNA" / "pa_pos_all.csv")
pa_ip = pd.read_csv(DATASET_DIR / "4. FillNA" / "pa_ip_all.csv")
pa_cc = pd.read_csv(DATASET_DIR / "4. FillNA" / "pa_cc_all.csv")

In [47]:
app_test = app_test.set_index("SK_ID_CURR")
bureau = bureau.set_index("SK_ID_CURR")
pa_pos = pa_pos.set_index("SK_ID_CURR")
pa_ip = pa_ip.set_index("SK_ID_CURR")
pa_cc = pa_cc.set_index("SK_ID_CURR")

In [48]:
df = pd.DataFrame(index=app_test.index)
df.head()

100001
100005
100013
100028
100038


In [49]:
# application prediction

In [50]:
x = app_test[model_application_data["keep"]]
x.head()

Unnamed: 0_level_0,A_EXT_SOURCE_3,A_EXT_SOURCE_1,A_EXT_SOURCE_2,A_AMT_REQ_CREDIT_BUREAU_DAY,A_AMT_REQ_CREDIT_BUREAU_HOUR,A_YEARS_BEGINEXPLUATATION_AVG,A_DAYS_EMPLOYED,A_DAYS_BIRTH,A_BASEMENTAREA_AVG,A_LANDAREA_AVG
SK_ID_CURR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
100001,0.15952,0.752614,0.789654,0.0,0.0,0.9732,-2329.0,-19241.0,0.059,0.020985
100005,0.432962,0.56499,0.291656,0.0,0.0,0.977818,-4469.0,-18064.0,0.083623,0.057886
100013,0.610991,0.716539,0.699787,0.0,0.0,0.982704,-4458.0,-20038.0,0.087005,0.06754
100028,0.612704,0.525734,0.509677,0.0,0.0,0.997,-1866.0,-13976.0,0.1974,0.2042
100038,0.41858,0.202145,0.425687,0.007371,0.006616,0.979896,-2191.0,-13040.0,0.050552,0.076152


In [51]:
predicted = pd.DataFrame(model_application.predict_proba(x), index=app_test.index)
predicted.columns = ["A_PAYBACK", "A_DEFAULT"]

In [52]:
df = pd.merge(df, predicted, how="left", left_index=True, right_index=True)
df.head()

Unnamed: 0_level_0,A_PAYBACK,A_DEFAULT
SK_ID_CURR,Unnamed: 1_level_1,Unnamed: 2_level_1
100001,0.961833,0.038167
100005,0.888497,0.111503
100013,0.996241,0.003759
100028,0.96757,0.03243
100038,0.890788,0.109212


In [53]:
# bureau prediction

In [54]:
x = bureau[model_bureau_data["keep"]]
x.head()

Unnamed: 0_level_0,B_Active,B_Closed,B_AMT_CREDIT_SUM,B_AMT_CREDIT_SUM_DEBT,B_DAYS_CREDIT_ENDDATE,B_AMT_CREDIT_SUM_OVERDUE,B_AMT_ANNUITY
SK_ID_CURR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
100001,3.0,4.0,290936.25,149171.625,728.0,0.0,6204.375
100002,2.0,6.0,212745.0,81927.0,309.0,0.0,0.0
100003,1.0,3.0,810000.0,0.0,1216.0,0.0,0.0
100004,0.0,2.0,592024.042525,278490.216669,2401.546966,7.541775,5861.850739
100005,2.0,1.0,219042.0,189469.5,439.333333,0.0,1420.5


In [55]:
predicted = pd.DataFrame(model_bureau.predict_proba(x), index=bureau.index)
predicted.columns = ["B_PAYBACK", "B_DEFAULT"]

In [56]:
df = pd.merge(df, predicted, how="left", left_index=True, right_index=True)
df.head()

Unnamed: 0_level_0,A_PAYBACK,A_DEFAULT,B_PAYBACK,B_DEFAULT
SK_ID_CURR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
100001,0.961833,0.038167,0.925268,0.074732
100005,0.888497,0.111503,0.919031,0.080969
100013,0.996241,0.003759,0.931967,0.068033
100028,0.96757,0.03243,0.910811,0.089189
100038,0.890788,0.109212,,


In [57]:
# pa_pos prediction

In [58]:
x = pa_pos[model_pa_pos_data["keep"]]
x.head()

Unnamed: 0_level_0,PA_CNT_NAME_CONTRACT_STATUS_Refused,PA_AMT_ANNUITY,PA_AMT_GOODS_PRICE,PA_AMT_DOWN_PAYMENT,PA_CNT_NAME_PRODUCT_TYPE_walk-in,PA_POS_CNT_DPD,PA_AMT_CREDIT,PA_RATE_DOWN_PAYMENT,PA_CNT_NAME_YIELD_GROUP_high,PA_POS_CNT_PAYMENTS_LEFT,PA_CNT_NAME_CLIENT_TYPE_New,PA_CNT_NAME_YIELD_GROUP_low_normal,PA_CNT_NAME_CONTRACT_TYPE_Revolving loans,PA_CNT_CREDITS,PA_CNT_PRODUCT_COMBINATION_Cash X-Sell: low,PA_CNT_NAME_CLIENT_TYPE_Refreshed,PA_CNT_PRODUCT_COMBINATION_POS industry with interest
SK_ID_CURR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
100001,0.0,3951.0,24835.5,2520.0,0.0,0.0,23787.0,0.104326,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
100002,0.0,9251.775,179055.0,0.0,0.0,0.0,179055.0,0.0,0.0,6.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0
100003,0.0,169661.97,1306309.5,6885.0,0.0,0.0,1452573.0,0.100061,0.0,0.0,0.0,1.0,0.0,3.0,1.0,2.0,1.0
100004,0.0,5357.25,24282.0,4860.0,0.0,0.0,20106.0,0.212008,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
100005,0.0,4813.2,44617.5,4464.0,0.0,0.0,40153.5,0.108964,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0


In [59]:
predicted = pd.DataFrame(model_pa_pos.predict_proba(x), index=pa_pos.index)
predicted.columns = ["PA_POS_PAYBACK", "PA_POS_DEFAULT"]

In [60]:
df = pd.merge(df, predicted, how="left", left_index=True, right_index=True)
df.head()

Unnamed: 0_level_0,A_PAYBACK,A_DEFAULT,B_PAYBACK,B_DEFAULT,PA_POS_PAYBACK,PA_POS_DEFAULT
SK_ID_CURR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
100001,0.961833,0.038167,0.925268,0.074732,0.915258,0.084742
100005,0.888497,0.111503,0.919031,0.080969,0.892115,0.107885
100013,0.996241,0.003759,0.931967,0.068033,0.943434,0.056566
100028,0.96757,0.03243,0.910811,0.089189,0.931185,0.068815
100038,0.890788,0.109212,,,0.896059,0.103941


In [61]:
# pa_ip prediction

In [62]:
x = pa_ip[model_pa_ip_data["keep"]]
x.head()

Unnamed: 0_level_0,PA_CNT_NAME_PRODUCT_TYPE_walk-in,PA_IP_DAYS_DIFF_60,PA_CNT_NAME_CONTRACT_STATUS_Refused,PA_IP_DAYS_DIFF_30,PA_AMT_ANNUITY,PA_IP_DAYS_DIFF_120,PA_IP_DAYS_DIFF_90,PA_IP_DAYS_DIFF_180,PA_AMT_CREDIT,PA_IP_PAYMENT_180,PA_AMT_APPLICATION,PA_IP_AMT_DIFF_60,PA_IP_AMT_DIFF_30,PA_IP_DAYS_DIFF_150,PA_CNT_NAME_YIELD_GROUP_low_normal,PA_CNT_PRODUCT_COMBINATION_Cash X-Sell: low,PA_IP_PAYMENT_60,PA_IP_AMT_DIFF_180,PA_IP_AMT_DIFF_90,PA_CNT_NAME_CONTRACT_TYPE_Revolving loans,PA_IP_AMT_DIFF_150,PA_CNT_NAME_CLIENT_TYPE_New,PA_CNT_NAME_YIELD_GROUP_high,PA_CNT_NAME_PORTFOLIO_Cards,PA_CNT_NAME_CLIENT_TYPE_Refreshed,PA_CNT_PRODUCT_COMBINATION_POS industry with interest,PA_RATE_DOWN_PAYMENT
SK_ID_CURR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1
252457,1.0,-15.0,0.0,0.0,4951.35,-14.0,-17.0,0.0,52641.0,-0.912871,47056.275,0.0,0.0,0.0,0.0,0.0,0.9128709,0.0,0.0,1.0,0.0,0.0,0.0,1.0,2.0,0.0,8.8e-05
256657,5.0,-13.0,7.0,-6.0,49054.77,-12.5,-12.0,-1.5,1198062.0,-1.020621,978570.0,0.0,0.0,-13.0,2.0,0.0,0.2041241,0.0,0.0,2.0,0.0,1.0,0.0,2.0,0.0,0.0,0.217818
198678,0.0,-21.5,0.0,-20.0,98806.05,-1.0,-6.5,-0.5,941661.0,0.000204,921721.5,0.0,0.0,0.5,0.0,0.0,4.540312e-08,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.193358
310743,0.0,0.0,1.0,0.0,7002.72,0.0,0.0,-5.0,75024.0,2.041241,67864.23,0.0,0.0,0.0,2.0,1.0,-0.4082483,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,6.1e-05
394447,1.0,-7.0,0.0,-0.5,17141.445,-6.5,-7.0,0.0,129132.0,-0.412932,129132.0,0.0,0.0,-8.5,0.0,0.0,0.6115381,0.0,0.0,1.0,0.0,2.0,0.0,1.0,0.0,0.0,0.0


In [63]:
predicted = pd.DataFrame(model_pa_ip.predict_proba(x), index=pa_ip.index)
predicted.columns = ["PA_IP_PAYBACK", "PA_IP_DEFAULT"]

In [64]:
df = pd.merge(df, predicted, how="left", left_index=True, right_index=True)
df.head()

Unnamed: 0_level_0,A_PAYBACK,A_DEFAULT,B_PAYBACK,B_DEFAULT,PA_POS_PAYBACK,PA_POS_DEFAULT,PA_IP_PAYBACK,PA_IP_DEFAULT
SK_ID_CURR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
100001,0.961833,0.038167,0.925268,0.074732,0.915258,0.084742,,
100005,0.888497,0.111503,0.919031,0.080969,0.892115,0.107885,,
100013,0.996241,0.003759,0.931967,0.068033,0.943434,0.056566,0.844992,0.155008
100028,0.96757,0.03243,0.910811,0.089189,0.931185,0.068815,0.920394,0.079606
100038,0.890788,0.109212,,,0.896059,0.103941,,


In [65]:
# pa_cc prediction

In [66]:
x = pa_cc[model_pa_cc_data["keep"]]
x.head()

Unnamed: 0_level_0,PA_CC_BALANCE_60,PA_CC_BALANCE_30,PA_CC_BALANCE_90,PA_CC_BALANCE_180,PA_CC_BALANCE_120,PA_CNT_NAME_PRODUCT_TYPE_walk-in,PA_CC_PAYMENT_60,PA_CC_PAYMENT_180,PA_CC_PAYMENT_120,PA_CNT_NAME_CONTRACT_STATUS_Refused,PA_AMT_CREDIT,PA_AMT_APPLICATION,PA_AMT_GOODS_PRICE,PA_CNT_CODE_REJECT_REASON_HC,PA_CNT_NAME_YIELD_GROUP_high,PA_CNT_NAME_CLIENT_TYPE_New,PA_CNT_PRODUCT_COMBINATION_Cash X-Sell: low,PA_CNT_NAME_YIELD_GROUP_low_normal,PA_CNT_NAME_CLIENT_TYPE_Refreshed,PA_CNT_PRODUCT_COMBINATION_POS industry with interest,PA_CC_DPD_180
SK_ID_CURR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
394447,1.434002,-1.206367,0.992963,-0.455389,-0.349513,1.0,1.008387,1.318045,-0.292196,0.0,45000.0,45000.0,45000.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0
361282,0.828945,0.946004,0.861611,-1.16908,-0.39311,1.0,1.109609,-1.152838,-1.267179,0.0,45000.0,45000.0,45000.0,0.0,0.0,2.0,0.0,1.0,0.0,0.0,0.0
131335,0.719445,0.722068,0.791575,-1.619869,0.204483,1.0,-0.427508,2.030866,-0.215249,0.0,45000.0,45000.0,45000.0,0.0,0.0,2.0,0.0,1.0,0.0,0.0,0.0
436351,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,3.0,45000.0,45000.0,45000.0,3.0,2.0,0.0,3.0,6.0,1.0,0.0,0.0
181153,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,3.0,45000.0,45000.0,45000.0,0.0,3.0,1.0,0.0,1.0,1.0,0.0,0.0


In [67]:
predicted = pd.DataFrame(model_pa_cc.predict_proba(x), index=pa_cc.index)
predicted.columns = ["PA_CC_PAYBACK", "PA_CC_DEFAULT"]

In [68]:
df = pd.merge(df, predicted, how="left", left_index=True, right_index=True)
df.head()

Unnamed: 0_level_0,A_PAYBACK,A_DEFAULT,B_PAYBACK,B_DEFAULT,PA_POS_PAYBACK,PA_POS_DEFAULT,PA_IP_PAYBACK,PA_IP_DEFAULT,PA_CC_PAYBACK,PA_CC_DEFAULT
SK_ID_CURR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
100001,0.961833,0.038167,0.925268,0.074732,0.915258,0.084742,,,,
100005,0.888497,0.111503,0.919031,0.080969,0.892115,0.107885,,,,
100013,0.996241,0.003759,0.931967,0.068033,0.943434,0.056566,0.844992,0.155008,,
100028,0.96757,0.03243,0.910811,0.089189,0.931185,0.068815,0.920394,0.079606,0.894884,0.105116
100038,0.890788,0.109212,,,0.896059,0.103941,,,,


In [69]:
df = df.fillna(0.5)
df.head()

Unnamed: 0_level_0,A_PAYBACK,A_DEFAULT,B_PAYBACK,B_DEFAULT,PA_POS_PAYBACK,PA_POS_DEFAULT,PA_IP_PAYBACK,PA_IP_DEFAULT,PA_CC_PAYBACK,PA_CC_DEFAULT
SK_ID_CURR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
100001,0.961833,0.038167,0.925268,0.074732,0.915258,0.084742,0.5,0.5,0.5,0.5
100005,0.888497,0.111503,0.919031,0.080969,0.892115,0.107885,0.5,0.5,0.5,0.5
100013,0.996241,0.003759,0.931967,0.068033,0.943434,0.056566,0.844992,0.155008,0.5,0.5
100028,0.96757,0.03243,0.910811,0.089189,0.931185,0.068815,0.920394,0.079606,0.894884,0.105116
100038,0.890788,0.109212,0.5,0.5,0.896059,0.103941,0.5,0.5,0.5,0.5


In [70]:
TARGET = model.predict_proba(df)[:,1]

In [71]:
solution = pd.DataFrame(TARGET, index=df.index)
solution.columns = ["TARGET"]

In [72]:
solution.head()

Unnamed: 0_level_0,TARGET
SK_ID_CURR,Unnamed: 1_level_1
100001,0.011435
100005,0.093632
100013,0.00605
100028,0.010159
100038,0.074113


In [73]:
solution.to_csv(DATASET_DIR / "solution.csv")