In [1]:
from pathlib import Path
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve, roc_auc_score

from sklearn.model_selection import KFold
from sklearn.metrics import plot_roc_curve
from sklearn.metrics import auc

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

np.set_printoptions(suppress=True)

pd.options.display.max_columns = None
pd.options.display.max_rows = None
pd.options.display.max_colwidth = None

from IPython.display import clear_output

In [2]:
# Entscheidungsbaum
TREE_PARAMS = {
    "random_state" : 0,
    "max_depth": 7
}

In [3]:
path1 = Path(r"A:\Workspace\Python\Masterarbeit\Kaggle Home Credit Datensatz")
path2 = Path(r"C:\Users\rober\Documents\Workspace\Python\Masterarbeit\Kaggle Home Credit Datensatz")

if path1.is_dir():
    DATASET_DIR = path1
else:
    DATASET_DIR = path2

In [4]:
app_train = pd.read_csv(DATASET_DIR / "Datenaufbereitung" / "app_train_mets.csv", index_col="SK_ID_CURR")
bureau = pd.read_csv(DATASET_DIR / "Datenaufbereitung" / "bureau_mets.csv", index_col="SK_ID_CURR")
pa = pd.read_csv(DATASET_DIR / "Datenaufbereitung" / "pa_mets.csv", index_col="SK_ID_CURR")
ip = pd.read_csv(DATASET_DIR / "Datenaufbereitung" / "ip_mets.csv", index_col="SK_ID_CURR")
pos = pd.read_csv(DATASET_DIR / "Datenaufbereitung" / "pos_mets.csv", index_col="SK_ID_CURR")

In [5]:
ip = ip.drop("TARGET", axis=1)
pos = pos.drop("TARGET", axis=1)

In [6]:
pos.head()

Unnamed: 0_level_0,CNT_CREDITS_PAST,CNT_CREDITS_CURR,SUM_PAYCOUNTS_MONTHS,SUM_DPD
SK_ID_CURR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
182943,1.0,1.0,15.0,0
367990,2.0,0.0,0.0,0
397406,7.0,1.0,1.0,4110
269225,5.0,1.0,39.0,0
334279,4.0,3.0,27.0,0


In [7]:
# Check IDs overlap

In [8]:
len(app_train)

307511

In [9]:
print(len([element for element in app_train.index if element not in bureau.index]))
print(len([element for element in app_train.index if element not in pa.index]))
print(len([element for element in bureau.index if element not in pa.index]))

44020
16454
13984


In [10]:
# join TARGET

In [11]:
# bureau = pd.merge(bureau, app_train[["TARGET"]], left_index=True, right_index=True)
# pa = pd.merge(pa, app_train[["TARGET"]], left_index=True, right_index=True)

In [12]:
# join all ACHTUNG das ist die Schnittmenge! Zuvor sollten fehlende Daten aufbereitet werden.

In [13]:
app_train = pd.merge(app_train, bureau, left_index=True, right_index=True)
app_train = pd.merge(app_train, pa, left_index=True, right_index=True)
app_train = pd.merge(app_train, ip, left_index=True, right_index=True)
app_train = pd.merge(app_train, pos, left_index=True, right_index=True)

In [14]:
app_train.head()

Unnamed: 0_level_0,TARGET,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,REGION_POPULATION_RELATIVE,DAYS_BIRTH,DAYS_EMPLOYED,DAYS_REGISTRATION,DAYS_ID_PUBLISH,EXT_SOURCE_1,EXT_SOURCE_2,EXT_SOURCE_3,YEARS_BEGINEXPLUATATION_AVG,TOTALAREA_MODE,OBS_30_CNT_SOCIAL_CIRCLE,DEF_30_CNT_SOCIAL_CIRCLE,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,DAYS_CREDIT_0.0,DAYS_CREDIT_0.25,DAYS_CREDIT_0.5,DAYS_CREDIT_0.75,DAYS_CREDIT_1.0,CREDIT_DAY_OVERDUE_0.0,CREDIT_DAY_OVERDUE_0.25,CREDIT_DAY_OVERDUE_0.5,CREDIT_DAY_OVERDUE_0.75,CREDIT_DAY_OVERDUE_1.0,DAYS_CREDIT_ENDDATE_0.0,DAYS_CREDIT_ENDDATE_0.25,DAYS_CREDIT_ENDDATE_0.5,DAYS_CREDIT_ENDDATE_0.75,DAYS_CREDIT_ENDDATE_1.0,AMT_CREDIT_MAX_OVERDUE_0.0,AMT_CREDIT_MAX_OVERDUE_0.25,AMT_CREDIT_MAX_OVERDUE_0.5,AMT_CREDIT_MAX_OVERDUE_0.75,AMT_CREDIT_MAX_OVERDUE_1.0,CNT_CREDIT_PROLONG_0.0,CNT_CREDIT_PROLONG_0.25,CNT_CREDIT_PROLONG_0.5,CNT_CREDIT_PROLONG_0.75,CNT_CREDIT_PROLONG_1.0,AMT_CREDIT_SUM_0.0,AMT_CREDIT_SUM_0.25,AMT_CREDIT_SUM_0.5,AMT_CREDIT_SUM_0.75,AMT_CREDIT_SUM_1.0,AMT_CREDIT_SUM_DEBT_0.0,AMT_CREDIT_SUM_DEBT_0.25,AMT_CREDIT_SUM_DEBT_0.5,AMT_CREDIT_SUM_DEBT_0.75,AMT_CREDIT_SUM_DEBT_1.0,AMT_CREDIT_SUM_LIMIT_0.0,AMT_CREDIT_SUM_LIMIT_0.25,AMT_CREDIT_SUM_LIMIT_0.5,AMT_CREDIT_SUM_LIMIT_0.75,AMT_CREDIT_SUM_LIMIT_1.0,AMT_CREDIT_SUM_OVERDUE_0.0,AMT_CREDIT_SUM_OVERDUE_0.25,AMT_CREDIT_SUM_OVERDUE_0.5,AMT_CREDIT_SUM_OVERDUE_0.75,AMT_CREDIT_SUM_OVERDUE_1.0,AMT_DOWN_PAYMENT_0.0,AMT_DOWN_PAYMENT_0.25,AMT_DOWN_PAYMENT_0.5,AMT_DOWN_PAYMENT_0.75,AMT_DOWN_PAYMENT_1.0,RATE_DOWN_PAYMENT_0.0,RATE_DOWN_PAYMENT_0.25,RATE_DOWN_PAYMENT_0.5,RATE_DOWN_PAYMENT_0.75,RATE_DOWN_PAYMENT_1.0,DAYS_DECISION_0.0,DAYS_DECISION_0.25,DAYS_DECISION_0.5,DAYS_DECISION_0.75,DAYS_DECISION_1.0,SELLERPLACE_AREA_0.0,SELLERPLACE_AREA_0.25,SELLERPLACE_AREA_0.5,SELLERPLACE_AREA_0.75,SELLERPLACE_AREA_1.0,CNT_PAYMENT_0.0,CNT_PAYMENT_0.25,CNT_PAYMENT_0.5,CNT_PAYMENT_0.75,CNT_PAYMENT_1.0,DAYS_FIRST_DRAWING_0.0,DAYS_FIRST_DRAWING_0.25,DAYS_FIRST_DRAWING_0.5,DAYS_FIRST_DRAWING_0.75,DAYS_FIRST_DRAWING_1.0,DAYS_FIRST_DUE_0.0,DAYS_FIRST_DUE_0.25,DAYS_FIRST_DUE_0.5,DAYS_FIRST_DUE_0.75,DAYS_FIRST_DUE_1.0,DAYS_LAST_DUE_1ST_VERSION_0.0,DAYS_LAST_DUE_1ST_VERSION_0.25,DAYS_LAST_DUE_1ST_VERSION_0.5,DAYS_LAST_DUE_1ST_VERSION_0.75,DAYS_LAST_DUE_1ST_VERSION_1.0,AMT_APPLICATION_0.0,AMT_APPLICATION_0.25,AMT_APPLICATION_0.5,AMT_APPLICATION_0.75,AMT_APPLICATION_1.0,DAYS_TERMINATION_0.0,DAYS_TERMINATION_0.25,DAYS_TERMINATION_0.5,DAYS_TERMINATION_0.75,DAYS_TERMINATION_1.0,MIN_TIMEDIFF,MIN_AMTDIFF,MEAN_TIMEDIFF,MEAN_AMTDIFF,MAX_TIMEDIFF,MAX_AMTDIFF,CNT_CREDITS_PAST,CNT_CREDITS_CURR,SUM_PAYCOUNTS_MONTHS,SUM_DPD
SK_ID_CURR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1,Unnamed: 89_level_1,Unnamed: 90_level_1,Unnamed: 91_level_1,Unnamed: 92_level_1,Unnamed: 93_level_1,Unnamed: 94_level_1,Unnamed: 95_level_1,Unnamed: 96_level_1,Unnamed: 97_level_1,Unnamed: 98_level_1,Unnamed: 99_level_1,Unnamed: 100_level_1,Unnamed: 101_level_1,Unnamed: 102_level_1,Unnamed: 103_level_1,Unnamed: 104_level_1,Unnamed: 105_level_1,Unnamed: 106_level_1,Unnamed: 107_level_1,Unnamed: 108_level_1,Unnamed: 109_level_1,Unnamed: 110_level_1,Unnamed: 111_level_1,Unnamed: 112_level_1,Unnamed: 113_level_1,Unnamed: 114_level_1,Unnamed: 115_level_1,Unnamed: 116_level_1,Unnamed: 117_level_1,Unnamed: 118_level_1,Unnamed: 119_level_1,Unnamed: 120_level_1,Unnamed: 121_level_1,Unnamed: 122_level_1,Unnamed: 123_level_1,Unnamed: 124_level_1,Unnamed: 125_level_1,Unnamed: 126_level_1
100002,1,0,202500.0,406597.5,0.018801,-9461,-637,-3648.0,-2120,0.083037,0.262949,0.139376,0.9722,0.0149,2.0,2.0,0.0,0.0,0.0,0.0,0.0,-1437.0,-1122.0,-1042.5,-602.75,-103.0,0.0,0.0,0.0,0.0,0.0,-1072.0,-1006.25,-424.5,79.25,780.0,0.0,0.0,40.5,3321.0,5043.645,0.0,0.0,0.0,0.0,0.0,0.0,28759.17375,54130.5,124301.25,450000.0,0.0,0.0,0.0,0.0,245781.0,0.0,0.0,0.0,7997.14125,31988.565,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-606.0,-606.0,-606.0,-606.0,-606.0,500.0,500.0,500.0,500.0,500.0,24.0,24.0,24.0,24.0,24.0,365243.0,365243.0,365243.0,365243.0,365243.0,-565.0,-565.0,-565.0,-565.0,-565.0,125.0,125.0,125.0,125.0,125.0,179055.0,179055.0,179055.0,179055.0,179055.0,-17.0,-17.0,-17.0,-17.0,-17.0,12.0,0.0,20.421053,0.0,31.0,0.0,0.0,1.0,6.0,0
100003,0,0,270000.0,1293502.5,0.003541,-16765,-1188,-1186.0,-291,0.311267,0.622246,0.510853,0.9851,0.0714,1.0,0.0,0.0,0.0,0.0,0.0,0.0,-2586.0,-1873.5,-1205.5,-732.75,-606.0,0.0,0.0,0.0,0.0,0.0,-2434.0,-1013.5,-480.0,-11.0,1216.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,22248.0,60051.375,92576.25,286875.0,810000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,202500.0,810000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3442.5,6885.0,0.0,0.0,0.0,0.05003,0.100061,-2341.0,-1584.5,-828.0,-787.0,-746.0,-1.0,99.5,200.0,800.0,1400.0,6.0,9.0,12.0,12.0,12.0,365243.0,365243.0,365243.0,365243.0,365243.0,-2310.0,-1553.5,-797.0,-756.5,-716.0,-1980.0,-1313.5,-647.0,-516.5,-386.0,68809.5,203154.75,337500.0,618750.0,900000.0,-1976.0,-1307.5,-639.0,-583.0,-527.0,1.0,0.0,7.16,0.0,14.0,0.0,2.0,1.0,1.0,0
100004,0,0,67500.0,135000.0,0.010032,-19046,-225,-4260.0,-2531,0.50213,0.555912,0.729567,0.977735,0.102547,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1326.0,-1096.5,-867.0,-637.5,-408.0,0.0,0.0,0.0,0.0,0.0,-595.0,-541.75,-488.5,-435.25,-382.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,94500.0,94509.45,94518.9,94528.35,94537.8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4860.0,4860.0,4860.0,4860.0,4860.0,0.212008,0.212008,0.212008,0.212008,0.212008,-815.0,-815.0,-815.0,-815.0,-815.0,30.0,30.0,30.0,30.0,30.0,4.0,4.0,4.0,4.0,4.0,365243.0,365243.0,365243.0,365243.0,365243.0,-784.0,-784.0,-784.0,-784.0,-784.0,-694.0,-694.0,-694.0,-694.0,-694.0,24282.0,24282.0,24282.0,24282.0,24282.0,-714.0,-714.0,-714.0,-714.0,-714.0,3.0,0.0,7.666667,0.0,11.0,0.0,1.0,0.0,0.0,0
100007,0,0,121500.0,513000.0,0.028663,-19932,-3038,-4311.0,-3458,0.50213,0.322738,0.510853,0.977735,0.102547,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1149.0,-1149.0,-1149.0,-1149.0,-1149.0,0.0,0.0,0.0,0.0,0.0,-783.0,-783.0,-783.0,-783.0,-783.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,146250.0,146250.0,146250.0,146250.0,146250.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2328.75,3676.5,0.0,0.0,0.0,0.075107,0.21889,-2357.0,-1602.5,-986.5,-865.5,-374.0,-1.0,0.25,28.5,913.25,1200.0,10.0,12.0,15.0,22.5,48.0,0.0,365243.0,365243.0,365243.0,365243.0,-2326.0,-1571.0,-955.0,-466.5,0.0,-2056.0,-1241.0,-535.0,-81.0,346.0,17176.5,68253.75,191250.0,219375.0,247500.0,-2041.0,-1232.5,-543.0,-86.75,365243.0,-12.0,-22655.655,3.636364,-452.384318,31.0,0.0,3.0,2.0,14.0,0
100008,0,0,99000.0,490495.5,0.035792,-16941,-1588,-4970.0,-477,0.50213,0.354225,0.621226,0.977735,0.102547,0.0,0.0,0.0,0.0,0.0,0.0,1.0,-1097.0,-1097.0,-1097.0,-587.5,-78.0,0.0,0.0,0.0,0.0,0.0,-853.0,-822.5,-792.0,-160.5,471.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,95134.5,100419.75,105705.0,186655.5,267606.0,0.0,0.0,0.0,120028.5,240057.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4500.0,12145.5,0.0,0.0,0.0,0.108909,0.110243,-2536.0,-2333.0,-639.0,-370.0,-82.0,-1.0,50.0,96.0,110.0,110.0,6.0,10.0,10.0,16.051936,30.0,0.0,365243.0,365243.0,365243.0,365243.0,-2491.0,-2299.0,-609.0,-339.0,0.0,-2341.0,-2029.0,-69.0,0.0,261.0,0.0,44455.5,121455.0,162598.5,450000.0,-2334.0,-703.0,-388.0,-66.0,0.0,-1317.0,-11758.995,-26.114286,-342.461571,28.0,0.0,4.0,0.0,0.0,28142


In [15]:
# Ermittlung Reihenfolge

In [16]:
y = app_train["TARGET"]
x = app_train.drop(["TARGET"], axis=1)

In [17]:
# unterteilt den trainingsdatensatz in trainings- und validierungsdatensätze
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=.5, random_state=0)

In [18]:
# logistisches Regressionsmodell
model = RandomForestClassifier(**TREE_PARAMS)
model.fit(x_train, y_train)

RandomForestClassifier(max_depth=20, random_state=0)

In [19]:
auc = roc_auc_score(y_test, model.predict_proba(x_test)[:,1])
fpr, tpr, threshold = roc_curve(y_test, model.predict_proba(x_test)[:,1])
print(auc)

0.7256384118448649


In [20]:
# Koeffizienten der einzelnen Klassen
coef_dict = {}
for coef, feat in zip(model.feature_importances_, x.columns.values):
    coef_dict[feat] = coef

In [21]:
# Feature Importance
d = dict(sorted(coef_dict.items(), key=lambda item: item[1], reverse=True))
order = list(d.keys())

In [22]:
#Random Forest Parameterbestimmung

In [23]:
data = {
        "auc":[],
        "auc_adj":[],
        "p" : [],
        "n": []
       }

auc_temp = 0.5
heads_gain = []
drop = []

HEADS = order
df = app_train

for index, h in enumerate(HEADS):
    
    heads_gain.append(h)
    
    X = df[heads_gain + ["TARGET"]]
    y = X["TARGET"]
    x = X.drop(["TARGET"], axis=1)
    
    model = RandomForestClassifier(**TREE_PARAMS).fit(x, y)
    
    n = len(X)
    p = len(X.columns)
    
    aucs = []

    kfold = KFold(2, shuffle=True, random_state=1)

    # enumerate splits
    for i, (train, test) in enumerate(kfold.split(x)):

        model.fit(x.iloc[train], y.iloc[train])
        auc = roc_auc_score(y.iloc[test], model.predict_proba(x.iloc[test])[:,1])
        aucs.append(auc)
    
    auc = np.mean(aucs)
    auc_adj = 1-(1-auc)*(n-1)/(n-p-1)
    
    if auc_adj > auc_temp:
        
        data["auc"].append(auc)
        data["auc_adj"].append(auc_adj)
        data["p"].append(p)
        data["n"].append(n)
        
        auc_temp = auc_adj
        
    else:
        heads_gain.remove(h)
        drop.append(h)
    
    clear_output(wait=True)
    print(auc_adj, index+1, len(HEADS), drop)

0.6596538669546141 1 125 []


KeyboardInterrupt: 

In [None]:
result = pd.DataFrame(data)

print(result[result["auc_adj"] == result["auc_adj"].max()])
result["auc_adj"].plot()

heads_gain

In [None]:
model = RandomForestClassifier(**TREE_PARAMS)

aucs = []

kfold = KFold(5, shuffle=True, random_state=1)
data = df[heads_gain]

# enumerate splits
for i, (train, test) in enumerate(kfold.split(data)):
    
    model.fit(x.iloc[train], y.iloc[train])
    auc = roc_auc_score(y.iloc[test], model.predict_proba(x.iloc[test])[:,1])
    aucs.append(auc)
    
    print('train: %s, test: %s, auc: %s' % (train, test, auc))

print("\n")
print("Durchschnitt: %.2f" % (np.mean(aucs)))
print("Standardabw.: %.2f" % (np.std(aucs)))
print("Varianz:      %.2f" % (np.var(aucs)))