In [13]:
from pathlib import Path
from scipy import stats

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns

np.set_printoptions(suppress=True)

pd.options.display.max_columns = None
pd.options.display.max_rows = None
pd.options.display.max_colwidth = None

from IPython.display import display, Markdown

In [14]:
path1 = Path(r"A:\Workspace\Python\Masterarbeit\Kaggle Home Credit Datensatz")
path2 = Path(r"C:\Users\rober\Documents\Workspace\Python\Masterarbeit\Kaggle Home Credit Datensatz")

if path1.is_dir():
    DATASET_DIR = path1
else:
    DATASET_DIR = path2

In [15]:
app_train = pd.read_csv(DATASET_DIR / "application_train.csv")
bureau = pd.read_csv(DATASET_DIR / "bureau.csv")
description = pd.read_csv(DATASET_DIR / "HomeCredit_columns_description.csv", encoding="latin", index_col=0)

In [4]:
# Schnittmenge zwischen app_train und bureau
# Ergänzt die Variable "TARGET" in dem bureau-Datensatz
bureau = pd.merge(bureau, app_train[["SK_ID_CURR","TARGET"]] ,on="SK_ID_CURR")

# Austausch der numerischen Kategorien durch sprechende, interpretierbare Variablen
bureau["TARGET"].replace(
    {
        0: "Payback",
        1: "Default"
    }, inplace = True
)

# Unterteilung des Datensatzen nach Payback und Default
payback = bureau[bureau["TARGET"] == "Payback"]
default = bureau[bureau["TARGET"] == "Default"]

In [16]:
bureau.head()

Unnamed: 0,SK_ID_CURR,SK_ID_BUREAU,CREDIT_ACTIVE,CREDIT_CURRENCY,DAYS_CREDIT,CREDIT_DAY_OVERDUE,DAYS_CREDIT_ENDDATE,DAYS_ENDDATE_FACT,AMT_CREDIT_MAX_OVERDUE,CNT_CREDIT_PROLONG,AMT_CREDIT_SUM,AMT_CREDIT_SUM_DEBT,AMT_CREDIT_SUM_LIMIT,AMT_CREDIT_SUM_OVERDUE,CREDIT_TYPE,DAYS_CREDIT_UPDATE,AMT_ANNUITY
0,215354,5714462,Closed,currency 1,-497,0,-153.0,-153.0,,0,91323.0,0.0,,0.0,Consumer credit,-131,
1,215354,5714463,Active,currency 1,-208,0,1075.0,,,0,225000.0,171342.0,,0.0,Credit card,-20,
2,215354,5714464,Active,currency 1,-203,0,528.0,,,0,464323.5,,,0.0,Consumer credit,-16,
3,215354,5714465,Active,currency 1,-203,0,,,,0,90000.0,,,0.0,Credit card,-16,
4,215354,5714466,Active,currency 1,-629,0,1197.0,,77674.5,0,2700000.0,,,0.0,Consumer credit,-21,


# Gruppierungen

In [17]:
for head in bureau.columns:
    print(head, bureau[head].dtype.name)

SK_ID_CURR int64
SK_ID_BUREAU int64
CREDIT_ACTIVE object
CREDIT_CURRENCY object
DAYS_CREDIT int64
CREDIT_DAY_OVERDUE int64
DAYS_CREDIT_ENDDATE float64
DAYS_ENDDATE_FACT float64
AMT_CREDIT_MAX_OVERDUE float64
CNT_CREDIT_PROLONG int64
AMT_CREDIT_SUM float64
AMT_CREDIT_SUM_DEBT float64
AMT_CREDIT_SUM_LIMIT float64
AMT_CREDIT_SUM_OVERDUE float64
CREDIT_TYPE object
DAYS_CREDIT_UPDATE int64
AMT_ANNUITY float64


In [18]:
skip = ["SK_ID_CURR", "SK_ID_BUREAU", "TARGET"]
quants = [0, 0.25, 0.5, 0.75, 1]

for index, head in enumerate(bureau.columns):
    
    if head in skip:
        continue
    
    if bureau[head].dtype.name == "object":
        continue
        
    a = bureau[["SK_ID_CURR", head]]
    a = a.groupby(by = ["SK_ID_CURR"]).quantile(quants)
    a = a.unstack(level=-1)
    
    try:
        mets = mets.join(a , on="SK_ID_CURR")
    except (ValueError, NameError):
        mets = a



In [19]:
# Schnittmenge zwischen app_train und bureau
# Ergänzt die Variable "TARGET" in dem bureau-Datensatz
mets = pd.merge(mets, app_train[["SK_ID_CURR","TARGET"]] ,on="SK_ID_CURR")

# Austausch der numerischen Kategorien durch sprechende, interpretierbare Variablen
mets["TARGET"].replace(
    {
        0: "Payback",
        1: "Default"
    }, inplace = True
)

# Unterteilung des Datensatzen nach Payback und Default
payback = mets[mets["TARGET"] == "Payback"]
default = mets[mets["TARGET"] == "Default"]



In [20]:
bureau["CREDIT_DAY_OVERDUE"].head()

0    0
1    0
2    0
3    0
4    0
Name: CREDIT_DAY_OVERDUE, dtype: int64

In [10]:
#bureau[bureau["CREDIT_DAY_OVERDUE"] == 0]["CREDIT_DAY_OVERDUE"] = np.nan

bureau.loc[bureau["CREDIT_DAY_OVERDUE"] == 0] = np.nan

In [11]:
df = bureau
len(df[df["CREDIT_DAY_OVERDUE"] == 0]) / len(df)*100

0.0

In [12]:
df = payback
len(df[df["CREDIT_DAY_OVERDUE"] == 0]) / len(df)*100

KeyError: 'CREDIT_DAY_OVERDUE'

In [None]:
df = default
len(df[df["CREDIT_DAY_OVERDUE"] == 0]) / len(df)*100

In [None]:
mets.head()

In [None]:
payback.columns[1]

In [None]:
for head in payback.columns:
    if head in skip:
        continue
        
    x_1, y_1 = sns.kdeplot(payback[head]).get_lines()[0].get_data()
    x_2, y_2 = sns.kdeplot(default[head]).get_lines()[1].get_data()
    
    plt.show()
    plt.clf()
    
    x = np.array(list(zip(x_1, x_2)))
    y = np.array(list(zip(y_1, y_2)))

    maxi = np.maximum(y[:,0],y[:,1])
    mini = np.minimum(y[:,0],y[:,1])
    diff = np.subtract(maxi,mini)
    
    rate = np.trapz(diff, x_2) / np.trapz(maxi, x_2) * 100
    print(head, rate)

#### Kategorien

In [None]:
skip = ["SK_ID_CURR", "SK_ID_BUREAU" ,"TARGET"]

for index, head in enumerate(bureau.columns):
    
    if head in skip:
        continue
    
    if bureau[head].dtype.name != "object":
        continue
    
    
    a = bureau[["SK_ID_CURR", head]]
    a = a.groupby(["SK_ID_CURR", head]).size()
    a = a.unstack()
    
    try:
        b = b.join(a , on="SK_ID_CURR")
    except (ValueError, NameError):
        b = a

b.head()

In [None]:
for head in b.columns:
    rate = b[head].isna().sum() / len(b[head]) * 100
    if rate < 90:
        print(head, rate)

In [None]:
bureau["DAYS_CREDIT"].head()