# Initialisierung

In [None]:
from pathlib import Path
from scipy import stats

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns

np.set_printoptions(suppress=True)

pd.options.display.max_columns = None
pd.options.display.max_rows = None
pd.options.display.max_colwidth = None

from IPython.display import display, Markdown

In [None]:
path1 = Path(r"A:\Workspace\Python\Masterarbeit\Kaggle Home Credit Datensatz")
path2 = Path(r"C:\Users\rober\Documents\Workspace\Python\Masterarbeit\Kaggle Home Credit Datensatz")

if path1.is_dir():
    DATASET_DIR = path1
else:
    DATASET_DIR = path2

In [None]:
app_train = pd.read_csv(DATASET_DIR / "application_train.csv")
pcb = pd.read_csv(DATASET_DIR / "POS_CASH_balance.csv")
description = pd.read_csv(DATASET_DIR / "HomeCredit_columns_description.csv", encoding="latin", index_col=0)

In [None]:
description.loc[description['Table']=="POS_CASH_balance.csv", "Row":"Special"]

In [None]:
# Function to draw a Piechart
def draw_piechart(arguments):
    
    fig, ax = plt.subplots(1,len(arguments))

    try:
    # Handle multiple plots
        for argument, a in zip(arguments,ax):
            labels = argument[0]
            sizes = argument[1]
            title = argument[2]

            a.pie(sizes, labels=labels, autopct='%1.1f%%', startangle=90, normalize=False, labeldistance=1.05)
            a.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.
            a.set_title(title)
            
    # Handle single plot           
    except TypeError:
        for argument in arguments:
            labels = argument[0]
            sizes = argument[1]
            title = argument[2]
        
            ax.pie(sizes, labels=labels, autopct='%1.1f%%', startangle=90, normalize=False)
            ax.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.
            ax.set_title(title)
    plt.show()

# Datenaufbereitung

In [None]:
# Schnittmenge zwischen app_train und bureau
# Ergänzt die Variable "TARGET" in dem bureau-Datensatz
pcb = pd.merge(pcb, app_train[["SK_ID_CURR","TARGET"]] ,on="SK_ID_CURR")

# Austausch der numerischen Kategorien durch sprechende, interpretierbare Variablen
pcb["TARGET"].replace(
    {
        0: "Payback",
        1: "Default"
    }, inplace = True
)

# Unterteilung des Datensatzen nach Payback und Default
payback = pcb[pcb["TARGET"] == "Payback"]
default = pcb[pcb["TARGET"] == "Default"]

In [None]:
skip = ["SK_ID_PREV", "SK_ID_CURR", "TARGET"]

n_heads = [element for element in pcb.columns if pcb[element].dtype.name == "object"]
n_heads = [element for element in n_heads if element not in skip]
n_heads += ["CNT_INSTALMENT"]

m_heads = [element for element in pcb.columns if pcb[element].dtype.name != "object"]
m_heads = [element for element in m_heads if element not in skip]
m_heads = [element for element in m_heads if element not in n_heads


n = pcb[n_heads]
m = pcb[m_heads]

In [None]:
n.head()

In [None]:
pcb.head()

# Datenanalyse

In [None]:
imp = ['CNT_PAYMENT_0.0', 'CODE_GENDER', 'CREDIT_DAY_OVERDUE_0.0',
       'DAYS_BIRTH', 'DAYS_CREDIT_1.0', 'DAYS_FIRST_DRAWING_0.75',
       'DAYS_LAST_DUE_1ST_VERSION_1.0', 'EXT_SOURCE_1', 'EXT_SOURCE_2',
       'EXT_SOURCE_3', 'FLAG_DOCUMENT_3', 'MAX_AMTDIFF', 'MEAN_AMTDIFF',
       'MEAN_TIMEDIFF', 'NAME_YIELD_GROUP low_normal',
       'ORGANIZATION_TYPE',
       'PRODUCT_COMBINATION POS industry with interest', 'SUM_DPD']

In [None]:
for head in m_heads:
    for head2 in imp:
        if head in head2:
            print(head)

## kategorische Variablen

In [None]:
result = {
    "head" : [],
    "cat" : [],
    "payback" : [],
    "default" : [],
    "diff" : []
}

for head in n.columns.values:
    df1 = payback[head].value_counts().rename_axis(head).reset_index(name='payback').head()
    df2 = default[head].value_counts().rename_axis(head).reset_index(name='default').head()
    
    df1["payback"] = df1["payback"]/df1["payback"].sum()*100
    df2["default"] = df2["default"]/df2["default"].sum()*100
    
    df = df1.merge(df2, how="outer", on=head)
    
    df["diff"] = (df["default"]-df["payback"])
    
    df = df.sort_values("diff", ascending=False)
    
    for diff in df["diff"]:
        if np.isnan(diff):
            continue
        if diff > 5 or diff < -5:
            row = df.loc[df["diff"] == diff]
            cat = row[head][row[head].index[0]]
            
            result["head"].append(head)
            result["cat"].append(cat)
            result["payback"].append(round(row["payback"].values[0],2))
            result["default"].append(round(row["default"].values[0],2))
            result["diff"].append(round(diff,2))

df = pd.DataFrame(result)
df.sort_values("diff", ascending=False)

In [None]:
quants =["SK_DPD", "CNT_INSTALMENT_FUTURE", "SK_DPD_DEF"]

for head in m_heads:

    pb = payback[head].copy()
    df = default[head].copy()

    if head in quants:
    
        pb = pb[pb < pb.quantile(0.95)]
        df = df[df < df.quantile(0.95)]

    pb.hist()
    df.hist()

    plt.title(head)
    plt.legend(labels=["Payback","Default"])
    plt.xlabel("Kreditdauer in Jahren")
    plt.ylabel("# Kreditnehmer")
    plt.show()

    sns.kdeplot(pb).get_lines()[0].get_data()
    sns.kdeplot(df).get_lines()[1].get_data()

    plt.legend(labels=["Payback","Default"])
    plt.xlabel("Kreditdauer in Jahren")
    plt.ylabel("# Kreditnehmer")
    plt.show()