# Initialisierung

In [1]:
from pathlib import Path
from scipy import stats

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns

np.set_printoptions(suppress=True)

pd.options.display.max_columns = None
pd.options.display.max_rows = None
pd.options.display.max_colwidth = None

from IPython.display import display, Markdown

In [2]:
path1 = Path(r"A:\Workspace\Python\Masterarbeit\Kaggle Home Credit Datensatz")
path2 = Path(r"C:\Users\rober\Documents\Workspace\Python\Masterarbeit\Kaggle Home Credit Datensatz")

if path1.is_dir():
    DATASET_DIR = path1
else:
    DATASET_DIR = path2

In [None]:
app_train = pd.read_csv(DATASET_DIR / "application_train.csv")
pcb = pd.read_csv(DATASET_DIR / "POS_CASH_balance.csv")
description = pd.read_csv(DATASET_DIR / "HomeCredit_columns_description.csv", encoding="latin", index_col=0)

In [None]:
description.loc[description['Table']=="POS_CASH_balance.csv", "Row":"Special"]

In [None]:
# Function to draw a Piechart
def draw_piechart(arguments):
    
    fig, ax = plt.subplots(1,len(arguments))

    try:
    # Handle multiple plots
        for argument, a in zip(arguments,ax):
            labels = argument[0]
            sizes = argument[1]
            title = argument[2]

            a.pie(sizes, labels=labels, autopct='%1.1f%%', startangle=90, normalize=False, labeldistance=1.05)
            a.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.
            a.set_title(title)
            
    # Handle single plot           
    except TypeError:
        for argument in arguments:
            labels = argument[0]
            sizes = argument[1]
            title = argument[2]
        
            ax.pie(sizes, labels=labels, autopct='%1.1f%%', startangle=90, normalize=False)
            ax.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.
            ax.set_title(title)
    plt.show()

# Datenaufbereitung

In [None]:
# Schnittmenge zwischen app_train und bureau
# Ergänzt die Variable "TARGET" in dem bureau-Datensatz
pcb = pd.merge(pcb, app_train[["SK_ID_CURR","TARGET"]] ,on="SK_ID_CURR")

# Austausch der numerischen Kategorien durch sprechende, interpretierbare Variablen
pcb["TARGET"].replace(
    {
        0: "Payback",
        1: "Default"
    }, inplace = True
)

# Unterteilung des Datensatzen nach Payback und Default
payback = pcb[pcb["TARGET"] == "Payback"]
default = pcb[pcb["TARGET"] == "Default"]

In [None]:
pcb.head()

# Datenanalyse

## Laufzeit vergangener Kredite

In [None]:
display(Markdown("### Payback"))
print(payback["CNT_INSTALMENT"].mean(), "\n")

display(Markdown("### Default"))
print(default["CNT_INSTALMENT"].mean())

## Vertragsstatus

In [None]:
COLUMN_NAME = "NAME_CONTRACT_STATUS"
TITLE = "Contract Status"
LABELS = ["Active", "Closed"]
SIZES_PER = lambda x : x / N

count = payback[COLUMN_NAME].value_counts()

N = len(payback[COLUMN_NAME])

a = count["Active"]
b = count["Completed"]
SIZES = [a,b]

labels1 = LABELS
sizes1 = [SIZES_PER(element) for element in SIZES]
title1 = TITLE + " Payback"

display(Markdown("### Payback"))
print(count, "\n")

count = default[COLUMN_NAME].value_counts()

N = len(default[COLUMN_NAME])

a = count["Active"]
b = count["Completed"]
SIZES = [a,b]

labels2 = LABELS
sizes2 = [SIZES_PER(element) for element in SIZES]
title2 = TITLE + " Default"

display(Markdown("### Default"))
print(count, "\n")

arguments = [(labels1, sizes1, title1), (labels2, sizes2, title2)]

draw_piechart(arguments)

## Default vergangener Kredite

In [None]:
display(Markdown("### Payback"))
print(len(payback[payback["SK_DPD"] > 0]["SK_DPD"])/len(payback["SK_DPD"])*100, "\n")

display(Markdown("### Default"))
print(len(default[default["SK_DPD"] > 0]["SK_DPD"])/len(default["SK_DPD"])*100)

2,95 % der in Application Train zurückzahlenden Kunden haben bereits zuvor einmal einen Kredit überzogen.

4,1 % der in Application Train ausgefallenen Kunden haben bereits zuvor einmal einen Kredit überzogen.