### Datenanalyse

Dieses Dokument dient zur Einarbeitung in den Datensatz. Ziel ist es ein Verständnis der einzelnen Attribute zu erlangen und ein Gefühl für deren Zusammenspiel zu gewinnen. Weiterhin sollen fehlerhafte Daten identifiziert werden.

**Vorbereitung: Import benötigter Bibliotheken & Einlesen der Daten**

In [None]:
from pathlib import Path

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

pd.options.display.max_columns = None
pd.options.display.max_rows = None
pd.options.display.max_colwidth = None

from IPython.display import display

In [None]:
path1 = Path(r"A:\Workspace\Python\Masterarbeit\Kaggle Home Credit Datensatz")
path2 = Path(r"C:\Users\Robert\Documents\Workspace\Python\Masterarbeit\Kaggle Home Credit Datensatz")

if path1.is_dir():
    DATASET_DIR = path1
else:
    DATASET_DIR = path2

In [None]:
app_test = pd.read_csv(DATASET_DIR / "application_test.csv")
app_train = pd.read_csv(DATASET_DIR / "application_train.csv")
bureau = pd.read_csv(DATASET_DIR / "bureau.csv")
bureau_balance = pd.read_csv(DATASET_DIR / "bureau_balance.csv")
credit_card_balance = pd.read_csv(DATASET_DIR / "credit_card_balance.csv")
installments_payments = pd.read_csv(DATASET_DIR / "installments_payments.csv")
pcb = pd.read_csv(DATASET_DIR / "POS_CASH_balance.csv")
previous_application = pd.read_csv(DATASET_DIR / "previous_application.csv")

description = pd.read_csv(DATASET_DIR / "HomeCredit_columns_description.csv", encoding="latin", index_col=0)

**Beschreibungen der Attribute untergliedert nach den gegebenen CSV-Dateien**

In [None]:
description.loc[description['Table']=="application_{train|test}.csv", "Row":"Special"]

In [None]:
description.loc[description['Table']=="bureau.csv", "Row":"Special"]

In [None]:
description.loc[description['Table']=="bureau_balance.csv", "Row":"Special"]

In [None]:
description.loc[description['Table']=="POS_CASH_balance.csv", "Row":"Special"]

In [None]:
description.loc[description['Table']=="credit_card_balance.csv", "Row":"Special"]

In [None]:
description.loc[description['Table']=="previous_application.csv", "Row":"Special"]

In [None]:
description.loc[description['Table']=="installments_payments.csv", "Row":"Special"]

**Anpassung der numerischen Kategorien**

In [None]:
app_train["TARGET"].replace(
    {
        0: "Payback",
        1: "Default"
    }, inplace = True
)

## Untersuchung Application-Train & Application-Test

**Erstellung von Subklassen**

* payback = Kreditnehmer die ihren Kredit zurückzahlten
* default = Kreditnehmer die ihren Kredit nicht zurückzahlten
* nums = numerische Daten
* cats = kategorische Daten

In [None]:
num_heads = ['CNT_CHILDREN', 'AMT_INCOME_TOTAL', 'AMT_CREDIT', 'AMT_ANNUITY', 'AMT_GOODS_PRICE', 'REGION_POPULATION_RELATIVE', 'DAYS_BIRTH', 'DAYS_EMPLOYED', 'DAYS_REGISTRATION', 'DAYS_ID_PUBLISH', 'OWN_CAR_AGE', 'CNT_FAM_MEMBERS', 'HOUR_APPR_PROCESS_START', 'EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3', 'APARTMENTS_AVG', 'BASEMENTAREA_AVG', 'YEARS_BEGINEXPLUATATION_AVG', 'YEARS_BUILD_AVG', 'COMMONAREA_AVG', 'ELEVATORS_AVG', 'ENTRANCES_AVG', 'FLOORSMAX_AVG', 'FLOORSMIN_AVG', 'LANDAREA_AVG', 'LIVINGAPARTMENTS_AVG', 'LIVINGAREA_AVG', 'NONLIVINGAPARTMENTS_AVG', 'NONLIVINGAREA_AVG', 'APARTMENTS_MODE', 'BASEMENTAREA_MODE', 'YEARS_BEGINEXPLUATATION_MODE', 'YEARS_BUILD_MODE', 'COMMONAREA_MODE', 'ELEVATORS_MODE', 'ENTRANCES_MODE', 'FLOORSMAX_MODE', 'FLOORSMIN_MODE', 'LANDAREA_MODE', 'LIVINGAPARTMENTS_MODE', 'LIVINGAREA_MODE', 'NONLIVINGAPARTMENTS_MODE', 'NONLIVINGAREA_MODE', 'APARTMENTS_MEDI', 'BASEMENTAREA_MEDI', 'YEARS_BEGINEXPLUATATION_MEDI', 'YEARS_BUILD_MEDI', 'COMMONAREA_MEDI', 'ELEVATORS_MEDI', 'ENTRANCES_MEDI', 'FLOORSMAX_MEDI', 'FLOORSMIN_MEDI', 'LANDAREA_MEDI', 'LIVINGAPARTMENTS_MEDI', 'LIVINGAREA_MEDI', 'NONLIVINGAPARTMENTS_MEDI', 'NONLIVINGAREA_MEDI', 'TOTALAREA_MODE', 'OBS_30_CNT_SOCIAL_CIRCLE', 'DEF_30_CNT_SOCIAL_CIRCLE', 'OBS_60_CNT_SOCIAL_CIRCLE', 'DEF_60_CNT_SOCIAL_CIRCLE', 'DAYS_LAST_PHONE_CHANGE', 'AMT_REQ_CREDIT_BUREAU_HOUR', 'AMT_REQ_CREDIT_BUREAU_DAY', 'AMT_REQ_CREDIT_BUREAU_WEEK', 'AMT_REQ_CREDIT_BUREAU_MON', 'AMT_REQ_CREDIT_BUREAU_QRT', 'AMT_REQ_CREDIT_BUREAU_YEAR']
cat_heads = ['TARGET', 'NAME_CONTRACT_TYPE', 'CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 'NAME_TYPE_SUITE', 'NAME_INCOME_TYPE', 'NAME_EDUCATION_TYPE', 'NAME_FAMILY_STATUS', 'NAME_HOUSING_TYPE', 'FLAG_MOBIL', 'FLAG_EMP_PHONE', 'FLAG_WORK_PHONE', 'FLAG_CONT_MOBILE', 'FLAG_PHONE', 'FLAG_EMAIL', 'OCCUPATION_TYPE', 'REGION_RATING_CLIENT', 'REGION_RATING_CLIENT_W_CITY', 'WEEKDAY_APPR_PROCESS_START', 'REG_REGION_NOT_LIVE_REGION', 'REG_REGION_NOT_WORK_REGION', 'LIVE_REGION_NOT_WORK_REGION', 'REG_CITY_NOT_LIVE_CITY', 'REG_CITY_NOT_WORK_CITY', 'LIVE_CITY_NOT_WORK_CITY', 'ORGANIZATION_TYPE', 'FONDKAPREMONT_MODE', 'HOUSETYPE_MODE', 'WALLSMATERIAL_MODE', 'EMERGENCYSTATE_MODE', 'FLAG_DOCUMENT_2', 'FLAG_DOCUMENT_3', 'FLAG_DOCUMENT_4', 'FLAG_DOCUMENT_5', 'FLAG_DOCUMENT_6', 'FLAG_DOCUMENT_7', 'FLAG_DOCUMENT_8', 'FLAG_DOCUMENT_9', 'FLAG_DOCUMENT_10', 'FLAG_DOCUMENT_11', 'FLAG_DOCUMENT_12', 'FLAG_DOCUMENT_13', 'FLAG_DOCUMENT_14', 'FLAG_DOCUMENT_15', 'FLAG_DOCUMENT_16', 'FLAG_DOCUMENT_17', 'FLAG_DOCUMENT_18', 'FLAG_DOCUMENT_19', 'FLAG_DOCUMENT_20', 'FLAG_DOCUMENT_21']

In [None]:
payback = app_train[app_train["TARGET"] == "Payback"]
default = app_train[app_train["TARGET"] == "Default"]
nums = app_train[num_heads]
cats = app_train[cat_heads]

**Hilfsfunktion zum zeichnen eines Kreisdiagramms**

In [None]:
# Function to draw a Piechart
def draw_piechart(arguments):
    
    fig, ax = plt.subplots(1,len(arguments))

    try:
    # Handle multiple plots
        for argument, a in zip(arguments,ax):
            labels = argument[0]
            sizes = argument[1]
            title = argument[2]

            a.pie(sizes, labels=labels, autopct='%1.1f%%', startangle=90, normalize=False, labeldistance=1.05)
            a.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.
            a.set_title(title)
            
    # Handle single plot           
    except TypeError:
        for argument in arguments:
            labels = argument[0]
            sizes = argument[1]
            title = argument[2]
        
            ax.pie(sizes, labels=labels, autopct='%1.1f%%', startangle=90, normalize=False)
            ax.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.
            ax.set_title(title)
    plt.show()

## Untersuchung der Kreditnehmer - Payback vs. Default

**Verhältnis Payback zu Default im Datensatz**

In [None]:
pb = len(payback.index)
df = len(default.index)
n = pb + df

labels = "Payback", "Default"
sizes = [pb/n,df/n]
title = "Payback vs Default"

arguments = [(labels, sizes, title)]

draw_piechart(arguments)

In [None]:
F,M,other = payback["CODE_GENDER"].value_counts()
n = F+M+other

labels1 = "Female", "Male"
sizes1 = [F/n,M/n]
title1 = "Gender Payback"

F,M = default["CODE_GENDER"].value_counts()
X = default["CODE_GENDER"].value_counts()

n = F+M

labels2 = "Female", "Male"
sizes2 = [F/n,M/n]
title2 = "Gender Default"

arguments = [(labels1, sizes1, title1),(labels2, sizes2, title2)]

draw_piechart(arguments)

In [None]:
count = payback["NAME_EDUCATION_TYPE"].value_counts()

low_sec = count["Lower secondary"]
sec = count["Secondary / secondary special"]
inc_high = count["Incomplete higher"]
high = count["Higher education"]
acad = count["Academic degree"]

n = len(payback["NAME_EDUCATION_TYPE"])

labels1 = "Secondary ", "Higher education", "Incomplete higher", "Lower secondary", "Academic degree"
sizes1 = [sec/n, high/n, inc_high/n, low_sec/n, acad/n]
title1 = "Education Payback"

count = default["NAME_EDUCATION_TYPE"].value_counts()

low_sec = count["Lower secondary"]
sec = count["Secondary / secondary special"]
inc_high = count["Incomplete higher"]
high = count["Higher education"]
acad = count["Academic degree"]

n = len(default["NAME_EDUCATION_TYPE"])

labels2 = "Secondary ", "Higher education", "Incomplete higher", "Lower secondary", "Academic degree"
sizes2 = [sec/n, high/n, inc_high/n, low_sec/n, acad/n]
title2 = "Education Default"


arguments = [(labels1, sizes1, title1),(labels2, sizes2, title2)]

draw_piechart(arguments)

In [None]:
df = cats[cats["TARGET"] == "Default"]
for col in df:
    print(col)
    c = df[col].unique()
    n = len(df[col].index)
    per = {}
    for category in c:
        v = len(df[df[col]==category])
        per[category]=round(v/n*100, 2)
    per = dict(sorted(per.items(), key=lambda item: item[1], reverse=True))
    for k,v in per.items():
        print(k,v)
    print("\n")

In [None]:
df = cats[cats["TARGET"] == "Payback"]
for col in df:
    print(col)
    c = df[col].unique()
    n = len(df[col].index)
    per = {}
    for category in c:
        v = len(df[df[col]==category])
        per[category]=round(v/n*100, 2)
    per = dict(sorted(per.items(), key=lambda item: item[1], reverse=True))
    for k,v in per.items():
        print(k,v)
    print("\n")

In [85]:
dfs = []
for head in cats.columns.values:
    if head in skip:
        continue
    df1 = payback[head].value_counts().rename_axis(head).reset_index(name='payback').head()
    df2 = default[head].value_counts().rename_axis(head).reset_index(name='default').head()
    
    df1["payback"] = df1["payback"]/df1["payback"].sum()*100
    df2["default"] = df2["default"]/df2["default"].sum()*100
    
    df = df1.merge(df2, how="outer", on=head)
    
    df["change"] = (df["default"]/100) / (df["payback"]/100)
    
    dfs.append(df)
    

In [86]:
for df in dfs:
    display(df)

Unnamed: 0,NAME_CONTRACT_TYPE,payback,default,change
0,Cash loans,90.209986,93.538771,1.0369
1,Revolving loans,9.790014,6.461229,0.659982


Unnamed: 0,CODE_GENDER,payback,default,change
0,F,66.603228,57.079557,0.857009
1,M,33.395357,42.920443,1.285222
2,XNA,0.001415,,


Unnamed: 0,FLAG_OWN_CAR,payback,default,change
0,N,65.682418,69.482377,1.057854
1,Y,34.317582,30.517623,0.889271


Unnamed: 0,FLAG_OWN_REALTY,payback,default,change
0,Y,69.451264,68.410876,0.98502
1,N,30.548736,31.589124,1.034057


Unnamed: 0,NAME_TYPE_SUITE,payback,default,change
0,Unaccompanied,81.372269,82.482966,1.01365
1,Family,13.244136,12.203926,0.921459
2,"Spouse, partner",3.735388,3.629948,0.971773
3,Children,1.079073,0.97745,0.905824
4,Other_B,0.569134,0.705711,1.239972


Unnamed: 0,NAME_INCOME_TYPE,payback,default,change
0,Working,50.785576,61.330218,1.207631
1,Commercial associate,23.440612,21.592878,0.921174
2,Pensioner,18.531163,12.013052,0.648262
3,State servant,7.236281,5.031624,0.695333
4,Student,0.006368,,
5,Unemployed,,0.032228,


Unnamed: 0,NAME_EDUCATION_TYPE,payback,default,change
0,Secondary / secondary special,70.34908,78.646526,1.117947
1,Higher education,25.064559,16.149043,0.644298
2,Incomplete higher,3.327013,3.512588,1.055778
3,Lower secondary,1.202394,1.679758,1.397011
4,Academic degree,0.056954,0.012085,0.212183


Unnamed: 0,NAME_FAMILY_STATUS,payback,default,change
0,Married,64.234976,59.818731,0.931249
1,Single / not married,14.499229,17.953676,1.23825
2,Civil marriage,9.485503,11.927492,1.257444
3,Separated,6.420597,6.52568,1.016367
4,Widow,5.359695,3.774421,0.704223


Unnamed: 0,NAME_HOUSING_TYPE,payback,default,change
0,House / apartment,89.328358,85.996119,0.962697
1,With parents,4.652533,7.018111,1.508449
2,Municipal apartment,3.631419,3.86077,1.063157
3,Rented apartment,1.5196,2.429657,1.598879
4,Office apartment,0.868089,0.695343,0.801004


Unnamed: 0,FLAG_MOBIL,payback,default,change
0,1,99.999646,100.0,1.000004
1,0,0.000354,,


Unnamed: 0,FLAG_EMP_PHONE,payback,default,change
0,1,81.465301,87.951662,1.079621
1,0,18.534699,12.048338,0.650042


Unnamed: 0,FLAG_WORK_PHONE,payback,default,change
0,0,80.400869,76.217523,0.947969
1,1,19.599131,23.782477,1.213445


Unnamed: 0,FLAG_CONT_MOBILE,payback,default,change
0,1,99.812867,99.818731,1.000059
1,0,0.187133,0.181269,0.968661


Unnamed: 0,FLAG_PHONE,payback,default,change
0,0,71.576237,75.504532,1.054883
1,1,28.423763,24.495468,0.861795


Unnamed: 0,FLAG_EMAIL,payback,default,change
0,0,94.315955,94.465257,1.001583
1,1,5.684045,5.534743,0.973733


Unnamed: 0,OCCUPATION_TYPE,payback,default,change
0,Laborers,35.065978,41.395448,1.180502
1,Sales staff,20.614088,21.924413,1.063565
2,Core staff,18.355847,12.323619,0.671373
3,Managers,14.242267,9.416436,0.661161
4,Drivers,11.72182,14.940084,1.274553


Unnamed: 0,REGION_RATING_CLIENT,payback,default,change
0,2,73.960861,72.132931,0.975285
1,3,15.198489,21.615307,1.422201
2,1,10.84065,6.251762,0.576696


Unnamed: 0,REGION_RATING_CLIENT_W_CITY,payback,default,change
0,2,74.752199,73.192346,0.979133
1,3,13.746348,20.145015,1.465481
2,1,11.501454,6.662638,0.579287


Unnamed: 0,WEEKDAY_APPR_PROCESS_START,payback,default,change
0,TUESDAY,20.878591,21.564776,1.032865
1,WEDNESDAY,20.158407,20.304714,1.007258
2,MONDAY,19.771265,18.848218,0.953314
3,THURSDAY,19.649967,19.633959,0.999185
4,FRIDAY,19.54177,19.648333,1.005453


Unnamed: 0,REG_REGION_NOT_LIVE_REGION,payback,default,change
0,0,98.505763,98.255791,0.997462
1,1,1.494237,1.744209,1.167291


Unnamed: 0,REG_REGION_NOT_WORK_REGION,payback,default,change
0,0,94.968269,94.408862,0.99411
1,1,5.031731,5.591138,1.111176


Unnamed: 0,LIVE_REGION_NOT_WORK_REGION,payback,default,change
0,0,95.950631,95.746224,0.99787
1,1,4.049369,4.253776,1.050479


Unnamed: 0,REG_CITY_NOT_LIVE_CITY,payback,default,change
0,0,92.535888,88.161128,0.952724
1,1,7.464112,11.838872,1.586106


Unnamed: 0,REG_CITY_NOT_WORK_CITY,payback,default,change
0,0,77.591037,69.707956,0.898402
1,1,22.408963,30.292044,1.351783


Unnamed: 0,LIVE_CITY_NOT_WORK_CITY,payback,default,change
0,0,82.41441,77.83283,0.944408
1,1,17.58559,22.16717,1.26053


Unnamed: 0,ORGANIZATION_TYPE,payback,default,change
0,Business Entity Type 3,35.356408,41.069109,1.161575
1,XNA,30.033081,19.420629,0.646641
2,Self-employed,19.782022,25.383216,1.283146
3,Other,8.833799,8.281372,0.937464
4,Medicine,5.994691,,
5,Business Entity Type 2,,5.845674,


Unnamed: 0,FONDKAPREMONT_MODE,payback,default,change
0,reg oper account,75.873879,76.895522,1.013465
1,reg oper spec account,12.470723,11.820896,0.947892
2,org spec account,5.84648,4.880597,0.834792
3,not specified,5.808918,6.402985,1.102268


Unnamed: 0,HOUSETYPE_MODE,payback,default,change
0,block of flats,98.2766,97.617936,0.993298
1,specific housing,0.945203,1.419897,1.502213
2,terraced house,0.778196,0.962167,1.236407


Unnamed: 0,WALLSMATERIAL_MODE,payback,default,change
0,Panel,45.002947,40.5612,0.901301
1,"Stone, brick",43.669187,46.444122,1.063544
2,Block,6.259869,6.289308,1.004703
3,Wooden,3.523223,5.031447,1.428081
4,Mixed,1.544775,1.673924,1.083603


Unnamed: 0,EMERGENCYSTATE_MODE,payback,default,change
0,No,98.600669,98.031253,0.994225
1,Yes,1.399331,1.968747,1.40692


Unnamed: 0,FLAG_DOCUMENT_2,payback,default,change
0,0,99.996816,99.983887,0.999871
1,1,0.003184,0.016113,5.060956


Unnamed: 0,FLAG_DOCUMENT_3,payback,default,change
0,1,70.406034,77.792548,1.104913
1,0,29.593966,22.207452,0.750405


Unnamed: 0,FLAG_DOCUMENT_4,payback,default,change
0,0,99.991156,100.0,1.000088
1,1,0.008844,,


Unnamed: 0,FLAG_DOCUMENT_5,payback,default,change
0,0,98.487368,98.501511,1.000144
1,1,1.512632,1.498489,0.99065


Unnamed: 0,FLAG_DOCUMENT_6,payback,default,change
0,0,90.954274,93.929507,1.032711
1,1,9.045726,6.070493,0.67109


Unnamed: 0,FLAG_DOCUMENT_7,payback,default,change
0,0,99.98019,99.987915,1.000077
1,1,0.01981,0.012085,0.610026


Unnamed: 0,FLAG_DOCUMENT_8,payback,default,change
0,0,91.797259,92.60423,1.008791
1,1,8.202741,7.39577,0.901622


Unnamed: 0,FLAG_DOCUMENT_9,payback,default,change
0,0,99.602386,99.701913,1.000999
1,1,0.397614,0.298087,0.749688


Unnamed: 0,FLAG_DOCUMENT_10,payback,default,change
0,0,99.997524,100.0,1.000025
1,1,0.002476,,


Unnamed: 0,FLAG_DOCUMENT_11,payback,default,change
0,0,99.600971,99.697885,1.000973
1,1,0.399029,0.302115,0.757124


Unnamed: 0,FLAG_DOCUMENT_12,payback,default,change
0,0,99.999293,100.0,1.000007
1,1,0.000707,,


Unnamed: 0,FLAG_DOCUMENT_13,payback,default,change
0,0,99.627148,99.879154,1.002529
1,1,0.372852,0.120846,0.324112


Unnamed: 0,FLAG_DOCUMENT_14,payback,default,change
0,0,99.691177,99.879154,1.001886
1,1,0.308823,0.120846,0.391311


Unnamed: 0,FLAG_DOCUMENT_15,payback,default,change
0,0,99.872296,99.95569,1.000835
1,1,0.127704,0.04431,0.346977


Unnamed: 0,FLAG_DOCUMENT_16,payback,default,change
0,0,98.973066,99.39577,1.004271
1,1,1.026934,0.60423,0.588382


Unnamed: 0,FLAG_DOCUMENT_17,payback,default,change
0,0,99.9717,99.991944,1.000202
1,1,0.0283,0.008056,0.284679


Unnamed: 0,FLAG_DOCUMENT_18,payback,default,change
0,0,99.165859,99.427996,1.002643
1,1,0.834141,0.572004,0.68574


Unnamed: 0,FLAG_DOCUMENT_19,payback,default,change
0,0,99.939509,99.951662,1.000122
1,1,0.060491,0.048338,0.799098


Unnamed: 0,FLAG_DOCUMENT_20,payback,default,change
0,0,99.949414,99.947633,0.999982
1,1,0.050586,0.052367,1.035195


Unnamed: 0,FLAG_DOCUMENT_21,payback,default,change
0,0,99.968516,99.943605,0.999751
1,1,0.031484,0.056395,1.791237


In [None]:
app_train["AMT_INCOME_TOTAL"]