## <u>2. Datenaufbereitung Credit Card

In diesem Dokument werden für die Untersuchung unwichtige Variablen aus dem CC-Datensatz gelöscht und wichtige gruppiert. Im Gegensatz zu den Application-Daten liegt in dem CC-Datensatz eine N:M-Beziehung vor, da ein Kreditnehmer in der Vergangenheit mehrere Kredite gehabt haben kann und diese monatliche Kreditdaten beinhalten. Dies erfordert die Gruppierung der historischen Daten. Für die Bestimmung der Kreditwürdigkeit werden nur Daten historische Kontodaten benutzt, die maximal ein halbes Jahr in der Vergangenheit liegen. Zur Vergleichbarkeit der Zahlungshöhen im Zeitverlauf werden diese normalisiert.

*Vorgehensweise kategorische Variablen:*
- Gruppierung der Variablen
- Normalisierung der Variablen

## Initialisierung

In [1]:
from pathlib import Path
from scipy import stats

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns

np.set_printoptions(suppress=True)

pd.options.display.max_columns = None
pd.options.display.max_rows = None
pd.options.display.max_colwidth = None

from sklearn.linear_model import LogisticRegression

from IPython.display import display, Markdown

In [2]:
path1 = Path(r"A:\Workspace\Python\Masterarbeit\Kaggle Home Credit Datensatz")
path2 = Path(r"C:\Users\rober\Documents\Workspace\Python\Masterarbeit\Kaggle Home Credit Datensatz")

if path1.is_dir():
    DATASET_DIR = path1
else:
    DATASET_DIR = path2

In [3]:
cc = pd.read_csv(DATASET_DIR / "credit_card_balance.csv")
description = pd.read_csv(DATASET_DIR / "HomeCredit_columns_description.csv", encoding="latin", index_col=0)

In [4]:
des = description.loc[description['Table']=="credit_card_balance.csv", "Row":"Special"]

In [5]:
des

Unnamed: 0,Row,Description,Special
153,SK_ID_PREV,"ID of previous credit in Home credit related to loan in our sample. (One loan in our sample can have 0,1,2 or more previous loans in Home Credit)",hashed
154,SK_ID_CURR,ID of loan in our sample,hashed
155,MONTHS_BALANCE,Month of balance relative to application date (-1 means the freshest balance date),time only relative to the application
156,AMT_BALANCE,Balance during the month of previous credit,
157,AMT_CREDIT_LIMIT_ACTUAL,Credit card limit during the month of the previous credit,
158,AMT_DRAWINGS_ATM_CURRENT,Amount drawing at ATM during the month of the previous credit,
159,AMT_DRAWINGS_CURRENT,Amount drawing during the month of the previous credit,
160,AMT_DRAWINGS_OTHER_CURRENT,Amount of other drawings during the month of the previous credit,
161,AMT_DRAWINGS_POS_CURRENT,Amount drawing or buying goods during the month of the previous credit,
162,AMT_INST_MIN_REGULARITY,Minimal installment for this month of the previous credit,


In [6]:
cc[(cc["SK_ID_PREV"] == 1489084) & (cc["MONTHS_BALANCE"] >= -12)].sort_values("MONTHS_BALANCE")

Unnamed: 0,SK_ID_PREV,SK_ID_CURR,MONTHS_BALANCE,AMT_BALANCE,AMT_CREDIT_LIMIT_ACTUAL,AMT_DRAWINGS_ATM_CURRENT,AMT_DRAWINGS_CURRENT,AMT_DRAWINGS_OTHER_CURRENT,AMT_DRAWINGS_POS_CURRENT,AMT_INST_MIN_REGULARITY,AMT_PAYMENT_CURRENT,AMT_PAYMENT_TOTAL_CURRENT,AMT_RECEIVABLE_PRINCIPAL,AMT_RECIVABLE,AMT_TOTAL_RECEIVABLE,CNT_DRAWINGS_ATM_CURRENT,CNT_DRAWINGS_CURRENT,CNT_DRAWINGS_OTHER_CURRENT,CNT_DRAWINGS_POS_CURRENT,CNT_INSTALMENT_MATURE_CUM,NAME_CONTRACT_STATUS,SK_DPD,SK_DPD_DEF
832709,1489084,207482,-12,59059.845,225000,45000.0,45000.0,0.0,0.0,2250.0,2250.0,2250.0,56666.475,57714.345,57714.345,1.0,1,0.0,0.0,16.0,Active,0,0
2062010,1489084,207482,-11,72011.115,225000,9000.0,16567.965,0.0,7567.965,2963.835,6750.0,6750.0,69094.575,70665.615,70665.615,1.0,5,0.0,4.0,17.0,Active,0,0
1088832,1489084,207482,-10,3942.09,225000,0.0,2965.5,0.0,2965.5,3612.33,72270.0,72270.0,2941.965,3942.09,3942.09,0.0,1,0.0,1.0,18.0,Active,0,0
1582318,1489084,207482,-9,135.36,225000,0.0,0.0,0.0,0.0,2250.0,4077.45,4077.45,0.0,0.0,0.0,0.0,0,0.0,0.0,19.0,Active,0,0
277673,1489084,207482,-8,0.0,225000,0.0,0.0,0.0,0.0,135.36,135.36,135.36,0.0,0.0,0.0,0.0,0,0.0,0.0,20.0,Active,0,0
3691020,1489084,207482,-7,225000.0,225000,0.0,225000.0,0.0,225000.0,0.0,0.0,0.0,225000.0,225000.0,225000.0,0.0,1,0.0,1.0,20.0,Active,0,0
2101775,1489084,207482,-6,41915.16,225000,0.0,0.0,0.0,0.0,11250.0,191250.0,191250.0,33750.0,41915.16,41915.16,0.0,0,0.0,0.0,21.0,Active,0,0
1260333,1489084,207482,-5,109375.2,225000,45000.0,66655.8,0.0,21655.8,2250.0,2250.0,2250.0,100405.8,107804.7,107804.7,1.0,9,0.0,8.0,22.0,Active,0,0
517,1489084,207482,-4,146687.445,225000,0.0,39060.0,0.0,39060.0,5481.675,4500.0,4500.0,139465.8,146687.445,146687.445,0.0,1,0.0,1.0,23.0,Active,7,7
11332,1489084,207482,-3,133284.42,148500,0.0,0.0,0.0,0.0,7238.34,9900.0,9900.0,129466.44,133284.42,133284.42,0.0,0,0.0,0.0,24.0,Active,0,0


In [7]:
df = cc.copy()

Aus allen Variablen wurden folgende als wichtig identifiziert:

In [8]:
df = df[["SK_ID_PREV", "MONTHS_BALANCE", "AMT_BALANCE", "AMT_PAYMENT_CURRENT", "SK_DPD_DEF"]]

In [9]:
df = df[df["MONTHS_BALANCE"] >= -6]

In [10]:
df.head()

Unnamed: 0,SK_ID_PREV,MONTHS_BALANCE,AMT_BALANCE,AMT_PAYMENT_CURRENT,SK_DPD_DEF
0,2562384,-6,56.97,1800.0,0
1,2582071,-1,63975.555,2250.0,0
3,1389973,-4,236572.11,11925.0,0
4,1891521,-1,453919.455,27000.0,0
6,1079071,-6,353451.645,15750.0,0


# Informationsgehalt:
* 6 Monate Kontostand vor Kreditaufnahme
* 6 Monate Ausgaben vor Kreditaufnahme
* 6 Monate Summe überzogenen Tage

In [11]:
T1 = df[df["MONTHS_BALANCE"] == -1][["SK_ID_PREV", "AMT_BALANCE", "AMT_PAYMENT_CURRENT", "SK_DPD_DEF"]]
T2 = df[df["MONTHS_BALANCE"] == -2][["SK_ID_PREV", "AMT_BALANCE", "AMT_PAYMENT_CURRENT", "SK_DPD_DEF"]]
T3 = df[df["MONTHS_BALANCE"] == -3][["SK_ID_PREV", "AMT_BALANCE", "AMT_PAYMENT_CURRENT", "SK_DPD_DEF"]]
T4 = df[df["MONTHS_BALANCE"] == -4][["SK_ID_PREV", "AMT_BALANCE", "AMT_PAYMENT_CURRENT", "SK_DPD_DEF"]]
T5 = df[df["MONTHS_BALANCE"] == -5][["SK_ID_PREV", "AMT_BALANCE", "AMT_PAYMENT_CURRENT", "SK_DPD_DEF"]]
T6 = df[df["MONTHS_BALANCE"] == -6][["SK_ID_PREV", "AMT_BALANCE", "AMT_PAYMENT_CURRENT", "SK_DPD_DEF"]]

In [12]:
T1 = T1.groupby(by="SK_ID_PREV").sum()
T1.columns = ["BALANCE_30", "PAYMENT_30", "DPD_30"]

T2 = T2.groupby(by="SK_ID_PREV").sum()
T2.columns = ["BALANCE_60", "PAYMENT_60", "DPD_60"]

T3 = T3.groupby(by="SK_ID_PREV").sum()
T3.columns = ["BALANCE_90", "PAYMENT_90", "DPD_90"]

T4 = T4.groupby(by="SK_ID_PREV").sum()
T4.columns = ["BALANCE_120", "PAYMENT_120", "DPD_120"]

T5 = T5.groupby(by="SK_ID_PREV").sum()
T5.columns = ["BALANCE_150", "PAYMENT_150", "DPD_150"]

T6 = T6.groupby(by="SK_ID_PREV").sum()
T6.columns = ["BALANCE_180", "PAYMENT_180", "DPD_180"]

In [13]:
result = pd.DataFrame(index=df.SK_ID_PREV.unique())
result.index.name = "SK_ID_PREV"

In [14]:
result = pd.merge(result, T1, how="left", left_index=True, right_index=True)
result = pd.merge(result, T2, how="left", left_index=True, right_index=True)
result = pd.merge(result, T3, how="left", left_index=True, right_index=True)
result = pd.merge(result, T4, how="left", left_index=True, right_index=True)
result = pd.merge(result, T5, how="left", left_index=True, right_index=True)
result = pd.merge(result, T6, how="left", left_index=True, right_index=True)

In [15]:
result = result.fillna(0)

In [16]:
df = result

In [17]:
df.head()

Unnamed: 0_level_0,BALANCE_30,PAYMENT_30,DPD_30,BALANCE_60,PAYMENT_60,DPD_60,BALANCE_90,PAYMENT_90,DPD_90,BALANCE_120,PAYMENT_120,DPD_120,BALANCE_150,PAYMENT_150,DPD_150,BALANCE_180,PAYMENT_180,DPD_180
SK_ID_PREV,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
2562384,0.0,0.0,0.0,28.575,1916.91,0.0,1897.875,2250.0,0.0,4036.86,2250.0,0.0,5228.28,841.5,0.0,56.97,1800.0,0.0
2582071,63975.555,2250.0,0.0,61124.625,2250.0,0.0,61615.395,2250.0,0.0,62129.475,4500.0,0.0,64634.985,0.0,1.0,62856.405,4500.0,0.0
1389973,234919.845,11925.0,0.0,235687.86,11925.0,0.0,236129.04,11925.0,0.0,236572.11,11925.0,0.0,237204.945,12150.0,0.0,237884.13,11250.0,0.0
1891521,453919.455,27000.0,0.0,458497.395,27000.0,0.0,461330.505,27000.0,0.0,457896.285,27000.0,0.0,459959.13,27000.0,0.0,457992.72,27000.0,0.0
1079071,322906.14,18000.0,0.0,329464.845,18000.0,0.0,335538.27,18000.0,0.0,342236.34,18000.0,0.0,347929.335,18000.0,0.0,353451.645,15750.0,0.0


### Normalisierung

In [18]:
# BALANCE

In [19]:
balance_heads = [head for head in df.columns.values if "BALANCE" in head]

In [20]:
df["BALANCE_MEAN"] = df[balance_heads].mean(axis=1)
df["BALANCE_STD"] = df[balance_heads].std(axis=1)

In [21]:
for head in balance_heads:
    df[head] = (df[head]-df["BALANCE_MEAN"])/df["BALANCE_STD"]

In [22]:
# PAYMENT

In [23]:
payment_heads = [head for head in df.columns.values if "PAYMENT" in head]

In [24]:
df["PAYMENT_MEAN"] = df[payment_heads].mean(axis=1)
df["PAYMENT_STD"] = df[payment_heads].std(axis=1)

In [25]:
for head in payment_heads:
    df[head] = (df[head]-df["PAYMENT_MEAN"])/df["PAYMENT_STD"]

In [26]:
df = df.drop(["BALANCE_MEAN", "BALANCE_STD", "PAYMENT_MEAN", "PAYMENT_STD"], axis=1)

### Ergebnis

In [27]:
df = df.add_prefix("CC_")

In [28]:
df.head()

Unnamed: 0_level_0,CC_BALANCE_30,CC_PAYMENT_30,CC_DPD_30,CC_BALANCE_60,CC_PAYMENT_60,CC_DPD_60,CC_BALANCE_90,CC_PAYMENT_90,CC_DPD_90,CC_BALANCE_120,CC_PAYMENT_120,CC_DPD_120,CC_BALANCE_150,CC_PAYMENT_150,CC_DPD_150,CC_BALANCE_180,CC_PAYMENT_180,CC_DPD_180
SK_ID_PREV,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
2562384,-0.819794,-1.672973,0.0,-0.807299,0.4512,0.0,0.010108,0.820305,0.0,0.945442,0.820305,0.0,1.466426,-0.740487,0.0,-0.794882,0.321649,0.0
2582071,0.914866,-0.221404,0.0,-1.167021,-0.221404,0.0,-0.808637,-0.221404,0.0,-0.433231,1.107019,0.0,1.396414,-1.549826,1.0,0.097609,1.107019,0.0
1389973,-1.391888,0.243975,0.0,-0.669504,0.243975,0.0,-0.254537,0.243975,0.0,0.162209,0.243975,0.0,0.757444,0.9759,0.0,1.396276,-1.9518,0.0
1891521,-1.734005,,0.0,0.092348,,0.0,1.222608,,0.0,-0.147463,,0.0,0.675502,,0.0,-0.10899,,0.0
1079071,-1.364959,0.408248,0.0,-0.794077,0.408248,0.0,-0.265434,0.408248,0.0,0.317579,0.408248,0.0,0.813109,0.408248,0.0,1.293782,-2.041241,0.0


### Speichern der metrischen Werte

In [29]:
df.to_csv(DATASET_DIR / "2. Datenaufbereitung" / "cc.csv")