# Datentransformation Bureau

Ziel dieser Datenaufbereitung ist es alle Variablen so zu transformieren, dass sie leicht analysierbar und interpretierbar sind.

### Initialisierung

In [1]:
from pathlib import Path
from scipy import stats

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns

np.set_printoptions(suppress=True)

pd.options.display.max_columns = None
pd.options.display.max_rows = None
pd.options.display.max_colwidth = None

from IPython.display import display, Markdown

In [2]:
path1 = Path(r"A:\Workspace\Python\Masterarbeit\Kaggle Home Credit Datensatz")
path2 = Path(r"C:\Users\rober\Documents\Workspace\Python\Masterarbeit\Kaggle Home Credit Datensatz")

if path1.is_dir():
    DATASET_DIR = path1
else:
    DATASET_DIR = path2

In [3]:
bureau = pd.read_csv(DATASET_DIR / "bureau.csv")
des = pd.read_csv(DATASET_DIR / "HomeCredit_columns_description.csv", encoding="latin", index_col=0)
des = des[des["Table"] == "bureau.csv"]

In [4]:
m_heads = [head for head in bureau.columns if bureau[head].dtype.name != "object"]
n_heads = [head for head in bureau.columns if bureau[head].dtype.name == "object"]

In [5]:
bureau.head()

Unnamed: 0,SK_ID_CURR,SK_ID_BUREAU,CREDIT_ACTIVE,CREDIT_CURRENCY,DAYS_CREDIT,CREDIT_DAY_OVERDUE,DAYS_CREDIT_ENDDATE,DAYS_ENDDATE_FACT,AMT_CREDIT_MAX_OVERDUE,CNT_CREDIT_PROLONG,AMT_CREDIT_SUM,AMT_CREDIT_SUM_DEBT,AMT_CREDIT_SUM_LIMIT,AMT_CREDIT_SUM_OVERDUE,CREDIT_TYPE,DAYS_CREDIT_UPDATE,AMT_ANNUITY
0,215354,5714462,Closed,currency 1,-497,0,-153.0,-153.0,,0,91323.0,0.0,,0.0,Consumer credit,-131,
1,215354,5714463,Active,currency 1,-208,0,1075.0,,,0,225000.0,171342.0,,0.0,Credit card,-20,
2,215354,5714464,Active,currency 1,-203,0,528.0,,,0,464323.5,,,0.0,Consumer credit,-16,
3,215354,5714465,Active,currency 1,-203,0,,,,0,90000.0,,,0.0,Credit card,-16,
4,215354,5714466,Active,currency 1,-629,0,1197.0,,77674.5,0,2700000.0,,,0.0,Consumer credit,-21,


In [6]:
# Application Train & Application Test enthalten die selben Attribute (außer TARGET)
todo = list(bureau.columns)

### Keine Datenaufbereitung für Primär- und Fremdschlüssel

In [7]:
keys = ["SK_ID_CURR","SK_ID_BUREAU"]
todo = [element for element in todo if element not in keys]

### keine Datenaufbereitung der nominalen Daten

* 1 == True
* 0 == False

In [8]:
bureau[n_heads].head()

Unnamed: 0,CREDIT_ACTIVE,CREDIT_CURRENCY,CREDIT_TYPE
0,Closed,currency 1,Consumer credit
1,Active,currency 1,Credit card
2,Active,currency 1,Consumer credit
3,Active,currency 1,Credit card
4,Active,currency 1,Consumer credit


In [9]:
# Löschen der nominalen Daten von der TODO Liste
todo = [head for head in todo if head not in n_heads]

### metrische stetige Daten

* Tage zu Jahre
* Vorzeichenumkehr

In [10]:
bureau[m_heads].head()

Unnamed: 0,SK_ID_CURR,SK_ID_BUREAU,DAYS_CREDIT,CREDIT_DAY_OVERDUE,DAYS_CREDIT_ENDDATE,DAYS_ENDDATE_FACT,AMT_CREDIT_MAX_OVERDUE,CNT_CREDIT_PROLONG,AMT_CREDIT_SUM,AMT_CREDIT_SUM_DEBT,AMT_CREDIT_SUM_LIMIT,AMT_CREDIT_SUM_OVERDUE,DAYS_CREDIT_UPDATE,AMT_ANNUITY
0,215354,5714462,-497,0,-153.0,-153.0,,0,91323.0,0.0,,0.0,-131,
1,215354,5714463,-208,0,1075.0,,,0,225000.0,171342.0,,0.0,-20,
2,215354,5714464,-203,0,528.0,,,0,464323.5,,,0.0,-16,
3,215354,5714465,-203,0,,,,0,90000.0,,,0.0,-16,
4,215354,5714466,-629,0,1197.0,,77674.5,0,2700000.0,,,0.0,-21,


In [11]:
days = [head for head in m_heads if "DAY" in head]

for head in days:
    bureau[head] /= 365
    bureau.rename(columns={head : head.replace("DAYS","YEARS")},inplace=True)

In [12]:
# Anpassung der Attributbeschreibung
for head in days:
    id = des[des["Row"] == head].index
    des.loc[id,"Row"] = head.replace("DAY", "YEAR")
    des.loc[id,"Description"] = des.loc[id,"Description"].values[0].replace("day", "year")

In [13]:
# Löschen der days Daten von der TODO Liste
todo = [head for head in todo if head not in m_heads]

## Datenspeicherung

In [14]:
bureau.to_csv(DATASET_DIR / "1. Datentransformation" / "bureau.csv")
des.to_csv(DATASET_DIR / "1. Datentransformation" / "bureau_des.csv")

In [15]:
todo

[]