## 2. Datenaufbereitung Bureau

In [1]:
from pathlib import Path
from scipy import stats

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns

np.set_printoptions(suppress=True)

pd.options.display.max_columns = None
pd.options.display.max_rows = None
pd.options.display.max_colwidth = None

from sklearn.linear_model import LogisticRegression

from IPython.display import display, Markdown

In [2]:
path1 = Path(r"A:\Workspace\Python\Masterarbeit\Kaggle Home Credit Datensatz")
path2 = Path(r"C:\Users\rober\Documents\Workspace\Python\Masterarbeit\Kaggle Home Credit Datensatz")

if path1.is_dir():
    DATASET_DIR = path1
else:
    DATASET_DIR = path2

In [3]:
app_train = pd.read_csv(DATASET_DIR / "application_train.csv")
bureau = pd.read_csv(DATASET_DIR / "bureau.csv")
description = pd.read_csv(DATASET_DIR / "HomeCredit_columns_description.csv", encoding="latin", index_col=0)

In [4]:
des = description.loc[description['Table']=="bureau.csv", "Row":"Special"]

In [5]:
bureau = pd.merge(bureau, app_train[["SK_ID_CURR","TARGET"]] ,on="SK_ID_CURR")

In [6]:
bureau.head()

Unnamed: 0,SK_ID_CURR,SK_ID_BUREAU,CREDIT_ACTIVE,CREDIT_CURRENCY,DAYS_CREDIT,CREDIT_DAY_OVERDUE,DAYS_CREDIT_ENDDATE,DAYS_ENDDATE_FACT,AMT_CREDIT_MAX_OVERDUE,CNT_CREDIT_PROLONG,AMT_CREDIT_SUM,AMT_CREDIT_SUM_DEBT,AMT_CREDIT_SUM_LIMIT,AMT_CREDIT_SUM_OVERDUE,CREDIT_TYPE,DAYS_CREDIT_UPDATE,AMT_ANNUITY,TARGET
0,215354,5714462,Closed,currency 1,-497,0,-153.0,-153.0,,0,91323.0,0.0,,0.0,Consumer credit,-131,,0
1,215354,5714463,Active,currency 1,-208,0,1075.0,,,0,225000.0,171342.0,,0.0,Credit card,-20,,0
2,215354,5714464,Active,currency 1,-203,0,528.0,,,0,464323.5,,,0.0,Consumer credit,-16,,0
3,215354,5714465,Active,currency 1,-203,0,,,,0,90000.0,,,0.0,Credit card,-16,,0
4,215354,5714466,Active,currency 1,-629,0,1197.0,,77674.5,0,2700000.0,,,0.0,Consumer credit,-21,,0


In [7]:
# Spalten die innerhalb der Aufbereitung nicht verändert werden können
skip = ["TARGET", "SK_ID_CURR", "SK_ID_BUREAU"]

In [8]:
# nominale und metrische Spalten
n_heads = [element for element in bureau.columns if bureau[element].dtype.name == "object"]
m_heads = [element for element in bureau.columns if bureau[element].dtype.name != "object"]

### kategorische Variablen

In [9]:
df = bureau[["SK_ID_CURR", "TARGET"] + n_heads].copy()

In [10]:
len(df)

1465325

In [11]:
df.head()

Unnamed: 0,SK_ID_CURR,TARGET,CREDIT_ACTIVE,CREDIT_CURRENCY,CREDIT_TYPE
0,215354,0,Closed,currency 1,Consumer credit
1,215354,0,Active,currency 1,Credit card
2,215354,0,Active,currency 1,Consumer credit
3,215354,0,Active,currency 1,Credit card
4,215354,0,Active,currency 1,Consumer credit


### Löschung der Spalten mit weniger als 40% ausgefüllten Daten

In [12]:
result = {
          "header":[],
          "rate":[],
          "des":[]
         }
for key in df.keys():
    if key in skip:
        continue
    rate = df[key].isna().sum() / len(df[key]) * 100
    if rate > 60:
        result["header"].append(key)
        result["rate"].append(rate)
        result["des"].append(des[des["Row"] == key]["Description"])

result = pd.DataFrame(result)
result

Unnamed: 0,header,rate,des


In [13]:
df = df.drop(result.header.values, axis=1)

### Bildung von Korrelationsclustern

In [14]:
c = df.corr(method='spearman') * 100

In [15]:
families = []
for i, row in c.iterrows():
    r = row[row > 70]
    if len(r) > 1 and set(r.index) not in families:
        print(r)
        print("\n")
        
        families.append(set(r.index))

In [16]:
result = {
          "family":[],
          "head":[],
          "r2":[],
          "na":[],
          "rate":[]
         }

for i, family in enumerate(families):
    headers = list(family)
    
    result["family"].append("")
    result["head"].append("")
    result["r2"].append("")
    result["na"].append("")
    result["rate"].append("")
    
    for head in headers:
        d = df[["TARGET"] + [head]]
        na = d[head].isna().sum() / len(d) * 100
        d = d.dropna()
        x = d[[head]]
        y = d[["TARGET"]]
        model = LogisticRegression().fit(x, y.values.ravel())
        r2 = round(model.score(x,y),5)
        
        result["family"].append(i)
        result["head"].append(head)
        result["r2"].append(round(r2,5))
        result["na"].append(round(na,2))
        result["rate"].append(r2/na)
    
result = pd.DataFrame(result)
result       

Unnamed: 0,family,head,r2,na,rate


### Unterscheidbarkeit von mindestens 5pP einer Kategorie

In [17]:
ID_Payback = df[df["TARGET"] == 0].index.values
ID_Default = df[df["TARGET"] == 1].index.values

In [18]:
payback = df.loc[ID_Payback]
default = df.loc[ID_Default]

In [19]:
result = {
    "head" : [],
    "cat" : [],
    "payback" : [],
    "default" : [],
    "diff" : []
}

for head in df.columns.values:
    df1 = payback[head].value_counts().rename_axis(head).reset_index(name='payback').head()
    df2 = default[head].value_counts().rename_axis(head).reset_index(name='default').head()
    
    df1["payback"] = df1["payback"]/df1["payback"].sum()*100
    df2["default"] = df2["default"]/df2["default"].sum()*100
    
    df_ = df1.merge(df2, how="outer", on=head)
    
    df_["diff"] = (df_["default"]-df_["payback"])
    
    df_ = df_.sort_values("diff", ascending=False)
    
    for diff in df_["diff"]:
        if np.isnan(diff):
            continue
        if diff > 5 or diff < -5:
            row = df_.loc[df_["diff"] == diff]
            cat = row[head][row[head].index[0]]
            
            result["head"].append(head)
            result["cat"].append(cat)
            result["payback"].append(round(row["payback"].values[0],2))
            result["default"].append(round(row["default"].values[0],2))
            result["diff"].append(round(diff,2))

result = pd.DataFrame(result)
result.sort_values("diff", ascending=False)

Unnamed: 0,head,cat,payback,default,diff
0,CREDIT_ACTIVE,Active,36.39,43.95,7.56
1,CREDIT_ACTIVE,Closed,63.23,55.54,-7.69


In [20]:
remove = [head for head in df.columns.values if head not in list(result["head"].unique()) + skip]

In [21]:
df = df.drop(remove, axis=1)

In [22]:
df = df.drop(["TARGET"], axis=1)

### Erstellungen der Variablen:
* Anzahl aktive Kredite zum Zeitpunkt der Kreditaufnahme
* Anzahl geschlossener Kredite zum Zeitpunkt der Kreditaufnahme

In [23]:
df["CREDIT_ACTIVE"].value_counts()

Closed      917733
Active      541919
Sold          5653
Bad debt        20
Name: CREDIT_ACTIVE, dtype: int64

In [24]:
closed = df[df["CREDIT_ACTIVE"] == "Closed"]
closed = closed.groupby(by="SK_ID_CURR").count()
closed.columns = ["Closed"]

active = df[df["CREDIT_ACTIVE"] == "Active"]
active = active.groupby(by="SK_ID_CURR").count()
active.columns = ["Active"]

In [25]:
df = pd.DataFrame(index=app_train.SK_ID_CURR)
df.index.name = "SK_ID_CURR"
df = pd.merge(df, closed, how="left", left_index=True, right_index=True)
df = pd.merge(df, active, how="left", left_index=True, right_index=True)
df = df.fillna(0)

In [26]:
df.head()

Unnamed: 0_level_0,Closed,Active
SK_ID_CURR,Unnamed: 1_level_1,Unnamed: 2_level_1
100002,6.0,2.0
100003,3.0,1.0
100004,2.0,0.0
100006,0.0,0.0
100007,1.0,0.0


### resultierende Variablen

In [27]:
result = {
    "head":[],
    "des":[]
}

for head in df.columns.values:
    if head in skip:
        continue
    result["head"].append(head)
    result["des"].append(des[des["Row"] == head]["Description"])
    
result = pd.DataFrame(result)
result

Unnamed: 0,head,des
0,Closed,"Series([], Name: Description, dtype: object)"
1,Active,"Series([], Name: Description, dtype: object)"


### Speichern der kategorischen Werte

In [28]:
df.to_csv(DATASET_DIR / "2. Datenaufbereitung" / "bureau_cats.csv")

### metrische Variablen

In [29]:
df = bureau[m_heads].copy()

In [30]:
df.head()

Unnamed: 0,SK_ID_CURR,SK_ID_BUREAU,DAYS_CREDIT,CREDIT_DAY_OVERDUE,DAYS_CREDIT_ENDDATE,DAYS_ENDDATE_FACT,AMT_CREDIT_MAX_OVERDUE,CNT_CREDIT_PROLONG,AMT_CREDIT_SUM,AMT_CREDIT_SUM_DEBT,AMT_CREDIT_SUM_LIMIT,AMT_CREDIT_SUM_OVERDUE,DAYS_CREDIT_UPDATE,AMT_ANNUITY,TARGET
0,215354,5714462,-497,0,-153.0,-153.0,,0,91323.0,0.0,,0.0,-131,,0
1,215354,5714463,-208,0,1075.0,,,0,225000.0,171342.0,,0.0,-20,,0
2,215354,5714464,-203,0,528.0,,,0,464323.5,,,0.0,-16,,0
3,215354,5714465,-203,0,,,,0,90000.0,,,0.0,-16,,0
4,215354,5714466,-629,0,1197.0,,77674.5,0,2700000.0,,,0.0,-21,,0


### Löschung der Spalten mit weniger als 40% ausgefüllten Daten

In [31]:
result = {
          "header":[],
          "rate":[],
          "des":[]
         }
for key in df.keys():
    if key in skip:
        continue
    rate = df[key].isna().sum() / len(df[key]) * 100
    if rate > 60:
        result["header"].append(key)
        result["rate"].append(rate)
        result["des"].append(des[des["Row"] == key]["Description"])

result = pd.DataFrame(result)
result

Unnamed: 0,header,rate,des
0,AMT_CREDIT_MAX_OVERDUE,64.732738,"133 Maximal amount overdue on the Credit Bureau credit so far (at application date of loan in our sample) Name: Description, dtype: object"
1,AMT_ANNUITY,77.116885,"141 Annuity of the Credit Bureau credit Name: Description, dtype: object"


In [32]:
df = df.drop(result.header.values, axis=1)

### Bildung von Korrelationsclustern

In [33]:
c = df.corr(method='pearson') * 100

In [34]:
families = []
for i, row in c.iterrows():
    r = row[row > 70]
    if len(r) > 1 and set(r.index) not in families:
        print(r)
        print("\n")
        
        families.append(set(r.index))

DAYS_CREDIT          100.000000
DAYS_ENDDATE_FACT     87.529115
Name: DAYS_CREDIT, dtype: float64


DAYS_CREDIT            87.529115
DAYS_ENDDATE_FACT     100.000000
DAYS_CREDIT_UPDATE     74.519239
Name: DAYS_ENDDATE_FACT, dtype: float64


DAYS_ENDDATE_FACT      74.519239
DAYS_CREDIT_UPDATE    100.000000
Name: DAYS_CREDIT_UPDATE, dtype: float64




In [35]:
result = {
          "family":[],
          "head":[],
          "r2":[],
          "na":[],
          "rate":[]
         }

for i, family in enumerate(families):
    headers = list(family)
    
    result["family"].append("")
    result["head"].append("")
    result["r2"].append("")
    result["na"].append("")
    result["rate"].append("")
    
    for head in headers:
        d = df[["TARGET"] + [head]]
        na = d[head].isna().sum() / len(d) * 100
        d = d.dropna()
        x = d[[head]]
        y = d[["TARGET"]]
        model = LogisticRegression().fit(x, y.values.ravel())
        r2 = round(model.score(x,y),5)
        
        result["family"].append(i)
        result["head"].append(head)
        result["r2"].append(round(r2,5))
        result["na"].append(round(na,2))
        result["rate"].append(r2/na)
    
result = pd.DataFrame(result)
result       

  result["rate"].append(r2/na)
  result["rate"].append(r2/na)
  result["rate"].append(r2/na)
  result["rate"].append(r2/na)


Unnamed: 0,family,head,r2,na,rate
0,,,,,
1,0.0,DAYS_ENDDATE_FACT,0.9306,37.17,0.0250358
2,0.0,DAYS_CREDIT,0.92185,0.0,inf
3,,,,,
4,1.0,DAYS_ENDDATE_FACT,0.9306,37.17,0.0250358
5,1.0,DAYS_CREDIT,0.92185,0.0,inf
6,1.0,DAYS_CREDIT_UPDATE,0.92185,0.0,inf
7,,,,,
8,2.0,DAYS_ENDDATE_FACT,0.9306,37.17,0.0250358
9,2.0,DAYS_CREDIT_UPDATE,0.92185,0.0,inf


In [36]:
df = df.drop(["DAYS_ENDDATE_FACT", "DAYS_CREDIT_UPDATE"], axis=1)

In [37]:
df.head()

Unnamed: 0,SK_ID_CURR,SK_ID_BUREAU,DAYS_CREDIT,CREDIT_DAY_OVERDUE,DAYS_CREDIT_ENDDATE,CNT_CREDIT_PROLONG,AMT_CREDIT_SUM,AMT_CREDIT_SUM_DEBT,AMT_CREDIT_SUM_LIMIT,AMT_CREDIT_SUM_OVERDUE,TARGET
0,215354,5714462,-497,0,-153.0,0,91323.0,0.0,,0.0,0
1,215354,5714463,-208,0,1075.0,0,225000.0,171342.0,,0.0,0
2,215354,5714464,-203,0,528.0,0,464323.5,,,0.0,0
3,215354,5714465,-203,0,,0,90000.0,,,0.0,0
4,215354,5714466,-629,0,1197.0,0,2700000.0,,,0.0,0


### Betrachtung der Kausalität

In [38]:
result = {
    "head":[],
    "des":[]
}

for head in df.columns.values:
    if head in skip:
        continue
    result["head"].append(head)
    result["des"].append(des[des["Row"] == head]["Description"])
    
result = pd.DataFrame(result)
result

Unnamed: 0,head,des
0,DAYS_CREDIT,"129 How many days before current application did client apply for Credit Bureau credit Name: Description, dtype: object"
1,CREDIT_DAY_OVERDUE,"130 Number of days past due on CB credit at the time of application for related loan in our sample Name: Description, dtype: object"
2,DAYS_CREDIT_ENDDATE,"131 Remaining duration of CB credit (in days) at the time of application in Home Credit Name: Description, dtype: object"
3,CNT_CREDIT_PROLONG,"134 How many times was the Credit Bureau credit prolonged Name: Description, dtype: object"
4,AMT_CREDIT_SUM,"135 Current credit amount for the Credit Bureau credit Name: Description, dtype: object"
5,AMT_CREDIT_SUM_DEBT,"136 Current debt on Credit Bureau credit Name: Description, dtype: object"
6,AMT_CREDIT_SUM_LIMIT,"137 Current credit limit of credit card reported in Credit Bureau Name: Description, dtype: object"
7,AMT_CREDIT_SUM_OVERDUE,"138 Current amount overdue on Credit Bureau credit Name: Description, dtype: object"


In [39]:
df = df.drop(["TARGET"], axis=1)

### Ergebnis

In [40]:
df.head()

Unnamed: 0,SK_ID_CURR,SK_ID_BUREAU,DAYS_CREDIT,CREDIT_DAY_OVERDUE,DAYS_CREDIT_ENDDATE,CNT_CREDIT_PROLONG,AMT_CREDIT_SUM,AMT_CREDIT_SUM_DEBT,AMT_CREDIT_SUM_LIMIT,AMT_CREDIT_SUM_OVERDUE
0,215354,5714462,-497,0,-153.0,0,91323.0,0.0,,0.0
1,215354,5714463,-208,0,1075.0,0,225000.0,171342.0,,0.0
2,215354,5714464,-203,0,528.0,0,464323.5,,,0.0
3,215354,5714465,-203,0,,0,90000.0,,,0.0
4,215354,5714466,-629,0,1197.0,0,2700000.0,,,0.0


### Speichern der metrischen Werte

In [41]:
df.to_csv(DATASET_DIR / "2. Datenaufbereitung" / "bureau_mets.csv")