## 2. Datenaufbereitung Point of Sales

In [1]:
from pathlib import Path
from scipy import stats

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns

np.set_printoptions(suppress=True)

pd.options.display.max_columns = None
pd.options.display.max_rows = None
pd.options.display.max_colwidth = None

from sklearn.linear_model import LogisticRegression

from IPython.display import display, Markdown

In [2]:
path1 = Path(r"A:\Workspace\Python\Masterarbeit\Kaggle Home Credit Datensatz")
path2 = Path(r"C:\Users\rober\Documents\Workspace\Python\Masterarbeit\Kaggle Home Credit Datensatz")

if path1.is_dir():
    DATASET_DIR = path1
else:
    DATASET_DIR = path2

In [3]:
app_train = pd.read_csv(DATASET_DIR / "application_train.csv")
pos = pd.read_csv(DATASET_DIR / "POS_CASH_balance.csv")
description = pd.read_csv(DATASET_DIR / "HomeCredit_columns_description.csv", encoding="latin", index_col=0)

In [4]:
des = description.loc[description['Table']=="POS_CASH_balance.csv", "Row":"Special"]

In [5]:
des

Unnamed: 0,Row,Description,Special
145,SK_ID_PREV,"ID of previous credit in Home Credit related to loan in our sample. (One loan in our sample can have 0,1,2 or more previous loans in Home Credit)",
146,SK_ID_CURR,ID of loan in our sample,
147,MONTHS_BALANCE,"Month of balance relative to application date (-1 means the information to the freshest monthly snapshot, 0 means the information at application - often it will be the same as -1 as many banks are not updating the information to Credit Bureau regularly )",time only relative to the application
148,CNT_INSTALMENT,Term of previous credit (can change over time),
149,CNT_INSTALMENT_FUTURE,Installments left to pay on the previous credit,
150,NAME_CONTRACT_STATUS,Contract status during the month,
151,SK_DPD,DPD (days past due) during the month of previous credit,
152,SK_DPD_DEF,DPD during the month with tolerance (debts with low loan amounts are ignored) of the previous credit,


In [6]:
pos = pd.merge(pos, app_train[["SK_ID_CURR","TARGET"]] ,on="SK_ID_CURR")

In [7]:
pos.head()

Unnamed: 0,SK_ID_PREV,SK_ID_CURR,MONTHS_BALANCE,CNT_INSTALMENT,CNT_INSTALMENT_FUTURE,NAME_CONTRACT_STATUS,SK_DPD,SK_DPD_DEF,TARGET
0,1803195,182943,-31,48.0,45.0,Active,0,0,0
1,1803195,182943,-17,48.0,31.0,Active,0,0,0
2,1803195,182943,-21,48.0,35.0,Active,0,0,0
3,1803195,182943,-8,48.0,21.0,Active,0,0,0
4,1803195,182943,-4,48.0,17.0,Active,0,0,0


In [8]:
# Spalten die innerhalb der Aufbereitung nicht verändert werden können
skip = ["TARGET", "SK_ID_CURR", "SK_ID_PREV"]

In [9]:
# nominale und metrische Spalten
n_heads = [element for element in pos.columns if pos[element].dtype.name == "object"]
m_heads = [element for element in pos.columns if element not in n_heads]

In [10]:
pos[m_heads].head()

Unnamed: 0,SK_ID_PREV,SK_ID_CURR,MONTHS_BALANCE,CNT_INSTALMENT,CNT_INSTALMENT_FUTURE,SK_DPD,SK_DPD_DEF,TARGET
0,1803195,182943,-31,48.0,45.0,0,0,0
1,1803195,182943,-17,48.0,31.0,0,0,0
2,1803195,182943,-21,48.0,35.0,0,0,0
3,1803195,182943,-8,48.0,21.0,0,0,0
4,1803195,182943,-4,48.0,17.0,0,0,0


### kategorische Variablen

In [11]:
df = pos[["SK_ID_CURR", "TARGET"] + n_heads].copy()

In [12]:
df.head()

Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_STATUS
0,182943,0,Active
1,182943,0,Active
2,182943,0,Active
3,182943,0,Active
4,182943,0,Active


### Löschung der Spalten mit weniger als 40% ausgefüllten Daten

In [13]:
result = {
          "header":[],
          "rate":[],
          "des":[]
         }
for key in df.keys():
    if key in skip:
        continue
    rate = df[key].isna().sum() / len(df[key]) * 100
    if rate > 60:
        result["header"].append(key)
        result["rate"].append(rate)
        result["des"].append(des[des["Row"] == key]["Description"])

result = pd.DataFrame(result)
result

Unnamed: 0,header,rate,des


In [14]:
df = df.drop(result.header.values, axis=1)

### Unterscheidbarkeit von mindestens 5pP einer Kategorie

In [15]:
ID_Payback = df[df["TARGET"] == 0].index.values
ID_Default = df[df["TARGET"] == 1].index.values

In [16]:
payback = df.loc[ID_Payback]
default = df.loc[ID_Default]

In [17]:
result = {
    "head" : [],
    "cat" : [],
    "payback" : [],
    "default" : [],
    "diff" : []
}

for head in df.columns.values:
    df1 = payback[head].value_counts().rename_axis(head).reset_index(name='payback').head()
    df2 = default[head].value_counts().rename_axis(head).reset_index(name='default').head()
    
    df1["payback"] = df1["payback"]/df1["payback"].sum()*100
    df2["default"] = df2["default"]/df2["default"].sum()*100
    
    df_ = df1.merge(df2, how="outer", on=head)
    
    df_["diff"] = (df_["default"]-df_["payback"])
    
    df_ = df_.sort_values("diff", ascending=False)
    
    for diff in df_["diff"]:
        if np.isnan(diff):
            continue
        if diff > 5 or diff < -5:
            row = df_.loc[df_["diff"] == diff]
            cat = row[head][row[head].index[0]]
            
            result["head"].append(head)
            result["cat"].append(cat)
            result["payback"].append(round(row["payback"].values[0],2))
            result["default"].append(round(row["default"].values[0],2))
            result["diff"].append(round(diff,2))

result = pd.DataFrame(result)
result.sort_values("diff", ascending=False)

Unnamed: 0,head,cat,payback,default,diff


In [18]:
remove = [head for head in df.columns.values if head not in list(result["head"].unique()) + skip]

In [19]:
df = df.drop(remove, axis=1)

### Es wird keine kategorische Variable übernommen

In [20]:
df.head()

Unnamed: 0,SK_ID_CURR,TARGET
0,182943,0
1,182943,0
2,182943,0
3,182943,0
4,182943,0


### metrische Variablen

In [21]:
df = pos[m_heads].copy()

In [22]:
df.head()

Unnamed: 0,SK_ID_PREV,SK_ID_CURR,MONTHS_BALANCE,CNT_INSTALMENT,CNT_INSTALMENT_FUTURE,SK_DPD,SK_DPD_DEF,TARGET
0,1803195,182943,-31,48.0,45.0,0,0,0
1,1803195,182943,-17,48.0,31.0,0,0,0
2,1803195,182943,-21,48.0,35.0,0,0,0
3,1803195,182943,-8,48.0,21.0,0,0,0
4,1803195,182943,-4,48.0,17.0,0,0,0


### Löschung der Spalten mit weniger als 40% ausgefüllten Daten

In [23]:
result = {
          "header":[],
          "rate":[],
          "des":[]
         }
for key in df.keys():
    if key in skip:
        continue
    rate = df[key].isna().sum() / len(df[key]) * 100
    if rate > 60:
        result["header"].append(key)
        result["rate"].append(rate)
        result["des"].append(des[des["Row"] == key]["Description"])

result = pd.DataFrame(result)
result

Unnamed: 0,header,rate,des


In [24]:
df = df.drop(result.header.values, axis=1)

### Bildung von Korrelationsclustern

In [25]:
c = df.corr(method='pearson') * 100

In [26]:
families = []
for i, row in c.iterrows():
    r = row[row > 70]
    if len(r) > 1 and set(r.index) not in families:
        print(r)
        print("\n")
        
        families.append(set(r.index))

CNT_INSTALMENT           100.000000
CNT_INSTALMENT_FUTURE     87.374215
Name: CNT_INSTALMENT, dtype: float64




In [27]:
result = {
          "family":[],
          "head":[],
          "r2":[],
          "na":[],
          "rate":[]
         }

for i, family in enumerate(families):
    headers = list(family)
    
    result["family"].append("")
    result["head"].append("")
    result["r2"].append("")
    result["na"].append("")
    result["rate"].append("")
    
    for head in headers:
        d = df[["TARGET"] + [head]]
        na = d[head].isna().sum() / len(d) * 100
        d = d.dropna()
        x = d[[head]]
        y = d[["TARGET"]]
        model = LogisticRegression().fit(x, y.values.ravel())
        r2 = round(model.score(x,y),5)
        
        result["family"].append(i)
        result["head"].append(head)
        result["r2"].append(round(r2,5))
        result["na"].append(round(na,2))
        result["rate"].append(r2/na)
    
result = pd.DataFrame(result)
result       

Unnamed: 0,family,head,r2,na,rate
0,,,,,
1,0.0,CNT_INSTALMENT_FUTURE,0.92648,0.26,3.61791
2,0.0,CNT_INSTALMENT,0.92648,0.26,3.62039


In [28]:
df = df.drop(["CNT_INSTALMENT_FUTURE"], axis=1)

In [29]:
df.head()

Unnamed: 0,SK_ID_PREV,SK_ID_CURR,MONTHS_BALANCE,CNT_INSTALMENT,SK_DPD,SK_DPD_DEF,TARGET
0,1803195,182943,-31,48.0,0,0,0
1,1803195,182943,-17,48.0,0,0,0
2,1803195,182943,-21,48.0,0,0,0
3,1803195,182943,-8,48.0,0,0,0
4,1803195,182943,-4,48.0,0,0,0


### Betrachtung der Kausalität

In [30]:
result = {
    "head":[],
    "des":[]
}

for head in df.columns.values:
    if head in skip:
        continue
    result["head"].append(head)
    result["des"].append(des[des["Row"] == head]["Description"])
    
result = pd.DataFrame(result)
result

Unnamed: 0,head,des
0,MONTHS_BALANCE,"147 Month of balance relative to application date (-1 means the information to the freshest monthly snapshot, 0 means the information at application - often it will be the same as -1 as many banks are not updating the information to Credit Bureau regularly ) Name: Description, dtype: object"
1,CNT_INSTALMENT,"148 Term of previous credit (can change over time) Name: Description, dtype: object"
2,SK_DPD,"151 DPD (days past due) during the month of previous credit Name: Description, dtype: object"
3,SK_DPD_DEF,"152 DPD during the month with tolerance (debts with low loan amounts are ignored) of the previous credit Name: Description, dtype: object"


### Ergebnis

In [31]:
df.head()

Unnamed: 0,SK_ID_PREV,SK_ID_CURR,MONTHS_BALANCE,CNT_INSTALMENT,SK_DPD,SK_DPD_DEF,TARGET
0,1803195,182943,-31,48.0,0,0,0
1,1803195,182943,-17,48.0,0,0,0
2,1803195,182943,-21,48.0,0,0,0
3,1803195,182943,-8,48.0,0,0,0
4,1803195,182943,-4,48.0,0,0,0


### Speichern der metrischen Werte

In [32]:
df.to_csv(DATASET_DIR / "2. Datenaufbereitung" / "pos_mets.csv")