## 2. Datenaufbereitung Installment Payments

In [1]:
from pathlib import Path
from scipy import stats

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns

np.set_printoptions(suppress=True)

pd.options.display.max_columns = None
pd.options.display.max_rows = None
pd.options.display.max_colwidth = None

from sklearn.linear_model import LogisticRegression

from IPython.display import display, Markdown

In [2]:
path1 = Path(r"A:\Workspace\Python\Masterarbeit\Kaggle Home Credit Datensatz")
path2 = Path(r"C:\Users\rober\Documents\Workspace\Python\Masterarbeit\Kaggle Home Credit Datensatz")

if path1.is_dir():
    DATASET_DIR = path1
else:
    DATASET_DIR = path2

In [3]:
app_train = pd.read_csv(DATASET_DIR / "application_train.csv")
ip = pd.read_csv(DATASET_DIR / "installments_payments.csv")
description = pd.read_csv(DATASET_DIR / "HomeCredit_columns_description.csv", encoding="latin", index_col=0)

In [4]:
des = description.loc[description['Table']=="installments_payments.csv", "Row":"Special"]

In [5]:
des

Unnamed: 0,Row,Description,Special
214,SK_ID_PREV,"ID of previous credit in Home credit related to loan in our sample. (One loan in our sample can have 0,1,2 or more previous loans in Home Credit)",hashed
215,SK_ID_CURR,ID of loan in our sample,hashed
216,NUM_INSTALMENT_VERSION,Version of installment calendar (0 is for credit card) of previous credit. Change of installment version from month to month signifies that some parameter of payment calendar has changed,
217,NUM_INSTALMENT_NUMBER,On which installment we observe payment,
218,DAYS_INSTALMENT,When the installment of previous credit was supposed to be paid (relative to application date of current loan),time only relative to the application
219,DAYS_ENTRY_PAYMENT,When was the installments of previous credit paid actually (relative to application date of current loan),time only relative to the application
220,AMT_INSTALMENT,What was the prescribed installment amount of previous credit on this installment,
221,AMT_PAYMENT,What the client actually paid on previous credit on this installment,


In [6]:
ip = pd.merge(ip, app_train[["SK_ID_CURR","TARGET"]] ,on="SK_ID_CURR")

In [7]:
ip.head()

Unnamed: 0,SK_ID_PREV,SK_ID_CURR,NUM_INSTALMENT_VERSION,NUM_INSTALMENT_NUMBER,DAYS_INSTALMENT,DAYS_ENTRY_PAYMENT,AMT_INSTALMENT,AMT_PAYMENT,TARGET
0,1054186,161674,1.0,6,-1180.0,-1187.0,6948.36,6948.36,0
1,2452854,161674,1.0,21,-546.0,-552.0,11302.605,11302.605,0
2,1054186,161674,1.0,2,-1300.0,-1307.0,6948.36,6948.36,0
3,1682318,161674,1.0,2,-240.0,-243.0,7374.51,7374.51,0
4,2452854,161674,1.0,10,-876.0,-882.0,11302.605,11302.605,0


In [8]:
# Spalten die innerhalb der Aufbereitung nicht verändert werden können
skip = ["TARGET", "SK_ID_CURR", "SK_ID_PREV"]

In [9]:
# nominale und metrische Spalten
n_heads = [element for element in ip.columns if ip[element].dtype.name == "object"]
m_heads = [element for element in ip.columns if element not in n_heads]

### metrische Variablen

In [10]:
df = ip[m_heads].copy()

In [11]:
df.head()

Unnamed: 0,SK_ID_PREV,SK_ID_CURR,NUM_INSTALMENT_VERSION,NUM_INSTALMENT_NUMBER,DAYS_INSTALMENT,DAYS_ENTRY_PAYMENT,AMT_INSTALMENT,AMT_PAYMENT,TARGET
0,1054186,161674,1.0,6,-1180.0,-1187.0,6948.36,6948.36,0
1,2452854,161674,1.0,21,-546.0,-552.0,11302.605,11302.605,0
2,1054186,161674,1.0,2,-1300.0,-1307.0,6948.36,6948.36,0
3,1682318,161674,1.0,2,-240.0,-243.0,7374.51,7374.51,0
4,2452854,161674,1.0,10,-876.0,-882.0,11302.605,11302.605,0


### Löschung der Spalten mit weniger als 40% ausgefüllten Daten

In [12]:
result = {
          "header":[],
          "rate":[],
          "des":[]
         }
for key in df.keys():
    if key in skip:
        continue
    rate = df[key].isna().sum() / len(df[key]) * 100
    if rate > 60:
        result["header"].append(key)
        result["rate"].append(rate)
        result["des"].append(des[des["Row"] == key]["Description"])

result = pd.DataFrame(result)
result

Unnamed: 0,header,rate,des


In [13]:
df = df.drop(result.header.values, axis=1)

### Bildung von Korrelationsclustern

In [14]:
c = df.corr(method='pearson') * 100

In [15]:
families = []
for i, row in c.iterrows():
    r = row[row > 70]
    if len(r) > 1 and set(r.index) not in families:
        print(r)
        print("\n")
        
        families.append(set(r.index))

DAYS_INSTALMENT       100.000000
DAYS_ENTRY_PAYMENT     99.947201
Name: DAYS_INSTALMENT, dtype: float64


AMT_INSTALMENT    100.000000
AMT_PAYMENT        93.741678
Name: AMT_INSTALMENT, dtype: float64




In [16]:
result = {
          "family":[],
          "head":[],
          "r2":[],
          "na":[],
          "rate":[]
         }

for i, family in enumerate(families):
    headers = list(family)
    
    result["family"].append("")
    result["head"].append("")
    result["r2"].append("")
    result["na"].append("")
    result["rate"].append("")
    
    for head in headers:
        d = df[["TARGET"] + [head]]
        na = d[head].isna().sum() / len(d) * 100
        d = d.dropna()
        x = d[[head]]
        y = d[["TARGET"]]
        model = LogisticRegression().fit(x, y.values.ravel())
        r2 = round(model.score(x,y),5)
        
        result["family"].append(i)
        result["head"].append(head)
        result["r2"].append(round(r2,5))
        result["na"].append(round(na,2))
        result["rate"].append(r2/na)
    
result = pd.DataFrame(result)
result       

  result["rate"].append(r2/na)
  result["rate"].append(r2/na)


Unnamed: 0,family,head,r2,na,rate
0,,,,,
1,0.0,DAYS_INSTALMENT,0.92406,0.0,inf
2,0.0,DAYS_ENTRY_PAYMENT,0.9241,0.02,41.4703
3,,,,,
4,1.0,AMT_PAYMENT,0.9241,0.02,41.4703
5,1.0,AMT_INSTALMENT,0.92406,0.0,inf


In [18]:
df = df.drop(["DAYS_ENTRY_PAYMENT", "AMT_PAYMENT"], axis=1)

In [19]:
df.head()

Unnamed: 0,SK_ID_PREV,SK_ID_CURR,NUM_INSTALMENT_VERSION,NUM_INSTALMENT_NUMBER,DAYS_INSTALMENT,AMT_INSTALMENT,TARGET
0,1054186,161674,1.0,6,-1180.0,6948.36,0
1,2452854,161674,1.0,21,-546.0,11302.605,0
2,1054186,161674,1.0,2,-1300.0,6948.36,0
3,1682318,161674,1.0,2,-240.0,7374.51,0
4,2452854,161674,1.0,10,-876.0,11302.605,0


### Betrachtung der Kausalität

In [20]:
result = {
    "head":[],
    "des":[]
}

for head in df.columns.values:
    if head in skip:
        continue
    result["head"].append(head)
    result["des"].append(des[des["Row"] == head]["Description"])
    
result = pd.DataFrame(result)
result

Unnamed: 0,head,des
0,NUM_INSTALMENT_VERSION,"216 Version of installment calendar (0 is for credit card) of previous credit. Change of installment version from month to month signifies that some parameter of payment calendar has changed Name: Description, dtype: object"
1,NUM_INSTALMENT_NUMBER,"217 On which installment we observe payment Name: Description, dtype: object"
2,DAYS_INSTALMENT,"218 When the installment of previous credit was supposed to be paid (relative to application date of current loan) Name: Description, dtype: object"
3,AMT_INSTALMENT,"220 What was the prescribed installment amount of previous credit on this installment Name: Description, dtype: object"


In [23]:
df = df.drop(["TARGET"], axis=1)

### Ergebnis

In [24]:
df.head()

Unnamed: 0,SK_ID_PREV,SK_ID_CURR,NUM_INSTALMENT_VERSION,NUM_INSTALMENT_NUMBER,DAYS_INSTALMENT,AMT_INSTALMENT
0,1054186,161674,1.0,6,-1180.0,6948.36
1,2452854,161674,1.0,21,-546.0,11302.605
2,1054186,161674,1.0,2,-1300.0,6948.36
3,1682318,161674,1.0,2,-240.0,7374.51
4,2452854,161674,1.0,10,-876.0,11302.605


### Speichern der metrischen Werte

In [25]:
df.to_csv(DATASET_DIR / "2. Datenaufbereitung" / "ip_mets.csv")