In [1]:
from pathlib import Path
from scipy import stats

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns

np.set_printoptions(suppress=True)

pd.options.display.max_columns = None
pd.options.display.max_rows = None
pd.options.display.max_colwidth = None

from IPython.display import display, Markdown

In [2]:
path1 = Path(r"A:\Workspace\Python\Masterarbeit\Kaggle Home Credit Datensatz")
path2 = Path(r"C:\Users\rober\Documents\Workspace\Python\Masterarbeit\Kaggle Home Credit Datensatz")

if path1.is_dir():
    DATASET_DIR = path1
else:
    DATASET_DIR = path2

In [3]:
app_train = pd.read_csv(DATASET_DIR / "application_train.csv")
ip = pd.read_csv(DATASET_DIR / "installments_payments.csv")
des = pd.read_csv(DATASET_DIR / "HomeCredit_columns_description.csv", encoding="latin", index_col=0)

In [4]:
ip.head()

Unnamed: 0,SK_ID_PREV,SK_ID_CURR,NUM_INSTALMENT_VERSION,NUM_INSTALMENT_NUMBER,DAYS_INSTALMENT,DAYS_ENTRY_PAYMENT,AMT_INSTALMENT,AMT_PAYMENT
0,1054186,161674,1.0,6,-1180.0,-1187.0,6948.36,6948.36
1,1330831,151639,0.0,34,-2156.0,-2156.0,1716.525,1716.525
2,2085231,193053,2.0,1,-63.0,-63.0,25425.0,25425.0
3,2452527,199697,1.0,3,-2418.0,-2426.0,24350.13,24350.13
4,2714724,167756,1.0,2,-1383.0,-1366.0,2165.04,2160.585


In [5]:
des[des["Table"] == "installments_payments.csv"]

Unnamed: 0,Table,Row,Description,Special
214,installments_payments.csv,SK_ID_PREV,"ID of previous credit in Home credit related to loan in our sample. (One loan in our sample can have 0,1,2 or more previous loans in Home Credit)",hashed
215,installments_payments.csv,SK_ID_CURR,ID of loan in our sample,hashed
216,installments_payments.csv,NUM_INSTALMENT_VERSION,Version of installment calendar (0 is for credit card) of previous credit. Change of installment version from month to month signifies that some parameter of payment calendar has changed,
217,installments_payments.csv,NUM_INSTALMENT_NUMBER,On which installment we observe payment,
218,installments_payments.csv,DAYS_INSTALMENT,When the installment of previous credit was supposed to be paid (relative to application date of current loan),time only relative to the application
219,installments_payments.csv,DAYS_ENTRY_PAYMENT,When was the installments of previous credit paid actually (relative to application date of current loan),time only relative to the application
220,installments_payments.csv,AMT_INSTALMENT,What was the prescribed installment amount of previous credit on this installment,
221,installments_payments.csv,AMT_PAYMENT,What the client actually paid on previous credit on this installment,


In [6]:
for head in ip.columns:
    print(ip[head].isna().sum())

0
0
0
0
0
2905
0
2905


In [7]:
result = {
          "header":[],
          "rate":[],
          "des":[]
         }
for key in ip.keys():
    rate = ip[key].isnull().sum() / len(ip[key]) * 100
    if rate > 60:
        result["header"].append(key)
        result["rate"].append(rate)
        result["des"].append(des[des["Row"] == key]["Description"])

result = pd.DataFrame(result)
result

Unnamed: 0,header,rate,des


In [8]:
ip[ip["DAYS_ENTRY_PAYMENT"].isna()].head()

Unnamed: 0,SK_ID_PREV,SK_ID_CURR,NUM_INSTALMENT_VERSION,NUM_INSTALMENT_NUMBER,DAYS_INSTALMENT,DAYS_ENTRY_PAYMENT,AMT_INSTALMENT,AMT_PAYMENT
3764207,1531600,103793,1.0,7,-668.0,,49741.02,
3764208,1947105,159974,1.0,24,-36.0,,22849.515,
3764209,1843773,167270,1.0,22,-20.0,,48092.355,
3764210,1691592,192536,1.0,5,-2561.0,,7675.425,
3764211,1531299,157088,0.0,11,-1847.0,,67.5,


In [9]:
ip["DAYS_ENTRY_PAYMENT"] = ip["DAYS_ENTRY_PAYMENT"].fillna(ip["DAYS_INSTALMENT"])
ip["AMT_PAYMENT"] = ip["AMT_PAYMENT"].fillna(ip["AMT_INSTALMENT"])

In [10]:
df = ip.copy()

In [11]:
df["TIMEDIFF"] = ip["DAYS_INSTALMENT"] - ip["DAYS_ENTRY_PAYMENT"]
df["AMTDIFF"] = ip["AMT_PAYMENT"] - ip["AMT_INSTALMENT"]

In [12]:
df = df[["SK_ID_PREV","SK_ID_CURR","TIMEDIFF", "AMTDIFF"]]

In [13]:
ip.head()

Unnamed: 0,SK_ID_PREV,SK_ID_CURR,NUM_INSTALMENT_VERSION,NUM_INSTALMENT_NUMBER,DAYS_INSTALMENT,DAYS_ENTRY_PAYMENT,AMT_INSTALMENT,AMT_PAYMENT
0,1054186,161674,1.0,6,-1180.0,-1187.0,6948.36,6948.36
1,1330831,151639,0.0,34,-2156.0,-2156.0,1716.525,1716.525
2,2085231,193053,2.0,1,-63.0,-63.0,25425.0,25425.0
3,2452527,199697,1.0,3,-2418.0,-2426.0,24350.13,24350.13
4,2714724,167756,1.0,2,-1383.0,-1366.0,2165.04,2160.585


In [14]:
df.head()

Unnamed: 0,SK_ID_PREV,SK_ID_CURR,TIMEDIFF,AMTDIFF
0,1054186,161674,7.0,0.0
1,1330831,151639,0.0,0.0
2,2085231,193053,0.0,0.0
3,2452527,199697,8.0,0.0
4,2714724,167756,-17.0,-4.455


In [15]:
df["TIMEDIFF"].min()

-2884.0

In [16]:
df["AMTDIFF"].max()

2630908.935

In [17]:
mean = df[["SK_ID_CURR", "TIMEDIFF", "AMTDIFF"]].groupby(by="SK_ID_CURR").mean()
maximum = df[["SK_ID_CURR", "TIMEDIFF", "AMTDIFF"]].groupby(by="SK_ID_CURR").max()
minimum = df[["SK_ID_CURR", "TIMEDIFF", "AMTDIFF"]].groupby(by="SK_ID_CURR").min()

In [18]:
print(maximum.min()) # 42 Tage zu spät gezahlt, 50k zu viel gezahlt
print(maximum.max()) # 3k Tage zu früh gezahlt, 2,4 Mio zu wenig gezahlt
print(minimum.min()) # 2,8k Tage zu spät gezahlt, 2,6 Mio zu viel gezahlt
print(minimum.max()) # 156 Tage zu früh gezahlt, 20k zu wenig gezahlt

TIMEDIFF      -42.00
AMTDIFF    -20512.35
dtype: float64
TIMEDIFF       3189.000
AMTDIFF     2630908.935
dtype: float64
TIMEDIFF      -2884.000
AMTDIFF    -2424726.405
dtype: float64
TIMEDIFF      156.00
AMTDIFF     50861.16
dtype: float64


In [19]:
mean.columns = ["MEAN_TIMEDIFF", "MEAN_AMTDIFF"]
maximum.columns = ["MAX_TIMEDIFF", "MAX_AMTDIFF"]
minimum.columns = ["MIN_TIMEDIFF", "MIN_AMTDIFF"]

In [20]:
SK_ID_CURR = ip.SK_ID_CURR.unique()

In [21]:
target = app_train[["SK_ID_CURR", "TARGET"]]
target = target.set_index("SK_ID_CURR")

In [22]:
df = pd.DataFrame({"SK_ID_CURR":SK_ID_CURR})

In [23]:
df = df.join(minimum, on="SK_ID_CURR")
df = df.join(mean, on="SK_ID_CURR")
df = df.join(maximum, on="SK_ID_CURR")
#df = df.join(target, on="SK_ID_CURR")

In [24]:
df = df.set_index("SK_ID_CURR")

In [25]:
len(df)

339587

In [26]:
df.head()

Unnamed: 0_level_0,MIN_TIMEDIFF,MIN_AMTDIFF,MEAN_TIMEDIFF,MEAN_AMTDIFF,MAX_TIMEDIFF,MAX_AMTDIFF,TARGET
SK_ID_CURR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
161674,0.0,0.0,10.90099,0.0,65.0,0.0,0.0
151639,-9.0,-26067.465,3.056962,-787.313165,38.0,0.0,0.0
193053,-23.0,0.0,-5.333333,0.0,7.0,0.0,0.0
199697,-74.0,-21174.3,-0.666667,-5491.033333,28.0,0.0,0.0
167756,-17.0,-2389.68,7.8,-622.389,38.0,0.0,0.0


In [27]:
df.to_csv(DATASET_DIR / "Datenaufbereitung" / "ip_mets.csv")