In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns

from pathlib import Path

pd.options.display.max_columns = None
pd.options.display.max_rows = None
pd.options.display.max_colwidth = None

from sklearn.linear_model import LogisticRegression

In [2]:
path1 = Path(r"A:\Workspace\Python\Masterarbeit\Kaggle Home Credit Datensatz")
path2 = Path(r"C:\Users\rober\Documents\Workspace\Python\Masterarbeit\Kaggle Home Credit Datensatz")

if path1.is_dir():
    DATASET_DIR = path1
else:
    DATASET_DIR = path2

In [3]:
app_train = pd.read_csv(DATASET_DIR / "application_train.csv")
bureau = pd.read_csv(DATASET_DIR / "bureau.csv")
bureau_balance = pd.read_csv(DATASET_DIR / "bureau_balance.csv")
des = pd.read_csv(DATASET_DIR / "HomeCredit_columns_description.csv", encoding="latin", index_col=0)

In [4]:
keys = ["SK_ID_CURR", "SK_ID_BUREAU"]

In [5]:
n_heads = [element for element in bureau.columns if bureau[element].dtype.name == "object"]
n_heads = [element for element in n_heads if element not in keys]

In [6]:
bureau = pd.merge(bureau, app_train[["SK_ID_CURR","TARGET"]] ,on="SK_ID_CURR")

In [7]:
bureau.head()

Unnamed: 0,SK_ID_CURR,SK_ID_BUREAU,CREDIT_ACTIVE,CREDIT_CURRENCY,DAYS_CREDIT,CREDIT_DAY_OVERDUE,DAYS_CREDIT_ENDDATE,DAYS_ENDDATE_FACT,AMT_CREDIT_MAX_OVERDUE,CNT_CREDIT_PROLONG,AMT_CREDIT_SUM,AMT_CREDIT_SUM_DEBT,AMT_CREDIT_SUM_LIMIT,AMT_CREDIT_SUM_OVERDUE,CREDIT_TYPE,DAYS_CREDIT_UPDATE,AMT_ANNUITY,TARGET
0,215354,5714462,Closed,currency 1,-497,0,-153.0,-153.0,,0,91323.0,0.0,,0.0,Consumer credit,-131,,0
1,215354,5714463,Active,currency 1,-208,0,1075.0,,,0,225000.0,171342.0,,0.0,Credit card,-20,,0
2,215354,5714464,Active,currency 1,-203,0,528.0,,,0,464323.5,,,0.0,Consumer credit,-16,,0
3,215354,5714465,Active,currency 1,-203,0,,,,0,90000.0,,,0.0,Credit card,-16,,0
4,215354,5714466,Active,currency 1,-629,0,1197.0,,77674.5,0,2700000.0,,,0.0,Consumer credit,-21,,0


In [8]:
SK_ID_CURR = bureau["SK_ID_CURR"].unique()

In [9]:
df = bureau[["SK_ID_CURR", "TARGET"] + n_heads]

In [10]:
df = df.set_index("SK_ID_CURR")

In [11]:
df.head()

Unnamed: 0_level_0,TARGET,CREDIT_ACTIVE,CREDIT_CURRENCY,CREDIT_TYPE
SK_ID_CURR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
215354,0,Closed,currency 1,Consumer credit
215354,0,Active,currency 1,Credit card
215354,0,Active,currency 1,Consumer credit
215354,0,Active,currency 1,Credit card
215354,0,Active,currency 1,Consumer credit


In [12]:
skip = ["TARGET", "SK_ID_CURR"]

In [13]:
result = {
          "header":[],
          "rate":[],
          "des":[]
         }
for key in df.keys():
    if key in skip:
        continue
    rate = df[key].isnull().sum() / len(df[key]) * 100
    if rate > 60:
        result["header"].append(key)
        result["rate"].append(rate)
        result["des"].append(des[des["Row"] == key]["Description"])

result = pd.DataFrame(result)
result

Unnamed: 0,header,rate,des


In [14]:
list(result.header)

[]

In [15]:
n_heads = [head for head in n_heads if head not in list(result.header)]
df.head()

Unnamed: 0_level_0,TARGET,CREDIT_ACTIVE,CREDIT_CURRENCY,CREDIT_TYPE
SK_ID_CURR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
215354,0,Closed,currency 1,Consumer credit
215354,0,Active,currency 1,Credit card
215354,0,Active,currency 1,Consumer credit
215354,0,Active,currency 1,Credit card
215354,0,Active,currency 1,Consumer credit


In [16]:
c = df[n_heads].corr(method='spearman') * 100

In [17]:
families = []
for i, row in c.iterrows():
    r = row[row > 70]
    if len(r) > 1 and set(r.index) not in families:
        print(r)
        print("\n")
        
        families.append(set(r.index))

In [18]:
result = {
          "family":[],
          "head":[],
          "r2":[],
          "na":[],
          "rate":[]
         }

for i, family in enumerate(families):
    headers = list(family)
    
    result["family"].append("")
    result["head"].append("")
    result["r2"].append("")
    result["na"].append("")
    result["rate"].append("")
    
    for head in headers:
        d = df[["TARGET"] + [head]]
        na = d[head].isna().sum() / len(d) * 100
        d = d.dropna()
        x = d[[head]]
        y = d[["TARGET"]]
        model = LogisticRegression().fit(x, y.values.ravel())
        r2 = round(model.score(x,y),5)
        
        result["family"].append(i)
        result["head"].append(head)
        result["r2"].append(round(r2,5))
        result["na"].append(round(na,2))
        result["rate"].append(r2/na)
    
        

In [19]:
result = pd.DataFrame(result)
result

Unnamed: 0,family,head,r2,na,rate


In [20]:
result = {
    "head":[],
    "des":[]
}

for head in n_heads:
    result["head"].append(head)
    result["des"].append(des[des["Row"] == head]["Description"])
    
result = pd.DataFrame(result)
result

Unnamed: 0,head,des
0,CREDIT_ACTIVE,"127 Status of the Credit Bureau (CB) reported credits Name: Description, dtype: object"
1,CREDIT_CURRENCY,"128 Recoded currency of the Credit Bureau credit Name: Description, dtype: object"
2,CREDIT_TYPE,"139 Type of Credit Bureau credit (Car, cash,...) Name: Description, dtype: object"


In [21]:
payback = df[df["TARGET"] == 0]
default = df[df["TARGET"] == 1]

In [22]:
rates = []
heads = []
for head in n_heads:
    if head in skip:
        continue

    pays = payback[head].value_counts()
    defs = default[head].value_counts()

    pays = pays/pays.sum()*100
    defs = defs/defs.sum()*100
    diff = abs(pays-defs)
    d1 = max(diff.sort_values(ascending=False))
    try:
        diff = abs(pays[[i for i in pays.index if i not in defs.index]])
        d2 = max(diff.sort_values(ascending=False))
    except ValueError:
        d2 = 0
    try:
        diff = abs(defs[[i for i in defs.index if i not in pays.index]])
        d3 = max(diff.sort_values(ascending=False))
    except ValueError:
        d3 = 0
    rates.append(max([d1,d2,d3]))
    heads.append(head)
    
    result = {
          "Cats":heads,
          "Rate":rates
         }
result = pd.DataFrame(result).sort_values("Rate", ascending=False)
result

Unnamed: 0,Cats,Rate
0,CREDIT_ACTIVE,7.68673
2,CREDIT_TYPE,3.360393
1,CREDIT_CURRENCY,0.032945


In [23]:
n_heads = result[result["Rate"] >= 5].Cats.values

In [24]:
df = df[n_heads]

In [25]:
df["CREDIT_ACTIVE"].value_counts()

Closed      917733
Active      541919
Sold          5653
Bad debt        20
Name: CREDIT_ACTIVE, dtype: int64

In [26]:
df.head()

Unnamed: 0_level_0,CREDIT_ACTIVE
SK_ID_CURR,Unnamed: 1_level_1
215354,Closed
215354,Active
215354,Active
215354,Active
215354,Active


In [27]:
# transformiert kategorische variablen in integer
for head in n_heads:
    df[head], cats = pd.factorize(df[head])

In [28]:
df = df[n_heads]

In [29]:
df["CREDIT_ACTIVE"].value_counts()

0    917733
1    541919
2      5653
3        20
Name: CREDIT_ACTIVE, dtype: int64

In [30]:
df.head()

Unnamed: 0_level_0,CREDIT_ACTIVE
SK_ID_CURR,Unnamed: 1_level_1
215354,0
215354,1
215354,1
215354,1
215354,1


In [31]:
a = df[df["CREDIT_ACTIVE"] == 0]
b = df[df["CREDIT_ACTIVE"] == 1]
c = df[df["CREDIT_ACTIVE"] == 2]
d = df[df["CREDIT_ACTIVE"] == 3]

closed = a[["CREDIT_ACTIVE"]].groupby(by=["SK_ID_CURR"]).count()
closed.columns = ["Closed"]
active = b[["CREDIT_ACTIVE"]].groupby(by=["SK_ID_CURR"]).count()
active.columns = ["Active"]
sold = c[["CREDIT_ACTIVE"]].groupby(by=["SK_ID_CURR"]).count()
sold.columns = ["Sold"]
bad = d[["CREDIT_ACTIVE"]].groupby(by=["SK_ID_CURR"]).count()
bad.columns = ["Bad"]

In [32]:
closed.head()

Unnamed: 0_level_0,Closed
SK_ID_CURR,Unnamed: 1_level_1
100002,6
100003,3
100004,2
100007,1
100008,2


In [33]:
active.head()

Unnamed: 0_level_0,Active
SK_ID_CURR,Unnamed: 1_level_1
100002,2
100003,1
100008,1
100009,4
100010,1


In [34]:
df = pd.DataFrame(index=SK_ID_CURR)
df.index.name = "SK_ID_CURR"
df.head()

215354
162297
402440
238881
222183


In [35]:
df = df.join(closed)
df = df.join(active)
df = df.join(sold)
df = df.join(bad)
df = df.fillna(0)

In [36]:
df.head()

Unnamed: 0_level_0,Closed,Active,Sold,Bad
SK_ID_CURR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
215354,5.0,6.0,0.0,0.0
162297,3.0,3.0,0.0,0.0
402440,0.0,1.0,0.0,0.0
238881,5.0,3.0,0.0,0.0
222183,3.0,5.0,0.0,0.0


In [37]:
len(SK_ID_CURR)

263491

In [38]:
df[df["Sold"] == 0]["Sold"].count() / len(df) * 100

98.01663054905102

98 % aller "sold" sind 0 -> kaum Informationsgehalt

In [39]:
df[df["Bad"] == 0]["Bad"].count() / len(df) * 100

99.9924096079183

99 % aller "Bad" sind 0 -> kaum Informationsgehalt

In [40]:
df[df["Closed"] == 0]["Closed"].count() / len(df) * 100

12.647870325741675

In [41]:
df[df["Active"] == 0]["Active"].count() / len(df) * 100

17.58731797290989

In [42]:
df = df[["Closed", "Active"]]

In [43]:
df.head()

Unnamed: 0_level_0,Closed,Active
SK_ID_CURR,Unnamed: 1_level_1,Unnamed: 2_level_1
215354,5.0,6.0
162297,3.0,3.0
402440,0.0,1.0
238881,5.0,3.0
222183,3.0,5.0


In [44]:
df.to_csv(DATASET_DIR / "Datenaufbereitung" / "bureau_cats.csv")