In [1]:
from pathlib import Path
from scipy import stats

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns

np.set_printoptions(suppress=True)

pd.options.display.max_columns = None
pd.options.display.max_rows = None
pd.options.display.max_colwidth = None

from sklearn.linear_model import LogisticRegression

from IPython.display import display, Markdown

In [2]:
path1 = Path(r"A:\Workspace\Python\Masterarbeit\Kaggle Home Credit Datensatz")
path2 = Path(r"C:\Users\rober\Documents\Workspace\Python\Masterarbeit\Kaggle Home Credit Datensatz")

if path1.is_dir():
    DATASET_DIR = path1
else:
    DATASET_DIR = path2

In [3]:
app_train = pd.read_csv(DATASET_DIR / "3. Ausreißerbereinigung" / "app_train.csv")
app_test = pd.read_csv(DATASET_DIR / "application_test.csv", index_col="SK_ID_CURR")

In [4]:
app_train = app_train.set_index("SK_ID_CURR")

In [5]:
app_train = app_train.drop(["TARGET"], axis=1)

In [6]:
cats_head = ['CODE_GENDER', 'NAME_INCOME_TYPE', 'NAME_EDUCATION_TYPE',
 'FLAG_EMP_PHONE', 'OCCUPATION_TYPE', 'REGION_RATING_CLIENT',
 'REG_CITY_NOT_WORK_CITY', 'ORGANIZATION_TYPE', 'FLAG_DOCUMENT_3',
 'CNT_CHILDREN', 'OBS_60_CNT_SOCIAL_CIRCLE', 'DEF_60_CNT_SOCIAL_CIRCLE',
 'AMT_REQ_CREDIT_BUREAU_HOUR', 'AMT_REQ_CREDIT_BUREAU_DAY',
 'AMT_REQ_CREDIT_BUREAU_WEEK', 'AMT_REQ_CREDIT_BUREAU_MON',
 'AMT_REQ_CREDIT_BUREAU_QRT', 'AMT_REQ_CREDIT_BUREAU_YEAR']

In [7]:
mets_head = ['AMT_INCOME_TOTAL', 'AMT_CREDIT', 'AMT_ANNUITY',
 'REGION_POPULATION_RELATIVE' ,'DAYS_BIRTH', 'DAYS_EMPLOYED',
 'DAYS_REGISTRATION', 'DAYS_ID_PUBLISH' ,'EXT_SOURCE_1' ,'EXT_SOURCE_2',
 'EXT_SOURCE_3', 'BASEMENTAREA_AVG' ,'YEARS_BEGINEXPLUATATION_AVG',
 'LANDAREA_AVG', 'NONLIVINGAREA_AVG', 'TOTALAREA_MODE' ,'CREDIT/INCOME']

In [8]:
app_test["CREDIT/INCOME"] = app_test["AMT_CREDIT"] / app_test["AMT_INCOME_TOTAL"]

In [9]:
app_test = app_test[cats_head + mets_head]

In [10]:
# transformiert kategorische variablen in integer
HEADS = [head for head in app_test.columns.values if app_test[head].dtype == "object"]

for head in HEADS:
    app_test[head], cats = pd.factorize(app_test[head])

In [11]:
# fill na

In [12]:
app_test.columns = app_train.columns

In [13]:
application = pd.concat([app_train, app_test])

In [14]:
application = application.sort_index()

In [15]:
application.head()

Unnamed: 0_level_0,A_CODE_GENDER,A_NAME_INCOME_TYPE,A_NAME_EDUCATION_TYPE,A_FLAG_EMP_PHONE,A_OCCUPATION_TYPE,A_REGION_RATING_CLIENT,A_REG_CITY_NOT_WORK_CITY,A_ORGANIZATION_TYPE,A_FLAG_DOCUMENT_3,A_CNT_CHILDREN,A_OBS_60_CNT_SOCIAL_CIRCLE,A_DEF_60_CNT_SOCIAL_CIRCLE,A_AMT_REQ_CREDIT_BUREAU_HOUR,A_AMT_REQ_CREDIT_BUREAU_DAY,A_AMT_REQ_CREDIT_BUREAU_WEEK,A_AMT_REQ_CREDIT_BUREAU_MON,A_AMT_REQ_CREDIT_BUREAU_QRT,A_AMT_REQ_CREDIT_BUREAU_YEAR,A_AMT_INCOME_TOTAL,A_AMT_CREDIT,A_AMT_ANNUITY,A_REGION_POPULATION_RELATIVE,A_DAYS_BIRTH,A_DAYS_EMPLOYED,A_DAYS_REGISTRATION,A_DAYS_ID_PUBLISH,A_EXT_SOURCE_1,A_EXT_SOURCE_2,A_EXT_SOURCE_3,A_BASEMENTAREA_AVG,A_YEARS_BEGINEXPLUATATION_AVG,A_LANDAREA_AVG,A_NONLIVINGAREA_AVG,A_TOTALAREA_MODE,A_CREDIT/INCOME
SK_ID_CURR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1
100001,0,0,0,1,-1,2,0,0,1,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,135000.0,568800.0,20560.5,0.01885,-19241,-2329.0,-5170.0,-812.0,0.752614,0.789654,0.15952,0.059,0.9732,,,0.0392,4.213333
100002,0,0,0,1,0,2,0,0,1,0,2.0,2.0,0.0,0.0,0.0,0.0,0.0,1.0,202500.0,406597.5,24700.5,0.018801,-9461,-637.0,-3648.0,-2120.0,0.083037,0.262949,0.139376,0.0369,0.9722,0.0369,0.0,0.0149,2.007889
100003,1,1,1,1,1,1,0,1,1,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,270000.0,1293502.5,35698.5,0.003541,-16765,-1188.0,-1186.0,-291.0,0.311267,0.622246,,0.0529,0.9851,0.013,0.0098,0.0714,4.79075
100004,0,0,0,1,0,2,0,2,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,67500.0,135000.0,6750.0,0.010032,-19046,-225.0,-4260.0,-2531.0,,0.555912,0.729567,,,,,,2.0
100005,1,0,1,1,0,2,0,1,1,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,99000.0,222768.0,17370.0,0.035792,-18064,-4469.0,-9118.0,-1623.0,0.56499,0.291656,0.432962,,,,,,2.250182


In [16]:
application.shape

(356255, 35)

In [17]:
from datetime import datetime

In [18]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

In [19]:
print(datetime.now().hour, ":", datetime.now().minute)
x = application.copy()
imp = IterativeImputer(max_iter=500, random_state=0)
df = imp.fit_transform(x)
df = pd.DataFrame(df)
df.columns = application.columns.values
df.index = application.index
print(datetime.now().hour, ":", datetime.now().minute)

21 : 49
22 : 1


In [20]:
app_test = df.loc[app_test.index]

In [21]:
app_test.head()

Unnamed: 0_level_0,A_CODE_GENDER,A_NAME_INCOME_TYPE,A_NAME_EDUCATION_TYPE,A_FLAG_EMP_PHONE,A_OCCUPATION_TYPE,A_REGION_RATING_CLIENT,A_REG_CITY_NOT_WORK_CITY,A_ORGANIZATION_TYPE,A_FLAG_DOCUMENT_3,A_CNT_CHILDREN,A_OBS_60_CNT_SOCIAL_CIRCLE,A_DEF_60_CNT_SOCIAL_CIRCLE,A_AMT_REQ_CREDIT_BUREAU_HOUR,A_AMT_REQ_CREDIT_BUREAU_DAY,A_AMT_REQ_CREDIT_BUREAU_WEEK,A_AMT_REQ_CREDIT_BUREAU_MON,A_AMT_REQ_CREDIT_BUREAU_QRT,A_AMT_REQ_CREDIT_BUREAU_YEAR,A_AMT_INCOME_TOTAL,A_AMT_CREDIT,A_AMT_ANNUITY,A_REGION_POPULATION_RELATIVE,A_DAYS_BIRTH,A_DAYS_EMPLOYED,A_DAYS_REGISTRATION,A_DAYS_ID_PUBLISH,A_EXT_SOURCE_1,A_EXT_SOURCE_2,A_EXT_SOURCE_3,A_BASEMENTAREA_AVG,A_YEARS_BEGINEXPLUATATION_AVG,A_LANDAREA_AVG,A_NONLIVINGAREA_AVG,A_TOTALAREA_MODE,A_CREDIT/INCOME
SK_ID_CURR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1
100001,0.0,0.0,0.0,1.0,-1.0,2.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,135000.0,568800.0,20560.5,0.01885,-19241.0,-2329.0,-5170.0,-812.0,0.752614,0.789654,0.15952,0.059,0.9732,0.020985,0.018354,0.0392,4.213333
100005,1.0,0.0,1.0,1.0,0.0,2.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,99000.0,222768.0,17370.0,0.035792,-18064.0,-4469.0,-9118.0,-1623.0,0.56499,0.291656,0.432962,0.083623,0.977818,0.057886,0.019178,0.095098,2.250182
100013,1.0,0.0,0.0,1.0,1.0,2.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,4.0,202500.0,663264.0,69777.0,0.019101,-20038.0,-4458.0,-2175.0,-3503.0,0.716539,0.699787,0.610991,0.087005,0.982704,0.06754,0.030749,0.114417,3.275378
100028,0.0,0.0,1.0,1.0,2.0,2.0,0.0,3.0,1.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,315000.0,1575000.0,49018.5,0.026392,-13976.0,-1866.0,-2000.0,-4208.0,0.525734,0.509677,0.612704,0.1974,0.997,0.2042,0.08,0.37,5.0
100038,1.0,0.0,1.0,1.0,-1.0,2.0,1.0,3.0,1.0,1.0,0.0,0.0,0.006616,0.007371,0.033795,0.185093,0.334465,2.068755,180000.0,625500.0,32067.0,0.010032,-13040.0,-2191.0,-4000.0,-4262.0,0.202145,0.425687,0.41858,0.050552,0.979896,0.076152,0.011934,0.03982,3.475


In [22]:
app_test.to_csv(DATASET_DIR / "4. FillNA" / "app_test.csv")