In [175]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [176]:
import os
import gc # 메모리 챙길 때 씀
import time
import numpy as np
import pandas as pd
from contextlib import contextmanager
import multiprocessing as mp
from functools import partial
from scipy.stats import kurtosis, iqr, skew
from lightgbm import LGBMClassifier
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import roc_auc_score
import warnings

In [177]:
warnings.simplefilter(action='ignore', category=FutureWarning)

In [178]:
pd.set_option('display.max_rows', 60)

In [179]:
pd.set_option('display.max_columns', 100)

데이터가 너무 커서 읽는데 시간이 너무 오래 걸립니다. 따라서 30000개만 읽어 디버깅을 해줘야 합니다.

In [180]:
debug = True

In [181]:
num_rows = 30000 if debug else None

df = get_train_test(DATA_DIRECTORY, num_rows = num_rows) # 함수 뜯어보기

In [182]:
# GENERAL CONFIGURATIONS
NUM_THRADS = 4
DATA_DIRECTORY = "../input/"
SUBMISSION_SUFIX = "_model2_04"

In [183]:
path = DATA_DIRECTORY
num_rows = num_rows

In [184]:
train_file_path = '../input/home-credit-default-risk/application_train.csv'
test_file_path = '../input/home-credit-default-risk/application_test.csv'

In [185]:
train = pd.read_csv(train_file_path, nrows = num_rows)
test = pd.read_csv(test_file_path, nrows = num_rows)

In [186]:
df = train.append(test)

In [187]:
train

In [188]:
df

In [189]:
del train, test

메모리를 주워담습니다.

In [190]:
gc.collect

## preprocessing

In [191]:
df = df[df['CODE_GENDER'] != 'XNA']# XNA에는 4명의 사람 밖에 없기 때문에 제거합니다.

In [192]:
df = df[df['AMT_INCOME_TOTAL'] < 20000000] # test의 Max income은 4백만, train은 1억7백만입니다.

In [193]:
df['DAYS_EMPLOYED'].replace(365243, np.nan, inplace=True) # 365243으로 기록된 사람들이 많습니다. 이는 의미 없는 숫자이므로 nan값을 적용하겠습니다.

In [194]:
df['DAYS_LAST_PHONE_CHANGE'].replace(0, np.nan, inplace=True)

In [195]:
docs = [f for f in df.columns if 'FLAG_DOC' in f] # 이들은 binary data입니다. "에, 아니오" 등으로 이루어진 문서

In [196]:
df['DOCUMENT_COUNT'] = df[docs].sum(axis=1)
df['NEW_DOC_KURT'] = df[docs].kurtosis(axis=1)

In [197]:
df[docs].sum(axis=1)

In [198]:
df[docs].kurtosis(axis=1).hist() # kurtosis는 분포의 꼬리를 의미합니다.

In [199]:
def get_age_label(days_birth):
    """ Return the age group label (int). """
    age_years = -days_birth / 365
    if age_years < 27: return 1
    elif age_years < 40: return 2
    elif age_years < 50: return 3
    elif age_years < 65: return 4
    elif age_years < 99: return 5
    else: return 0

In [200]:
df['AGE_RANGE'] = df['DAYS_BIRTH'].apply(lambda x: get_age_label(x))
# get_age_label 함수의 df['DAYS_BIRTH']를 보면 -살아온 일 수가 기록돼 있음.
# 여기다 -를 취해 양수로 만들어준 이후 365로 나누면 나이가 나옴. 이를 구간으로 나눠준 함수

In [201]:
# EXT_SOURCE는 feature importance가 높게 나오는 값들입니다. 이들끼리 서로 interaction할 수 있게 곱해서 새로운 feature를 만들어줍니다.
df['EXT_SOURCES_PROD'] = df['EXT_SOURCE_1'] * df['EXT_SOURCE_2'] * df['EXT_SOURCE_3']

# 현업에서는 설명이 되지 않는 변수는 사용하기 어렵지만 캐글에서는 점수를 높여야 하니 사용하는 경향이 있습니다.

In [202]:
# 경진대회에 참여하는 사람들이 해당 feature들 간의 상관관계를 조사해본 결과 2, 1, 3이 좋게 나타났습니다.
df['EXT_SOURCES_WEIGHTED'] = df.EXT_SOURCE_1 * 2 + df.EXT_SOURCE_2 * 1 + df.EXT_SOURCE_3 * 3

In [203]:
np.warnings.filterwarnings('ignore', r'All-NaN (slice|axis) encountered')

In [204]:
# eval 함수는 string 형태를 evaluation할 수 있도록 만들어줌

for function_name in ['min', 'max', 'mean', 'nanmedian', 'var']:
    feature_name = 'EXT_SOURCES_{}'.format(function_name.upper())
    df[feature_name] = eval('np.{}'.format(function_name))(
        df[['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']], axis=1)

In [205]:
# Credit ratios
df['CREDIT_TO_ANNUITY_RATIO'] = df['AMT_CREDIT'] / df['AMT_ANNUITY'] # 연금을 받는데 빛은 얼마나 지는지
df['CREDIT_TO_GOODS_RATIO'] = df['AMT_CREDIT'] / df['AMT_GOODS_PRICE'] # 물건을 사는데 빚을 얼마나 지는지

In [206]:
# Income ratios
df['ANNUITY_TO_INCOME_RATIO'] = df['AMT_ANNUITY'] / df['AMT_INCOME_TOTAL']
df['CREDIT_TO_INCOME_RATIO'] = df['AMT_CREDIT'] / df['AMT_INCOME_TOTAL']
df['INCOME_TO_EMPLOYED_RATIO'] = df['AMT_INCOME_TOTAL'] / df['DAYS_EMPLOYED'] # 일한 햇수에 비해 임금이 얼만지
df['INCOME_TO_BIRTH_RATIO'] = df['AMT_INCOME_TOTAL'] / df['DAYS_BIRTH']

In [207]:
# Time ratios
df['EMPLOYED_TO_BIRTH_RATIO'] = df['DAYS_EMPLOYED'] / df['DAYS_BIRTH'] # 살아온 날 대비 얼마나 근로했는지
df['ID_TO_BIRTH_RATIO'] = df['DAYS_ID_PUBLISH'] / df['DAYS_BIRTH'] # 회사에 아이디 만들어진지 얼마나 되는지
df['CAR_TO_BIRTH_RATIO'] = df['OWN_CAR_AGE'] / df['DAYS_BIRTH'] # 나이 대비 car를 소유한 일 수
df['CAR_TO_EMPLOYED_RATIO'] = df['OWN_CAR_AGE'] / df['DAYS_EMPLOYED']
df['PHONE_TO_BIRTH_RATIO'] = df['DAYS_LAST_PHONE_CHANGE'] / df['DAYS_BIRTH']

Groupby: Statistics for applications in the same group

In [208]:
def do_mean(df, group_cols, counted, agg_name):
    gp = df[group_cols + [counted]].groupby(group_cols)[counted].mean().reset_index().rename(
        columns={counted: agg_name})
    df = df.merge(gp, on=group_cols, how='left')
    del gp
    gc.collect()
    return df


def do_median(df, group_cols, counted, agg_name):
    gp = df[group_cols + [counted]].groupby(group_cols)[counted].median().reset_index().rename(
        columns={counted: agg_name})
    df = df.merge(gp, on=group_cols, how='left')
    del gp
    gc.collect()
    return df


def do_std(df, group_cols, counted, agg_name):
    gp = df[group_cols + [counted]].groupby(group_cols)[counted].std().reset_index().rename(
        columns={counted: agg_name})
    df = df.merge(gp, on=group_cols, how='left')
    del gp
    gc.collect()
    return df


def do_sum(df, group_cols, counted, agg_name):
    gp = df[group_cols + [counted]].groupby(group_cols)[counted].sum().reset_index().rename(
        columns={counted: agg_name})
    df = df.merge(gp, on=group_cols, how='left')
    del gp
    gc.collect()
    return df

In [209]:
group = ['ORGANIZATION_TYPE', 'NAME_EDUCATION_TYPE', 'OCCUPATION_TYPE', 'AGE_RANGE', 'CODE_GENDER']

In [210]:
df = do_median(df, group, 'EXT_SOURCES_MEAN', 'GROUP_EXT_SOURCES_MEDIAN')
df = do_std(df, group, 'EXT_SOURCES_MEAN', 'GROUP_EXT_SOURCES_STD')
df = do_mean(df, group, 'AMT_INCOME_TOTAL', 'GROUP_INCOME_MEAN')
df = do_std(df, group, 'AMT_INCOME_TOTAL', 'GROUP_INCOME_STD')
df = do_mean(df, group, 'CREDIT_TO_ANNUITY_RATIO', 'GROUP_CREDIT_TO_ANNUITY_MEAN')
df = do_std(df, group, 'CREDIT_TO_ANNUITY_RATIO', 'GROUP_CREDIT_TO_ANNUITY_STD')
df = do_mean(df, group, 'AMT_CREDIT', 'CROUP_CREDIT_MEAN')
df = do_mean(df, group, 'AMT_ANNUITY', 'GROUP_ANNUITY_MEAN')
df = do_std(df, group, 'AMT_ANNUITY', 'GROUP_ANNUITY_STD')

Encode categorical features (LabelEncoder)

In [211]:
# df[col]로 하면 에러나서 df['col']로 코드 수정, 함수 안에서 df[col].dtype == 'object'를 못받아들임

if not categorical_columns:
    categorical_columns = [col for col in df.columns if df['col'].dtype == 'object']
    
categorical_columns

In [212]:
def label_encoder(df, categorical_columns=None):
    """Encode categorical values as integers (0,1,2,3...) with pandas.factorize. """
    
    categorical_columns = ['NAME_CONTRACT_TYPE',
                           'CODE_GENDER',
                           'FLAG_OWN_CAR',
                           'FLAG_OWN_REALTY',
                           'NAME_TYPE_SUITE',
                           'NAME_INCOME_TYPE',
                           'NAME_EDUCATION_TYPE',
                           'NAME_FAMILY_STATUS',
                           'NAME_HOUSING_TYPE',
                           'OCCUPATION_TYPE',
                           'WEEKDAY_APPR_PROCESS_START',
                           'ORGANIZATION_TYPE',
                           'FONDKAPREMONT_MODE',
                           'HOUSETYPE_MODE',
                           'WALLSMATERIAL_MODE',
                           'EMERGENCYSTATE_MODE']
    
    for col2 in categorical_columns:
        df[col2], uniques = pd.factorize(df[col2])
    return df, categorical_columns

In [213]:
df, le_encoded_cols = label_encoder(df, None)

In [214]:
def drop_application_columns(df):
    """ Drop features based on permutation feature importance. """
    drop_list = [
        'CNT_CHILDREN', 'CNT_FAM_MEMBERS', 'HOUR_APPR_PROCESS_START',
        'FLAG_EMP_PHONE', 'FLAG_MOBIL', 'FLAG_CONT_MOBILE', 'FLAG_EMAIL', 'FLAG_PHONE',
        'FLAG_OWN_REALTY', 'REG_REGION_NOT_LIVE_REGION', 'REG_REGION_NOT_WORK_REGION',
        'REG_CITY_NOT_WORK_CITY', 'OBS_30_CNT_SOCIAL_CIRCLE', 'OBS_60_CNT_SOCIAL_CIRCLE',
        'AMT_REQ_CREDIT_BUREAU_DAY', 'AMT_REQ_CREDIT_BUREAU_MON', 'AMT_REQ_CREDIT_BUREAU_YEAR', 
        'COMMONAREA_MODE', 'NONLIVINGAREA_MODE', 'ELEVATORS_MODE', 'NONLIVINGAREA_AVG',
        'FLOORSMIN_MEDI', 'LANDAREA_MODE', 'NONLIVINGAREA_MEDI', 'LIVINGAPARTMENTS_MODE',
        'FLOORSMIN_AVG', 'LANDAREA_AVG', 'FLOORSMIN_MODE', 'LANDAREA_MEDI',
        'COMMONAREA_MEDI', 'YEARS_BUILD_AVG', 'COMMONAREA_AVG', 'BASEMENTAREA_AVG',
        'BASEMENTAREA_MODE', 'NONLIVINGAPARTMENTS_MEDI', 'BASEMENTAREA_MEDI', 
        'LIVINGAPARTMENTS_AVG', 'ELEVATORS_AVG', 'YEARS_BUILD_MEDI', 'ENTRANCES_MODE',
        'NONLIVINGAPARTMENTS_MODE', 'LIVINGAREA_MODE', 'LIVINGAPARTMENTS_MEDI',
        'YEARS_BUILD_MODE', 'YEARS_BEGINEXPLUATATION_AVG', 'ELEVATORS_MEDI', 'LIVINGAREA_MEDI',
        'YEARS_BEGINEXPLUATATION_MODE', 'NONLIVINGAPARTMENTS_AVG', 'HOUSETYPE_MODE',
        'FONDKAPREMONT_MODE', 'EMERGENCYSTATE_MODE'
    ]
        
    # Drop most flag document columns
    for doc_num in [2, 4, 5, 6, 7, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21]:
        drop_list.append('FLAG_DOCUMENT_{}'.format(doc_num))
    df.drop(drop_list, axis=1, inplace=True)
    return df

In [215]:
df = drop_application_columns(df)

breau는 당사가 아닌 다른 외부 대출 회사의 거래내역을 가져온 것입니다.

In [216]:
breau_path = '../input/home-credit-default-risk/bureau.csv'
bureau = pd.read_csv(breau_path, nrows=num_rows)

In [217]:
# Credit duration and credit/account end date difference
bureau['CREDIT_DURATION'] = bureau['DAYS_CREDIT'] + bureau['DAYS_CREDIT_ENDDATE']
bureau['ENDDATE_DIF'] = bureau['DAYS_CREDIT_ENDDATE'] - breau['DAYS_ENDDATE_FACT']

# Credit to debt ratio and difference
bureau['DEBT_PERCENTAGE'] = bureau['AMT_CREDIT_SUM'] / bureau['AMT_CREDIT_SUM_DEBT']
bureau['DEBT_CREDIT_DIFF'] = bureau['AMT_CREDIT_SUM'] - bureau['AMT_CREDIT_SUM_DEBT']
bureau['CREDIT_TO_ANNUITY_RATIO'] = bureau['AMT_CREDIT_SUM'] / bureau['AMT_ANNUITY']

In [218]:
# One hot encoder
def one_hot_encoder(df, categorical_columns=None, nan_as_category=True):
    original_columns = list(df.columns)
    if not categorical_columns:
        categorical_columns = [col for col in df.columns if df[col].dtype == 'object']
    # categorical_columns 만드는 과정 생략. 위에서 만든 자료 사용했음
    # dummy_na > nan 값을 dummies를 만들 때 포함할 것인지의 여부
    df = pd.get_dummies(df, columns=categorical_columns, dummy_na=nan_as_category)
    categorical_columns = [c for c in df.columns if c not in original_columns]
    return df, categorical_columns

In [219]:
bureau, categorical_cols = one_hot_encoder(bureau, nan_as_category = False)

In [221]:
bureau_balance_path = '../input/home-credit-default-risk/bureau_balance.csv'
bb = pd.read_csv(bureau_balance_path, nrows=num_rows)

In [222]:
bb, categorical_cols = one_hot_encoder(bb, nan_as_category = False)

In [237]:
def group(df_to_agg, prefix, aggregations, aggregate_by= 'SK_ID_CURR'):
    agg_df = df_to_agg.groupby(aggregate_by).agg(aggregations)
    agg_df.columns = pd.Index(['{}{}_{}'.format(prefix, e[0], e[1].upper())
                               for e in agg_df.columns.tolist()])
    return agg_df.reset_index()


def group_and_merge(df_to_agg, df_to_merge, prefix, aggregations, aggregate_by= 'SK_ID_CURR'):
    agg_df = group(df_to_agg, prefix, aggregations, aggregate_by= aggregate_by)
    return df_to_merge.merge(agg_df, how='left', on= aggregate_by)

In [242]:
def get_bureau_balance(path, num_rows= None):
    bb = pd.read_csv(bureau_balance_path, nrows= num_rows)
    bb, categorical_cols = one_hot_encoder(bb, nan_as_category= False)
    # Calculate rate for each category with decay
    bb_processed = bb.groupby('SK_ID_BUREAU')[categorical_cols].mean().reset_index()
    # Min, Max, Count and mean duration of payments (months)
    agg = {'MONTHS_BALANCE': ['min', 'max', 'mean', 'size']}
    bb_processed = group_and_merge(bb, bb_processed, '', agg, 'SK_ID_BUREAU')
    del bb; gc.collect()
    return bb_processed

In [243]:
 bureau = bureau.merge(get_bureau_balance(path, num_rows), how='left', on='SK_ID_BUREAU')

In [246]:
bureau['STATUS_12345'] = 0

In [248]:
for i in range(1, 6):
    bureau['STATUS_12345'] += bureau['STATUS_{}'.format(i)]

Aggregate by number of months in balance and merge with bureau (loan length agg)

In [250]:
features = ['AMT_CREDIT_MAX_OVERDUE', 'AMT_CREDIT_SUM_OVERDUE', 'AMT_CREDIT_SUM',
        'AMT_CREDIT_SUM_DEBT', 'DEBT_PERCENTAGE', 'DEBT_CREDIT_DIFF', 'STATUS_0', 'STATUS_12345']

In [251]:
agg_length = bureau.groupby('MONTHS_BALANCE_SIZE')[features].mean()

In [253]:
agg_length.rename({feat: 'LL_' + feat for feat in features}, axis=1, inplace=True)

In [254]:
bureau = bureau.merge(agg_length, how='left', on='MONTHS_BALANCE_SIZE')