In [1]:
import numpy as np
import pandas as pd
import gc
import time
from contextlib import contextmanager
from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.model_selection import KFold, StratifiedKFold
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
#dataのカテゴリカル特徴量を明示
def convert_cat(data):
    categorical_feats = [
        f for f in data.columns if data[f].dtype == 'object'
    ]

    for f_ in categorical_feats:
        data[f_], indexer = pd.factorize(data[f_])

In [3]:
DIR = '/Users/sh-tatsuno/.kaggle/competitions/home-credit-default-risk/'
train = pd.read_csv(DIR+'application_train.csv.zip')
test = pd.read_csv(DIR+'application_test.csv.zip')

In [4]:
test['TARGET'] = np.nan
train['TEST'] = 0
test['TEST'] = 1
all_data = pd.concat([train,test],sort=True)

In [5]:
drop_cols = ['FLAG_DOCUMENT_2',
                    'FLAG_DOCUMENT_10',
                    'FLAG_DOCUMENT_12',
                    'FLAG_DOCUMENT_13',
                    'FLAG_DOCUMENT_14',
                    'FLAG_DOCUMENT_15',
                    'FLAG_DOCUMENT_16',
                    'FLAG_DOCUMENT_17',
                    'FLAG_DOCUMENT_19',
                    'FLAG_DOCUMENT_20',
                    'FLAG_DOCUMENT_21',
                    'FLAG_MOBIL'
                    ]
all_data = all_data.drop(drop_cols,axis=1)

In [6]:
docs = [_f for _f in all_data.columns if 'FLAG_DOC' in _f]
live = [_f for _f in all_data.columns if ('FLAG_' in _f) & ('FLAG_DOC' not in _f) & ('_FLAG_' not in _f)]

all_data['DAYS_EMPLOYED'].replace(365243, np.nan, inplace= True)

inc_by_org = all_data[['AMT_INCOME_TOTAL', 'ORGANIZATION_TYPE']].groupby('ORGANIZATION_TYPE').median()['AMT_INCOME_TOTAL']

all_data['NEW_CREDIT_TO_ANNUITY_RATIO'] = all_data['AMT_CREDIT'] / all_data['AMT_ANNUITY']
all_data['NEW_CREDIT_TO_GOODS_RATIO'] = all_data['AMT_CREDIT'] / all_data['AMT_GOODS_PRICE']
all_data['NEW_DOC_IND_KURT'] = all_data[docs].kurtosis(axis=1)
all_data['NEW_LIVE_IND_SUM'] = all_data[live].sum(axis=1)
all_data['NEW_INC_PER_CHLD'] = all_data['AMT_INCOME_TOTAL'] / (1 + all_data['CNT_CHILDREN'])
all_data['NEW_INC_BY_ORG'] = all_data['ORGANIZATION_TYPE'].map(inc_by_org)
all_data['NEW_EMPLOY_TO_BIRTH_RATIO'] = all_data['DAYS_EMPLOYED'] / all_data['DAYS_BIRTH']
all_data['NEW_ANNUITY_TO_INCOME_RATIO'] = all_data['AMT_ANNUITY'] / (1 + all_data['AMT_INCOME_TOTAL'])
all_data['NEW_SOURCES_PROD'] = all_data['EXT_SOURCE_1'] * all_data['EXT_SOURCE_2'] * all_data['EXT_SOURCE_3']
all_data['NEW_EXT_SOURCES_MEAN'] = all_data[['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']].mean(axis=1)
all_data['NEW_SCORES_STD'] = all_data[['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']].std(axis=1)
all_data['NEW_SCORES_STD'] = all_data['NEW_SCORES_STD'].fillna(all_data['NEW_SCORES_STD'].mean())
all_data['NEW_CAR_TO_BIRTH_RATIO'] = all_data['OWN_CAR_AGE'] / all_data['DAYS_BIRTH']
all_data['NEW_CAR_TO_EMPLOY_RATIO'] = all_data['OWN_CAR_AGE'] / all_data['DAYS_EMPLOYED']
all_data['NEW_PHONE_TO_BIRTH_RATIO'] = all_data['DAYS_LAST_PHONE_CHANGE'] / all_data['DAYS_BIRTH']
all_data['NEW_PHONE_TO_BIRTH_RATIO'] = all_data['DAYS_LAST_PHONE_CHANGE'] / all_data['DAYS_EMPLOYED']
all_data['NEW_CREDIT_TO_INCOME_RATIO'] = all_data['AMT_CREDIT'] / all_data['AMT_INCOME_TOTAL']

In [7]:
convert_cat(all_data)

In [8]:
all_data.head()

Unnamed: 0,AMT_ANNUITY,AMT_CREDIT,AMT_GOODS_PRICE,AMT_INCOME_TOTAL,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_YEAR,...,NEW_INC_BY_ORG,NEW_EMPLOY_TO_BIRTH_RATIO,NEW_ANNUITY_TO_INCOME_RATIO,NEW_SOURCES_PROD,NEW_EXT_SOURCES_MEAN,NEW_SCORES_STD,NEW_CAR_TO_BIRTH_RATIO,NEW_CAR_TO_EMPLOY_RATIO,NEW_PHONE_TO_BIRTH_RATIO,NEW_CREDIT_TO_INCOME_RATIO
0,24700.5,406597.5,351000.0,202500.0,0.0,0.0,0.0,0.0,0.0,1.0,...,157500.0,0.067329,0.121977,0.003043,0.161787,0.092026,,,1.78022,2.007889
1,35698.5,1293502.5,1129500.0,270000.0,0.0,0.0,0.0,0.0,0.0,0.0,...,135000.0,0.070862,0.132216,,0.466757,0.219895,,,0.69697,4.79075
2,6750.0,135000.0,135000.0,67500.0,0.0,0.0,0.0,0.0,0.0,0.0,...,135000.0,0.011814,0.099999,,0.642739,0.122792,-0.001365,-0.115556,3.622222,2.0
3,29686.5,312682.5,297000.0,135000.0,,,,,,,...,157500.0,0.159905,0.219898,,0.650442,0.151008,,,0.203027,2.316167
4,21865.5,513000.0,513000.0,121500.0,0.0,0.0,0.0,0.0,0.0,0.0,...,157500.0,0.152418,0.179961,,0.322738,0.151008,,,0.364055,4.222222


In [9]:
all_data.to_pickle('edit/fixed_data/all_data.pkl')

In [18]:
test['TARGET'] = np.nan
train['TEST'] = 0
test['TEST'] = 1
all_data = pd.concat([train,test],sort=True)

drop_cols = ['FLAG_DOCUMENT_2',
                    'FLAG_DOCUMENT_10',
                    'FLAG_DOCUMENT_12',
                    'FLAG_DOCUMENT_13',
                    'FLAG_DOCUMENT_14',
                    'FLAG_DOCUMENT_15',
                    'FLAG_DOCUMENT_16',
                    'FLAG_DOCUMENT_17',
                    'FLAG_DOCUMENT_19',
                    'FLAG_DOCUMENT_20',
                    'FLAG_DOCUMENT_21',
                    'FLAG_MOBIL'
                    ]
all_data = all_data.drop(drop_cols,axis=1)

docs = [_f for _f in all_data.columns if 'FLAG_DOC' in _f]
live = [_f for _f in all_data.columns if ('FLAG_' in _f) & ('FLAG_DOC' not in _f) & ('_FLAG_' not in _f)]

all_data['DAYS_EMPLOYED'].replace(365243, np.nan, inplace= True)

inc_by_org = all_data[['AMT_INCOME_TOTAL', 'ORGANIZATION_TYPE']].groupby('ORGANIZATION_TYPE').median()['AMT_INCOME_TOTAL']

all_data['NEW_CREDIT_TO_ANNUITY_RATIO'] = all_data['AMT_CREDIT'] / all_data['AMT_ANNUITY']
all_data['NEW_CREDIT_TO_GOODS_RATIO'] = all_data['AMT_CREDIT'] / all_data['AMT_GOODS_PRICE']
all_data['NEW_DOC_IND_KURT'] = all_data[docs].kurtosis(axis=1)
all_data['NEW_LIVE_IND_SUM'] = all_data[live].sum(axis=1)
all_data['NEW_INC_PER_CHLD'] = all_data['AMT_INCOME_TOTAL'] / (1 + all_data['CNT_CHILDREN'])
all_data['NEW_INC_BY_ORG'] = all_data['ORGANIZATION_TYPE'].map(inc_by_org)
all_data['NEW_EMPLOY_TO_BIRTH_RATIO'] = all_data['DAYS_EMPLOYED'] / all_data['DAYS_BIRTH']
all_data['NEW_ANNUITY_TO_INCOME_RATIO'] = all_data['AMT_ANNUITY'] / (1 + all_data['AMT_INCOME_TOTAL'])
all_data['NEW_SOURCES_PROD'] = all_data['EXT_SOURCE_1'] + all_data['EXT_SOURCE_2']
all_data['NEW_CAR_TO_BIRTH_RATIO'] = all_data['OWN_CAR_AGE'] / all_data['DAYS_BIRTH']
all_data['NEW_CAR_TO_EMPLOY_RATIO'] = all_data['OWN_CAR_AGE'] / all_data['DAYS_EMPLOYED']
all_data['NEW_PHONE_TO_BIRTH_RATIO'] = all_data['DAYS_LAST_PHONE_CHANGE'] / all_data['DAYS_BIRTH']
all_data['NEW_PHONE_TO_EMPLOYED_RATIO'] = all_data['DAYS_LAST_PHONE_CHANGE'] / all_data['DAYS_EMPLOYED']
all_data['NEW_CREDIT_TO_INCOME_RATIO'] = all_data['AMT_CREDIT'] / all_data['AMT_INCOME_TOTAL']

In [19]:
convert_cat(all_data)

In [20]:
all_data.to_pickle('edit/tmp_data/all_dataX2.pkl')