In [1]:
import pandas as pd
import numpy as np

import os
import random

from tqdm import tqdm_notebook as tqdm
from sklearn.externals import joblib

import warnings
warnings.filterwarnings(action='ignore')

random.seed(123)
np.random.seed(123)

pd.set_option('display.max_columns', 200)

root_dir = '/'.join(os.path.realpath(__name__).split('/')[:-2])
input_dir = root_dir + '/input_data'
persist_dir = root_dir + '/persist'

def save_check_dir(dir_path):
    if not os.path.exists(dir_path):
        os.makedirs(dir_path)
        print('>> {} << directory created.'.format(dir_path.split('/')[-1]))
        
    else:
        print(dir_path)
        print('>> {} << directory already exists.'.format(dir_path.split('/')[-1]))

save_check_dir(persist_dir)

/Users/300029144/Documents/kaggle/home_credit_default/home-credit-default/persist
>> persist << directory already exists.


In [4]:
prev_application_path = input_dir + '/previous_application.csv'
previous_application = pd.read_csv(prev_application_path)

In [5]:
previous_application.head()

Unnamed: 0,SK_ID_PREV,SK_ID_CURR,NAME_CONTRACT_TYPE,AMT_ANNUITY,AMT_APPLICATION,AMT_CREDIT,AMT_DOWN_PAYMENT,AMT_GOODS_PRICE,WEEKDAY_APPR_PROCESS_START,HOUR_APPR_PROCESS_START,FLAG_LAST_APPL_PER_CONTRACT,NFLAG_LAST_APPL_IN_DAY,RATE_DOWN_PAYMENT,RATE_INTEREST_PRIMARY,RATE_INTEREST_PRIVILEGED,NAME_CASH_LOAN_PURPOSE,NAME_CONTRACT_STATUS,DAYS_DECISION,NAME_PAYMENT_TYPE,CODE_REJECT_REASON,NAME_TYPE_SUITE,NAME_CLIENT_TYPE,NAME_GOODS_CATEGORY,NAME_PORTFOLIO,NAME_PRODUCT_TYPE,CHANNEL_TYPE,SELLERPLACE_AREA,NAME_SELLER_INDUSTRY,CNT_PAYMENT,NAME_YIELD_GROUP,PRODUCT_COMBINATION,DAYS_FIRST_DRAWING,DAYS_FIRST_DUE,DAYS_LAST_DUE_1ST_VERSION,DAYS_LAST_DUE,DAYS_TERMINATION,NFLAG_INSURED_ON_APPROVAL
0,2030495,271877,Consumer loans,1730.43,17145.0,17145.0,0.0,17145.0,SATURDAY,15,Y,1,0.0,0.182832,0.867336,XAP,Approved,-73,Cash through the bank,XAP,,Repeater,Mobile,POS,XNA,Country-wide,35,Connectivity,12.0,middle,POS mobile with interest,365243.0,-42.0,300.0,-42.0,-37.0,0.0
1,2802425,108129,Cash loans,25188.615,607500.0,679671.0,,607500.0,THURSDAY,11,Y,1,,,,XNA,Approved,-164,XNA,XAP,Unaccompanied,Repeater,XNA,Cash,x-sell,Contact center,-1,XNA,36.0,low_action,Cash X-Sell: low,365243.0,-134.0,916.0,365243.0,365243.0,1.0
2,2523466,122040,Cash loans,15060.735,112500.0,136444.5,,112500.0,TUESDAY,11,Y,1,,,,XNA,Approved,-301,Cash through the bank,XAP,"Spouse, partner",Repeater,XNA,Cash,x-sell,Credit and cash offices,-1,XNA,12.0,high,Cash X-Sell: high,365243.0,-271.0,59.0,365243.0,365243.0,1.0
3,2819243,176158,Cash loans,47041.335,450000.0,470790.0,,450000.0,MONDAY,7,Y,1,,,,XNA,Approved,-512,Cash through the bank,XAP,,Repeater,XNA,Cash,x-sell,Credit and cash offices,-1,XNA,12.0,middle,Cash X-Sell: middle,365243.0,-482.0,-152.0,-182.0,-177.0,1.0
4,1784265,202054,Cash loans,31924.395,337500.0,404055.0,,337500.0,THURSDAY,9,Y,1,,,,Repairs,Refused,-781,Cash through the bank,HC,,Repeater,XNA,Cash,walk-in,Credit and cash offices,-1,XNA,24.0,high,Cash Street: high,,,,,,


# Data Cleaning

In [8]:
previous_application['DAYS_FIRST_DRAWING'].replace(365243, np.nan, inplace=True)
previous_application['DAYS_FIRST_DUE'].replace(365243, np.nan, inplace=True)
previous_application['DAYS_LAST_DUE_1ST_VERSION'].replace(365243, np.nan, inplace=True)
previous_application['DAYS_LAST_DUE'].replace(365243, np.nan, inplace=True)
previous_application['DAYS_TERMINATION'].replace(365243, np.nan, inplace=True)

# HandCrafted Features

In [20]:
def handcrafted_feats(prev_applications):
    number_of_applications = [1, 2, 3, 4, 5]
    features = pd.DataFrame({'SK_ID_CURR': prev_applications['SK_ID_CURR'].unique()})

    prev_app_sorted = prev_applications.sort_values(['SK_ID_CURR', 'DAYS_DECISION'])
    prev_app_sorted_groupby = prev_app_sorted.groupby(by=['SK_ID_CURR'])

    prev_app_sorted['previous_application_prev_was_approved'] = (prev_app_sorted['NAME_CONTRACT_STATUS'] == 'Approved') \
                                                                .astype('int')
    g = prev_app_sorted_groupby['previous_application_prev_was_approved'].last().reset_index()
    features = features.merge(g, on=['SK_ID_CURR'], how='left')

    prev_app_sorted['previous_application_prev_was_refused'] = (prev_app_sorted['NAME_CONTRACT_STATUS'] == 'Refused') \
                                                                .astype('int')
    g = prev_app_sorted_groupby['previous_application_prev_was_refused'].last().reset_index()
    features = features.merge(g, on=['SK_ID_CURR'], how='left')

    g = prev_app_sorted_groupby['SK_ID_PREV'].agg('nunique').reset_index()
    g.rename(index=str, columns={'SK_ID_PREV': 'previous_application_number_of_prev_application'}, inplace=True)
    features = features.merge(g, on=['SK_ID_CURR'], how='left')

    g = prev_app_sorted.groupby(by=['SK_ID_CURR'])['previous_application_prev_was_refused'].mean().reset_index()
    g.rename(index=str, columns={
    'previous_application_prev_was_refused': 'previous_application_fraction_of_refused_applications'},
         inplace=True)
    features = features.merge(g, on=['SK_ID_CURR'], how='left')

    prev_app_sorted['prev_applications_prev_was_revolving_loan'] = (
    prev_app_sorted['NAME_CONTRACT_TYPE'] == 'Revolving loans').astype('int')
    g = prev_app_sorted.groupby(by=['SK_ID_CURR'])[
    'prev_applications_prev_was_revolving_loan'].last().reset_index()
    features = features.merge(g, on=['SK_ID_CURR'], how='left')

    for number in number_of_applications:
        prev_applications_tail = prev_app_sorted_groupby.tail(number)

        tail_groupby = prev_applications_tail.groupby(by=['SK_ID_CURR'])

        g = tail_groupby['CNT_PAYMENT'].agg('mean').reset_index()
        g.rename(index=str,
                 columns={'CNT_PAYMENT': 'previous_application_term_of_last_{}_credits_mean'.format(number)},
                 inplace=True)
        features = features.merge(g, on=['SK_ID_CURR'], how='left')

        g = tail_groupby['DAYS_DECISION'].agg('mean').reset_index()
        g.rename(index=str,
                 columns={'DAYS_DECISION': 'previous_application_days_decision_about_last_{}_credits_mean'.format(
                     number)},
                 inplace=True)
        features = features.merge(g, on=['SK_ID_CURR'], how='left')

        g = tail_groupby['DAYS_FIRST_DRAWING'].agg('mean').reset_index()
        g.rename(index=str,
                 columns={
                     'DAYS_FIRST_DRAWING': 'previous_application_days_first_drawing_last_{}_credits_mean'.format(
                         number)},
                 inplace=True)
        features = features.merge(g, on=['SK_ID_CURR'], how='left')
        
        
    return features
    
features = handcrafted_feats(previous_application)

In [21]:
PREVIOUS_APPLICATION_AGGREGATION_RECIPIES = []
for agg in ['mean', 'min', 'max', 'sum', 'var']:
    for select in ['AMT_ANNUITY',
                   'AMT_APPLICATION',
                   'AMT_CREDIT',
                   'AMT_DOWN_PAYMENT',
                   'AMT_GOODS_PRICE',
                   'CNT_PAYMENT',
                   'DAYS_DECISION',
                   'HOUR_APPR_PROCESS_START',
                   'RATE_DOWN_PAYMENT'
                   ]:
        PREVIOUS_APPLICATION_AGGREGATION_RECIPIES.append((select, agg))
PREVIOUS_APPLICATION_AGGREGATION_RECIPIES = [(['SK_ID_CURR'], PREVIOUS_APPLICATION_AGGREGATION_RECIPIES)]


groupby_aggregate_names = []
for groupby_cols, specs in tqdm(PREVIOUS_APPLICATION_AGGREGATION_RECIPIES):
    group_object = previous_application.groupby(groupby_cols)
    for select, agg in tqdm(specs):
        groupby_aggregate_name = '{}_{}_{}'.format('_'.join(groupby_cols), agg, select)
        features = features.merge(group_object[select]
                              .agg(agg)
                              .reset_index()
                              .rename(index=str,
                                      columns={select: groupby_aggregate_name})
                              [groupby_cols + [groupby_aggregate_name]],
                              on=groupby_cols,
                              how='left')
        groupby_aggregate_names.append(groupby_aggregate_name)
    

HBox(children=(IntProgress(value=0, max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, max=45), HTML(value='')))




In [24]:
prev_application_presist_path = os.path.join(persist_dir, 'prev_application.pkl')
joblib.dump(features, prev_application_presist_path)

['/Users/300029144/Documents/kaggle/home_credit_default/home-credit-default/persist/prev_application.pkl']