In [None]:
import numpy as np
import pandas as pd

import joblib
import gc

## Load test data, pipeline, and model

In [None]:
#import test data
test = pd.read_csv('../input/home-credit-default-risk/application_test.csv')
test.set_index(['SK_ID_CURR'], inplace=True)
test.shape

In [None]:
#load saved pipeline and model
preprocessor = joblib.load('../input/wk6-default/wk6default_preprocessor.joblib')

LGBM_model = joblib.load('../input/wk6-default/wk6_LGBM_default_model.joblib')

## Preprocessing: Cleaning and Merging Datasets

#### Merge Bureau and Bureau_Balance with Test Data

In [None]:
#load bureau_balance and bureau into memory
bureau_bal = pd.read_csv('../input/home-credit-default-risk/bureau_balance.csv')
bureau = pd.read_csv('../input/home-credit-default-risk/bureau.csv')

bb = pd.merge(bureau, bureau_bal, on = 'SK_ID_BUREAU', how = 'left')

#feature engineering
bb['REMAIN_CRED'] = bb['AMT_CREDIT_SUM'] - bb['AMT_CREDIT_SUM_DEBT'] - bb['AMT_CREDIT_SUM_LIMIT']
bb['AC_RATIO'] = bb['AMT_ANNUITY'] / bb['AMT_CREDIT_SUM'] 

#add prefix to bureau columns
bb.columns = ['BU_'+column if column != ('SK_ID_CURR') 
                       else column for column in bb.columns]

#group categorical features in bureau
bur_cat = pd.get_dummies(bb.select_dtypes('object'))
bur_cat['SK_ID_CURR'] = bb['SK_ID_CURR']
bur_cat = bur_cat.groupby(by = ['SK_ID_CURR']).agg(['mean'])
  
#group numerical features    
bur_num = bb.groupby(by = ['SK_ID_CURR']).agg(['max', 'mean', 'sum']).astype('float32')

# merge cat and num columns
bureau_rev = bur_cat.merge(bur_num, on = ['SK_ID_CURR'], how = 'left')

#merge bureau_rev and test
test = test.merge(bureau_rev, on = ['SK_ID_CURR'], how = 'left')

#remove unneeded datasets from memory
del bur_cat
del bur_num
del bureau
del bureau_bal

gc.collect()

#### Merge Credit_Card_Balance with Test Dataset

In [None]:
#load data into memory
cc_bal = pd.read_csv('../input/home-credit-default-risk/credit_card_balance.csv')

#feature engineering
cc_bal['DRAW_RATIO'] = cc_bal['AMT_DRAWINGS_CURRENT'] / cc_bal['CNT_DRAWINGS_CURRENT']
cc_bal['RECEIVE_RATIO'] = cc_bal['AMT_RECIVABLE'] / cc_bal['AMT_RECEIVABLE_PRINCIPAL']
cc_bal['RECEIVE_PER'] = cc_bal['AMT_RECIVABLE'] / cc_bal['AMT_TOTAL_RECEIVABLE']


#create prefix for columns
cc_bal.columns = ['CC_'+ column if column !='SK_ID_CURR' 
                  else column for column in cc_bal.columns]

#group categorical features by SK_ID_CURR
cc_cat = pd.get_dummies(cc_bal.select_dtypes('object'))
cc_cat['SK_ID_CURR'] = cc_bal['SK_ID_CURR']
cc_cat = cc_cat.groupby(by = ['SK_ID_CURR']).mean()

#group numerical features in credit card balance by SK_ID_CURR
cc_num = cc_bal.groupby(by = ['SK_ID_CURR']).agg(['max', 'mean', 'sum']).astype('float32')

test = test.merge(cc_cat, on = ['SK_ID_CURR'], how = 'left')
test = test.merge(cc_num, on = ['SK_ID_CURR'], how = 'left')

del cc_bal
del cc_cat
del cc_num
gc.collect()

#### Merge Installments_Payments with Test Dataset

In [None]:
#load installments_payments into memory
install = pd.read_csv('../input/home-credit-default-risk/installments_payments.csv')

#feature engineering
install['PAY_PERCENT'] = install['AMT_INSTALMENT'] / install['AMT_PAYMENT']
install['PAY_DIFF'] = install['AMT_INSTALMENT'] - install['AMT_PAYMENT']

install['DPD'] = install['DAYS_ENTRY_PAYMENT'] - install['DAYS_INSTALMENT']
install['DPD'] = install['DPD'].apply(lambda x: x if x>0 else 0)

install['DBD'] = install['DAYS_INSTALMENT'] - install['DAYS_ENTRY_PAYMENT']
install['DBD'] = install['DBD'].apply(lambda x: x if x>0 else 0)

#create prefix
install.columns = ['IP_'+ column if column !='SK_ID_CURR' 
                   else column for column in install.columns]  


#group numeric features (no cat features in install)
inst_num = install.groupby(by = ['SK_ID_CURR']).agg(['max', 'mean']).astype('float32')

#merge install with prev
test = test.merge(inst_num, on = 'SK_ID_CURR', how='left')

del install
del inst_num
gc.collect()

#### Merge POS_CASH with test set

In [None]:
#load POS_CASH into memory
pos = pd.read_csv('../input/home-credit-default-risk/POS_CASH_balance.csv')

#create prefix
pos.columns = ['PC_'+ column if column !='SK_ID_CURR' 
                   else column for column in pos.columns]

#group numeric features (no cat features in install)
pos_num = pos.groupby(by = ['SK_ID_CURR']).agg(['max', 'mean', 'sum']).astype('float32')

test = test.merge(pos_num, on = ['SK_ID_CURR'], how = 'left')

del pos
del pos_num
gc.collect()


#### Merge Previous_Application with Test Dataset

In [None]:
#load data
prev = pd.read_csv('../input/home-credit-default-risk/previous_application.csv')

#feature engineering
prev['DAYS_FIRST_DRAWING'].replace(365243, np.nan, inplace = True)
prev['DAYS_FIRST_DUE'].replace(365243, np.nan, inplace = True)
prev['DAYS_LAST_DUE_1ST_VERSION'].replace(365243, np.nan, inplace = True)
prev['DAYS_LAST_DUE'].replace(365243, np.nan, inplace = True)
prev['DAYS_TERMINATION'].replace(365243, np.nan, inplace = True)

prev['AppCred_RATIO'] = prev['AMT_APPLICATION'] / (prev['AMT_CREDIT'] + 1)
prev['AppGoods_RATIO'] = prev['AMT_APPLICATION'] / (prev['AMT_GOODS_PRICE'] + 1)
prev['AnnCred_RATIO'] = prev['AMT_ANNUITY'] / (prev['AMT_CREDIT'] + 1)
prev['CredGoods_RATIO'] = prev['AMT_CREDIT'] / (prev['AMT_GOODS_PRICE'] + 1)

#calculate APR and add it as a feature
def calc_rate(row):
    return np.rate(row['CNT_PAYMENT'], -row['AMT_ANNUITY'], row['AMT_CREDIT'], 0, guess = 0.05, maxiter = 10)

prev['CALC_RATE'] = prev.apply(calc_rate, axis=1)


#Remove unnecessary features
p_dels = ['RATE_INTEREST_PRIMARY','RATE_INTEREST_PRIVILEGED']
prev = prev.drop(prev[p_dels], axis = 1)

#create prefix
prev.columns = ['PR_'+ column if column != 'SK_ID_CURR' 
                else column for column in prev.columns]

#group categorical features in previous_application
prev_cat = pd.get_dummies(prev.select_dtypes('object'))
prev_cat['SK_ID_CURR'] = prev['SK_ID_CURR']
prev_cat = prev_cat.groupby(by = ['SK_ID_CURR']).agg(['mean'])

#group numeric features
prev_num = prev.groupby(by = ['SK_ID_CURR']).agg(['max', 'mean', 'sum']).astype('float32')

#combine previous_application categorical and numeric features
prev_rev = prev_num.merge(prev_cat, on = ['SK_ID_CURR'], how = 'left')

#merge revised previous_application features into test dataset
test = test.merge(prev_rev, on = ['SK_ID_CURR'], how = 'left')

del prev_rev
del prev_cat
del prev_num
gc.collect()

In [None]:
#replace 365243 in days employed with nan
test['DAYS_EMPLOYED'].replace(365243, np.nan, inplace = True)

#convert age to years
test['AGE'] = test['DAYS_BIRTH'] / - 365

#create avg of each row of EXIT_SOURCE values
test['AVG_EXT'] = test.iloc[:, 41:44].sum(axis=1)/(3- test.iloc[:,41:44].isnull().sum(axis=1))
test.EXT_SOURCE_1.fillna(test.AVG_EXT, inplace=True)
test.EXT_SOURCE_2.fillna(test.AVG_EXT, inplace=True)
test.EXT_SOURCE_3.fillna(test.AVG_EXT, inplace=True)

In [None]:
#percentage of days employed 
test['EmpAge_RATIO'] = test['DAYS_EMPLOYED'] / test['AGE']

#create credit/income ratio 
test['CredInc_RATIO'] = test['AMT_CREDIT'] / test['AMT_INCOME_TOTAL']

#create annuity to income ration
test['AnnInc_RATIO'] = test['AMT_ANNUITY'] / test['AMT_INCOME_TOTAL']

#create credit/annuity ratio 
test['AnnCred_RATIO'] = test['AMT_ANNUITY'] / (test['AMT_CREDIT'] + 1)

#create credit/cost of goods ratio feature
test['CredGoods_RATIO'] = test['AMT_CREDIT'] / (test['AMT_GOODS_PRICE'] + 1)


test['AVG_EXT_INCOME'] = test['AMT_INCOME_TOTAL'] * test['AVG_EXT']
test['AVG_EXT_GOODS'] = test['AMT_GOODS_PRICE'] * test['AVG_EXT']

In [None]:
dels = ['APARTMENTS_MODE', 'BASEMENTAREA_MODE', 'YEARS_BEGINEXPLUATATION_MODE', 
        'YEARS_BUILD_MODE', 'COMMONAREA_MODE', 'ELEVATORS_MODE', 'ENTRANCES_MODE', 
        'FLOORSMAX_MODE', 'FLOORSMIN_MODE', 'LANDAREA_MODE', 'LIVINGAPARTMENTS_MODE', 
        'LIVINGAREA_MODE', 'NONLIVINGAPARTMENTS_MODE', 'NONLIVINGAREA_MODE', 
        'APARTMENTS_MEDI', 'BASEMENTAREA_MEDI', 'YEARS_BEGINEXPLUATATION_MEDI', 
        'YEARS_BUILD_MEDI', 'COMMONAREA_MEDI', 'ELEVATORS_MEDI', 'ENTRANCES_MEDI',
        'FLOORSMAX_MEDI', 'FLOORSMIN_MEDI', 'LANDAREA_MEDI', 'LIVINGAPARTMENTS_MEDI', 
        'LIVINGAREA_MEDI', 'NONLIVINGAPARTMENTS_MEDI', 'NONLIVINGAREA_MEDI', 
        'FONDKAPREMONT_MODE', 'HOUSETYPE_MODE', 'TOTALAREA_MODE', 
        'WALLSMATERIAL_MODE', 'EMERGENCYSTATE_MODE', 'FLAG_DOCUMENT_2', 
        'FLAG_DOCUMENT_3', 'FLAG_DOCUMENT_4', 'FLAG_DOCUMENT_5',
        'FLAG_DOCUMENT_6', 'FLAG_DOCUMENT_7', 'FLAG_DOCUMENT_8', 
        'FLAG_DOCUMENT_9', 'FLAG_DOCUMENT_10', 'FLAG_DOCUMENT_11', 
        'FLAG_DOCUMENT_12', 'FLAG_DOCUMENT_13', 'FLAG_DOCUMENT_14', 
        'FLAG_DOCUMENT_15', 'FLAG_DOCUMENT_16', 'FLAG_DOCUMENT_17', 
        'FLAG_DOCUMENT_18', 'FLAG_DOCUMENT_19', 'FLAG_DOCUMENT_20', 
        'FLAG_DOCUMENT_21', 'DAYS_BIRTH', 'LIVINGAPARTMENTS_AVG', 
        'LIVINGAREA_AVG', 'CNT_FAM_MEMBERS',  'OBS_30_CNT_SOCIAL_CIRCLE',
        'OBS_60_CNT_SOCIAL_CIRCLE', 'ELEVATORS_AVG', 'AVG_EXT']


test = test.drop(test[dels], axis =1)
gc.collect()

In [None]:
test = test.replace([np.inf, -np.inf], np.nan)

In [None]:
X_test = preprocessor.transform(test)
print(X_test.shape)

## Make Predictions

In [None]:
test_pred = LGBM_model.predict_proba(X_test)
print(test_pred.shape)
print(test_pred[:5])

## Submission

In [None]:
submission = pd.read_csv('../input/home-credit-default-risk/sample_submission.csv')
submission.head(10)  

In [None]:
submission.shape

In [None]:
submission.TARGET = test_pred[:,1]   # replace the default values with our predictions
submission.head(10)

In [None]:
submission.to_csv('default_submission_wk06.csv', index=False, header = True)