In [None]:
import numpy as np
import pandas as pd

import joblib
import gc

## Load test data, pipeline, and model

In [None]:
#import test data
test = pd.read_csv('../input/home-credit-default-risk/application_test.csv')
test.set_index(['SK_ID_CURR'], inplace=True)
test.shape

In [None]:
#load saved pipeline and model
preprocessor = joblib.load('../input/week5-default/wk5default_preprocessor2.joblib')

LGBM_model2 = joblib.load('../input/week5-default/LGBM_default_model3-2.joblib')

## Preprocessing: Cleaning and Merging Datasets

#### Merge Bureau and Bureau_Balance with Test Data

In [None]:
#load bureau and bureau_balance
bureau = pd.read_csv('../input/home-credit-default-risk/bureau.csv')
bureau_bal = pd.read_csv('../input/home-credit-default-risk/bureau_balance.csv')

bb_status = pd.crosstab(bureau_bal['SK_ID_BUREAU'], bureau_bal['STATUS'])

#add prefix to bureau balance columns
bb_status.columns = ['BB_'+ column for column in bb_status.columns]        

#merge the tables
bureau = bureau.merge(bb_status, left_on = ['SK_ID_BUREAU'], right_on = ['SK_ID_BUREAU'])              
bureau = bureau.drop(['SK_ID_BUREAU'], axis = 1)   

In [None]:
#add prefix to bureau columns
bureau.columns = ['BU_'+column if column != 'SK_ID_CURR' 
                  else column for column in bureau.columns]

#create numeric features and group on SK_ID_CURR   
bur_num = bureau.groupby(by = ['SK_ID_CURR']).mean().reset_index()

#group categorical features in bureau
bur_cat = pd.get_dummies(bureau.select_dtypes('object'))

bur_cat['SK_ID_CURR'] = bureau['SK_ID_CURR']
bur_cat = bur_cat.groupby(by = ['SK_ID_CURR']).mean().reset_index()

#create feature of number of past loans
bur_count = bureau.groupby(by = ['SK_ID_CURR'])['BU_CREDIT_ACTIVE'].count().reset_index()
bur_count.rename(columns={'BU_CREDIT_ACTIVE':'COUNT_LOANS'})

# merge cat, cat and count into test data
test = test.merge(bur_num, on = ['SK_ID_CURR'], how = 'left')       
test = test.merge(bur_cat, on = ['SK_ID_CURR'], how = 'left')   
test = test.merge(bur_count, on = ['SK_ID_CURR'], how = 'left')

del bureau
del bureau_bal
del bur_cat
del bur_num
del bur_count
gc.collect()

test.shape

#### Merge Installments_Payments with Test Dataset

In [None]:
#load installments_payments into memory
install = pd.read_csv('../input/home-credit-default-risk/installments_payments.csv')

#create prefix
install.columns = ['IP_'+ column if column !='SK_ID_CURR' 
                   else column for column in install.columns]  

#group numeric features (no cat features in install
inst_num = install.groupby(by = ['SK_ID_CURR']).mean().reset_index()

#merge install with prev
test = test.merge(inst_num, on = 'SK_ID_CURR', how='left')

del install
del inst_num
gc.collect()

test.shape

#### Merge Previous_Application and POS_CASH with Test Dataset

In [None]:
#load previous_application and pos_cash_balance data into memory
prev = pd.read_csv('../input/home-credit-default-risk/previous_application.csv')
pos = pd.read_csv('../input/home-credit-default-risk/POS_CASH_balance.csv')

#add prefix to pos_cash_balance
pos.columns = ['PO_'+column if column !='SK_ID_PREV' else column for column in pos.columns]

#create numeric features of pos grouped on SK_ID_PREV
pos_num = pos.groupby(by=['SK_ID_PREV']).mean().reset_index()

#create dummies and group catergorical features in pos
pos_cat = pd.get_dummies(pos.select_dtypes('object'))
pos_cat['SK_ID_PREV'] = pos['SK_ID_PREV']    
pos_cat = pos_cat.groupby(by = ['SK_ID_PREV']).mean().reset_index()

#merge pos_cat and pos_num with prev
prev = prev.merge(pos_num, on='SK_ID_PREV', how='left')    
prev = prev.merge(pos_cat, on='SK_ID_PREV', how='left')


del pos
del pos_num
del pos_cat
gc.collect()

In [None]:
#create prefix
prev.columns = ['PR_'+ column if column != 'SK_ID_CURR' 
                    else column for column in prev.columns]

#create numeric features of prev grouped on SK_ID_PREV
prev_num = prev.groupby(by = ['SK_ID_CURR']).mean().reset_index()

#group categorical features in previous_application
prev_cat = pd.get_dummies(prev.select_dtypes('object'))

prev_cat['SK_ID_CURR'] = prev['SK_ID_CURR']
prev_cat = prev_cat.groupby(by = ['SK_ID_CURR']).mean().reset_index()

#combine previous_application categorical and numeric features
test = test.merge(prev_num, on = 'SK_ID_CURR', how = 'left')
test = test.merge(prev_cat, on = 'SK_ID_CURR', how = 'left')

del prev_cat
del prev_num
del prev
gc.collect()

test.shape

#### Merge Credit_Card_Balance with Test Dataset

In [None]:
#load credit_card_balance into memory
cc_bal = pd.read_csv('../input/home-credit-default-risk/credit_card_balance.csv')

#create prefix for columns
cc_bal.columns = ['CC_'+ column if column !='SK_ID_CURR' 
                  else column for column in cc_bal.columns]

#group numerical features in credit card balance by SK_ID_CURR
cc_num = cc_bal.groupby(by = ['SK_ID_CURR']).mean().reset_index()

#group categorical features by SK_ID_CURR
cc_cat = pd.get_dummies(cc_bal.select_dtypes('object'))
cc_cat['SK_ID_CURR'] = cc_bal['SK_ID_CURR']    
cc_cat = cc_bal.groupby(by = ['SK_ID_CURR']).mean().reset_index()

#merge cc_cat, cc_num, and prev
test = test.merge(cc_cat, on = 'SK_ID_CURR', how = 'left')
test = test.merge(cc_num, on = 'SK_ID_CURR', how = 'left')

del cc_bal
del cc_cat
del cc_num
gc.collect()

test.shape

In [None]:
#create average of the (at most) three scores for each row of EXT_SOURCE_x variables 
test['AVG_EXT'] = test.iloc[:, 40:43].sum(axis=1)/(3- test.iloc[:,40:43].isnull().sum(axis=1))   
test['EXT_SOURCE_1'].fillna(test['AVG_EXT'], inplace=True)
test['EXT_SOURCE_2'].fillna(test['AVG_EXT'], inplace=True)
test['EXT_SOURCE_3'].fillna(test['AVG_EXT'], inplace = True)

#drop avg as it is only needed for above 
test.drop(['AVG_EXT'], axis = 1)


#convert catergorical features to cat
cat_cols = ['FLAG_MOBIL', 'FLAG_EMP_PHONE', 'FLAG_WORK_PHONE', 
            'FLAG_CONT_MOBILE', 'FLAG_PHONE', 'FLAG_EMAIL', 
            'REGION_RATING_CLIENT', 'REGION_RATING_CLIENT_W_CITY',
            'REG_REGION_NOT_LIVE_REGION', 'REG_REGION_NOT_WORK_REGION', 
            'LIVE_REGION_NOT_WORK_REGION', 'REG_CITY_NOT_LIVE_CITY', 
            'REG_CITY_NOT_WORK_CITY', 'LIVE_CITY_NOT_WORK_CITY']

test[cat_cols] = test[cat_cols].astype('category')


#replace 365243 in days employed with nan
test['DAYS_EMPLOYED'].replace(365243, np.nan, inplace = True)

#convert age to years
test['AGE'] = test['DAYS_BIRTH'] / - 365

#percentage of days employed 
test['EMP_RATIO'] = test['DAYS_EMPLOYED'] / test['AGE']

#create credit/income ratio 
test['CI_RATIO'] = test['AMT_CREDIT'] / test['AMT_INCOME_TOTAL']

#create annuity to income ration
test['AI_RATIO'] = test['AMT_ANNUITY'] / test['AMT_INCOME_TOTAL']

#create credit/annuity ratio 
test['CA_RATIO'] = test['AMT_CREDIT'] / test['AMT_ANNUITY']

#create credit/cost of goods ratio feature
test['CG_RATIO'] = test['AMT_CREDIT'] / test['AMT_GOODS_PRICE']

In [None]:
dels = ['APARTMENTS_MODE', 'BASEMENTAREA_MODE', 'YEARS_BEGINEXPLUATATION_MODE', 
        'YEARS_BUILD_MODE', 'COMMONAREA_MODE', 'ELEVATORS_MODE', 'ENTRANCES_MODE', 
        'FLOORSMAX_MODE', 'FLOORSMIN_MODE', 'LANDAREA_MODE', 'LIVINGAPARTMENTS_MODE', 
        'LIVINGAREA_MODE', 'NONLIVINGAPARTMENTS_MODE', 'NONLIVINGAREA_MODE', 
        'APARTMENTS_MEDI', 'BASEMENTAREA_MEDI', 'YEARS_BEGINEXPLUATATION_MEDI', 
        'YEARS_BUILD_MEDI', 'COMMONAREA_MEDI', 'ELEVATORS_MEDI', 'ENTRANCES_MEDI',
        'FLOORSMAX_MEDI', 'FLOORSMIN_MEDI', 'LANDAREA_MEDI', 'LIVINGAPARTMENTS_MEDI', 
        'LIVINGAREA_MEDI', 'NONLIVINGAPARTMENTS_MEDI', 'NONLIVINGAREA_MEDI', 
        'FONDKAPREMONT_MODE', 'HOUSETYPE_MODE', 'TOTALAREA_MODE', 
        'WALLSMATERIAL_MODE', 'EMERGENCYSTATE_MODE', 'FLAG_DOCUMENT_2', 
        'FLAG_DOCUMENT_3', 'FLAG_DOCUMENT_4', 'FLAG_DOCUMENT_5',
        'FLAG_DOCUMENT_6', 'FLAG_DOCUMENT_7', 'FLAG_DOCUMENT_8', 
        'FLAG_DOCUMENT_9', 'FLAG_DOCUMENT_10', 'FLAG_DOCUMENT_11', 
        'FLAG_DOCUMENT_12', 'FLAG_DOCUMENT_13', 'FLAG_DOCUMENT_14', 
        'FLAG_DOCUMENT_15', 'FLAG_DOCUMENT_16', 'FLAG_DOCUMENT_17', 
        'FLAG_DOCUMENT_18', 'FLAG_DOCUMENT_19', 'FLAG_DOCUMENT_20', 
        'FLAG_DOCUMENT_21', 'DAYS_BIRTH', 'LIVINGAPARTMENTS_AVG', 
        'LIVINGAREA_AVG', 'CNT_FAM_MEMBERS',  'OBS_30_CNT_SOCIAL_CIRCLE',
        'OBS_60_CNT_SOCIAL_CIRCLE', 'ELEVATORS_AVG', 'ORGANIZATION_TYPE']


test = test.drop(test[dels], axis =1)
gc.collect()


In [None]:
test.info(max_cols = 340)

In [None]:
X_test = preprocessor.transform(test)
print(X_test.shape)

## Make Predictions

In [None]:
test_pred = LGBM_model2.predict_proba(X_test)
print(test_pred.shape)
print(test_pred[:5])

## Submission

In [None]:
submission = pd.read_csv('../input/home-credit-default-risk/sample_submission.csv')
submission.head(10)  

In [None]:
submission.shape

In [None]:
submission.TARGET = test_pred[:,1]   # replace the default values with our predictions
submission.head(10)

In [None]:
submission.to_csv('default_submission_wk05.csv', index=False, header = True)