In [None]:
import numpy as np
import pandas as pd

import joblib

In [None]:
#import data
test = pd.read_csv('../input/home-credit-default-risk/application_test.csv')
test.set_index(['SK_ID_CURR'], inplace=True)
test.shape

### Data Prep

In [None]:
#convert catergorical festures to cat
cat_cols = ['FLAG_MOBIL', 'FLAG_EMP_PHONE', 'FLAG_WORK_PHONE', 'FLAG_CONT_MOBILE', 
            'FLAG_PHONE', 'FLAG_EMAIL', 'REGION_RATING_CLIENT', 'REGION_RATING_CLIENT_W_CITY',
            'REG_REGION_NOT_LIVE_REGION', 'REG_REGION_NOT_WORK_REGION', 'LIVE_REGION_NOT_WORK_REGION',
            'REG_CITY_NOT_LIVE_CITY', 'REG_CITY_NOT_WORK_CITY', 'LIVE_CITY_NOT_WORK_CITY']

test[cat_cols] = test[cat_cols].astype('category')

In [None]:
test.info(max_cols = 150)

In [None]:
#create credit annuity ratio feature
test['CRED_ANNUITY'] = test['AMT_CREDIT'] / test['AMT_ANNUITY']

#replace 365243 in days employed with nan
test['DAYS_EMPLOYED'].replace(365243, np.nan, inplace = True)

#convert age to years
test['AGE'] = test['DAYS_BIRTH'] / - 365

#create average of the (at most) three scores for each row of EXT_SOURCE_x variables 
test['AVG_EXT'] = test.iloc[:, 40:43].sum(axis=1)/(3- test.iloc[:,40:43].isnull().sum(axis=1))   
test['EXT_SOURCE_1'].fillna(test['AVG_EXT'], inplace=True)
test['EXT_SOURCE_2'].fillna(test['AVG_EXT'], inplace=True)
test['EXT_SOURCE_3'].fillna(test['AVG_EXT'], inplace=True)

In [None]:
#remove columns with mode and median building information 
dels = ['DAYS_BIRTH', 'LIVINGAPARTMENTS_AVG', 'LIVINGAREA_AVG', 'CNT_FAM_MEMBERS',  'OBS_30_CNT_SOCIAL_CIRCLE', 'ELEVATORS_AVG', 
        'APARTMENTS_MODE', 'BASEMENTAREA_MODE', 'YEARS_BEGINEXPLUATATION_MODE', 'YEARS_BUILD_MODE', 'COMMONAREA_MODE', 
        'ELEVATORS_MODE', 'ENTRANCES_MODE', 'FLOORSMAX_MODE', 'FLOORSMIN_MODE', 'LANDAREA_MODE', 'LIVINGAPARTMENTS_MODE', 
        'LIVINGAREA_MODE', 'NONLIVINGAPARTMENTS_MODE', 'NONLIVINGAREA_MODE', 'APARTMENTS_MEDI', 'BASEMENTAREA_MEDI', 
        'YEARS_BEGINEXPLUATATION_MEDI', 'YEARS_BUILD_MEDI', 'COMMONAREA_MEDI', 'ELEVATORS_MEDI', 'ENTRANCES_MEDI', 
        'FLOORSMAX_MEDI', 'FLOORSMIN_MEDI', 'LANDAREA_MEDI', 'LIVINGAPARTMENTS_MEDI', 'LIVINGAREA_MEDI', 
        'NONLIVINGAPARTMENTS_MEDI', 'NONLIVINGAREA_MEDI', 'FONDKAPREMONT_MODE', 'HOUSETYPE_MODE', 'TOTALAREA_MODE', 
        'WALLSMATERIAL_MODE', 'EMERGENCYSTATE_MODE', 'FLAG_DOCUMENT_2', 'FLAG_DOCUMENT_3', 'FLAG_DOCUMENT_4', 'FLAG_DOCUMENT_5',
        'FLAG_DOCUMENT_6', 'FLAG_DOCUMENT_7', 'FLAG_DOCUMENT_8', 'FLAG_DOCUMENT_9', 'FLAG_DOCUMENT_10', 'FLAG_DOCUMENT_11', 
        'FLAG_DOCUMENT_12', 'FLAG_DOCUMENT_13', 'FLAG_DOCUMENT_14', 'FLAG_DOCUMENT_15', 'FLAG_DOCUMENT_16', 'FLAG_DOCUMENT_17', 
        'FLAG_DOCUMENT_18', 'FLAG_DOCUMENT_19', 'FLAG_DOCUMENT_20', 'FLAG_DOCUMENT_21']

test1 = test.drop(test[dels], axis =1)
test1.shape

In [None]:
test1.info(max_cols = 65)

### Load model

In [None]:
#load saved pipeline and model
preprocessor = joblib.load('../input/wk-2-default-v3/wk2default_preprocessor.joblib')

LGBM_model2 = joblib.load('../input/wk-2-default-v3/LGBM_default_model (1).joblib')

### Preprocess

In [None]:
X_test = preprocessor.transform(test1)
print(X_test.shape)

### Make Predictions

In [None]:
test_pred = LGBM_model2.predict_proba(X_test)
print(test_pred.shape)
print(test_pred[:5])

### Submission

In [None]:
submission = pd.read_csv('../input/home-credit-default-risk/sample_submission.csv')
submission.head(10)  # We need the probability of default (column [1] from test_pred)

In [None]:
submission.shape

In [None]:
submission.TARGET = test_pred[:,1]   # replace the default values with our predictions
submission.head(10)

In [None]:
submission.to_csv('default_submission_wk02-3.csv', index=False, header = True)