# Model Submission Notebook - Home Credit Default Risk

### initialization - load packages and data

In [None]:
import pandas as pd
import joblib
import numpy as np
import gc

MainDir = "../input/../input/home-credit-default-risk"
test = pd.read_csv(f'{MainDir}/application_test.csv')

### Load Model

In [None]:
preprocessor = joblib.load('../input/defaultdata08/default_preprocessor_08.joblib')
model = joblib.load('../input/defaultdata08/default_model_08.joblib')
print(type(model))

### Preprocessing

In [None]:
# Load Bureau table
bureau = pd.read_csv(f'{MainDir}/bureau.csv')
print(bureau.shape, "- shape of bureau table")

# Load bureau_balance table and merge its features into bureau
bureau_balance = pd.read_csv(f'{MainDir}/bureau_balance.csv')
bb_status = pd.crosstab(bureau_balance.SK_ID_BUREAU, bureau_balance.STATUS) # , normalize = 'index')   let's try it as counts instead of row proportions

bb_status.columns = ['BB_'+column for column in bb_status.columns]                                  # add BB_ prefix for identification
bureau = bureau.merge(bb_status, left_on = 'SK_ID_BUREAU', right_on = 'SK_ID_BUREAU')               # merge the tables
bureau = bureau.drop(['SK_ID_BUREAU'], axis = 1)                                                    # no longer need this
print(bureau.shape, "- shape of bureau table after merge")

bureau.columns = ['BU_'+column if column !='SK_ID_CURR' else column for column in bureau.columns]   # this way we can recognize these columns later

# Create numeric features by grouping on SK_ID_CURR and finding group means
bureau_num = bureau.groupby(by=['SK_ID_CURR']).mean().reset_index()         # group the numeric features by SK_ID_CURR
print(bureau_num.shape, "- shape of numeric features (incl index)")         # should be 305,811 x 13

# Create categorical features by creating dummies and then taking group means
bureau_cat = pd.get_dummies(bureau.select_dtypes('object'))                 # this got rid of the SK_ID_CURR column ...
bureau_cat['SK_ID_CURR'] = bureau['SK_ID_CURR']                             # so we have to replace it
bureau_cat = bureau_cat.groupby(by = ['SK_ID_CURR']).mean().reset_index()   # could try sum as well.
print(bureau_cat.shape, "- shape of categorical features (incl index)")     # should be 305,811 x 24

# Number of past loans per customer
bureau_count = bureau.groupby(by = ['SK_ID_CURR'])['BU_CREDIT_ACTIVE'].count().reset_index()
bureau_count.rename(columns={'BU_CREDIT_ACTIVE':'COUNT_of_BUREAU'})
bureau_count.head(5)

# merge bureau_num and bureau_cat into the training data
test = test.merge(bureau_num, on='SK_ID_CURR', how='left')                # merge numeric features
test = test.merge(bureau_cat, on='SK_ID_CURR', how='left')                # merge categorical features
test = test.merge(bureau_count, on='SK_ID_CURR', how='left')              # merge count features
print(test.shape, "- shape of training data after merges")                 # added 35 new features.

# This process will add some NaNs to the main data, but we can let the imputer take care of that later.

# no longer need bureau, bureau_num, bureau_car, bureau_count, bureau_balance
list = ['bureau', 'bureau_num', 'bureau_cat', 'bureau_balance']
del list
gc.collect()

# ---------

# merge pos_cash_balance, installments_payments and credit_card_balance into previous_application
previous = pd.read_csv(f'{MainDir}/previous_application.csv')
print(previous.shape, "- shape of previous_application")
pos = pd.read_csv(f'{MainDir}/POS_CASH_balance.csv')

pos.columns = ['PO_'+column if column !='SK_ID_PREV' else column for column in pos.columns]   # this way we can recognize these columns later

# Create numeric features of pos by grouping on SK_ID_PREV and finding group means
pos_num = pos.groupby(by=['SK_ID_PREV']).mean().reset_index()         # group the numeric features by SK_ID_CURR
print(pos_num.shape, "- shape of numeric features (incl index)")         # should be 305,811 x 13

# Create categorical features by creating dummies and then taking group means
pos_cat = pd.get_dummies(pos.select_dtypes('object'))                 # this got rid of the SK_ID_PREV column ...
pos_cat['SK_ID_PREV'] = pos['SK_ID_PREV']                             # so we have to replace it
pos_cat = pos_cat.groupby(by = ['SK_ID_PREV']).mean().reset_index()   # could try sum as well.
print(pos_cat.shape, "- shape of categorical features (incl index)")     # should be 305,811 x 24

# merge pos_num and pos_cat into the previous_application data
previous = previous.merge(pos_num, on='SK_ID_PREV', how='left')                # merge numeric features
previous = previous.merge(pos_cat, on='SK_ID_PREV', how='left')                # merge categorical features
print(previous.shape, "- shape of previous data after merges")                 # added 35 new features.

# don't need these anymore: pos, pos_num, pos_cat
list = ['pos', 'pos_num', 'pos_cat']
del list
gc.collect()

inst = pd.read_csv(f'{MainDir}/installments_payments.csv')
inst.columns = ['IP_'+column if column !='SK_ID_PREV' else column for column in inst.columns]   # this way we can recognize these columns later

inst_num = inst.groupby(by=['SK_ID_PREV']).mean().reset_index()         # group the numeric features by SK_ID_CURR
print(inst_num.shape, "- shape of numeric features (incl index)")         # should be 305,811 x 13

# installments_payments only has numeric features

# merge pos_num into the previous_application data
previous = previous.merge(inst_num, left_on='SK_ID_PREV', right_on = 'SK_ID_PREV', how='left')                # merge numeric features
print(previous.shape, "- shape of previous data after merges")                 # added 35 new features.

# don't need these anymore: inst, inst_num
list = ['inst', 'inst_num']
del list
gc.collect()

ccb = pd.read_csv(f'{MainDir}/credit_card_balance.csv')
ccb.columns = ['CC_'+column if column !='SK_ID_PREV' else column for column in ccb.columns]   # this way we can recognize these columns later

# Create numeric features of pos by grouping on SK_ID_PREV and finding group means
ccb_num = ccb.groupby(by=['SK_ID_PREV']).mean().reset_index()         # group the numeric features by SK_ID_CURR
print(ccb_num.shape, "- shape of numeric features (incl index)")         # should be 305,811 x 13

# Create categorical features by creating dummies and then taking group means
ccb_cat = pd.get_dummies(ccb.select_dtypes('object'))                 # this got rid of the SK_ID_PREV column ...
ccb_cat['SK_ID_PREV'] = ccb['SK_ID_PREV']                             # so we have to replace it
ccb_cat = ccb_cat.groupby(by = ['SK_ID_PREV']).mean().reset_index()   # could try sum as well.
print(ccb_cat.shape, "- shape of categorical features (incl index)")     # should be 305,811 x 24

# merge ccb_num and ccb_cat into the previous_application data
previous = previous.merge(ccb_num, on='SK_ID_PREV', how='left')                # merge numeric features
previous = previous.merge(ccb_cat, on='SK_ID_PREV', how='left')                # merge categorical features
print(previous.shape, "- shape of previous data after merges")                 # added 35 new features.

# don't need these anymore: pos, pos_num, pos_cat
list = ['ccb', 'ccb_num', 'ccb_cat']
del list
gc.collect()

# final step: merge previous into main table

previous.columns = ['PR_'+column if column !='SK_ID_CURR' else column for column in previous.columns]   # this way we can recognize these columns later
previous['PR_DAYS_LAST_DUE'].replace({365243: np.nan}, inplace = True)
previous['PR_DAYS_TERMINATION'].replace({365243: np.nan}, inplace = True)
previous['PR_DAYS_FIRST_DRAWING'].replace({365243: np.nan}, inplace = True)

# Create numeric features by grouping on SK_ID_CURR and finding group means
previous_num = previous.groupby(by=['SK_ID_CURR']).mean().reset_index()         # group the numeric features by SK_ID_CURR
print(previous_num.shape, "- shape of numeric features (incl index)")         # should be 305,811 x 13

# Create categorical features by creating dummies and then taking group means
previous_cat = pd.get_dummies(previous.select_dtypes('object'))                 # this got rid of the SK_ID_CURR column ...
previous_cat['SK_ID_CURR'] = previous['SK_ID_CURR']                             # so we have to replace it
previous_cat = bureau_cat.groupby(by = ['SK_ID_CURR']).mean().reset_index()   # could try sum as well.
print(previous_cat.shape, "- shape of categorical features (incl index)")     # should be 305,811 x 24

# merge bureau_num and bureau_cat into the training data
test = test.merge(previous_num, on='SK_ID_CURR', how='left')                # merge numeric features
test = test.merge(previous_cat, on='SK_ID_CURR', how='left')                # merge categorical features
print(test.shape, "- shape of training data after merges")                 # added 35 new features.

# This process will add some NaNs to the main data, but we can let the imputer take care of that later.

# no longer need bureau, bureau_num, bureau_car, bureau_count, bureau_balance
list = ['previous', 'previous_num', 'previous_cat']
del list
gc.collect()

# -------------

# what is going on with days_employed? Over 50,000 entries have the value 365,243 days! Let's replace those with NaN and let the imputer deal with them.
test['DAYS_EMPLOYED'].replace({365243: np.nan}, inplace = True)

# ratio features
test['CI_ratio'] = test['AMT_CREDIT'] / test['AMT_INCOME_TOTAL']        # credit-to-income ratio
test['AI_ratio'] = test['AMT_ANNUITY'] / test['AMT_INCOME_TOTAL']       # annuity-to-income ratio
test['AC_ratio'] = test['AMT_CREDIT'] / test['AMT_ANNUITY']             # credit to annuity - basically the term of the loan in years
test['CG_ratio'] = test['AMT_CREDIT'] / test['AMT_GOODS_PRICE']         # credit to goods price ratio - how much was financed?

# log features
test['log_INCOME'] = np.log(test['AMT_INCOME_TOTAL'])                    # log of income
test['log_ANNUITY'] = np.log(test['AMT_ANNUITY'])                        # log of annuity
test['log_CREDIT'] = np.log(test['AMT_CREDIT'])                          # log of credit
test['log_GOODS'] = np.log(test['AMT_GOODS_PRICE'])                      # log of goods price

# flag features
test['MissingBureau'] = test.iloc[:, 41:44].isnull().sum(axis=1).astype("category")   # number of bureaus with no score
test['FLAG_CG_ratio'] = test['AMT_CREDIT'] > test['AMT_GOODS_PRICE']                 # FLAG if you borrowed more than the price of the item
test['DAYS_ID_4200'] = test['DAYS_ID_PUBLISH'] < -4200                             # IDs more than about 14 years old are from USSR

# EXT_SOURCE_x variables are very important - let's not leave missing values up to the imputer!
# Instead of imputing missing values by column mean or median, let's fill in missing values by row
# i.e. missing scores are replaced with the average of the scores we do have. If there are no scores at all
# let's just give them a value of 0.2 for now.
test['AVG_EXT'] = test.iloc[:, 41:44].sum(axis=1)/(3- test.iloc[:,41:44].isnull().sum(axis=1))   # average of the (at most) three scores
test['AVG_EXT'].replace(np.nan, 0.2, inplace = True)   # get rid of any /0 errors generated from previous step

test.EXT_SOURCE_1.fillna(test.AVG_EXT, inplace=True)
test.EXT_SOURCE_2.fillna(test.AVG_EXT, inplace=True)
test.EXT_SOURCE_3.fillna(test.AVG_EXT, inplace=True)

test.drop(['AVG_EXT'], axis = 1)   # let's not make AVG_EXT a feature - it will be too highly correlated to the three components

# drop these variables based on poor feature significance (< 0.0001)
#train.drop(['REG_REGION_NOT_LIVE_REGION','AMT_REQ_CREDIT_BUREAU_WEEK','HOUSETYPE_MODE','OCCUPATION_TYPE','FLAG_MOBIL','FLAG_CONT_MOBILE',
#           'NAME_TYPE_SUITE', 'FLAG_DOCUMENT_4','ORGANIZATION_TYPE', 'FONDKAPREMONT_MODE', 'FLAG_DOCUMENT_15', 'FLAG_DOCUMENT_7', 'FLAG_DOCUMENT_16',
#           'FLAG_DOCUMENT_9', 'FLAG_DOCUMENT_10', 'FLAG_DOCUMENT_11','FLAG_DOCUMENT_12', 'FLAG_DOCUMENT_13', 'FLAG_DOCUMENT_14', 'AMT_REQ_CREDIT_BUREAU_DAY',
#           'AMT_REQ_CREDIT_BUREAU_HOUR', 'FLAG_DOCUMENT_21','FLAG_DOCUMENT_20', 'FLAG_DOCUMENT_19', 'FLAG_DOCUMENT_18', 'FLAG_DOCUMENT_17','FLAG_DOCUMENT_2'],
#           axis=1, inplace=True)

test.drop(['ORGANIZATION_TYPE'], axis = 1)  # 58 dummies, doesn't do jackshit

# Ratio Features
test['OD_ratio'] = test['BU_AMT_CREDIT_SUM_OVERDUE'] / test['BU_AMT_CREDIT_SUM_DEBT']   # proportion of debt that is overdue
test['OD_ratio'].replace([np.nan, np.inf, -np.inf], 0, inplace = True)
test['Credit_ratio'] = test['BU_AMT_CREDIT_SUM'] / test['BU_AMT_CREDIT_SUM_LIMIT']      # proportion of credit line used
test['Credit_ratio'].replace([np.nan, np.inf, -np.inf], 0, inplace = True)
test['Debt_ratio'] = test['BU_AMT_CREDIT_SUM_DEBT'] / test['BU_AMT_CREDIT_SUM']         # debt percentage
test['Debt_ratio'].replace([np.nan, np.inf, -np.inf], 0, inplace = True)
test['PR_term'] = test['PR_IP_AMT_PAYMENT'] / test['PR_IP_AMT_INSTALMENT']             # term
test['PR_term'].replace([np.nan, np.inf, -np.inf], 0, inplace = True)


X_test = preprocessor.transform(test)
print(X_test.shape)

### Test Predictions

In [None]:
test_pred = model.predict_proba(X_test)
print(test_pred.shape)
print(test_pred[:5])

### Submission

In [None]:
submission = pd.read_csv('../input/home-credit-default-risk/sample_submission.csv')
submission.head(10)  # We need the probability of default (column [1] from test_pred)
submission.TARGET = test_pred[:,1]   # replace the default values with our predictions
submission.head(10)
submission.to_csv('default_submission_08.csv', index=False, header = True)