In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from xgboost import XGBClassifier
import lightgbm as lgb 

from sklearn.metrics import accuracy_score

import joblib   # save and load ML models
import gc       # garbage collection



In [None]:
train = pd.read_csv('../input/home-credit-default-risk/application_train.csv')
train.set_index(['SK_ID_CURR'], inplace = True)
train.shape

In [None]:
#convert catergorical festures to cat
cat_cols = ['FLAG_MOBIL', 'FLAG_EMP_PHONE', 'FLAG_WORK_PHONE', 
            'FLAG_CONT_MOBILE', 'FLAG_PHONE', 'FLAG_EMAIL', 
            'REGION_RATING_CLIENT', 'REGION_RATING_CLIENT_W_CITY',
            'REG_REGION_NOT_LIVE_REGION', 'REG_REGION_NOT_WORK_REGION', 
            'LIVE_REGION_NOT_WORK_REGION', 'REG_CITY_NOT_LIVE_CITY', 
            'REG_CITY_NOT_WORK_CITY', 'LIVE_CITY_NOT_WORK_CITY']

train[cat_cols] = train[cat_cols].astype('category')

In [None]:
train.info(max_cols = 125)

## Combining additional tables and performing feature engineering to improve model performance

#### Merge Bureau and Bureau_Balance with Training Data

In [None]:
#load bureau_balance and bureau into memory
bureau_bal = pd.read_csv('../input/home-credit-default-risk/bureau_balance.csv')
bureau = pd.read_csv('../input/home-credit-default-risk/bureau.csv')

bb = pd.merge(bureau, bureau_bal, on = 'SK_ID_BUREAU', how = 'left')

In [None]:
#feature engineering
bb['REMAIN_CRED'] = bb['AMT_CREDIT_SUM'] - bb['AMT_CREDIT_SUM_DEBT'] - bb['AMT_CREDIT_SUM_LIMIT']
bb['AC_RATIO'] = bb['AMT_ANNUITY'] / bb['AMT_CREDIT_SUM'] 

#add prefix to bureau columns
bb.columns = ['BU_'+column if column != ('SK_ID_CURR') 
                       else column for column in bb.columns]

#group categorical features in bureau
bur_cat = pd.get_dummies(bb.select_dtypes('object'))
bur_cat['SK_ID_CURR'] = bb['SK_ID_CURR']
bur_cat = bur_cat.groupby(by = ['SK_ID_CURR']).agg(['mean'])
  
#group numerical features    
bur_num = bb.groupby(by = ['SK_ID_CURR']).agg(['max', 'mean', 'sum']).astype('float32')

# merge cat and num columns
bureau_rev = bur_cat.merge(bur_num, on = ['SK_ID_CURR'], how = 'left')

#merge bureau_rev and train
train = train.merge(bureau_rev, on = ['SK_ID_CURR'], how = 'left')

#remove unneeded datasets from memory
del bur_cat
del bur_num
del bureau
del bureau_bal

gc.collect()

#### Merge Credit_Card_Balance with Training Dataset

In [None]:
#load data into memory
cc_bal = pd.read_csv('../input/home-credit-default-risk/credit_card_balance.csv')

#feature engineering
cc_bal['DRAW_RATIO'] = cc_bal['AMT_DRAWINGS_CURRENT'] / cc_bal['CNT_DRAWINGS_CURRENT']
cc_bal['RECEIVE_RATIO'] = cc_bal['AMT_RECIVABLE'] / cc_bal['AMT_RECEIVABLE_PRINCIPAL']
cc_bal['RECEIVE_PER'] = cc_bal['AMT_RECIVABLE'] / cc_bal['AMT_TOTAL_RECEIVABLE']


#create prefix for columns
cc_bal.columns = ['CC_'+ column if column !='SK_ID_CURR' 
                  else column for column in cc_bal.columns]

#group categorical features by SK_ID_CURR
cc_cat = pd.get_dummies(cc_bal.select_dtypes('object'))
cc_cat['SK_ID_CURR'] = cc_bal['SK_ID_CURR']
cc_cat = cc_cat.groupby(by = ['SK_ID_CURR']).mean()

#group numerical features in credit card balance by SK_ID_CURR
cc_num = cc_bal.groupby(by = ['SK_ID_CURR']).agg(['max', 'mean', 'sum']).astype('float32')

train = train.merge(cc_cat, on = ['SK_ID_CURR'], how = 'left')
train = train.merge(cc_num, on = ['SK_ID_CURR'], how = 'left')

del cc_bal
del cc_cat
del cc_num
gc.collect()


#### Merge Installments with Training Dataset

In [None]:
#load installments_payments into memory
install = pd.read_csv('../input/home-credit-default-risk/installments_payments.csv')

#feature engineering
install['PAY_PERCENT'] = install['AMT_INSTALMENT'] / install['AMT_PAYMENT']
install['PAY_DIFF'] = install['AMT_INSTALMENT'] - install['AMT_PAYMENT']

install['DPD'] = install['DAYS_ENTRY_PAYMENT'] - install['DAYS_INSTALMENT']
install['DPD'] = install['DPD'].apply(lambda x: x if x>0 else 0)

install['DBD'] = install['DAYS_INSTALMENT'] - install['DAYS_ENTRY_PAYMENT']
install['DBD'] = install['DBD'].apply(lambda x: x if x>0 else 0)

#create prefix
install.columns = ['IP_'+ column if column !='SK_ID_CURR' 
                   else column for column in install.columns]  


#group numeric features (no cat features in install)
inst_num = install.groupby(by = ['SK_ID_CURR']).agg(['max', 'mean']).astype('float32')

#merge install with prev
train = train.merge(inst_num, on = 'SK_ID_CURR', how='left')

del install
del inst_num
gc.collect()

#### Merge POS_CASH into Training data

In [None]:
#load POS_CASH into memory
pos = pd.read_csv('../input/home-credit-default-risk/POS_CASH_balance.csv')

#create prefix
pos.columns = ['PC_'+ column if column !='SK_ID_CURR' 
                   else column for column in pos.columns]

#group numeric features (no cat features in install)
pos_num = pos.groupby(by = ['SK_ID_CURR']).agg(['max', 'mean', 'sum']).astype('float32')

train = train.merge(pos_num, on = ['SK_ID_CURR'], how = 'left')

del pos
del pos_num
gc.collect()


#### Merge Previous_Application with Training Dataset

In [None]:
#load data
prev = pd.read_csv('../input/home-credit-default-risk/previous_application.csv')

#feature engineering
prev['DAYS_FIRST_DRAWING'].replace(365243, np.nan, inplace = True)
prev['DAYS_FIRST_DUE'].replace(365243, np.nan, inplace = True)
prev['DAYS_LAST_DUE_1ST_VERSION'].replace(365243, np.nan, inplace = True)
prev['DAYS_LAST_DUE'].replace(365243, np.nan, inplace = True)
prev['DAYS_TERMINATION'].replace(365243, np.nan, inplace = True)

prev['AppCred_RATIO'] = prev['AMT_APPLICATION'] / (prev['AMT_CREDIT'] + 1)
prev['AppGoods_RATIO'] = prev['AMT_APPLICATION'] / (prev['AMT_GOODS_PRICE'] + 1)
prev['AnnCred_RATIO'] = prev['AMT_ANNUITY'] / (prev['AMT_CREDIT'] + 1)
prev['CredGoods_RATIO'] = prev['AMT_CREDIT'] / (prev['AMT_GOODS_PRICE'] + 1)


#calculate APR and add it as a feature
def calc_rate(row):
    return np.rate(row['CNT_PAYMENT'], -row['AMT_ANNUITY'], row['AMT_CREDIT'], 0, guess = 0.05, maxiter = 10)

prev['CALC_RATE'] = prev.apply(calc_rate, axis=1)


#Remove unnecessary features
p_dels = ['RATE_INTEREST_PRIMARY','RATE_INTEREST_PRIVILEGED']
prev = prev.drop(prev[p_dels], axis = 1)

#create prefix
prev.columns = ['PR_'+ column if column != 'SK_ID_CURR' 
                else column for column in prev.columns]

#group categorical features in previous_application
prev_cat = pd.get_dummies(prev.select_dtypes('object'))
prev_cat['SK_ID_CURR'] = prev['SK_ID_CURR']
prev_cat = prev_cat.groupby(by = ['SK_ID_CURR']).agg(['mean'])

#group numeric features
prev_num = prev.groupby(by = ['SK_ID_CURR']).agg(['max', 'mean', 'sum']).astype('float32')

#combine previous_application categorical and numeric features
prev_rev = prev_num.merge(prev_cat, on = ['SK_ID_CURR'], how = 'left')

#merge revised previous_application features into training dataset
train = train.merge(prev_rev, on = ['SK_ID_CURR'], how = 'left')

del prev_rev
del prev_cat
del prev_num
gc.collect()

In [None]:
#replace 365243 in days employed with nan
train['DAYS_EMPLOYED'].replace(365243, np.nan, inplace = True)

#set max income to 2.5 million
train = train[train['AMT_INCOME_TOTAL'] < 2500000]

#convert age to years
train['AGE'] = train['DAYS_BIRTH'] / - 365

#create avg of each row of EXIT_SOURCE values
train['AVG_EXT'] = train.iloc[:, 41:44].sum(axis=1)/(3- train.iloc[:,41:44].isnull().sum(axis=1))
train.EXT_SOURCE_1.fillna(train.AVG_EXT, inplace=True)
train.EXT_SOURCE_2.fillna(train.AVG_EXT, inplace=True)
train.EXT_SOURCE_3.fillna(train.AVG_EXT, inplace=True)



#### Added features to improve predictive power

* The percentage of days employed - Does length of employment predict ability to keep paying off a loan?
* Available credit to income ratio - Does the amount of credit available as a percentage of income predict ability to pay off a loan?
* Annuity to income ratio - Does receiving annuity predict ability to pay off a loan?
* Annuity as a percentage of available credit -  Does annuity compared to credit availability predict ability to pay off a loan?
* Cost of goods to credit ratio - Does how much was financed vs how much was paid for goods predict ability to pay off the loan?

In [None]:
#percentage of days employed 
train['EmpAge_RATIO'] = train['DAYS_EMPLOYED'] / train['AGE']

#create credit/income ratio 
train['CredInc_RATIO'] = train['AMT_CREDIT'] / train['AMT_INCOME_TOTAL']

#create annuity to income ration
train['AnnInc_RATIO'] = train['AMT_ANNUITY'] / train['AMT_INCOME_TOTAL']

#create credit/annuity ratio 
train['AnnCred_RATIO'] = train['AMT_ANNUITY'] / (train['AMT_CREDIT'] + 1)

#create credit/cost of goods ratio feature
train['CredGoods_RATIO'] = train['AMT_CREDIT'] / (train['AMT_GOODS_PRICE'] + 1)


train['AVG_EXT_INCOME'] = train['AMT_INCOME_TOTAL'] * train['AVG_EXT']
train['AVG_EXT_GOODS'] = train['AMT_GOODS_PRICE'] * train['AVG_EXT']


In [None]:
dels = ['APARTMENTS_MODE', 'BASEMENTAREA_MODE', 'YEARS_BEGINEXPLUATATION_MODE', 
        'YEARS_BUILD_MODE', 'COMMONAREA_MODE', 'ELEVATORS_MODE', 'ENTRANCES_MODE', 
        'FLOORSMAX_MODE', 'FLOORSMIN_MODE', 'LANDAREA_MODE', 'LIVINGAPARTMENTS_MODE', 
        'LIVINGAREA_MODE', 'NONLIVINGAPARTMENTS_MODE', 'NONLIVINGAREA_MODE', 
        'APARTMENTS_MEDI', 'BASEMENTAREA_MEDI', 'YEARS_BEGINEXPLUATATION_MEDI', 
        'YEARS_BUILD_MEDI', 'COMMONAREA_MEDI', 'ELEVATORS_MEDI', 'ENTRANCES_MEDI',
        'FLOORSMAX_MEDI', 'FLOORSMIN_MEDI', 'LANDAREA_MEDI', 'LIVINGAPARTMENTS_MEDI', 
        'LIVINGAREA_MEDI', 'NONLIVINGAPARTMENTS_MEDI', 'NONLIVINGAREA_MEDI', 
        'FONDKAPREMONT_MODE', 'HOUSETYPE_MODE', 'TOTALAREA_MODE', 
        'WALLSMATERIAL_MODE', 'EMERGENCYSTATE_MODE', 'FLAG_DOCUMENT_2', 
        'FLAG_DOCUMENT_3', 'FLAG_DOCUMENT_4', 'FLAG_DOCUMENT_5',
        'FLAG_DOCUMENT_6', 'FLAG_DOCUMENT_7', 'FLAG_DOCUMENT_8', 
        'FLAG_DOCUMENT_9', 'FLAG_DOCUMENT_10', 'FLAG_DOCUMENT_11', 
        'FLAG_DOCUMENT_12', 'FLAG_DOCUMENT_13', 'FLAG_DOCUMENT_14', 
        'FLAG_DOCUMENT_15', 'FLAG_DOCUMENT_16', 'FLAG_DOCUMENT_17', 
        'FLAG_DOCUMENT_18', 'FLAG_DOCUMENT_19', 'FLAG_DOCUMENT_20', 
        'FLAG_DOCUMENT_21', 'DAYS_BIRTH', 'LIVINGAPARTMENTS_AVG', 
        'LIVINGAREA_AVG', 'CNT_FAM_MEMBERS',  'OBS_30_CNT_SOCIAL_CIRCLE',
        'OBS_60_CNT_SOCIAL_CIRCLE', 'ELEVATORS_AVG', 'AVG_EXT']


train = train.drop(train[dels], axis =1)
gc.collect()


## Build pipelines

In [None]:
train_noTARGET= train.loc[:, train.columns != 'TARGET']

num_feat = train_noTARGET.select_dtypes(include=np.number).columns.tolist()
cat_feat = train.select_dtypes(('object', 'category')).columns.tolist()

features = num_feat + cat_feat

print(features)

In [None]:
train = train.replace([np.inf, -np.inf], np.nan)

In [None]:
#create a Pipeline for processing the num_feat
num_pipe = Pipeline(
    steps = [('imputer', SimpleImputer(strategy = 'median')),
           ('scaler', StandardScaler())
    ])


#create a Pipeline for processing the cat_feat
cat_pipe = Pipeline(
    steps = [('imputer', SimpleImputer(strategy = 'constant', fill_value = 'missing')),
           ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])

In [None]:
#create a ColumnTransformer that combines the two pipelines
preprocessor = ColumnTransformer(
    transformers = [('num', num_pipe, num_feat),
                    ('cat', cat_pipe, cat_feat)
    ])

In [None]:
preprocessor.fit(train[features])

In [None]:
train30k = train.sample(frac=0.10, replace=False, random_state=1)

In [None]:
#define y_train, apply the fitted preprocessor to the training data
y_train = train30k['TARGET'].values
X_train = preprocessor.transform(train30k[features])

In [None]:
print('Shape of features: ', X_train.shape)
print('Shape of target: ', y_train.shape)

In [None]:
%%time

LGBM_clf = lgb.LGBMClassifier(boosting_type = 'gbdt',objective = 'binary',
                              n_estimators = 5000, num_leaves = 35, 
                              subsample = 0.87, colsample_bytree = 0.94, 
                              silent = -1, verbose = -1)

LGBM_parameters = {
    'max_depth': range (1, 3, 8),
    'learning_rate': [0.01, 0.02, 0.03], 
    'metric' : ['auc']
}


LGBM_grid = GridSearchCV(LGBM_clf, LGBM_parameters, cv=10, n_jobs=10, 
                         verbose=True, scoring= 'roc_auc')
LGBM_grid.fit(X_train, y_train)


LGBM_model = LGBM_grid.best_estimator_

print('Best Parameters:', LGBM_grid.best_params_)
print('Best CV Score:  ', LGBM_grid.best_score_)
print('Training Acc:   ', LGBM_model.score(X_train, y_train))

In [None]:
#save the pipeline and best model to a file 
joblib.dump(preprocessor, 'wk6default_preprocessor.joblib')

LGBM_model = lgb.LGBMClassifier(boosting_type = 'gbdt',objective = 'binary',
                                n_estimators = 5000, num_leaves = 35, 
                                subsample = 0.87, colsample_bytree = 0.94, 
                                silent = -1, verbose = -1,
                                learning_rate = 0.02, max_depth = 1, metric = 'auc')

LGBM_model.fit(X_train, y_train)
joblib.dump(LGBM_model, 'wk6_LGBM_default_model.joblib')