In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from xgboost import XGBClassifier
import lightgbm as lgb 

from sklearn.metrics import accuracy_score

import joblib   # save and load ML models
import gc       # garbage collection



In [None]:
train = pd.read_csv('../input/home-credit-default-risk/application_train.csv')
train.set_index(['SK_ID_CURR'], inplace = True)
train.shape

### Merge additional tables and perform feature engineering to improve model performance

#### Merge Bureau and Bureau_Balance with Training Data

In [None]:
#load bureau and bureau_balance
bureau = pd.read_csv('../input/home-credit-default-risk/bureau.csv')
bureau_bal = pd.read_csv('../input/home-credit-default-risk/bureau_balance.csv')

bb_status = pd.crosstab(bureau_bal['SK_ID_BUREAU'], bureau_bal['STATUS'])

#add prefix to bureau balance columns
bb_status.columns = ['BB_'+ column for column in bb_status.columns]        

#merge the tables
bureau = bureau.merge(bb_status, left_on = ['SK_ID_BUREAU'], right_on = ['SK_ID_BUREAU'])              
bureau = bureau.drop(['SK_ID_BUREAU'], axis = 1)   

In [None]:
#add prefix to bureau columns
bureau.columns = ['BU_'+column if column != 'SK_ID_CURR' 
                  else column for column in bureau.columns]

#create numeric features and group on SK_ID_CURR   
bur_num = bureau.groupby(by = ['SK_ID_CURR']).mean().reset_index()

#group categorical features in bureau
bur_cat = pd.get_dummies(bureau.select_dtypes('object'))

bur_cat['SK_ID_CURR'] = bureau['SK_ID_CURR']
bur_cat = bur_cat.groupby(by = ['SK_ID_CURR']).mean().reset_index()

#create feature of number of past loans
bur_count = bureau.groupby(by = ['SK_ID_CURR'])['BU_CREDIT_ACTIVE'].count().reset_index()
bur_count.rename(columns={'BU_CREDIT_ACTIVE':'COUNT_LOANS'})

# merge cat, cat and count into train data
train = train.merge(bur_num, on = ['SK_ID_CURR'], how = 'left')       
train = train.merge(bur_cat, on = ['SK_ID_CURR'], how = 'left')   
train = train.merge(bur_count, on = ['SK_ID_CURR'], how = 'left')

del bureau
del bureau_bal
del bur_cat
del bur_num
del bur_count
gc.collect()

train.shape

#### Merge Installments_Payments with Training Dataset

In [None]:
#load installments_payments into memory
install = pd.read_csv('../input/home-credit-default-risk/installments_payments.csv')

#create prefix
install.columns = ['IP_'+ column if column !='SK_ID_CURR' 
                   else column for column in install.columns]  

#group numeric features (no cat features in install
inst_num = install.groupby(by = ['SK_ID_CURR']).mean().reset_index()

#merge install with prev
train = train.merge(inst_num, on = 'SK_ID_CURR', how='left')

del install
del inst_num
gc.collect()

train.shape

#### Merge Previous_Application and POS_CASH with Training Dataset

In [None]:
#load previous_application and pos_cash_balance data into memory
prev = pd.read_csv('../input/home-credit-default-risk/previous_application.csv')
pos = pd.read_csv('../input/home-credit-default-risk/POS_CASH_balance.csv')

#add prefix to pos_cash_balance
pos.columns = ['PO_'+column if column !='SK_ID_PREV' else column for column in pos.columns]

#create numeric features of pos grouped on SK_ID_PREV
pos_num = pos.groupby(by=['SK_ID_PREV']).mean().reset_index()

#create dummies and group catergorical features in pos
pos_cat = pd.get_dummies(pos.select_dtypes('object'))
pos_cat['SK_ID_PREV'] = pos['SK_ID_PREV']    
pos_cat = pos_cat.groupby(by = ['SK_ID_PREV']).mean().reset_index()

#merge pos_cat and pos_num with prev
prev = prev.merge(pos_num, on='SK_ID_PREV', how='left')    
prev = prev.merge(pos_cat, on='SK_ID_PREV', how='left')


del pos
del pos_num
del pos_cat
gc.collect()

In [None]:
#create prefix
prev.columns = ['PR_'+ column if column != 'SK_ID_CURR' 
                    else column for column in prev.columns]

#create numeric features of prev grouped on SK_ID_PREV
prev_num = prev.groupby(by = ['SK_ID_CURR']).mean().reset_index()

#group categorical features in previous_application
prev_cat = pd.get_dummies(prev.select_dtypes('object'))

prev_cat['SK_ID_CURR'] = prev['SK_ID_CURR']
prev_cat = prev_cat.groupby(by = ['SK_ID_CURR']).mean().reset_index()

#combine previous_application categorical and numeric features
train = train.merge(prev_num, on = 'SK_ID_CURR', how = 'left')
train = train.merge(prev_cat, on = 'SK_ID_CURR', how = 'left')

del prev_cat
del prev_num
del prev
gc.collect()

train.shape

#### Merge Credit_Card_Balance with Training Dataset

In [None]:
#load credit_card_balance into memory
cc_bal = pd.read_csv('../input/home-credit-default-risk/credit_card_balance.csv')

#create prefix for columns
cc_bal.columns = ['CC_'+ column if column !='SK_ID_CURR' 
                  else column for column in cc_bal.columns]

#group numerical features in credit card balance by SK_ID_CURR
cc_num = cc_bal.groupby(by = ['SK_ID_CURR']).mean().reset_index()

#group categorical features by SK_ID_CURR
cc_cat = pd.get_dummies(cc_bal.select_dtypes('object'))
cc_cat['SK_ID_CURR'] = cc_bal['SK_ID_CURR']    
cc_cat = cc_bal.groupby(by = ['SK_ID_CURR']).mean().reset_index()

#merge cc_cat, cc_num, and prev
train = train.merge(cc_cat, on = 'SK_ID_CURR', how = 'left')
train = train.merge(cc_num, on = 'SK_ID_CURR', how = 'left')

del cc_bal
del cc_cat
del cc_num
gc.collect()

train.shape

#### Training Data Feature Engineering and Additional Data Cleaning

#### Added features to improve predictive power

* The percentage of days employed - Does length of employment predict ability to keep paying off a loan?
* Available credit to income ratio - Does the amount of credit available as a percentage of income predict ability to pay off a loan?
* Annuity to income ratio - Does receiving annuity predict ability to pay off a loan?
* Annuity as a percentage of available credit -  Does annuity compared to credit availability predict ability to pay off a loan?
* Cost of goods to credit ratio - Does how much was financed vs how much was paid for goods predict ability to pay off the loan?

In [None]:
train.info(max_cols = 400)

In [None]:
#create average of the (at most) three scores for each row of EXT_SOURCE_x variables 
train['AVG_EXT'] = train.iloc[:, 41:44].sum(axis=1)/(3- train.iloc[:,41:44].isnull().sum(axis=1))   
train['EXT_SOURCE_1'].fillna(train['AVG_EXT'], inplace=True)
train['EXT_SOURCE_2'].fillna(train['AVG_EXT'], inplace=True)
train['EXT_SOURCE_3'].fillna(train['AVG_EXT'], inplace = True)

#drop avg as it is only needed for above 
train.drop(['AVG_EXT'], axis = 1)

#convert catergorical features to cat
cat_cols = ['FLAG_MOBIL', 'FLAG_EMP_PHONE', 'FLAG_WORK_PHONE', 
            'FLAG_CONT_MOBILE', 'FLAG_PHONE', 'FLAG_EMAIL', 
            'REGION_RATING_CLIENT', 'REGION_RATING_CLIENT_W_CITY',
            'REG_REGION_NOT_LIVE_REGION', 'REG_REGION_NOT_WORK_REGION', 
            'LIVE_REGION_NOT_WORK_REGION', 'REG_CITY_NOT_LIVE_CITY', 
            'REG_CITY_NOT_WORK_CITY', 'LIVE_CITY_NOT_WORK_CITY']

train[cat_cols] = train[cat_cols].astype('category')

#replace 365243 in days employed with nan
train['DAYS_EMPLOYED'].replace(365243, np.nan, inplace = True)

#convert age to years
train['AGE'] = train['DAYS_BIRTH'] / - 365


#percentage of days employed 
train['EMP_RATIO'] = train['DAYS_EMPLOYED'] / train['AGE']

#create credit/income ratio 
train['CI_RATIO'] = train['AMT_CREDIT'] / train['AMT_INCOME_TOTAL']

#create annuity to income ration
train['AI_RATIO'] = train['AMT_ANNUITY'] / train['AMT_INCOME_TOTAL']

#create credit/annuity ratio 
train['CA_RATIO'] = train['AMT_CREDIT'] / train['AMT_ANNUITY']

#create credit/cost of goods ratio feature
train['CG_RATIO'] = train['AMT_CREDIT'] / train['AMT_GOODS_PRICE']



In [None]:
dels = ['APARTMENTS_MODE', 'BASEMENTAREA_MODE', 'YEARS_BEGINEXPLUATATION_MODE', 
        'YEARS_BUILD_MODE', 'COMMONAREA_MODE', 'ELEVATORS_MODE', 'ENTRANCES_MODE', 
        'FLOORSMAX_MODE', 'FLOORSMIN_MODE', 'LANDAREA_MODE', 'LIVINGAPARTMENTS_MODE', 
        'LIVINGAREA_MODE', 'NONLIVINGAPARTMENTS_MODE', 'NONLIVINGAREA_MODE', 
        'APARTMENTS_MEDI', 'BASEMENTAREA_MEDI', 'YEARS_BEGINEXPLUATATION_MEDI', 
        'YEARS_BUILD_MEDI', 'COMMONAREA_MEDI', 'ELEVATORS_MEDI', 'ENTRANCES_MEDI',
        'FLOORSMAX_MEDI', 'FLOORSMIN_MEDI', 'LANDAREA_MEDI', 'LIVINGAPARTMENTS_MEDI', 
        'LIVINGAREA_MEDI', 'NONLIVINGAPARTMENTS_MEDI', 'NONLIVINGAREA_MEDI', 
        'FONDKAPREMONT_MODE', 'HOUSETYPE_MODE', 'TOTALAREA_MODE', 
        'WALLSMATERIAL_MODE', 'EMERGENCYSTATE_MODE', 'FLAG_DOCUMENT_2', 
        'FLAG_DOCUMENT_3', 'FLAG_DOCUMENT_4', 'FLAG_DOCUMENT_5',
        'FLAG_DOCUMENT_6', 'FLAG_DOCUMENT_7', 'FLAG_DOCUMENT_8', 
        'FLAG_DOCUMENT_9', 'FLAG_DOCUMENT_10', 'FLAG_DOCUMENT_11', 
        'FLAG_DOCUMENT_12', 'FLAG_DOCUMENT_13', 'FLAG_DOCUMENT_14', 
        'FLAG_DOCUMENT_15', 'FLAG_DOCUMENT_16', 'FLAG_DOCUMENT_17', 
        'FLAG_DOCUMENT_18', 'FLAG_DOCUMENT_19', 'FLAG_DOCUMENT_20', 
        'FLAG_DOCUMENT_21', 'DAYS_BIRTH', 'LIVINGAPARTMENTS_AVG', 
        'LIVINGAREA_AVG', 'CNT_FAM_MEMBERS',  'OBS_30_CNT_SOCIAL_CIRCLE',
        'OBS_60_CNT_SOCIAL_CIRCLE', 'ELEVATORS_AVG', 'ORGANIZATION_TYPE']


train = train.drop(train[dels], axis =1)
gc.collect()


In [None]:
train.info()

## Build pipelines

In [None]:
train_noTARGET= train.loc[:, train.columns != 'TARGET']

num_feat = train_noTARGET.select_dtypes(include=np.number).columns.tolist()
cat_feat = train.select_dtypes(('object', 'category')).columns.tolist()

features = num_feat + cat_feat

print(features)

In [None]:
#create a Pipeline for processing the num_feat
num_pipe = Pipeline(
    steps = [('imputer', SimpleImputer(strategy = 'median')),
           ('scaler', StandardScaler())
    ])


#create a Pipeline for processing the cat_feat
cat_pipe = Pipeline(
    steps = [('imputer', SimpleImputer(strategy = 'constant', fill_value = 'missing')),
           ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])

In [None]:
#create a ColumnTransformer that combines the two pipelines
preprocessor = ColumnTransformer(
    transformers = [('num', num_pipe, num_feat),
                    ('cat', cat_pipe, cat_feat)
    ])

In [None]:
preprocessor.fit(train[features])

In [None]:
train30k = train.sample(frac=0.10, replace=False, random_state=1)

In [None]:
#define y_train, apply the fitted preprocessor to the training data
y_train = train30k['TARGET'].values
X_train = preprocessor.transform(train30k[features])

In [None]:
print('Shape of features: ', X_train.shape)
print('Shape of target: ', y_train.shape)

In [None]:
%%time

XGB_clf = XGBClassifier(objective='binary:logistic', use_label_encoder=False)

XGB_parameters = {
    'max_depth': range (1, 2, 3),
    'n_estimators': range(25, 100, 200),
    'learning_rate': [0.05, 0.01, 1]
}

XGB_grid = GridSearchCV(XGB_clf, XGB_parameters, cv=10, n_jobs=10, verbose=True, scoring= 'roc_auc')
XGB_grid.fit(X_train, y_train)


XGB_model = XGB_grid.best_estimator_

print('Best Parameters:', XGB_grid.best_params_)
print('Best CV Score:  ', XGB_grid.best_score_)
print('Training Acc:   ', XGB_model.score(X_train, y_train))

In [None]:
%%time

LGBM_clf = lgb.LGBMClassifier(boosting_type='gbdt',n_estimators= 5000, 
                              class_weight='balanced', subsample=0.85, colsample_bytree= 0.75, n_jobs=-1)

LGBM_parameters = {
    'max_depth': range (1, 2, 3),
    'learning_rate': [0.05, 0.03, 0.01], 
    'metric' : ['auc', 'binary_logloss']
}


LGBM_grid = GridSearchCV(LGBM_clf, LGBM_parameters, cv=10, n_jobs=10, verbose=True, scoring= 'roc_auc')
LGBM_grid.fit(X_train, y_train)


LGBM_model = LGBM_grid.best_estimator_

print('Best Parameters:', LGBM_grid.best_params_)
print('Best CV Score:  ', LGBM_grid.best_score_)
print('Training Acc:   ', LGBM_model.score(X_train, y_train))

In [None]:
#save the pipeline and best model to a file 
joblib.dump(preprocessor, 'wk5default_preprocessor2.joblib')

LGBM_model = lgb.LGBMClassifier(boosting_type='gbdt',n_estimators= 5000, class_weight='balanced',
                                subsample=0.8, colsample_bytree= 0.7, n_jobs=-1, 
                                learning_rate = 0.03, max_depth = 1)
LGBM_model.fit(X_train, y_train)
joblib.dump(LGBM_model, 'LGBM_default_model3-2.joblib')