In [2]:
import pandas as pd
import numpy as np
from scipy.stats import skew
import xgboost as xgb
from sklearn.model_selection import KFold
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LogisticRegression
from math import sqrt
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
import gc

import warnings
warnings.filterwarnings('ignore')

In [3]:
NFOLDS = 3
SEED = 0
NROWS = None

## data preprocessing

In [4]:
from pathlib import Path
data_dir = Path('C:/Users/sinjy/jupyter_notebook/datasets/kaggle_datasets/Home-Credit-Default-Risk_dataset')

data = pd.read_csv(data_dir / 'application_train.csv')
test = pd.read_csv(data_dir / 'application_test.csv')
prev = pd.read_csv(data_dir / 'previous_application.csv')

In [5]:
categorical_feats = [
    f for f in data.columns if data[f].dtype == 'object'
]

for f_ in categorical_feats:
    data[f_], indexer = pd.factorize(data[f_])
    test[f_] = indexer.get_indexer(test[f_])

In [6]:
gc.enable()
y_train = data['TARGET']
del data['TARGET']

In [7]:
prev.head()

Unnamed: 0,SK_ID_PREV,SK_ID_CURR,NAME_CONTRACT_TYPE,AMT_ANNUITY,AMT_APPLICATION,AMT_CREDIT,AMT_DOWN_PAYMENT,AMT_GOODS_PRICE,WEEKDAY_APPR_PROCESS_START,HOUR_APPR_PROCESS_START,...,NAME_SELLER_INDUSTRY,CNT_PAYMENT,NAME_YIELD_GROUP,PRODUCT_COMBINATION,DAYS_FIRST_DRAWING,DAYS_FIRST_DUE,DAYS_LAST_DUE_1ST_VERSION,DAYS_LAST_DUE,DAYS_TERMINATION,NFLAG_INSURED_ON_APPROVAL
0,2030495,271877,Consumer loans,1730.43,17145.0,17145.0,0.0,17145.0,SATURDAY,15,...,Connectivity,12.0,middle,POS mobile with interest,365243.0,-42.0,300.0,-42.0,-37.0,0.0
1,2802425,108129,Cash loans,25188.615,607500.0,679671.0,,607500.0,THURSDAY,11,...,XNA,36.0,low_action,Cash X-Sell: low,365243.0,-134.0,916.0,365243.0,365243.0,1.0
2,2523466,122040,Cash loans,15060.735,112500.0,136444.5,,112500.0,TUESDAY,11,...,XNA,12.0,high,Cash X-Sell: high,365243.0,-271.0,59.0,365243.0,365243.0,1.0
3,2819243,176158,Cash loans,47041.335,450000.0,470790.0,,450000.0,MONDAY,7,...,XNA,12.0,middle,Cash X-Sell: middle,365243.0,-482.0,-152.0,-182.0,-177.0,1.0
4,1784265,202054,Cash loans,31924.395,337500.0,404055.0,,337500.0,THURSDAY,9,...,XNA,24.0,high,Cash Street: high,,,,,,


In [8]:
prev_cat_features = [
    f_ for f_ in prev.columns if prev[f_].dtype == 'object'
]
for f_ in prev_cat_features:
    prev[f_], _ = pd.factorize(prev[f_])
    
avg_prev = prev.groupby('SK_ID_CURR').mean()
cnt_prev = prev[['SK_ID_CURR', 'SK_ID_PREV']].groupby('SK_ID_CURR').count()
avg_prev['nb_app'] = cnt_prev['SK_ID_PREV']
del avg_prev['SK_ID_PREV']

In [9]:
x_train = data.merge(right=avg_prev.reset_index(), how='left', on='SK_ID_CURR')
x_test = test.merge(right=avg_prev.reset_index(), how='left', on='SK_ID_CURR')

x_train = x_train.fillna(0)
x_test = x_test.fillna(0)

ntrain = x_train.shape[0]
ntest = x_test.shape[0]

In [10]:
excluded_feats = ['SK_ID_CURR']
features = [f_ for f_ in x_train.columns if f_ not in excluded_feats]

x_train = x_train[features]
x_test = x_test[features]

kf = KFold(n_splits=NFOLDS, shuffle=True, random_state=SEED)

## model

In [11]:
class SklearnWrapper(object):
    def __init__(self, clf, seed=0, params=None):
        params['random_state'] = seed
        self.clf = clf(**params)
    
    def train(self, x_train, y_train):
        self.clf.fit(x_train, y_train)
    
    def predict(self, x):
        return self.clf.predict_proba(x)[:, 1]
    
class CatboostWrapper(object):
    def __init__(self, clf, seed=0, params=None):
        params['random_seed'] = seed
        self.clf = clf(**params)
    
    def train(self, x_train, y_train):
        self.clf.fit(x_train, y_train)
    
    def predict(self, x):
        return self.clf.predict_proba(x)[:, 1]

class LightGBMWrapper(object):
    def __init__(self, clf, seed=0, params=None):
        params['feature_fraction_seed'] = seed
        params['bagging_seed'] = seed
        self.clf = clf(**params)
    
    def train(self, x_train, y_train):
        self.clf.fit(x_train, y_train)
        
    def predict(self, x):
        return self.clf.predict_proba(x)[:, 1]
    
class XgbWrapper(object):
    def __init__(self, seed=0, params=None):
        self.param = params
        self.param['seed'] = seed
        self.nrounds = params.pop('nrounds', 250)
        
    def train(self, x_train, y_train):
        dtrain = xgb.DMatrix(x_train, label=y_train)
        self.gbdt = xgb.train(self.param, dtrain, self.nrounds)
        
    def predict(self, x):
        return self.gbdt.predict(xgb.DMatrix(x))

In [12]:
def get_oof(clf):
    oof_train = np.zeros((ntrain, ))
    oof_test = np.zeros((ntest, ))
    oof_test_skf = np.empty((NFOLDS, ntest))
    
    for i, (train_index, test_index) in enumerate(kf.split(x_train)):
        x_tr = x_train.loc[train_index]
        y_tr = y_train.loc[train_index]
        x_te = x_train.loc[test_index]
        
        clf.train(x_tr, y_tr)
        
        oof_train[test_index] = clf.predict(x_te)
        oof_test_skf[i, :] = clf.predict(x_test)
        
    oof_test[:] = oof_test_skf.mean(axis=0)
    return oof_train.reshape(-1, 1), oof_test.reshape(-1, 1)

In [13]:
et_params = {
    'n_jobs': -1,
    'n_estimators': 200,
    'max_features': 0.5,
    'max_depth': 12,
    'min_samples_leaf': 2
}

rf_params = {
    'n_jobs': -1,
    'n_estimators': 200,
    'max_features': 0.5,
    'max_depth': 12,
    'min_samples_leaf': 2
}

xgb_params = {
    'seed': 0,
    'colsample_bytree': 0.7,
    'silent': 1,
    'subsample': 0.7,
    'learning_rate': 0.075,
    'objective': 'binary:logistic',
    'max_depth': 4,
    'num_parallel_tree': 1,
    'min_child_weight': 1,
    'nrounds':200
}

catboost_params = {
    'iterations': 200,
    'learning_rate': 0.5,
    'depth': 3,
    'l2_leaf_reg': 40,
    'bootstrap_type': 'Bernoulli',
    'subsample': 0.7,
    'scale_pos_weight': 5,
    'eval_metric': 'AUC',
    'od_type': 'Iter',
    'allow_writing_files': False
}

lightgbm_params = {
    'n_estimators': 200,
    'learning_rate': 0.1,
    'num_leaves': 123,
    'colsample_bytree': 0.8,
    'subsample': 0.9,
    'max_depth': 15,
    'reg_alpha': 0.1,
    'reg_lambda': 0.1,
    'min_split_gain': 0.01,
    'min_child_weight': 2
}

In [14]:
xg = XgbWrapper(seed=SEED, params=xgb_params)
et = SklearnWrapper(clf=ExtraTreesClassifier, seed=SEED, params=et_params)
rf = SklearnWrapper(clf=RandomForestClassifier, seed=SEED, params=rf_params)
cb = CatboostWrapper(clf=CatBoostClassifier, seed=SEED, params=catboost_params)
lg = LightGBMWrapper(clf=LGBMClassifier, seed=SEED, params=lightgbm_params)

xg_oof_train, xg_oof_test = get_oof(xg)
et_oof_train, et_oof_test = get_oof(et)
rf_oof_train, rf_oof_test = get_oof(rf)
cb_oof_train, cb_oof_test = get_oof(cb)
lg_oof_train, lg_oof_test = get_oof(lg)

print("XG-CV: {}".format(sqrt(mean_squared_error(y_train, xg_oof_train))))
print("ET-CV: {}".format(sqrt(mean_squared_error(y_train, et_oof_train))))
print("RF-CV: {}".format(sqrt(mean_squared_error(y_train, rf_oof_train))))
print("CB-CV: {}".format(sqrt(mean_squared_error(y_train, cb_oof_train))))
print("LG-CV: {}".format(sqrt(mean_squared_error(y_train, lg_oof_train))))

Parameters: { "silent" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


Parameters: { "silent" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


Parameters: { "silent" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


0:	total: 231ms	remaining: 45.9s
1:	total: 303ms	remaining: 3

190:	total: 6.05s	remaining: 285ms
191:	total: 6.08s	remaining: 253ms
192:	total: 6.11s	remaining: 222ms
193:	total: 6.14s	remaining: 190ms
194:	total: 6.17s	remaining: 158ms
195:	total: 6.19s	remaining: 126ms
196:	total: 6.23s	remaining: 94.9ms
197:	total: 6.26s	remaining: 63.2ms
198:	total: 6.29s	remaining: 31.6ms
199:	total: 6.32s	remaining: 0us
0:	total: 38.3ms	remaining: 7.63s
1:	total: 70.2ms	remaining: 6.95s
2:	total: 100ms	remaining: 6.58s
3:	total: 130ms	remaining: 6.36s
4:	total: 164ms	remaining: 6.38s
5:	total: 199ms	remaining: 6.44s
6:	total: 230ms	remaining: 6.34s
7:	total: 262ms	remaining: 6.3s
8:	total: 297ms	remaining: 6.31s
9:	total: 324ms	remaining: 6.15s
10:	total: 352ms	remaining: 6.05s
11:	total: 385ms	remaining: 6.03s
12:	total: 421ms	remaining: 6.05s
13:	total: 459ms	remaining: 6.09s
14:	total: 487ms	remaining: 6s
15:	total: 517ms	remaining: 5.95s
16:	total: 545ms	remaining: 5.87s
17:	total: 578ms	remaining: 5.84s
18:	total: 609ms	remaining: 5.8s
19:	total: 639ms

36:	total: 1.15s	remaining: 5.05s
37:	total: 1.17s	remaining: 5.01s
38:	total: 1.21s	remaining: 4.99s
39:	total: 1.24s	remaining: 4.94s
40:	total: 1.27s	remaining: 4.93s
41:	total: 1.3s	remaining: 4.89s
42:	total: 1.33s	remaining: 4.85s
43:	total: 1.36s	remaining: 4.83s
44:	total: 1.39s	remaining: 4.79s
45:	total: 1.42s	remaining: 4.75s
46:	total: 1.45s	remaining: 4.72s
47:	total: 1.48s	remaining: 4.67s
48:	total: 1.5s	remaining: 4.64s
49:	total: 1.53s	remaining: 4.6s
50:	total: 1.56s	remaining: 4.57s
51:	total: 1.59s	remaining: 4.53s
52:	total: 1.62s	remaining: 4.5s
53:	total: 1.65s	remaining: 4.46s
54:	total: 1.68s	remaining: 4.43s
55:	total: 1.71s	remaining: 4.4s
56:	total: 1.74s	remaining: 4.37s
57:	total: 1.77s	remaining: 4.33s
58:	total: 1.8s	remaining: 4.31s
59:	total: 1.83s	remaining: 4.28s
60:	total: 1.86s	remaining: 4.25s
61:	total: 1.9s	remaining: 4.22s
62:	total: 1.92s	remaining: 4.18s
63:	total: 1.95s	remaining: 4.15s
64:	total: 1.98s	remaining: 4.12s
65:	total: 2.02s	rema

In [17]:
predict_dir = Path('C:/Users/sinjy/jupyter_notebook/datasets/kaggle_predict')

x_train = np.concatenate((xg_oof_train, et_oof_train, rf_oof_train, cb_oof_train, lg_oof_train), axis=1)
x_test = np.concatenate((xg_oof_test, et_oof_test, rf_oof_test, cb_oof_test, lg_oof_test), axis=1)

print("{}, {}".format(x_train.shape, x_test.shape))

logistic_regression = LogisticRegression()
logistic_regression.fit(x_train, y_train)

test['TARGET'] = logistic_regression.predict_proba(x_test)[:, 1]
test[['SK_ID_CURR', 'TARGET']].to_csv(predict_dir / 'HomeCredit3.csv', index=False, float_format='%.8f')

(307511, 5), (48744, 5)


## test score: 0.75815