***최초 작성일 : 22.09.21***<br>

***최종 작성일 : 22.09.21***

# Stacking Test-Sklearn, XGBoost, CatBoost, LightGBM

##### "이유한님의 캐글 코리아 캐글 스터디 커널 커리큘럼"에 따라 필사한 내용입니다.

- 필사 노트북의 원 출처 : https://www.kaggle.com/code/eliotbarr/stacking-test-sklearn-xgboost-catboost-lightgbm/script

-------

## 1. Import package

In [1]:
# Stacking Strater based on Allstate Faron's Script
# - https://www.kaggle.com/mmueller/allstate-claims-severity/stacking-starter/run/390867
# Preprocessing from ogrellier
# - https://www.kaggle.com/ogrellier/good-fun-with-ligthgbm

import pandas as pd
import numpy as np
from scipy.stats import skew
import xgboost as xgb
# from sklearn.cross_validation import KFold # cross_validation은 없어진 하위 모듈
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LogisticRegression
from math import sqrt
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
import gc

In [2]:
NFOLDS = 3
SEED = 0
NROWS = None

data = pd.read_csv("./dataset/home-credit-default-risk/application_train.csv")
test = pd.read_csv("./dataset/home-credit-default-risk/application_test.csv")
prev = pd.read_csv("./dataset/home-credit-default-risk/previous_application.csv")

In [3]:
categorical_feats = [
    f for f in data.columns if data[f].dtype == 'object'
]

for f_ in categorical_feats:
    data[f_], indexer = pd.factorize(data[f_])
    test[f_] = indexer.get_indexer(test[f_])
    
gc.enable()

y_train = data['TARGET']
del data['TARGET']

In [4]:
###################################################################
# Plese don't do this at home  
# Averaging factorized categorical features defeats my own resoning
###################################################################
prev_cat_features = [
    f_ for f_ in prev.columns if prev[f_].dtype == 'object'
]

for f_ in prev_cat_features:
    prev[f_], _ = pd.factorize(prev[f_])

avg_prev = prev.groupby('SK_ID_CURR').mean()
cnt_prev = prev[['SK_ID_CURR', 'SK_ID_PREV']].groupby('SK_ID_CURR').count()
avg_prev['nb_app'] = cnt_prev['SK_ID_PREV']
del avg_prev['SK_ID_PREV']

In [5]:
x_train = data.merge(right = avg_prev.reset_index(), how = 'left', on = 'SK_ID_CURR')
x_test = test.merge(right = avg_prev.reset_index(), how = 'left', on = 'SK_ID_CURR')

x_train = x_train.fillna(0)
x_test = x_test.fillna(0)

ntrain = x_train.shape[0]
ntest = x_test.shape[0]

exclouded_feats = ['SK_ID_CURR']
features = [f_ for f_ in x_train.columns if f_ not in exclouded_feats]

x_train = x_train[features]
x_test = x_test[features]

kf = KFold(n_splits = NFOLDS, shuffle = True, random_state = SEED)

In [13]:
class SklearnWrapper(object):
    def __init__(self, clf, seed = 0, params = None):
        params['random_state'] = seed
        self.clf = clf(**params)
        
    def train(self, x_train, y_train):
        self.clf.fit(x_train, y_train)
        
    def predict(self, x):
        return self.clf.predict_proba(x)[:, 1]
    
class CatboostWrapper(object):
    def __init__(self, clf, seed = 0, params = None):
        params['random_seed'] = seed
        self.clf = clf(**params)
        
    def train(self, x_train, y_train):
        self.clf.fit(x_train, y_train)
        
    def predict(self, x):
        return self.clf.predict_proba(x)[:, 1]
    
class LightGBMWrapper(object):
    def __init__(self, clf, seed = 0, params = None):
        params['feature_fraction_seed'] = seed
        params['bagging_seed'] = seed
        self.clf = clf(**params)
        
    def train(self, x_train, y_train):
        self.clf.fit(x_train, y_train)
        
    def predict(self, x):
        return self.clf.predict_proba(x)[:, 1]
    
class XgbWrapper(object):
    def __init__(self, seed = 0, params = None):
        self.param = params
        self.param['seed'] = seed
        self.nrounds = params.pop('nround', 250)
        
    def train(self, x_train, y_train):
        dtrain = xgb.DMatrix(x_train, label = y_train)
        self.gbdt = xgb.train(self.param, dtrain, self.nrounds)
        
    def predict(self, x):
        return self.gbdt.predict(xgb.DMatrix(x))
    
def get_oof(clf):
    oof_train = np.zeros((ntrain, ))
    oof_test = np.zeros((ntest, ))
    oof_test_skf = np.empty((NFOLDS, ntest))
    
    for i, (train_index, test_index) in enumerate(kf.split(x_train)):
        x_tr = x_train.loc[train_index]
        y_tr = y_train.loc[train_index]
        x_te = x_train.loc[test_index]
        
        clf.train(x_tr, y_tr)
        
        oof_train[test_index] = clf.predict(x_te)
        oof_test_skf[i, :] = clf.predict(x_test)
        
    oof_test[:] = oof_test_skf.mean(axis = 0)
    return oof_train.reshape(-1, 1), oof_test.reshape(-1, 1)

In [15]:
et_params = {
    'n_jobs' : 16,
    'n_estimators' : 200,
    'max_features' : 0.5,
    'max_depth' : 12,
    'min_samples_leaf' : 2
}

rf_params = {
    'n_jobs' : 16,
    'n_estimators' : 200,
    'max_features' : 0.5,
    'max_depth' : 12,
    'min_samples_leaf' : 2
}

xgb_params = {
    'seed' : 0,
    'colsample_bytree' : 0.7,
    'silent' : 1,
    'subsample' : 0.7,
    'learning_rate' : 0.075,
    'objective' : 'binary:logistic',
    'max_depth' : 4,
    'num_parallel_tree' : 1,
    'min_child_weight' : 1,
    'nround' : 200
}

catboost_params = {
    'iterations' : 200,
    'learning_rate' : 0.5,
    'depth' : 3,
    'l2_leaf_reg' : 40,
    'bootstrap_type' : 'Bernoulli',
    'subsample' : 0.7,
    'scale_pos_weight' : 5,
    'eval_metric' : 'AUC',
    'od_type' : 'Iter',
    'allow_writing_files' : False
}

lightgbm_params = {
    'n_estimators' : 200,
    'learning_rate' : 0.1,
    'num_leaves' : 123,
    'colsample_bytree' : 0.8,
    'subsample' : 0.9,
    'max_depth' : 15,
    'reg_alpha' : 0.1,
    'reg_lambda' : 0.1,
    'min_split_gain' : 0.01,
    'min_child_weght' : 2
}

In [18]:
xg = XgbWrapper(seed = SEED, params = xgb_params)
et = SklearnWrapper(clf = ExtraTreesClassifier, seed = SEED, params = et_params)
rf = SklearnWrapper(clf = RandomForestClassifier, seed = SEED, params = rf_params)
cb = CatboostWrapper(clf = CatBoostClassifier, seed = SEED, params = catboost_params)
lg = LightGBMWrapper(clf = LGBMClassifier, seed = SEED, params = lightgbm_params)

xg_oof_train, xg_oof_test = get_oof(xg)
et_oof_train, et_oof_test = get_oof(et)
rf_oof_train, rf_oof_test = get_oof(rf)
cb_oof_train, cb_oor_test = get_oof(cb)

Parameters: { "silent" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


Parameters: { "silent" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


Parameters: { "silent" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


0:	total: 16.9ms	remaining: 3.36s
1:	total: 32.9ms	remaining:

0:	total: 17.8ms	remaining: 3.54s
1:	total: 48.6ms	remaining: 4.81s
2:	total: 65ms	remaining: 4.27s
3:	total: 80.1ms	remaining: 3.93s
4:	total: 95.9ms	remaining: 3.74s
5:	total: 112ms	remaining: 3.62s
6:	total: 128ms	remaining: 3.53s
7:	total: 143ms	remaining: 3.43s
8:	total: 157ms	remaining: 3.33s
9:	total: 171ms	remaining: 3.25s
10:	total: 190ms	remaining: 3.26s
11:	total: 204ms	remaining: 3.19s
12:	total: 219ms	remaining: 3.15s
13:	total: 237ms	remaining: 3.15s
14:	total: 253ms	remaining: 3.11s
15:	total: 270ms	remaining: 3.1s
16:	total: 286ms	remaining: 3.08s
17:	total: 301ms	remaining: 3.04s
18:	total: 315ms	remaining: 3s
19:	total: 330ms	remaining: 2.97s
20:	total: 344ms	remaining: 2.93s
21:	total: 361ms	remaining: 2.92s
22:	total: 377ms	remaining: 2.9s
23:	total: 391ms	remaining: 2.87s
24:	total: 406ms	remaining: 2.84s
25:	total: 421ms	remaining: 2.81s
26:	total: 435ms	remaining: 2.79s
27:	total: 450ms	remaining: 2.77s
28:	total: 468ms	remaining: 2.76s
29:	total: 487ms	remaining

50:	total: 851ms	remaining: 2.48s
51:	total: 869ms	remaining: 2.47s
52:	total: 883ms	remaining: 2.45s
53:	total: 900ms	remaining: 2.43s
54:	total: 918ms	remaining: 2.42s
55:	total: 935ms	remaining: 2.4s
56:	total: 950ms	remaining: 2.38s
57:	total: 964ms	remaining: 2.36s
58:	total: 979ms	remaining: 2.34s
59:	total: 999ms	remaining: 2.33s
60:	total: 1.01s	remaining: 2.31s
61:	total: 1.03s	remaining: 2.3s
62:	total: 1.05s	remaining: 2.28s
63:	total: 1.07s	remaining: 2.27s
64:	total: 1.08s	remaining: 2.25s
65:	total: 1.1s	remaining: 2.23s
66:	total: 1.11s	remaining: 2.21s
67:	total: 1.13s	remaining: 2.19s
68:	total: 1.15s	remaining: 2.17s
69:	total: 1.16s	remaining: 2.16s
70:	total: 1.18s	remaining: 2.14s
71:	total: 1.19s	remaining: 2.12s
72:	total: 1.21s	remaining: 2.1s
73:	total: 1.22s	remaining: 2.08s
74:	total: 1.24s	remaining: 2.06s
75:	total: 1.25s	remaining: 2.05s
76:	total: 1.27s	remaining: 2.03s
77:	total: 1.29s	remaining: 2.01s
78:	total: 1.3s	remaining: 2s
79:	total: 1.32s	remai

In [21]:
print("XG-CV : {}".format(sqrt(mean_squared_error(y_train, xg_oof_train))))
print("ET-CV : {}".format(sqrt(mean_squared_error(y_train, et_oof_train))))
print("RF-CV : {}".format(sqrt(mean_squared_error(y_train, rf_oof_train))))
print("CB-CV : {}".format(sqrt(mean_squared_error(y_train, cb_oof_train))))

x_train = np.concatenate((xg_oof_train, et_oof_train, rf_oof_train, cb_oof_train), axis = 1)
x_test = np.concatenate((xg_oof_test, et_oof_test, rf_oof_test, cb_ooc_test), axis = 1)

print("{}, {}".format(x_train.shape, x_test.shape))

XG-CV : 0.2594898942250431
ET-CV : 0.26296502347137407
RF-CV : 0.2627260187688126
CB-CV : 0.3306000196333401


NameError: name 'cb_oov_test' is not defined

In [None]:
logistic_regression = LogisticRegression()
logistic_regression.fit(x_train, y_train)

test["TARGET"] = logistic_regression.predict_proba(x_test)[:, 1]

# test[['SK_ID_CURR', 'TARGET']].to_csv('first_submission.csv', index = False, float_format = '%.8f')

#### 1. 공부한 내용<br>
 → 

#### 2. 어려웠던 부분
 → 
 
#### 3. 느낀점 및 좋던 부분
 → 
 
#### 4. 라이브러리 사용법
 → 
 
#### 5. 해당 커널의 특징(다른 커널과의 차별점)
 → 
 
#### 6. 추후 공부 및 정리할 내용
 → 