In [2]:
import datetime
from copy import copy
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import ipywidgets
import pickle
import gc

from sklearn.model_selection import train_test_split, StratifiedKFold, cross_validate
from sklearn.metrics import accuracy_score, roc_auc_score

from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
import xgboost
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier, cv, Pool
import optuna
from optuna.integration import lightgbm as lgb
import tensorflow as tf

In [27]:
train_df = pd.read_csv('dataset/train_16_0246.csv')
test_df = pd.read_csv('dataset/test_16_0246.csv')

In [28]:
ID = test_df['id']
x = train_df.drop(columns=['y', 'id'])
y = train_df['y']
test_df = test_df.drop(columns=['id', 'y'])

In [4]:
rf_model = pickle.load(open('models/RF_22_2248.pkl', 'rb'))
xgb_model = pickle.load(open('models/XG_17_1144.pkl', 'rb'))
lgb_model = pickle.load(open('models/LGBM_op24_0418.pkl', 'rb'))

In [8]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, stratify=y, random_state=18)

In [10]:
rf_best_params = {
    'n_estimators': 1200,
    'criterion': 'entropy',
    'max_depth': 10,
    'min_samples_split': 22,
    'random_state': 18,
    'verbose': 0
}

In [11]:
xgb_best_params = {
        'objective': 'binary:logistic',
        'booster': 'gbtree',
        'eval_metric': 'auc',
        'eta': 0.01,
        'n_estimators': 10000,
        'early_stopping_rounds': 100,
        'max_depth': 6,
        'min_child_weight': 1,
        'subsample': 0.9,
        'colsample_bytree': 0.85,
        'gamma': 0.0,
        'alpha': 0.0,
        'seed': 18
}


In [12]:
lgb_best_params = {
        'n_estimations': 10000,
        'learning_rate': 0.001,
        'metric': 'auc',
        'seed': 111,
        'verbose': -1,
        'lambda_l1': 0.11736816437265646,
        'lambda_l2': 0.04906024417734872,
        'num_leaves': 140,
        'feature_fraction': 0.4,
        'bagging_fraction': 1.0,
        'bagging_freq': 0,
        'min_child_samples': 20
}

In [14]:
%%time

estimators = [
    ('rf', RandomForestClassifier(**rf_best_params)),
    ('xgb', XGBClassifier(**xgb_best_params)),
    ('lgb', LGBMClassifier(**lgb_best_params))
]

sclf = StackingClassifier(estimators=estimators,
                          final_estimator=LogisticRegression(max_iter=1000, random_state=18))

sclf.fit(x_train, y_train)

CPU times: user 19min 12s, sys: 7.58 s, total: 19min 19s
Wall time: 4min 14s


StackingClassifier(estimators=[('rf',
                                RandomForestClassifier(criterion='entropy',
                                                       max_depth=10,
                                                       min_samples_split=22,
                                                       n_estimators=1200,
                                                       random_state=18)),
                               ('xgb',
                                XGBClassifier(alpha=0.0, base_score=None,
                                              booster='gbtree',
                                              colsample_bylevel=None,
                                              colsample_bynode=None,
                                              colsample_bytree=0.85,
                                              early_stopping_rounds=100,
                                              eta=0.01, eval_metric='auc',
                                              gamma=0.0, gp

In [20]:
sclf_p = sclf.predict(x_test)

acc_sclf = accuracy_score(sclf_p, y_test)
auc_sclf = roc_auc_score(sclf_p, y_test)

print('acc:{}, auc:{}'.format(acc_sclf, auc_sclf))

acc:0.9302583025830258, auc:0.8095053912498169


In [29]:
pred_sclf = sclf.predict_proba(test_df)
sclf_p = pd.DataFrame(pred_sclf)
sclf_p.head(-10)

Unnamed: 0,0,1
0,0.031051,0.968949
1,0.940884,0.059116
2,0.962093,0.037907
3,0.971538,0.028462
4,0.943304,0.056696
...,...,...
18035,0.944752,0.055248
18036,0.944058,0.055942
18037,0.971607,0.028393
18038,0.964811,0.035189


In [30]:
now = datetime.datetime.now()

submission = pd.DataFrame({
    'ID':ID,
    'pred':sclf_p[1]
})

file_name = 'submit/submit_stacking_' + now.strftime('%d_%H%M') + '.csv'

submission.to_csv(file_name, index=False, header=False)