スタッキング

In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold, StratifiedKFold, GridSearchCV
from sklearn.metrics import roc_auc_score, roc_curve
import lightgbm as lgb

os.environ['KMP_DUPLICATE_LIB_OK']='True'

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


In [2]:
origin_train = pd.read_csv("../input/train.csv")

In [24]:
lg_tra = pd.read_csv("./Logistic_train_predict.csv").drop(["Unnamed: 0"], axis=1)
lg_tes = pd.read_csv("./Logistic_test_predict.csv").drop(["Unnamed: 0"], axis=1)
# lgbm_tra = pd.read_csv("./LGBM_train_submmit.csv")
# lgbm_tes = pd.read_csv("./LGBM_test_submmit.csv")
# niv_tra = pd.read_csv("./naive_bayes_train_submission.csv")
# niv_tes = pd.read_csv("./naive_bayes_test_submission.csv")
lgsvm_tra = pd.read_csv("./train_5x-LinearSVC-01-v1-oof_0859008_2019-03-29-16-30.csv")
lgsvm_tes = pd.read_csv("./submission_5x-LinearSVC-01-v1_0859008_2019-03-29-16-30.csv")

In [32]:
train_list = [lgbm_tra, niv_tra, lgsvm_tra["prediction"]]
test_list =[lgbm_tes, niv_tes, lgsvm_tes]
columns_list =["lgbm", "niv", "lgsvm"]

train_data = pd.DataFrame([])
test_data = pd.DataFrame([])

for li in train_list:
     train_data = pd.concat([train_data, li], axis=1)

for li in test_list:
     test_data = pd.concat([test_data, li], axis=1)

train_data = train_data.drop(["ID_code"], axis=1)
train_data.columns = columns_list
test_data = test_data.drop(["ID_code"], axis=1)
test_data.columns = columns_list

In [33]:
param = {
    'bagging_freq': 5,
    'bagging_fraction': 0.4,
    'boost_from_average':'false',
    'boost': 'gbdt',
    'feature_fraction': 0.05,
    'learning_rate': 0.01,
    'max_depth': 2,  
    'metric':'auc',
    'min_data_in_leaf': 80,
    'min_sum_hessian_in_leaf': 10.0,
    'num_leaves': 13,
    'num_threads': 8,
    'tree_learner': 'serial',
    'objective': 'binary', 
    'verbosity': 1
}
# param = {
#     'bagging_freq': 8, #handling overfitting
#     'bagging_fraction': 0.4, #handling overfitting - adding some noise
#     'boost_from_average':'false',
#     'boost': 'gbdt',
#     'feature_fraction': 0.4, #handling overfitting
#     'learning_rate': 0.01, #the changes between one auc and a better one gets really small thus a small learning rate performs better
#     'max_depth': 2,  #smaller trees less overfitting
#     'metric':'auc',
#     'num_threads': 8,
#     'tree_learner': 'serial',
#     'objective': 'binary', 
#     'verbosity': 0
#     }

features= train_data.columns
train = train_data
test = test_data
target = origin_train.target

In [34]:
folds = StratifiedKFold(n_splits=10, shuffle=False, random_state=44000)
oof = np.zeros(len(train))
predictions = np.zeros(len(test))
feature_importance = pd.DataFrame()

for fold_, (trn_idx, val_idx) in enumerate(folds.split(train.values, target.values)):
    print("Fold {}".format(fold_))
    #argument
#     trn_train, trn_target= augment(np.array(train.iloc[trn_idx]), np.array(np.array(target.iloc[trn_idx])))
#     trn_data = lgb.Dataset(trn_train, label=trn_target)
#     val_train, val_target= augment(np.array(train.iloc[val_idx]), np.array(target.iloc[val_idx]))
#     val_data = lgb.Dataset(val_train, label=val_target)

    #not argument
    trn_data = lgb.Dataset(train.iloc[trn_idx], label=target.iloc[trn_idx])
    val_data = lgb.Dataset(train.iloc[val_idx], label=target.iloc[val_idx])
    
    num_round = 30000
    clf = lgb.train(param, trn_data, num_round, valid_sets = [trn_data, val_data], verbose_eval=1000, early_stopping_rounds = 3000)
    oof[val_idx] = clf.predict(train.iloc[val_idx], num_iteration=clf.best_iteration)
    
    fold_importance = pd.DataFrame()
    fold_importance["Feature"] = features
    fold_importance["importance"] = clf.feature_importance()
    fold_importance["fold"] = fold_ + 1
    feature_importance = pd.concat([feature_importance, fold_importance], axis=0)
    
    predictions += clf.predict(test[features], num_iteration=clf.best_iteration) / folds.n_splits

print("CV score: {:<8.5f}".format(roc_auc_score(target, oof)))

Fold 0
Training until validation scores don't improve for 3000 rounds.
[1000]	training's auc: 0.915471	valid_1's auc: 0.913606
[2000]	training's auc: 0.922082	valid_1's auc: 0.92042
[3000]	training's auc: 0.923982	valid_1's auc: 0.922628
[4000]	training's auc: 0.92447	valid_1's auc: 0.923132
[5000]	training's auc: 0.924689	valid_1's auc: 0.923184
[6000]	training's auc: 0.924805	valid_1's auc: 0.923134
[7000]	training's auc: 0.924891	valid_1's auc: 0.923042
[8000]	training's auc: 0.924976	valid_1's auc: 0.92298
Early stopping, best iteration is:
[5175]	training's auc: 0.924711	valid_1's auc: 0.923209
Fold 1
Training until validation scores don't improve for 3000 rounds.
[1000]	training's auc: 0.915521	valid_1's auc: 0.913475
[2000]	training's auc: 0.922062	valid_1's auc: 0.920115
[3000]	training's auc: 0.92403	valid_1's auc: 0.922056
[4000]	training's auc: 0.924529	valid_1's auc: 0.92243
[5000]	training's auc: 0.924746	valid_1's auc: 0.922385
[6000]	training's auc: 0.924858	valid_1's au

In [23]:
score = round(roc_auc_score(target, oof), 4)
sub = pd.DataFrame({"ID_code": lg_tes["ID_code"]})
sub["target"]=predictions
sub.to_csv(f"../output/submission_ensemble_stacking_{score}.csv", index=False)