In [2]:
import os
import time
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.metrics import log_loss, roc_auc_score, roc_curve
from imblearn.under_sampling import RandomUnderSampler
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.utils.fixes import logsumexp

os.environ['KMP_DUPLICATE_LIB_OK']='True'

In [3]:
TRAIN_DATA_PATH = './train.csv'
TEST_DATA_PATH = './test.csv'

origin_train = pd.read_csv(TRAIN_DATA_PATH)
origin_test = pd.read_csv(TEST_DATA_PATH)
features = origin_train.columns[2:]
target = pd.DataFrame(origin_train, columns=['target'])
df_train = pd.DataFrame(origin_train, columns=features)
df_test = pd.DataFrame(origin_test, columns=features)

In [4]:
# kernel引用
params = {
    'boosting':'gbdt', 
    'bagging_freq':5,
    'bagging_fraction':0.5,
    'num_leaves':2,
    'reg_lambda':100.0,
    'learning_rate':0.01, 
    'max_bin':1023,
    'seed':3366
}

In [5]:
def optimal_rounds(X, verbose=False):
    """
    各機能に対してnfoldで指定した数分のCVでLGBMを早期に停止しながら実行し、各機能に最適なツリー数を決定します。
    """
    rounds = []
    for i in range(len(df_train.columns)):
        if verbose:
            print("Feature ", i)
        cv_res = lgb.cv(params, 
           lgb.Dataset(df_train[['var_'+str(i)]], target.iloc[:, 0]),
           nfold=3, # 速度向上のために小さな値にしておく
           num_boost_round=100000,
           metrics='binary_logloss',
           verbose_eval=100 if verbose else None,
           early_stopping_rounds=100
          )
        rounds.append(len(cv_res['binary_logloss-mean']))
    return rounds

opt_rounds = optimal_rounds(df_train, verbose=True)

Feature  0
[100]	cv_agg's binary_logloss: 0.324643 + 0.000112501
[200]	cv_agg's binary_logloss: 0.324307 + 0.000144592
[300]	cv_agg's binary_logloss: 0.324229 + 0.000148846
[400]	cv_agg's binary_logloss: 0.324217 + 0.000153991
Feature  1
[100]	cv_agg's binary_logloss: 0.324862 + 2.69714e-05
[200]	cv_agg's binary_logloss: 0.324486 + 2.77672e-05
[300]	cv_agg's binary_logloss: 0.324365 + 2.0729e-05
[400]	cv_agg's binary_logloss: 0.324333 + 1.02923e-05
[500]	cv_agg's binary_logloss: 0.324326 + 4.91504e-06
[600]	cv_agg's binary_logloss: 0.32433 + 4.53892e-06
Feature  2
[100]	cv_agg's binary_logloss: 0.324606 + 0.000195444
[200]	cv_agg's binary_logloss: 0.324209 + 0.000286782
[300]	cv_agg's binary_logloss: 0.324101 + 0.000346028
[400]	cv_agg's binary_logloss: 0.324083 + 0.00037177
[500]	cv_agg's binary_logloss: 0.324085 + 0.00038213
[600]	cv_agg's binary_logloss: 0.324082 + 0.000382774
Feature  3
[100]	cv_agg's binary_logloss: 0.326073 + 3.99854e-05
[200]	cv_agg's binary_logloss: 0.326047 + 

[300]	cv_agg's binary_logloss: 0.324601 + 7.48758e-05
[400]	cv_agg's binary_logloss: 0.324566 + 8.15704e-05
[500]	cv_agg's binary_logloss: 0.324556 + 8.63678e-05
[600]	cv_agg's binary_logloss: 0.324559 + 8.59729e-05
Feature  34
[100]	cv_agg's binary_logloss: 0.324929 + 0.000147007
[200]	cv_agg's binary_logloss: 0.324571 + 0.000222958
[300]	cv_agg's binary_logloss: 0.324445 + 0.000288139
[400]	cv_agg's binary_logloss: 0.324409 + 0.000326085
[500]	cv_agg's binary_logloss: 0.324402 + 0.000348806
Feature  35
[100]	cv_agg's binary_logloss: 0.325524 + 4.93463e-05
[200]	cv_agg's binary_logloss: 0.325355 + 5.67514e-05
[300]	cv_agg's binary_logloss: 0.325317 + 6.05311e-05
[400]	cv_agg's binary_logloss: 0.325312 + 6.29939e-05
[500]	cv_agg's binary_logloss: 0.325315 + 6.38107e-05
Feature  36
[100]	cv_agg's binary_logloss: 0.325487 + 3.73368e-05
[200]	cv_agg's binary_logloss: 0.325303 + 7.49784e-05
[300]	cv_agg's binary_logloss: 0.325257 + 8.96711e-05
[400]	cv_agg's binary_logloss: 0.325248 + 0.00

[200]	cv_agg's binary_logloss: 0.326067 + 3.15197e-05
Feature  73
[100]	cv_agg's binary_logloss: 0.326148 + 2.03833e-05
Feature  74
[100]	cv_agg's binary_logloss: 0.325995 + 3.62701e-05
[200]	cv_agg's binary_logloss: 0.325963 + 5.58662e-05
[300]	cv_agg's binary_logloss: 0.325957 + 7.34272e-05
[400]	cv_agg's binary_logloss: 0.325949 + 7.67247e-05
[500]	cv_agg's binary_logloss: 0.325955 + 7.86765e-05
Feature  75
[100]	cv_agg's binary_logloss: 0.325353 + 0.000119429
[200]	cv_agg's binary_logloss: 0.325148 + 0.000159836
[300]	cv_agg's binary_logloss: 0.325089 + 0.000175418
[400]	cv_agg's binary_logloss: 0.325079 + 0.0001753
[500]	cv_agg's binary_logloss: 0.325078 + 0.000179119
Feature  76
[100]	cv_agg's binary_logloss: 0.324378 + 0.000212346
[200]	cv_agg's binary_logloss: 0.323929 + 0.000323336
[300]	cv_agg's binary_logloss: 0.323792 + 0.000378254
[400]	cv_agg's binary_logloss: 0.323759 + 0.000412834
[500]	cv_agg's binary_logloss: 0.32376 + 0.000433051
Feature  77
[100]	cv_agg's binary_log

[400]	cv_agg's binary_logloss: 0.325113 + 9.54684e-05
[500]	cv_agg's binary_logloss: 0.32511 + 0.000103063
Feature  108
[100]	cv_agg's binary_logloss: 0.324951 + 0.000146132
[200]	cv_agg's binary_logloss: 0.324635 + 0.000215419
[300]	cv_agg's binary_logloss: 0.32455 + 0.000269065
[400]	cv_agg's binary_logloss: 0.324525 + 0.000288067
[500]	cv_agg's binary_logloss: 0.324499 + 0.000295374
[600]	cv_agg's binary_logloss: 0.324479 + 0.00029644
[700]	cv_agg's binary_logloss: 0.324458 + 0.000298991
[800]	cv_agg's binary_logloss: 0.324442 + 0.000304442
[900]	cv_agg's binary_logloss: 0.324426 + 0.000301638
[1000]	cv_agg's binary_logloss: 0.32441 + 0.000305259
[1100]	cv_agg's binary_logloss: 0.324396 + 0.000303168
[1200]	cv_agg's binary_logloss: 0.324379 + 0.000311744
[1300]	cv_agg's binary_logloss: 0.324367 + 0.000308984
[1400]	cv_agg's binary_logloss: 0.324354 + 0.000313344
[1500]	cv_agg's binary_logloss: 0.324338 + 0.000321845
[1600]	cv_agg's binary_logloss: 0.324339 + 0.000322183
[1700]	cv_ag

[300]	cv_agg's binary_logloss: 0.325577 + 4.84727e-05
[400]	cv_agg's binary_logloss: 0.325577 + 5.44976e-05
[500]	cv_agg's binary_logloss: 0.325577 + 5.67448e-05
Feature  136
[100]	cv_agg's binary_logloss: 0.326167 + 6.60263e-06
Feature  137
[100]	cv_agg's binary_logloss: 0.325774 + 3.06495e-05
[200]	cv_agg's binary_logloss: 0.325657 + 4.26428e-05
[300]	cv_agg's binary_logloss: 0.325625 + 5.30656e-05
[400]	cv_agg's binary_logloss: 0.325615 + 5.89196e-05
Feature  138
[100]	cv_agg's binary_logloss: 0.326021 + 2.21142e-05
[200]	cv_agg's binary_logloss: 0.325988 + 2.19091e-05
[300]	cv_agg's binary_logloss: 0.325985 + 2.77773e-05
[400]	cv_agg's binary_logloss: 0.32599 + 2.71157e-05
Feature  139
[100]	cv_agg's binary_logloss: 0.323215 + 0.00015098
[200]	cv_agg's binary_logloss: 0.322556 + 0.000209618
[300]	cv_agg's binary_logloss: 0.322392 + 0.000250519
[400]	cv_agg's binary_logloss: 0.322352 + 0.00026105
[500]	cv_agg's binary_logloss: 0.322346 + 0.000275327
[600]	cv_agg's binary_logloss: 0.

[500]	cv_agg's binary_logloss: 0.324688 + 0.000395098
Feature  170
[100]	cv_agg's binary_logloss: 0.324923 + 4.21611e-05
[200]	cv_agg's binary_logloss: 0.324641 + 4.2644e-05
[300]	cv_agg's binary_logloss: 0.324569 + 4.83786e-05
[400]	cv_agg's binary_logloss: 0.324542 + 5.10777e-05
[500]	cv_agg's binary_logloss: 0.324542 + 5.67373e-05
Feature  171
[100]	cv_agg's binary_logloss: 0.326014 + 2.54703e-05
[200]	cv_agg's binary_logloss: 0.326002 + 2.94004e-05
[300]	cv_agg's binary_logloss: 0.326004 + 3.23597e-05
Feature  172
[100]	cv_agg's binary_logloss: 0.325447 + 1.79091e-05
[200]	cv_agg's binary_logloss: 0.325273 + 3.23007e-05
[300]	cv_agg's binary_logloss: 0.325232 + 3.66531e-05
[400]	cv_agg's binary_logloss: 0.325226 + 3.43463e-05
Feature  173
[100]	cv_agg's binary_logloss: 0.325305 + 1.9028e-05
[200]	cv_agg's binary_logloss: 0.325086 + 2.78414e-05
[300]	cv_agg's binary_logloss: 0.325014 + 4.19165e-05
[400]	cv_agg's binary_logloss: 0.324985 + 5.6023e-05
[500]	cv_agg's binary_logloss: 0.

In [6]:
num_ones = np.sum(target.iloc[:, 0] == 1)
num_zeros = np.sum(target.iloc[:, 0] == 0)

class LGBNaiveBayes:
    def fit(self,df_train, target, opt_rounds):
        self.clfs = []
        for i in range(len(df_train.columns)):
            if i%20 == 0:
                print("Fitting var_"+ str(i)+"...")
            params['n_estimators'] = opt_rounds[i]

            lgb_clf = lgb.LGBMClassifier(**params)
            lgb_clf.fit(df_train[['var_'+str(i)]], target)

            self.clfs.append(lgb_clf)
            
    def predict_proba(self,df_train):
        log_sum = np.zeros((df_train.shape[0],2))
        for i in range(len(df_train.columns)):

            log_sum += np.log(self.clfs[i].predict_proba(df_train[['var_'+str(i)]]))
            log_sum += np.array([np.log(num_ones) - np.log(200000), np.log(num_zeros) - np.log(200000)])
            
        log_sum -= np.array([np.log(num_ones) - np.log(200000), np.log(num_zeros) - np.log(200000)])
        
        log_prob_x = logsumexp(log_sum, axis=1)
        return np.exp(log_sum - np.atleast_2d(log_prob_x).T)
clf = LGBNaiveBayes()

In [7]:
clf = LGBNaiveBayes()

def cross_validate(nfolds):
    sss = StratifiedShuffleSplit(nfolds)
    aucs = []
    for train, test in sss.split(origin_train[features], origin_train['target']):
        clf.fit(origin_train.loc[train][features], origin_train.loc[train]['target'], opt_rounds)
        y_true = origin_train.loc[test]['target']
        y_pred = clf.predict_proba(origin_train.loc[test][features])[:,1]
        test_auc = roc_auc_score(y_true, y_pred)
        aucs.append(test_auc)
        print("Test AUC:", test_auc)
    print("Mean test AUC: ", np.mean(aucs))
    
cross_validate(5)


Fitting var_0...
Fitting var_20...
Fitting var_40...
Fitting var_60...
Fitting var_80...
Fitting var_100...
Fitting var_120...
Fitting var_140...
Fitting var_160...
Fitting var_180...
Test AUC: 0.9024156316803973
Fitting var_0...
Fitting var_20...
Fitting var_40...
Fitting var_60...
Fitting var_80...
Fitting var_100...
Fitting var_120...
Fitting var_140...
Fitting var_160...
Fitting var_180...
Test AUC: 0.9030722706644653
Fitting var_0...
Fitting var_20...
Fitting var_40...
Fitting var_60...
Fitting var_80...
Fitting var_100...
Fitting var_120...
Fitting var_140...
Fitting var_160...
Fitting var_180...
Test AUC: 0.9012663198736722
Fitting var_0...
Fitting var_20...
Fitting var_40...
Fitting var_60...
Fitting var_80...
Fitting var_100...
Fitting var_120...
Fitting var_140...
Fitting var_160...
Fitting var_180...
Test AUC: 0.901244140608796
Fitting var_0...
Fitting var_20...
Fitting var_40...
Fitting var_60...
Fitting var_80...
Fitting var_100...
Fitting var_120...
Fitting var_140...
Fit

In [8]:
clf.fit(origin_train[features], origin_train['target'], opt_rounds)
pred = clf.predict_proba(origin_test.iloc[:][features])
sub_df = pd.DataFrame({"ID_code":origin_test["ID_code"].values})
sub_df["target"] = pred[:,1]
sub_df.to_csv("naive_bayes_submission.csv", index=False)

Fitting var_0...
Fitting var_20...
Fitting var_40...
Fitting var_60...
Fitting var_80...
Fitting var_100...
Fitting var_120...
Fitting var_140...
Fitting var_160...
Fitting var_180...
