In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

In [None]:
import pandas as pd
import numpy as np
train = pd.read_csv('../input/train.csv')
train_label = train['target']
train_id = train['ID_code']
del train['target'], train['ID_code']

test = pd.read_csv('../input/test.csv')
test_id = test['ID_code']
del test['ID_code']

In [None]:
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()
X_train_std = sc.fit_transform(train)
X_test_std = sc.fit_transform(test)

In [None]:
from sklearn.model_selection import StratifiedKFold, train_test_split, cross_validate, cross_val_predict, cross_val_score
import xgboost as xgb
import lightgbm as lgbm
from sklearn.metrics import roc_auc_score

In [None]:
# LightGBM 모델의 설정값이다.
num_boost_round = 10000
params = {"objective": "binary",
          'metric' : 'binary_logloss',
          "boosting_type": "gbdt",
          "learning_rate": 0.1,
          "num_leaves": 45,
           "max_bin": 256,
          "feature_fraction": 0.6,
          "verbosity": 0,
          "drop_rate": 0.1,
          "is_unbalance": False,
          "max_drop": 50,
          "min_child_samples": 10,
          "min_child_weight": 150,
          "min_split_gain": 0,
          "subsample": 0.9
          }

In [None]:
# Stratified 5-Fold 내부 교차 검증
NFOLDS = 5
kfold = StratifiedKFold(n_splits=NFOLDS, shuffle=True, random_state=1112)

x_score = []
final_cv_train = np.zeros(len(train_label))
final_cv_pred = np.zeros(len(test_id))

In [None]:
# 총 16번의 다른 시드값으로 학습을 돌려, 평균값을 최종 예측 결과물로 사용한다. 시드값이 많을 수록 랜덤 요소로 인한 분산을 줄일 수 있다.
for s in range(16):
    cv_train = np.zeros(len(train_label))
    cv_pred = np.zeros(len(test_id))

    params['seed'] = s
    
    kf = kfold.split(X_train_std, train_label)

    best_trees = []
    fold_scores = []

    for i, (train_fold, validate) in enumerate(kf):
        X_train, X_validate, label_train, label_validate = X_train_std[train_fold, :], X_train_std[validate, :], train_label[train_fold], train_label[validate]
        dtrain = lgbm.Dataset(X_train, label_train)
        dvalid = lgbm.Dataset(X_validate, label_validate, reference=dtrain)
        
        # 훈련 데이터를 학습하고 검증 데이터에 대한 최적의 트리 개수를 찾는다.
        bst = lgbm.train(params, dtrain, num_boost_round, valid_sets=dvalid, verbose_eval=100, early_stopping_rounds=100)
        best_trees.append(bst.best_iteration)
        
        # 테스트 데이터에 대한 예측값을 cv_pred에 더한다.
        cv_pred += bst.predict(X_test_std, num_iteration=bst.best_iteration)
        cv_train[validate] += bst.predict(X_validate)

        # 검증 데이터에 대한 평가 점수를 출력한다.
        score = roc_auc_score(label_validate, cv_train[validate])
        print(score)
        fold_scores.append(score)

    cv_pred /= NFOLDS
    final_cv_train += cv_train
    final_cv_pred += cv_pred

    # 시드값별로 교차 검증 점수를 출력한다.
    print("cv score:")
    print(roc_auc_score(train_label, cv_train))
    print("current score:", roc_auc_score(train_label, final_cv_train / (s + 1.)), s+1)
    print(fold_scores)
    print(best_trees, np.mean(best_trees))

    x_score.append(roc_auc_score(train_label, cv_train))

print(x_score)

In [None]:
# 테스트 데이터에 대한 결과물을 시드값 개수만큼 나누어주어 0~1사이 값으로 수정하고, 결과물을 저장한다.
final_cv_pred_16 = final_cv_pred / 16.0
pd.DataFrame({'ID_code': test_id, 'target': final_cv_pred_16}).to_csv('lgbm_pred_avg.csv', index=False)

In [None]:
final_cv_pred

In [None]:
final_cv_pred_16

In [None]:
#convert into binary values
for i in range(0,200000):
    if final_cv_pred_16[i]>=.5: # setting threshold to .5
        final_cv_pred_16[i]=1
    else:
        final_cv_pred_16[i]=0
pd.DataFrame({'ID_code':test_id, 'target': final_cv_pred_16}).to_csv('lgbm_pred_bin.csv', index=False)