In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.ensemble import VotingClassifier
from lightgbm import LGBMClassifier

import pickle

# 데이터 로드

In [2]:
import os

colab = os.path.isdir('./sample_data')
mount = os.path.isdir('/content/drive')
if colab:
    if not mount:
        from google.colab import drive
        drive.mount('/content/drive')
    base_path = '/content/drive/MyDrive/Colab Notebooks/5_ML_Project/dulee/'
    data_path = '/content/drive/MyDrive/Colab Notebooks/5_ML_Project/data/'
else:
    base_path = ''
    data_path = '../data/'

In [3]:
submission_df = pd.read_csv(data_path+'sample_submission.csv', index_col='id')

train = pd.read_csv(base_path + 'train.csv', index_col='id')
test = pd.read_csv(base_path + 'test.csv', index_col='id')

##### 변수 설정

In [4]:
X = train.drop(columns=['defects'])
y = train['defects']
X_test = test

K = 15

In [5]:
rf = RandomForestClassifier(
    min_samples_leaf=141,
    max_depth=9,
    max_features=0.68758604791617,
    random_state=61,
)
# rf = pickle.load(open(base_path + "rf_best_1.pickle", "rb"))
rf

RandomForestClassifier(max_depth=9, max_features=0.68758604791617,
                       min_samples_leaf=141, random_state=61)

In [6]:
# et = ExtraTreesClassifier(
#     n_estimators=700,
#     min_samples_leaf=2,
#     max_depth=16,
#     max_features=0.793614074795712,
#     min_samples_split=48,
#     random_state=61,
# )
et = pickle.load(open(base_path + "et_best_1.pickle", "rb"))
et

ExtraTreesClassifier(max_depth=16, max_features=0.7994174320161516,
                     min_samples_split=26, n_estimators=700, random_state=61)

In [7]:
# lgbm = LGBMClassifier(
#     num_leaves=4,
#     learning_rate=0.16467547044594108,
#     n_estimators=230,
#     random_state=61,
#     force_col_wise=True,
# )
lgbm = pickle.load(open(base_path + "lgbm_best_1.pickle", "rb"))
lgbm

LGBMClassifier(colsample_bytree=0.5878524120000573, force_row_wise=True,
               learning_rate=0.0891461451200261, min_child_samples=90,
               n_estimators=648, n_jobs=-1, random_state=61)

In [8]:
ensemble = VotingClassifier(
    [('rf', rf), ('et',et), ('lgbm', lgbm)],
    voting='soft',
    weights = [0.3, 0.3, 0.4],
)

In [9]:
def oof_predict(model):
    folds = StratifiedKFold(n_splits=K, shuffle=True, random_state=61)
    scores = []
    predict_probas = []
    for train_idx, val_idx in folds.split(X, y):
        model.fit(X.iloc[train_idx], y.iloc[train_idx])
        y_proba = model.predict_proba(X.iloc[val_idx])[:, 1]
        y_true = y.iloc[val_idx]
        scores.append(roc_auc_score(y_true, y_proba))

        predict_probas.append(model.predict_proba(X_test)[:, 1])
    print(f"# AUC {np.mean(scores):.5f}")
    return np.mean(predict_probas, axis=0)

In [10]:
ensemble.fit(X, y)
y_proba = ensemble.predict_proba(X_test)[:, 1]
y_proba_oof = oof_predict(ensemble)

[LightGBM] [Info] Number of positive: 23064, number of negative: 78699
[LightGBM] [Info] Total Bins 3585
[LightGBM] [Info] Number of data points in the train set: 101763, number of used features: 21
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.226644 -> initscore=-1.227357
[LightGBM] [Info] Start training from score -1.227357
[LightGBM] [Info] Number of positive: 21526, number of negative: 73452
[LightGBM] [Info] Total Bins 3574
[LightGBM] [Info] Number of data points in the train set: 94978, number of used features: 21
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.226642 -> initscore=-1.227371
[LightGBM] [Info] Start training from score -1.227371
[LightGBM] [Info] Number of positive: 21526, number of negative: 73452
[LightGBM] [Info] Total Bins 3572
[LightGBM] [Info] Number of data points in the train set: 94978, number of used features: 21
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.226642 -> initscore=-1.227371
[LightGBM] [Info] Start training from score -1.227371
[Li

In [11]:
submission_df['defects'] = y_proba
submission_df.to_csv(base_path + 'submission_ensemble.csv')
submission_df['defects'] = y_proba_oof
submission_df.to_csv(base_path + 'submission_ensemble_oof.csv')
submission_df

Unnamed: 0_level_0,defects
id,Unnamed: 1_level_1
101763,0.241656
101764,0.182790
101765,0.643669
101766,0.461809
101767,0.142048
...,...
169600,0.257936
169601,0.114541
169602,0.169719
169603,0.099490
