In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.ensemble import VotingClassifier
from lightgbm import LGBMClassifier

# 데이터 로드

In [2]:
import os

colab = os.path.isdir('./sample_data')
mount = os.path.isdir('/content/drive')
if colab:
    if not mount:
        from google.colab import drive
        drive.mount('/content/drive')
    base_path = '/content/drive/MyDrive/Colab Notebooks/5_ML_Project/dulee/'
    data_path = '/content/drive/MyDrive/Colab Notebooks/5_ML_Project/data/'
else:
    base_path = ''
    data_path = '../data/'

In [3]:
submission_df = pd.read_csv(data_path+'sample_submission.csv', index_col='id')

train = pd.read_csv(base_path + 'train.csv', index_col='id')
test = pd.read_csv(base_path + 'test.csv', index_col='id')

##### 변수 설정

In [4]:
X = train.drop(columns=['defects'])
y = train['defects']
X_test = test

In [5]:
rf = RandomForestClassifier(
    min_samples_leaf=78,
    max_depth=9,
    max_features=0.6222106369274514,
    random_state=61,
)

In [6]:
et = ExtraTreesClassifier(
    n_estimators=700,
    min_samples_leaf=2,
    max_depth=16,
    max_features=0.793614074795712,
    min_samples_split=48,
    random_state=61,
)

In [7]:
lgbm = LGBMClassifier(
    num_leaves=4,
    learning_rate=0.16467547044594108,
    n_estimators=230,
    random_state=61
)

In [8]:
ensemble = VotingClassifier(
    [('rf', rf), ('et',et), ('lgbm', lgbm)],
    voting='soft',
    weights = [0.3, 0.3, 0.4],
)

In [9]:
def oof_predict(model):
    folds = StratifiedKFold(shuffle=True, random_state=61)
    scores = []
    predict_probas = []
    for train_idx, val_idx in folds.split(X, y):
        model.fit(X.iloc[train_idx], y.iloc[train_idx])
        y_proba = model.predict_proba(X.iloc[val_idx])[:, 1]
        y_true = y.iloc[val_idx]
        scores.append(roc_auc_score(y_true, y_proba))
        
        predict_probas.append(model.predict_proba(X_test)[:, 1])
    print(f"# AUC {np.mean(scores):.5f}")
    return np.mean(predict_probas, axis=0)

In [10]:
ensemble.fit(X, y)
y_proba = ensemble.predict_proba(X_test)[:, 1]
y_proba_oof = oof_predict(ensemble)

[LightGBM] [Info] Number of positive: 23064, number of negative: 78699
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003698 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3585
[LightGBM] [Info] Number of data points in the train set: 101763, number of used features: 21
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.226644 -> initscore=-1.227357
[LightGBM] [Info] Start training from score -1.227357
[LightGBM] [Info] Number of positive: 18451, number of negative: 62959
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.005112 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3547
[LightGBM] [Info] Number of data points in the train set: 81410, number of used features: 21
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.226643 -> initscore=-1.227365
[

In [11]:
submission_df['defects'] = y_proba
submission_df.to_csv(base_path + 'submission_ensemble.csv')
submission_df['defects'] = y_proba_oof
submission_df.to_csv(base_path + 'submission_ensemble_oof.csv')
submission_df

Unnamed: 0_level_0,defects
id,Unnamed: 1_level_1
101763,0.246789
101764,0.194710
101765,0.655302
101766,0.470759
101767,0.138548
...,...
169600,0.280177
169601,0.112983
169602,0.163022
169603,0.100564
