In [1]:
# 각 알고리즘 별 oof를 수집하여 Ridge를 이용하여 weight를 추출
# Voting model을 구성하여 위에서 구한 weight기반하여 결과 추출

In [2]:
import pandas as pd
import numpy as np

In [3]:
from sklearn.neural_network import MLPClassifier
from sklearn.calibration import CalibratedClassifierCV
from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier, HistGradientBoostingClassifier
from sklearn.kernel_approximation import Nystroem
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.preprocessing import FunctionTransformer, PolynomialFeatures, StandardScaler
from sklearn.pipeline import make_pipeline

In [4]:
data_path='../../data/'

In [5]:
ambrosm_oof_df = pd.read_csv(data_path+'oof/ambrosm.csv')
lightgbm_oof_df = pd.read_csv(data_path+'oof/lightgbm.csv')
xgboost_oof_df = pd.read_csv(data_path+'oof/xgboost.csv')

In [6]:
# train_df = pd.read_csv(base_path + 'train.csv', index_col='id')
# test_df = pd.read_csv(base_path + 'test.csv', index_col='id')
train_df = pd.read_csv(data_path + 'train_f1.csv', index_col='id')
test_df = pd.read_csv(data_path + 'test_f1.csv', index_col='id')
submission_df = pd.read_csv(data_path + 'sample_submission.csv', index_col='id')

In [7]:
X = train_df.drop(columns='defects')
y = train_df['defects']

In [8]:
models = {
    'xgboost': XGBClassifier(
        max_depth=5,
        colsample_bynode=0.5195981912942003,
        reg_lambda=2.0596502472632006,
        n_estimators=1345,
        learning_rate=0.010119804013091233,
        random_state=61,
    ),
    'lightgbm': LGBMClassifier(
        max_depth=22,
        num_leaves=128,
        min_child_samples=46,
        colsample_bytree=0.5924502637788397,
        n_estimators=659,
        learning_rate=0.008200284931836449,
        random_state=61,
    ),
    'rf':RandomForestClassifier(min_samples_leaf=220, max_features=1.0, random_state=61),
    'hgb':HistGradientBoostingClassifier(random_state=61),
    'logistic_nystroem': make_pipeline(
        FunctionTransformer(np.log1p),
        Nystroem(n_components=400, random_state=61),
        StandardScaler(),
        LogisticRegression(dual=False, C=0.0032, max_iter=1500, random_state=61)
    ),
    'extra': make_pipeline(
        FunctionTransformer(np.log1p),
        ExtraTreesClassifier(
            n_estimators=100,
            min_samples_leaf=110,
            max_features=1.0,
            random_state=61
        ),
    ),
    'poly': make_pipeline(
        FunctionTransformer(np.log1p),
        PolynomialFeatures(2, include_bias=False),
        StandardScaler(),
        LogisticRegression(
            dual=False,
            C=0.32,
            class_weight='balanced',
            max_iter=1500,
            random_state=61,
            solver='newton-cholesky'
        )
    ),
}

In [9]:
from sklearn.model_selection import StratifiedKFold

# Make KFold OOF prediction
K=5
def oof_preds(best_model, model_name=None):
    if model_name:
        print(f"{model_name}'s oof prediction")

    # make KFold
    folds = StratifiedKFold(n_splits=K, random_state=61, shuffle=True)
    final_preds = []
    losses = []
    oof = np.full(len(X), np.nan)
    # fitting with best_model
    for i, (train_idx, val_idx) in enumerate(folds.split(X, y)):
        X_train = X.iloc[train_idx, :]
        y_train = y.iloc[train_idx]
        X_val = X.iloc[val_idx, :]
        y_val = y.iloc[val_idx]

        print(f"========== Fold {i+1} ==========")
        best_model.fit(X_train, y_train)
        preds = best_model.predict_proba(X_val)[:, 1]
        oof[val_idx] = preds
        test_preds = best_model.predict_proba(test_df)[:, 1]
        final_preds.append(test_preds)
        loss = roc_auc_score(y_val, preds)

        losses.append(loss)

    avg_loss = np.mean(losses)
    print(f"Loss : {avg_loss:.4f}")
    return final_preds, oof, avg_loss

In [10]:
model_scores = [(model_name, oof_preds(model, model_name)[2]) for model_name, model in models.items()]

xgboost's oof prediction
Loss : 0.7923
lightgbm's oof prediction
Loss : 0.7911
rf's oof prediction
Loss : 0.7911
hgb's oof prediction
Loss : 0.7914
logistic_nystroem's oof prediction
Loss : 0.7911
extra's oof prediction
Loss : 0.7914
poly's oof prediction
Loss : 0.7896


In [11]:
model_scores.sort(key=lambda m: m[1], reverse=True)
model_scores

[('xgboost', 0.7922969635583282),
 ('hgb', 0.7914157169781412),
 ('extra', 0.7914027736506613),
 ('rf', 0.7911496027838701),
 ('lightgbm', 0.7911350332082578),
 ('logistic_nystroem', 0.7911317840722252),
 ('poly', 0.7896454270506789)]

In [12]:
from sklearn.ensemble import VotingClassifier
def run_k_fold(models, model_scores, num=6, weight='auto'):
    start = 2
    if weight=='auto':
        weights = [6,5,4,3,2,1]
    elif weight=='balance':
        weights = [1]*num
    else:
        weights = [6,5,4,3,2,1]
    best_score = 0
    best_model_num = 0
    for model_num in range(start, start+num-1):

        chosen_models = [(model_scores[m_idx][0], models[model_scores[m_idx][0]]) for m_idx in range(model_num)]
        chosen_model_names = [model_scores[m_idx][0] for m_idx in range(model_num)]
        print(f'chosen models : {chosen_model_names}')
        voter_model = VotingClassifier(chosen_models, weights = weights[:model_num], voting = 'soft')
        preds, oof, oof_score = oof_preds(voter_model)
        preds = np.mean(preds, axis=0)
        if model_num==2 or best_score<oof_score:
            best_preds = preds
            best_score = oof_score
            best_model_num = model_num
    print(f'best model 개수 : {best_model_num}, best oof score : {best_score}')
    return best_model_num, best_score, best_preds

In [13]:
_,_,best_preds = run_k_fold(models, model_scores, num=7, weight='balance')

chosen models : ['xgboost', 'hgb']
Loss : 0.7922
chosen models : ['xgboost', 'hgb', 'extra']
Loss : 0.7924
chosen models : ['xgboost', 'hgb', 'extra', 'rf']
Loss : 0.7923
chosen models : ['xgboost', 'hgb', 'extra', 'rf', 'lightgbm']
Loss : 0.7924
chosen models : ['xgboost', 'hgb', 'extra', 'rf', 'lightgbm', 'logistic_nystroem']
Loss : 0.7927
chosen models : ['xgboost', 'hgb', 'extra', 'rf', 'lightgbm', 'logistic_nystroem', 'poly']
Loss : 0.7926
best model 개수 : 6, best oof score : 0.7926516101929101


In [14]:
submission_df['defects'] = best_preds

In [15]:
submission_df.to_csv(data_path+'submission/ensemble_selected_balance.csv')