In [16]:
# 각 알고리즘 별 oof를 수집하여 Ridge를 이용하여 weight를 추출
# Voting model을 구성하여 위에서 구한 weight기반하여 결과 추출

In [17]:
import pandas as pd
import numpy as np

In [18]:
from sklearn.neural_network import MLPClassifier
from sklearn.calibration import CalibratedClassifierCV
from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier, HistGradientBoostingClassifier
from sklearn.kernel_approximation import Nystroem
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.preprocessing import FunctionTransformer, PolynomialFeatures, StandardScaler
from sklearn.pipeline import make_pipeline

In [19]:
data_path='../../data/'

In [20]:
ambrosm_oof_df = pd.read_csv(data_path+'oof/ambrosm.csv')
lightgbm_oof_df = pd.read_csv(data_path+'oof/lightgbm.csv')
xgboost_oof_df = pd.read_csv(data_path+'oof/xgboost.csv')

In [21]:
train_df = pd.read_csv(data_path+'train.csv', index_col='id')
test_df = pd.read_csv(data_path+'test.csv', index_col='id')
submission_df = pd.read_csv(data_path+'sample_submission.csv', index_col='id')

In [22]:
X = train_df.drop(columns='defects')
y = train_df['defects']

In [23]:
all_oof_df = pd.concat([ambrosm_oof_df, xgboost_oof_df, lightgbm_oof_df], axis=1)
all_oof_df

Unnamed: 0,linear_best_oof,logistic_best_oof,logistic_nystroem_best_oof,extra_best_oof,rf_best_oof,knn_best_oof,hgb_best_oof,xgboost_oof,lightgbm_oof
0,0.113020,0.292676,0.263026,0.087506,0.082296,0.077995,0.084565,0.071362,0.074083
1,0.091172,0.217814,0.191553,0.065491,0.060955,0.059274,0.057399,0.053843,0.055853
2,0.077846,0.177005,0.161153,0.046353,0.038370,0.057778,0.073632,0.045884,0.049316
3,0.083293,0.187400,0.159725,0.086816,0.075404,0.066361,0.082444,0.085829,0.080914
4,0.091548,0.231135,0.313546,0.125164,0.136323,0.116845,0.097831,0.112108,0.104143
...,...,...,...,...,...,...,...,...,...
101758,0.082243,0.180461,0.160814,0.056628,0.046545,0.076260,0.053516,0.078027,0.052296
101759,0.151209,0.397482,0.467674,0.184923,0.187295,0.196183,0.183232,0.166571,0.193487
101760,0.113829,0.317497,0.338628,0.155416,0.148849,0.127197,0.125833,0.143557,0.126170
101761,0.078646,0.178350,0.164692,0.053035,0.048034,0.055675,0.050798,0.062254,0.046861


In [24]:
from sklearn.linear_model import RidgeClassifierCV

ridge_model = RidgeClassifierCV(alphas=np.logspace(-5, 1, 10), fit_intercept=False, scoring='roc_auc', cv=5)
ridge_model.fit(all_oof_df, y)
weights = ridge_model.coef_[0]
print(f'best alpha : {ridge_model.alpha_}')
pd.Series(weights, index=all_oof_df.columns)

best alpha : 1e-05


linear_best_oof              -1.338477
logistic_best_oof            -1.932318
logistic_nystroem_best_oof   -2.560529
extra_best_oof                2.199417
rf_best_oof                   0.459820
knn_best_oof                  1.566002
hgb_best_oof                  0.894229
xgboost_oof                   1.590566
lightgbm_oof                  1.191794
dtype: float64

In [25]:
models = [
    ('linear',make_pipeline(
                FunctionTransformer(np.log1p),
                PolynomialFeatures(2, include_bias=False),
                StandardScaler(),
                CalibratedClassifierCV(LinearSVC(dual=False, C=0.78858))
            )),
    ('logistic',make_pipeline(
                FunctionTransformer(np.log1p),
                PolynomialFeatures(2, include_bias=False),
                StandardScaler(),
                LogisticRegression(
                    dual=False,
                    C=0.32,
                    class_weight='balanced',
                    max_iter=1500,
                    random_state=61,
                    solver='newton-cholesky'
                )
            )),
    ('logistic_nystroem',make_pipeline(
                FunctionTransformer(np.log1p),
                Nystroem(n_components=400, random_state=61),
                StandardScaler(),
                LogisticRegression(dual=False, C=0.0032, max_iter=1500, random_state=61)
            )),
    ('extra',make_pipeline(
                FunctionTransformer(np.log1p),
                ExtraTreesClassifier(
                    n_estimators=100,
                    min_samples_leaf=110,
                    max_features=1.0,
                    random_state=61
                ),
            )),
    ('rf',RandomForestClassifier(min_samples_leaf=220, max_features=1.0, random_state=61)),
    ('knn',make_pipeline(
                FunctionTransformer(np.log1p),
                StandardScaler(),
                KNeighborsClassifier(
                    n_neighbors=360,
                    weights='distance'
                )
            )),
    ('hgb',HistGradientBoostingClassifier(random_state=61)),
    ('xgboost',XGBClassifier(
            max_depth=5,
            colsample_bynode=0.6028784042231218,
            reg_lambda=1.9107959408881667,
            n_estimators=38,
            learning_rate=0.19089614460186538,
            random_state=61,
            eval_metric=roc_auc_score,
        )
    ),
    ('lightgbm',LGBMClassifier(
            max_depth=24,
            num_leaves=32,
            min_child_samples=74,
            colsample_bytree=0.7080352104708246,
            n_estimators=63,
            learning_rate=0.09089244698723227,
            random_state=61
        )
    )
]

In [35]:
from sklearn.model_selection import StratifiedKFold

# Make KFold OOF prediction
K=5
def oof_preds(best_model):

    # make KFold
    folds = StratifiedKFold(n_splits=K, random_state=61, shuffle=True)
    final_preds = []
    losses = []
    oof = np.full(len(X), np.nan)
    # fitting with best_model
    for i, (train_idx, val_idx) in enumerate(folds.split(X, y)):
        X_train = X.iloc[train_idx, :]
        y_train = y.iloc[train_idx]
        X_val = X.iloc[val_idx, :]
        y_val = y.iloc[val_idx]

        print(f"========== Fold {i+1} ==========")
        best_model.fit(X_train, y_train)
        preds = best_model.predict_proba(X_val)[:, 1]
        oof[val_idx] = preds
        test_preds = best_model.predict_proba(test_df)[:, 1]
        final_preds.append(test_preds)
        loss = roc_auc_score(y_val, preds)

        losses.append(loss)

    avg_loss = np.mean(losses)
    print(f"Loss : {avg_loss:.4f}")
    return final_preds, oof, avg_loss

In [41]:
model_scores = [(model_name, oof_preds(model)[2]) for model_name, model in models]
model_scores.sort(key=lambda m: m[1], reverse=True)

Loss : 0.7879
Loss : 0.7897
Loss : 0.7912
Loss : 0.7910
Loss : 0.7913
Loss : 0.7875
Loss : 0.7913
Loss : 0.7916
Loss : 0.7920


In [44]:
model_scores

[('lightgbm', 0.7920163420770895),
 ('xgboost', 0.7916494425928904),
 ('rf', 0.791317471130813),
 ('hgb', 0.791295894674041),
 ('logistic_nystroem', 0.7911558772672646),
 ('extra', 0.7910446684288237),
 ('logistic', 0.7896575091156404),
 ('linear', 0.7879088097006365),
 ('knn', 0.7875235235646615)]

In [50]:
from sklearn.ensemble import VotingClassifier
selected_models = [
    ('lightgbm',LGBMClassifier(
            max_depth=24,
            num_leaves=32,
            min_child_samples=74,
            colsample_bytree=0.7080352104708246,
            n_estimators=63,
            learning_rate=0.09089244698723227,
            random_state=61
        )
    ),
    ('xgboost',XGBClassifier(
            max_depth=5,
            colsample_bynode=0.6028784042231218,
            reg_lambda=1.9107959408881667,
            n_estimators=38,
            learning_rate=0.19089614460186538,
            random_state=61,
            eval_metric=roc_auc_score,
        )
    ),
    ('rf',RandomForestClassifier(min_samples_leaf=220, max_features=1.0, random_state=61)),
    ('hgb', HistGradientBoostingClassifier(random_state=61)),
    ('logistic_nystroem',make_pipeline(
        FunctionTransformer(np.log1p),
        Nystroem(n_components=400, random_state=61),
        StandardScaler(),
        LogisticRegression(dual=False, C=0.0032, max_iter=1500, random_state=61)
    )),
    ('extra', ExtraTreesClassifier(
            n_estimators=100,
            min_samples_leaf=110,
            max_features=1.0,
            random_state=61
        ),
    ),
]

best_preds = None
best_score = 0
best_model_num = 0
for model_num in range(2, 7):
    weights = [6,5,4,3,2,1]
    voter_model = VotingClassifier(selected_models[:model_num], weights = weights[:model_num], voting = 'soft')
    preds, oof, oof_score = oof_preds(voter_model)
    preds = np.mean(preds, axis=0)
    if model_num==2 or best_score<oof_score:
        best_preds = preds
        best_score = oof_score
        best_model_num = model_num
print(f'best model 개수 : {best_model_num}, best oof score : {best_score}')

Loss : 0.7922
Loss : 0.7924
Loss : 0.7924
Loss : 0.7926
Loss : 0.7926
best model 개술 : 5, best oof score : 0.7925872440398897


In [51]:
submission_df['defects'] = best_preds

In [52]:
submission_df.to_csv(data_path+'submission/ensemble_selected_grid.csv')