In [11]:
import pandas as pd
import numpy as np

In [12]:
data_path='../data/'
ambrosm_oof_df = pd.read_csv(data_path+'oof/ambrosm.csv')
lightgbm_oof_df = pd.read_csv(data_path+'oof/lightgbm.csv')
xgboost_oof_df = pd.read_csv(data_path+'oof/xgboost.csv')
mlp_oof_df = pd.read_csv(data_path+'oof/mlp.csv')

In [13]:
train_df = pd.read_csv(data_path+'train.csv', index_col='id')
test_df = pd.read_csv(data_path+'test.csv', index_col='id')
submission_df = pd.read_csv(data_path+'sample_submission.csv', index_col='id')

In [14]:
((pd.read_csv(data_path+'submission_ambrosm.csv', index_col='id')
+pd.read_csv(data_path+'submission_xgboost_kfold.csv', index_col='id')
+pd.read_csv(data_path+'submission_lightgbm_kfold.csv', index_col='id')) / 3.0).to_csv(data_path+'submission_a_x_l.csv')

In [15]:
X = train_df.drop(columns='defects')
y = train_df['defects']

In [16]:
all_oof_df = pd.concat([ambrosm_oof_df, xgboost_oof_df, lightgbm_oof_df, mlp_oof_df], axis=1)
all_oof_df.columns

Index(['linear_best_oof', 'logistic_best_oof', 'logistic_nystroem_best_oof',
       'extra_best_oof', 'rf_best_oof', 'knn_best_oof', 'hgb_best_oof',
       'xgboost_oof', 'lightgbm_oof', 'mlp_oof'],
      dtype='object')

In [17]:
from sklearn.linear_model import RidgeClassifier, Lasso

weights = Lasso(random_state = 61).fit(all_oof_df, train_df.defects).coef_[0]
# weights.index = all_oof_df.columns
pd.Series(weights, index=all_oof_df.columns)

linear_best_oof               0.0
logistic_best_oof             0.0
logistic_nystroem_best_oof    0.0
extra_best_oof                0.0
rf_best_oof                   0.0
knn_best_oof                  0.0
hgb_best_oof                  0.0
xgboost_oof                   0.0
lightgbm_oof                  0.0
mlp_oof                       0.0
dtype: float64

In [18]:
from sklearn.neural_network import MLPClassifier
from sklearn.calibration import CalibratedClassifierCV
from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier, HistGradientBoostingClassifier
from sklearn.kernel_approximation import Nystroem
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.preprocessing import FunctionTransformer, PolynomialFeatures, StandardScaler
from sklearn.pipeline import make_pipeline

In [19]:
models = [
    ('linear',make_pipeline(
                FunctionTransformer(np.log1p),
                PolynomialFeatures(2, include_bias=False),
                StandardScaler(),
                CalibratedClassifierCV(LinearSVC(dual=False, C=0.78858))
            )),
    ('logistic',make_pipeline(
                FunctionTransformer(np.log1p),
                PolynomialFeatures(2, include_bias=False),
                StandardScaler(),
                LogisticRegression(
                    dual=False,
                    C=0.32,
                    class_weight='balanced',
                    max_iter=1500,
                    random_state=61,
                    solver='newton-cholesky'
                )
            )),
    ('logistic_nystroem',make_pipeline(
                FunctionTransformer(np.log1p),
                Nystroem(n_components=400, random_state=61),
                StandardScaler(),
                LogisticRegression(dual=False, C=0.0032, max_iter=1500, random_state=61)
            )),
    ('extra',make_pipeline(
                FunctionTransformer(np.log1p),
                ExtraTreesClassifier(
                    n_estimators=100,
                    min_samples_leaf=110,
                    max_features=1.0,
                    random_state=61
                ),
            )),
    ('rf',RandomForestClassifier(min_samples_leaf=220, max_features=1.0, random_state=61)),
    ('knn',make_pipeline(
                FunctionTransformer(np.log1p),
                StandardScaler(),
                KNeighborsClassifier(
                    n_neighbors=360,
                    weights='distance'
                )
            )),
    ('hgb',HistGradientBoostingClassifier(random_state=61)),
    ('xgboost',XGBClassifier(
            max_depth=5,
            colsample_bynode=0.5893033001541113,
            reg_lambda=2.51229884910896,
            n_estimators=77,
            learning_rate=0.11576587720138976,
            random_state=61,
            eval_metric=roc_auc_score,
        )
    ),
    ('lightgbm',LGBMClassifier(
            max_depth=20,
            num_leaves=128,
            min_child_samples=63,
            colsample_bytree=0.6674419461546907,
            n_estimators=966,
            learning_rate=0.008527944132064239,
            random_state=61
        )
    ),
    ('mlp', make_pipeline(
            FunctionTransformer(np.log1p),
            StandardScaler(),
            MLPClassifier(
                learning_rate='constant',
                alpha =0.014831922236360796,
                hidden_layer_sizes = (15,30,15,),
                activation='relu',
                solver='adam',
                # **best_params,
                max_iter = 1000,
                random_state=61
            )
        )
    )
]

In [9]:
from sklearn.model_selection import StratifiedKFold


# Make KFold OOF prediction
K=5
def oof_preds(best_model):

    # make KFold
    folds = StratifiedKFold(n_splits=K, random_state=61, shuffle=True)
    final_preds = []
    losses = []
    oof = np.full(len(X), np.nan)
    # fitting with best_model
    for i, (train_idx, val_idx) in enumerate(folds.split(X, y)):
        X_train = X.iloc[train_idx, :]
        y_train = y.iloc[train_idx]
        X_val = X.iloc[val_idx, :]
        y_val = y.iloc[val_idx]

        print(f"========== Fold {i+1} ==========")
        best_model.fit(X_train, y_train)
        preds = best_model.predict_proba(X_val)[:, 1]
        oof[val_idx] = preds
        test_preds = best_model.predict_proba(test_df)[:, 1]
        final_preds.append(test_preds)
        loss = roc_auc_score(y_val, preds)

        losses.append(loss)

    avg_loss = np.mean(losses)
    print(f"Loss : {avg_loss:.4f}")
    return final_preds, oof

In [23]:
from sklearn.ensemble import VotingClassifier

voter_model = VotingClassifier(models, weights = weights, voting = 'soft')
voter_model

In [None]:
preds, oof = oof_preds(voter_model)
preds = np.mean(preds, axis=0)
preds
# mlp 없는 모델
# 0.7923
# mlp 있는 모델 - not log
# 0.7924
# mlp 있는 모델 - log
#

In [10]:
submission_df['defects'] = preds

In [11]:
submission_df.to_csv(data_path+'submission_final.csv')