In [None]:
import sys

sys.path.append("../input/iterative-stratification/iterative-stratification-master")

In [None]:
import pickle

import numpy as np
import pandas as pd
from scipy.optimize import minimize
from scipy.optimize import minimize_scalar
from sklearn.metrics import roc_auc_score

In [None]:
# import numpy as np


# def objective_for_high(exponent, Y_true, Y_pred, label_smoothing=0.0):
#     high = 1.0 - 10.0 ** exponent
#     Y_pred = Y_pred.copy()
#     Y_pred = np.clip(Y_pred, 0.0, high)

#     return score(Y_true, Y_pred, label_smoothing=label_smoothing)

In [None]:
# import numpy as np


# def objective_for_low(exponent, Y_true, Y_pred, label_smoothing=0.0):
#     low = 10.0 ** exponent
#     Y_pred = Y_pred.copy()
#     Y_pred = np.clip(Y_pred, low, 1.0)

#     return score(Y_true, Y_pred, label_smoothing=label_smoothing)

In [None]:
def objective_for_threshold(exponent, Y_true, Y_pred, label_smoothing=0.0):
    threshold = 10.0 ** exponent
    Y_pred = Y_pred.copy()
    Y_pred[Y_pred < threshold] = 0.0
    Y_pred[Y_pred > 1.0 - threshold] = 1.0

    return score(Y_true, Y_pred, label_smoothing=label_smoothing)

In [None]:
import numpy as np


def objective_for_weights(weights, Y_true, Y_preds, label_smoothing=0.0):
    Y_pred = np.tensordot(weights, Y_preds, axes=(0, 0))

    return score(Y_true, Y_pred, label_smoothing=label_smoothing)

In [None]:
import numpy as np


def score(Y, Y_pred, eps=1e-15, label_smoothing=0.0):
    Y = np.asarray(Y)
    Y = np.ravel(Y)

    if label_smoothing > 0.0:
        Y = Y * (1.0 - label_smoothing) + 0.5 * label_smoothing

    Y_pred = np.asarray(Y_pred)
    Y_pred = np.ravel(Y_pred)
    Y_pred = np.clip(Y_pred, eps, 1.0 - eps)

    return -np.mean(Y * np.log(Y_pred) + (1.0 - Y) * np.log(1.0 - Y_pred))

In [None]:
import numpy as np
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold
from sklearn.model_selection._split import _BaseKFold


class MultilabelStratifiedGroupKFold(_BaseKFold):
    def __init__(self, n_splits=5, random_state=None, shuffle=False):
        super().__init__(n_splits=n_splits, random_state=random_state, shuffle=shuffle)

    def _iter_test_indices(self, X=None, y=None, groups=None):
        cv = MultilabelStratifiedKFold(
            n_splits=self.n_splits,
            random_state=self.random_state,
            shuffle=self.shuffle,
        )

        value_counts = groups.value_counts()
        regluar_indices = value_counts.loc[
            (value_counts == 6) | (value_counts == 12) | (value_counts == 18)
        ].index.sort_values()
        irregluar_indices = value_counts.loc[
            (value_counts != 6) & (value_counts != 12) & (value_counts != 18)
        ].index.sort_values()

        group_to_fold = {}
        tmp = y.groupby(groups).mean().loc[regluar_indices]

        for fold, (_, test) in enumerate(cv.split(tmp, tmp)):
            group_to_fold.update({group: fold for group in tmp.index[test]})

        sample_to_fold = {}
        tmp = y.loc[groups.isin(irregluar_indices)]

        for fold, (_, test) in enumerate(cv.split(tmp, tmp)):
            sample_to_fold.update({sample: fold for sample in tmp.index[test]})

        folds = groups.map(group_to_fold)
        is_na = folds.isna()
        folds[is_na] = folds[is_na].index.map(sample_to_fold).values

        for i in range(self.n_splits):
            yield np.where(folds == i)[0]

In [None]:
import numpy as np


class ObjectiveWithEarlyStopping(object):
    def __init__(
        self, y_true, y_preds, y_true_valid, y_preds_valid, label_smoothing=0.0, patience=30
    ):
        self.y_true = np.asarray(y_true)
        self.y_preds = np.asarray(y_preds)
        self.y_true_valid = np.asarray(y_true_valid)
        self.y_preds_valid = np.asarray(y_preds_valid)
        self.label_smoothing = label_smoothing
        self.patience = patience

        self._nit = 0
        self._wait = 0
        self._best_score = np.inf
        self._best_weights = None

    def __call__(self, params):
        train_score = self._objective(
            params,
            self.y_true,
            self.y_preds,
            label_smoothing=self.label_smoothing,
        )
        valid_score = self._objective(
            params,
            self.y_true_valid,
            self.y_preds_valid,
            label_smoothing=self.label_smoothing,
        )

        self._nit += 1

        if valid_score < self._best_score:
            self._wait = 0
            self._best_score = valid_score
            self._best_params = params
        else:
            self._wait += 1

        if self._wait >= self.patience:
            raise RuntimeError(f"Epoch {self._nit}: early stopping")

        return train_score


# class ObjectiveForHighWithEarlyStopping(ObjectiveWithEarlyStopping):
#     @property
#     def _objective(self):
#          return objective_for_high


# class ObjectiveForLowWithEarlyStopping(ObjectiveWithEarlyStopping):
#     @property
#     def _objective(self):
#          return objective_for_low


class ObjectiveForThresholdWithEarlyStopping(ObjectiveWithEarlyStopping):
    @property
    def _objective(self):
         return objective_for_threshold


class ObjectiveForWeightsWithEarlyStopping(ObjectiveWithEarlyStopping):
    @property
    def _objective(self):
         return objective_for_weights

In [None]:
index_col = "sig_id"

Y = pd.read_csv("../input/lish-moa/train_targets_scored.csv", index_col=index_col)
groups = pd.read_csv(
    "../input/lish-moa/train_drug.csv", index_col=index_col, squeeze=True
)

columns = Y.columns

In [None]:
train_size, n_classes = Y.shape

In [None]:
Y_preds = []

paths = [
    # anonamename
    # "../input/mlp-for-ensemble/kaggle_upload/Y_pred_2l.pkl",
    # "../input/mlp-for-ensemble/kaggle_upload/Y_pred_3l_v2.pkl",
    # "../input/mlp-for-ensemble/kaggle_upload/Y_pred_4l.pkl",
    # "../input/mlp-for-ensemble/kaggle_upload/Y_pred_5l.pkl",
    "../input/mlp-for-ensemble/kaggle_upload/Y_pred_rs.pkl",
    "../input/mlp-for-ensemble/kaggle_upload/Y_pred_StackedTabNet.pkl",
    # "../input/moa-grownet/Y_pred.pkl",
    # "../input/moa-lgbmclassifier-classifierchain/Y_pred.pkl",
    # "../input/moa-lightgbm/Y_pred.pkl",
    # "../input/moa-rapids-svm-seed-y-pred/Y_pred.pkl",
    # ari hiro
    "../input/transformer-fit/Y_pred.pkl",
    # hirune924
    "../input/pytorch-mlp-tabnet-many-fe-predict/mlp_oof_avg.pkl",
    "../input/pytorch-mlp-tabnet-many-fe-predict/tabnet_oof_avg.pkl",
    "../input/pytorch-tabnet-pretraining-step3-many-fe-predict/tabnet_oof_avg.pkl",
    # Kon
    # "../input/lstmclassifier-fit/Y_pred.pkl",
    # "../input/mlpclassifier-fit/Y_pred.pkl",
    "../input/resnetclassifier-fit/Y_pred.pkl",
    # "../input/tabnetclassifier-fit/Y_pred.pkl",
    # "../input/transformerclassifier-fit/Y_pred.pkl",
    # ynishi
    # "../input/21-tabnet-fit/Y_pred.pkl",
]
n_models = len(paths)

result = pd.DataFrame(index=paths)

for i, path in enumerate(paths):
    with open(path, "rb") as f:
        Y_pred = pickle.load(f)

    Y_pred = np.asarray(Y_pred)

    Y_preds.append(Y_pred)

    result.loc[path, "oof_logloss"] = score(Y, Y_pred)
    result.loc[path, "oof_roc_auc_score"] = roc_auc_score(Y, Y_pred, average="micro")

Y_preds = np.asarray(Y_preds)

In [None]:
corr = np.empty((n_models, n_models))
corr = pd.DataFrame(corr, columns=paths, index=paths)

for i, row in enumerate(paths):
    for j, column in enumerate(paths):
        if i <= j:
            corr.loc[row, column] = 0
        else:
            df = pd.DataFrame(Y_preds[i])
            other = pd.DataFrame(Y_preds[j])

            corr.loc[row, column] = df.corrwith(other).mean()

corr.style.background_gradient(cmap="Blues", subset=paths, vmax=1.0, vmin=0.0)

In [None]:
# hyperparameters
n_seeds = 1
n_splits = 5
shuffle = True

In [None]:
%%time
Y_pred = np.zeros((train_size, n_classes))
Y_pred = pd.DataFrame(Y_pred, columns=Y.columns, dtype="float", index=Y.index)

weights = np.zeros((n_classes, n_models))
n_iters = np.zeros(n_classes)

x0 = np.ones(n_models) / n_models
bounds = [(0.0, 1.0) for _ in range(n_models)]
constraints = {
    "type": "eq",
    "fun": lambda x: np.sum(x) - 1.0,
    "jac": lambda x: np.ones_like(x),
}
options = {"ftol": 0.0, "maxiter": 1_000_000}

for i in range(n_seeds):
    cv = MultilabelStratifiedGroupKFold(
        n_splits=n_splits, random_state=i, shuffle=shuffle
    )

    for j, (train, valid) in enumerate(cv.split(Y, Y, groups)):
        for k, column in enumerate(columns):
            objective = ObjectiveForWeightsWithEarlyStopping(
                Y.iloc[train, [k]],
                Y_preds[:, train, [k]],
                Y.iloc[valid, [k]],
                Y_preds[:, valid, [k]],
                label_smoothing=1e-06,
                patience=30,
            )

            try:
                res = minimize(
                    objective,
                    x0,
                    bounds=bounds,
                    constraints=constraints,
                    method="SLSQP",
                    options=options,
                )
            except RuntimeError:
                pass

            weights[k] += objective._best_params / n_seeds / n_splits
            n_iters[k] += objective._nit / n_seeds / n_splits

            Y_pred.iloc[valid, k] += np.tensordot(
                objective._best_params, Y_preds[:, valid, [k]], axes=(0, 0)
            ) / n_seeds

with open("weights.pkl", "wb") as f:
    pickle.dump(weights, f)

In [None]:
result["weights_mean"] = np.mean(weights, axis=0)

result.sort_values("oof_logloss").style.background_gradient(cmap="Blues")

In [None]:
result = pd.DataFrame(weights, columns=paths, index=Y.columns)
result["n_pos"] = Y.sum()
result["n_iter"] = n_iters

result.style.background_gradient(cmap="Blues", subset=paths, vmax=1.0, vmin=0.0)

In [None]:
score(Y[columns], Y_pred[columns])

In [None]:
roc_auc_score(Y[columns], Y_pred[columns], average="micro")

In [None]:
# %%time
# Y_pred_postprocessed = np.zeros((train_size, n_classes))
# Y_pred_postprocessed = pd.DataFrame(
#     Y_pred_postprocessed, columns=Y.columns, dtype="float", index=Y.index
# )

# thresholds = np.empty(n_classes)
# n_iters = np.zeros(n_classes)

# for i in range(n_seeds):
#     cv = MultilabelStratifiedGroupKFold(
#         n_splits=n_splits, random_state=i, shuffle=shuffle
#     )

#     for j, (train, valid) in enumerate(cv.split(Y, Y, groups)):
#         for k, column in enumerate(columns):
#             objective = ObjectiveForThresholdWithEarlyStopping(
#                 Y.iloc[train, [k]],
#                 Y_pred.iloc[train, [k]],
#                 Y.iloc[valid, [k]],
#                 Y_pred.iloc[valid, [k]],
#                 # label_smoothing=1e-06,
#                 patience=30,
#             )

#             try:
#                 res = minimize_scalar(
#                     objective,
#                     bounds=[-6.0, -4.0],
#                     method="bounded",
#                     options={"maxiter": 1_000_000, "xatol": 0.0},
#                 )
#             except RuntimeError:
#                 pass

#             thresholds[k] += objective._best_params / n_seeds / n_splits
#             n_iters[k] += objective._nit / n_seeds / n_splits

#             tmp = Y_pred.iloc[:, k].copy()
#             tmp[tmp < 10 ** objective._best_params] = 0.0
#             tmp[tmp > 1.0 - 10 ** objective._best_params] = 1.0

#             Y_pred_postprocessed.iloc[valid, k] += tmp.iloc[valid] / n_splits

# thresholds = 10 ** thresholds

# with open("thresholds.pkl", "wb") as f:
#     pickle.dump(thresholds, f)

In [None]:
# score(Y[columns], Y_pred_postprocessed[columns])

In [None]:
# roc_auc_score(Y[columns], Y_pred_postprocessed[columns], average="micro")

In [None]:
# b 0.015139477616298346: 5 models
# + 0.015155485177579045: remove anonamename's resnet
# + 0.015175202091833243: remove hirune924's mlp
# + 0.015164515328761272: remove hirune924's tabnet
# + 0.015267901074102700: remove Kon's resnet
# + 0.015182427500416380: remove hirune924's custom tabnet
# - 0.015137371244008616: add Kon's tabnet
# - 0.015125755262557580: add anonamename's tabnet
# - 0.015123081606109805: add ari hiro's transformer
# + 0.015126446377100101: add anonamename's 2l-mlp
# + 0.015127751381848550: add anonamename's lightgbm
# + 0.015127386890777982: add Kon's transformer
# + 0.015144190451307262: add anonamename's grownet
# + 0.015129582758532654: add anonamename's 3l-mlp
# + 0.015128664309679032: add Kon's lstm
# + 0.015125454798629432: add anonamename's 4l-mlp
# + 0.015125978015766315: add anonamename's 5l-mlp
# + 0.015127642264192595: add Kon's mlp