In [None]:
import sys

sys.path.append("../input/adabeliefoptimizer/pypi_packages/adabelief_tf0.1.0")

In [None]:
import sys

sys.path.append("../input/iterative-stratification/iterative-stratification-master")

In [None]:
import sys

sys.path.append("../input/tabnet")

In [None]:
import pickle

import numpy as np
import pandas as pd
import tensorflow as tf
from adabelief_tf import AdaBeliefOptimizer

In [None]:
import tensorflow


def build_callbacks(
    model_path, factor=0.1, mode="auto", monitor="val_loss", patience=0, verbose=0
):
    early_stopping = tf.keras.callbacks.EarlyStopping(
        mode=mode, monitor=monitor, patience=patience, verbose=verbose
    )
    model_checkpoint = tf.keras.callbacks.ModelCheckpoint(
        model_path, mode=mode, monitor=monitor, save_best_only=True, verbose=verbose
    )
    reduce_lr_on_plateau = tf.keras.callbacks.ReduceLROnPlateau(
        factor=factor, monitor=monitor, mode=mode, verbose=verbose
    )

    return [early_stopping, model_checkpoint, reduce_lr_on_plateau]

In [None]:
def compute_row_statistics(X, prefix=""):
    Xt = pd.DataFrame()

    for agg_func in [
        # "min",
        # "max",
        "mean",
        "std",
        "kurtosis",
        "skew",
    ]:
        Xt[f"{prefix}{agg_func}"] = X.agg(agg_func, axis=1)

    return Xt

In [None]:
import numpy as np


def score(Y, Y_pred, eps=1e-15, label_smoothing=0.0):
    Y = np.asarray(Y)
    Y = np.ravel(Y)

    if label_smoothing > 0.0:
        Y = Y * (1.0 - label_smoothing) + 0.5 * label_smoothing

    Y_pred = np.asarray(Y_pred)
    Y_pred = np.ravel(Y_pred)
    Y_pred = np.clip(Y_pred, eps, 1.0 - eps)

    return -np.mean(Y * np.log(Y_pred) + (1.0 - Y) * np.log(1.0 - Y_pred))

In [None]:
import os
import random as rn

import tensorflow as tf
import numpy as np


def set_seed(seed=0):
    os.environ["PYTHONHASHSEED"] = str(seed)

    rn.seed(seed)
    np.random.seed(seed)
    tf.random.set_seed(seed)

    graph = tf.compat.v1.get_default_graph()
    session_conf = tf.compat.v1.ConfigProto(
        inter_op_parallelism_threads=1, intra_op_parallelism_threads=1
    )
    sess = tf.compat.v1.Session(graph=graph, config=session_conf)

    tf.compat.v1.keras.backend.set_session(sess)

In [None]:
from sklearn.base import BaseEstimator
from sklearn.base import TransformerMixin


class ClippedFeatures(BaseEstimator, TransformerMixin):
    def __init__(self, copy=True, high=0.99, low=0.01):
        self.copy = copy
        self.high = high
        self.low = low

    def fit(self, X, y=None):
        self.data_max_ = X.quantile(q=self.high)
        self.data_min_ = X.quantile(q=self.low)

        return self

    def transform(self, X):
        if self.copy:
            X = X.copy()

        X.clip(self.data_min_, self.data_max_, axis=1, inplace=True)

        return X

In [None]:
# https://arxiv.org/abs/1905.04899

import numpy as np
import tensorflow as tf


class Cutmix(tf.keras.utils.Sequence):
    def __init__(self, X, y=None, batch_size=32, alpha=1.0):
        self.X = np.asarray(X)

        if y is None:
            self.y = y
        else:
            self.y = np.asarray(y)

        self.batch_size = batch_size
        self.alpha = alpha

    def __getitem__(self, i):
        X_batch = self.X[i * self.batch_size : (i + 1) * self.batch_size]

        n_samples, n_features = self.X.shape
        batch_size = X_batch.shape[0]
        shuffle = np.random.choice(n_samples, batch_size)

        l = np.random.beta(self.alpha, self.alpha)
        mask = np.random.choice([0.0, 1.0], size=n_features, p=[1.0 - l, l])
        X_shuffle = self.X[shuffle]
        X_batch = mask * X_batch + (1.0 - mask) * X_shuffle

        if self.y is None:
            return X_batch, None

        y_batch = self.y[i * self.batch_size : (i + 1) * self.batch_size]
        y_shuffle = self.y[shuffle]
        y_batch = l * y_batch + (1.0 - l) * y_shuffle

        return X_batch, y_batch

    def __len__(self):
        n_samples = self.X.shape[0]

        return int(np.ceil(n_samples / self.batch_size))

In [None]:
import numpy as np
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold
from sklearn.model_selection._split import _BaseKFold


class MultilabelStratifiedGroupKFold(_BaseKFold):
    def __init__(self, n_splits=5, random_state=None, shuffle=False):
        super().__init__(n_splits=n_splits, random_state=random_state, shuffle=shuffle)

    def _iter_test_indices(self, X=None, y=None, groups=None):
        cv = MultilabelStratifiedKFold(
            n_splits=self.n_splits,
            random_state=self.random_state,
            shuffle=self.shuffle,
        )

        value_counts = groups.value_counts()
        regluar_indices = value_counts.loc[
            (value_counts == 6) | (value_counts == 12) | (value_counts == 18)
        ].index.sort_values()
        irregluar_indices = value_counts.loc[
            (value_counts != 6) & (value_counts != 12) & (value_counts != 18)
        ].index.sort_values()

        group_to_fold = {}
        tmp = y.groupby(groups).mean().loc[regluar_indices]

        for fold, (_, test) in enumerate(cv.split(tmp, tmp)):
            group_to_fold.update({group: fold for group in tmp.index[test]})

        sample_to_fold = {}
        tmp = y.loc[groups.isin(irregluar_indices)]

        for fold, (_, test) in enumerate(cv.split(tmp, tmp)):
            sample_to_fold.update({sample: fold for sample in tmp.index[test]})

        folds = groups.map(group_to_fold)
        is_na = folds.isna()
        folds[is_na] = folds[is_na].index.map(sample_to_fold).values

        for i in range(self.n_splits):
            yield np.where(folds == i)[0]

In [None]:
# import tensorflow as tf
# from tabnet import StackedTabNet


# class StackedTabNetClassifier(tf.keras.Model):
#     def __init__(
#         self,
#         num_classes,
#         batch_momentum=0.98,
#         epsilon=1e-05,
#         feature_columns=None,
#         feature_dim=64,
#         norm_type="group",
#         num_decision_steps=5,
#         num_features=None,
#         num_groups=2,
#         num_layers=1,
#         output_dim=64,
#         relaxation_factor=1.5,
#         sparsity_coefficient=1e-05,
#         virtual_batch_size=None,
#         **kwargs
#     ):
#         super().__init__(**kwargs)

#         self.stacked_tabnet = StackedTabNet(
#             feature_columns,
#             batch_momentum=batch_momentum,
#             epsilon=epsilon,
#             feature_dim=feature_dim,
#             norm_type=norm_type,
#             num_decision_steps=num_decision_steps,
#             num_features=num_features,
#             num_groups=num_groups,
#             num_layers=num_layers,
#             output_dim=output_dim,
#             relaxation_factor=relaxation_factor,
#             sparsity_coefficient=sparsity_coefficient,
#             virtual_batch_size=virtual_batch_size,
#         )

#         self.classifier = tf.keras.layers.Dense(
#             num_classes, activation="sigmoid", use_bias=False
#         )

#     def call(self, inputs, training=None):
#         x = self.stacked_tabnet(inputs, training=training)

#         return self.classifier(x)

In [None]:
import tensorflow as tf
from tabnet import TabNet


class TabNetClassifier(tf.keras.Model):
    def __init__(
        self,
        num_classes,
        batch_momentum=0.98,
        epsilon=1e-05,
        feature_columns=None,
        feature_dim=64,
        norm_type="group",
        num_decision_steps=5,
        num_features=None,
        num_groups=1,
        output_dim=64,
        relaxation_factor=1.5,
        sparsity_coefficient=1e-05,
        virtual_batch_size=None,
        **kwargs
    ):
        super().__init__(**kwargs)

        self.tabnet = TabNet(
            feature_columns,
            batch_momentum=batch_momentum,
            epsilon=epsilon,
            feature_dim=feature_dim,
            norm_type=norm_type,
            num_decision_steps=num_decision_steps,
            num_features=num_features,
            num_groups=num_groups,
            output_dim=output_dim,
            relaxation_factor=relaxation_factor,
            sparsity_coefficient=sparsity_coefficient,
            virtual_batch_size=virtual_batch_size,
            **kwargs
        )

        self.classifier = tf.keras.layers.Dense(
            num_classes, activation="sigmoid", use_bias=False
        )

    def call(self, inputs, training=None):
        x = self.tabnet(inputs, training=training)

        return self.classifier(x)

In [None]:
dtype = {"cp_type": "category", "cp_dose": "category"}
index_col = "sig_id"

train_features = pd.read_csv(
    "../input/lish-moa/train_features.csv", dtype=dtype, index_col=index_col
)
X = train_features.select_dtypes("number")
Y = pd.read_csv("../input/lish-moa/train_targets_scored.csv", index_col=index_col)
groups = pd.read_csv(
    "../input/lish-moa/train_drug.csv", index_col=index_col, squeeze=True
)

columns = Y.columns

In [None]:
c_prefix = "c-"
g_prefix = "g-"
c_columns = X.columns.str.startswith(c_prefix)
g_columns = X.columns.str.startswith(g_prefix)
X_stats_c = compute_row_statistics(X.loc[:, c_columns], prefix=c_prefix)
X_stats_g = compute_row_statistics(X.loc[:, g_columns], prefix=g_prefix)

with open("../input/preprocessor-fit/clipped_features.pkl", "rb") as f:
    clipped_features = pickle.load(f)

X = clipped_features.transform(X)

X = pd.concat([X, X_stats_c, X_stats_g], axis=1)

In [None]:
train_size, n_features = X.shape
_, n_classes = Y.shape

In [None]:
# hyperparameters
alpha = 4.0
batch_size = 8
factor = 0.5
label_smoothing = 1e-03
lr = 0.001
n_seeds = 5
n_splits = 5
patience = 30
shuffle = True
params = {
    "batch_momentum": 0.95,
    "feature_dim": 512,
    "norm_type": "batch",
    "num_decision_steps": 1,
}
fit_params = {"epochs": 1_000, "verbose": 0}

with open("params.pkl", "wb") as f:
    pickle.dump(params, f)

In [None]:
%%time
Y_pred = np.zeros((train_size, n_classes))
Y_pred = pd.DataFrame(Y_pred, columns=Y.columns, index=Y.index)

for i in range(n_seeds):
    set_seed(seed=i)

    cv = MultilabelStratifiedGroupKFold(
        n_splits=n_splits, random_state=i, shuffle=shuffle
    )

    for j, (train, valid) in enumerate(cv.split(X, Y[columns], groups)):
        model_path = f"model_seed_{i}_fold_{j}.h5"

        model = TabNetClassifier(
            num_classes=n_classes, num_features=n_features, **params
        )
        loss = tf.keras.losses.BinaryCrossentropy(label_smoothing=label_smoothing)
        optimizer = AdaBeliefOptimizer(learning_rate=lr)

        model.compile(loss=loss, optimizer=optimizer)

        generator = Cutmix(
            X.iloc[train], Y.iloc[train], alpha=alpha, batch_size=batch_size
        )
        callbacks = build_callbacks(model_path, factor=factor, patience=patience)
        history = model.fit(
            generator,
            callbacks=callbacks,
            validation_data=(X.iloc[valid], Y.iloc[valid]),
            **fit_params,
        )

        model.load_weights(model_path)

        Y_pred.iloc[valid] += model.predict(X.iloc[valid]) / n_seeds

Y_pred[train_features["cp_type"] == "ctl_vehicle"] = 0.0

with open("Y_pred.pkl", "wb") as f:
    pickle.dump(Y_pred[columns], f)

In [None]:
score(Y[columns], Y_pred[columns])

In [None]:
# b 0.016500749844765163: vanilla
# - 0.016497676039512712: lr=0.001
# + 0.016648792505992370: lr=0.01
# + 0.016514646176541175: lr=0.0003
# + 0.016576238532578630: label_smoothing=1e-04
# - 0.016459734021787725: label_smoothing=1e-03
# + 0.016714446685325107: label_smoothing=3e-03
# + 0.016595114418360840: batch_size=64
# - 0.016416163897717113: batch_size=16
# - 0.016376464826077570: batch_size=8
# - 0.016264005364634404: norm_type="batch"
# + 0.016288912869945646: batch_size=16
# + 0.016583924007269300: batch_size=4
# - 0.016212486949183844: feature_dim=256
# - 0.016182858549420427: feature_dim=512
# + 0.016206945018426098: feature_dim=1024
# + 0.016326808537440018: output_dim=128
# + 0.016206255579305613: output_dim=32
# + 0.021095031830998465: n_decsion_steps=2
# + 0.016182858549420427: relaxation_factor=1.0
# + 0.016182858549420427: relaxation_factor=0.5
# - 0.016146143023995250: cutmix(alpha=1.0)
# + 0.016154592789026014: fixed_cutmix(alpha=1.0)
# - 0.016133952316455563: batch_momentum=0.95
# + 0.016154628705148800: batch_momentum=0.9
# + 0.016151426430633047: batch_momentum=0.9, remove rowstatistics
# - 0.016101229987361220: remove rowstatistics
# + 0.016101229987361220: sparsity_coefficient=1e-04
# + 0.016186363339304583: batch_size=128
# + 0.016233864670764358: cutmix(alpha=0.5)
# - 0.016064267354577305: cutmix(alpha=2.0)
# - 0.015954724221242774: cutmix(alpha=4.0)
# + 0.015986972892065680: cutmix(alpha=8.0)
# b 0.015992065599939043: remove additional targets
# + 0.017293994271024533: remove additional targets, feature_dim=8, output_dim=4
# + 0.016365092229987350: remove additional targets, feature_dim=16, output_dim=8
# + 0.016064325148839542: remove additional targets, feature_dim=32, output_dim=16
# + 0.016007137125710537: use_bias=True
# + 0.016130240781653944: apply weight normalization to classifier
# + 0.015992407110539293: rowstatistics -> clipped_features -> concat