In [None]:
import sys

sys.path.append("../input/adabeliefoptimizer/pypi_packages/adabelief_tf0.1.0")

In [None]:
import sys

sys.path.append("../input/iterative-stratification/iterative-stratification-master")

In [None]:
import pickle

import numpy as np
import pandas as pd
import tensorflow as tf

In [None]:
import tensorflow


def build_callbacks(
    model_path, factor=0.1, mode="auto", monitor="val_loss", patience=0, verbose=0
):
    early_stopping = tf.keras.callbacks.EarlyStopping(
        mode=mode, monitor=monitor, patience=patience, verbose=verbose
    )
    model_checkpoint = tf.keras.callbacks.ModelCheckpoint(
        model_path, mode=mode, monitor=monitor, save_best_only=True, verbose=verbose
    )
    reduce_lr_on_plateau = tf.keras.callbacks.ReduceLROnPlateau(
        factor=factor, monitor=monitor, mode=mode, verbose=verbose
    )

    return [early_stopping, model_checkpoint, reduce_lr_on_plateau]

In [None]:
import tensorflow as tf
import tensorflow_addons as tfa
from adabelief_tf import AdaBeliefOptimizer


def build_mlp_classifier(
    input_dim,
    output_dim,
    activation="relu",
    bias_initializer="zeros",
    kernel_initializer="glorot_uniform",
    label_smoothing=0.0,
    momentum=0.99,
    n_layers=3,
    n_units="auto",
    pretrained_model_path=None,
    optimizer_params=None,
    rate=0.0,
    skip=False,
):
    if n_units == "auto":
        n_units = 0.5 * (input_dim + output_dim)

    inputs = tf.keras.layers.Input(shape=input_dim, name="I")

    x = inputs

    for i in range(n_layers - 2):
        x = tfa.layers.WeightNormalization(
            tf.keras.layers.Dense(n_units, kernel_initializer=kernel_initializer),
            name=f"WN{i + 1}",
        )(x)
        x = tf.keras.layers.BatchNormalization(momentum=momentum, name=f"BN{i + 1}")(x)

        if skip and i > 0 and i % 2 == 0:
            x = x + shortcut

        x = tf.keras.layers.Activation(activation, name=f"A{i + 1}")(x)
        x = tf.keras.layers.Dropout(rate, name=f"D{i + 1}")(x)

        if skip and i % 2 == 0:
            shortcut = x

    x = tfa.layers.WeightNormalization(
        tf.keras.layers.Dense(output_dim, bias_initializer=bias_initializer)
    )(x)

    outputs = tf.keras.layers.Activation("sigmoid")(x)

    model = tf.keras.models.Model(inputs=inputs, outputs=outputs)

    if optimizer_params is None:
        optimizer_params = {}

    if pretrained_model_path is not None:
        model.load_weights(pretrained_model_path, by_name=True)

    loss = tf.keras.losses.BinaryCrossentropy(label_smoothing=label_smoothing)
    optimizer = AdaBeliefOptimizer(**optimizer_params)

    model.compile(loss=loss, optimizer=optimizer)

    return model

In [None]:
def compute_row_statistics(X, prefix=""):
    Xt = pd.DataFrame()

    for agg_func in [
        # "min",
        # "max",
        "mean",
        "std",
        "kurtosis",
        "skew",
    ]:
        Xt[f"{prefix}{agg_func}"] = X.agg(agg_func, axis=1)

    return Xt

In [None]:
import numpy as np


def score(Y, Y_pred, eps=1e-15, label_smoothing=0.0):
    Y = np.asarray(Y)
    Y = np.ravel(Y)

    if label_smoothing > 0.0:
        Y = Y * (1.0 - label_smoothing) + 0.5 * label_smoothing

    Y_pred = np.asarray(Y_pred)
    Y_pred = np.ravel(Y_pred)
    Y_pred = np.clip(Y_pred, eps, 1.0 - eps)

    return -np.mean(Y * np.log(Y_pred) + (1.0 - Y) * np.log(1.0 - Y_pred))

In [None]:
import os
import random as rn

import tensorflow as tf
import numpy as np


def set_seed(seed=0):
    os.environ["PYTHONHASHSEED"] = str(seed)

    rn.seed(seed)
    np.random.seed(seed)
    tf.random.set_seed(seed)

    graph = tf.compat.v1.get_default_graph()
    session_conf = tf.compat.v1.ConfigProto(
        inter_op_parallelism_threads=1, intra_op_parallelism_threads=1
    )
    sess = tf.compat.v1.Session(graph=graph, config=session_conf)

    tf.compat.v1.keras.backend.set_session(sess)

In [None]:
from sklearn.base import BaseEstimator
from sklearn.base import TransformerMixin


class ClippedFeatures(BaseEstimator, TransformerMixin):
    def __init__(self, copy=True, high=0.99, low=0.01):
        self.copy = copy
        self.high = high
        self.low = low

    def fit(self, X, y=None):
        self.data_max_ = X.quantile(q=self.high)
        self.data_min_ = X.quantile(q=self.low)

        return self

    def transform(self, X):
        if self.copy:
            X = X.copy()

        X.clip(self.data_min_, self.data_max_, axis=1, inplace=True)

        return X

In [None]:
# https://arxiv.org/abs/1905.04899

import numpy as np
import tensorflow as tf


class Cutmix(tf.keras.utils.Sequence):
    def __init__(self, X, y=None, batch_size=32, alpha=1.0):
        self.X = np.asarray(X)

        if y is None:
            self.y = y
        else:
            self.y = np.asarray(y)

        self.batch_size = batch_size
        self.alpha = alpha

    def __getitem__(self, i):
        X_batch = self.X[i * self.batch_size : (i + 1) * self.batch_size]

        n_samples, n_features = self.X.shape
        batch_size = X_batch.shape[0]
        shuffle = np.random.choice(n_samples, batch_size)

        l = np.random.beta(self.alpha, self.alpha)
        mask = np.random.choice([0.0, 1.0], size=n_features, p=[1.0 - l, l])
        X_shuffle = self.X[shuffle]
        X_batch = mask * X_batch + (1.0 - mask) * X_shuffle

        if self.y is None:
            return X_batch, None

        y_batch = self.y[i * self.batch_size : (i + 1) * self.batch_size]
        y_shuffle = self.y[shuffle]
        y_batch = l * y_batch + (1.0 - l) * y_shuffle

        return X_batch, y_batch

    def __len__(self):
        n_samples = self.X.shape[0]

        return int(np.ceil(n_samples / self.batch_size))

In [None]:
import numpy as np
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold
from sklearn.model_selection._split import _BaseKFold


class MultilabelStratifiedGroupKFold(_BaseKFold):
    def __init__(self, n_splits=5, random_state=None, shuffle=False):
        super().__init__(n_splits=n_splits, random_state=random_state, shuffle=shuffle)

    def _iter_test_indices(self, X=None, y=None, groups=None):
        cv = MultilabelStratifiedKFold(
            n_splits=self.n_splits,
            random_state=self.random_state,
            shuffle=self.shuffle,
        )

        value_counts = groups.value_counts()
        regluar_indices = value_counts.loc[
            (value_counts == 6) | (value_counts == 12) | (value_counts == 18)
        ].index.sort_values()
        irregluar_indices = value_counts.loc[
            (value_counts != 6) & (value_counts != 12) & (value_counts != 18)
        ].index.sort_values()

        group_to_fold = {}
        tmp = y.groupby(groups).mean().loc[regluar_indices]

        for fold, (_, test) in enumerate(cv.split(tmp, tmp)):
            group_to_fold.update({group: fold for group in tmp.index[test]})

        sample_to_fold = {}
        tmp = y.loc[groups.isin(irregluar_indices)]

        for fold, (_, test) in enumerate(cv.split(tmp, tmp)):
            sample_to_fold.update({sample: fold for sample in tmp.index[test]})

        folds = groups.map(group_to_fold)
        is_na = folds.isna()
        folds[is_na] = folds[is_na].index.map(sample_to_fold).values

        for i in range(self.n_splits):
            yield np.where(folds == i)[0]

In [None]:
dtype = {"cp_type": "category", "cp_dose": "category"}
index_col = "sig_id"

train_features = pd.read_csv(
    "../input/lish-moa/train_features.csv", dtype=dtype, index_col=index_col
)
X = train_features.select_dtypes("number")
Y = pd.read_csv("../input/lish-moa/train_targets_scored.csv", index_col=index_col)
groups = pd.read_csv(
    "../input/lish-moa/train_drug.csv", index_col=index_col, squeeze=True
)

columns = Y.columns

In [None]:
c_prefix = "c-"
g_prefix = "g-"
c_columns = X.columns.str.startswith(c_prefix)
g_columns = X.columns.str.startswith(g_prefix)
X_stats_c = compute_row_statistics(X.loc[:, c_columns], prefix=c_prefix)
X_stats_g = compute_row_statistics(X.loc[:, g_columns], prefix=g_prefix)

with open("../input/preprocessor-fit/clipped_features.pkl", "rb") as f:
    clipped_features = pickle.load(f)

X = clipped_features.transform(X)

X = pd.concat([X, X_stats_c, X_stats_g], axis=1)

In [None]:
train_size, n_features = X.shape
_, n_classes = Y.shape

In [None]:
# hyperparameters
alpha = 4.0
batch_size = 32
factor = 0.5
n_seeds = 5
n_splits = 5
patience = 30
shuffle = True
params = {
    "activation": "elu",
    "kernel_initializer": "he_normal",
    "label_smoothing": 5e-04,
    "n_layers": 7,
    "n_units": 256,
    "optimizer_params": {"beta_1": 0.85, "lr": 0.03},
    "rate": 0.3,
    "skip": True,
}
fit_params = {"epochs": 1_000, "verbose": 0}

In [None]:
%%time
bias_initializer = -Y.mean(axis=0).apply(np.log).values
bias_initializer = tf.keras.initializers.Constant(bias_initializer)

Y_pred = np.zeros((train_size, n_classes))
Y_pred = pd.DataFrame(Y_pred, columns=Y.columns, index=Y.index)

for i in range(n_seeds):
    set_seed(seed=i)

    cv = MultilabelStratifiedGroupKFold(
        n_splits=n_splits, random_state=i, shuffle=shuffle
    )

    for j, (train, valid) in enumerate(cv.split(X, Y[columns], groups)):
        model_path = f"model_seed_{i}_fold_{j}.h5"

        model = build_mlp_classifier(
            n_features,
            n_classes,
            bias_initializer=bias_initializer,
            **params,
        )

        generator = Cutmix(
            X.iloc[train], Y.iloc[train], alpha=alpha, batch_size=batch_size
        )
        callbacks = build_callbacks(model_path, factor=factor, patience=patience)
        history = model.fit(
            generator,
            callbacks=callbacks,
            validation_data=(X.iloc[valid], Y.iloc[valid]),
            **fit_params,
        )

        model.load_weights(model_path)

        Y_pred.iloc[valid] += model.predict(X.iloc[valid]) / n_seeds

Y_pred[train_features["cp_type"] == "ctl_vehicle"] = 0.0

with open("Y_pred.pkl", "wb") as f:
    pickle.dump(Y_pred[columns], f)

In [None]:
score(Y[columns], Y_pred[columns])

In [None]:
# b 0.015994246952747954: vanilla
# + 0.016008211277595052: batch_momentum=0.98
# - 0.015810368099750786: cutmix(alpha=1.0)
# - 0.015788791741863668: remove first dropout layer, cutmix(alpha=1.0)
# + 0.016917430610109030: test-time augmentation (cutmix + gaussian noise)
# - 0.015782911285632180: remove rowstatistics
# + 0.016236778363784050: add quantiletransformer
# - 0.015780689018469560: stddev=0.4
# + 0.015798678302832290: stddev=0.35
# + 0.015848971743793760: cutmix(alpha=0.5)
# + 0.015795163474415612: cutmix(alpha=2.0)
# - 0.015747588913558980: rate=0.3
# + 0.015759213663024532: rate=0.35
# + 0.015802250529421566: remove ctl_vehicle from train & valid
# + 0.015824030991552742: remove ctl_vehicle from train & valid, stddev=0.4*std
# + 0.016150020012321364: stddev=0.0, remove cutmix
# + 0.015784536333436300: stddev=0.0
# + 0.015789385925020200: stddev=0.0, add cp_type & cp_dose
# + 0.015764240674257066: stddev=0.0, cutmix(alpha=2.0)
# - 0.015732600356490125: stddev=0.0, cutmix(alpha=4.0)
# + 0.015793021822842125: stddev=0.0, cutmix(alpha=8.0)
# + 0.015751460390131804: stddev=0.1*std, stddev["cptime"] = 0.0, cutmix(alpha=4.0)
# + 0.015761022293541458: stddev=0.1*std, cutmix(alpha=4.0)
# + 0.015758034499447827: stddev=0.2*std, cutmix(alpha=4.0)
# + 0.015867841830930873: add quantiletransformer(n_quantiles=100)
# + 0.015797839642251572: add absolute features
# - 0.015708352883603460: remove additional targets
# + 0.015832151243559488: add c-square features
# b 0.015708352883603467: remove additional targets
# + 0.015819876183296120: remove clipped features, add quantiletransformer(n_quantiles=100), fit(train+test)
# + 0.015807305230095343: quantiletransformer(n_quantiles=100) -> clipped features, fit(train+test)
# + 0.015764452580356664: add additional targets
# - 0.015705591305375046: rowstatistics -> clipped_features -> concat
# + 0.015716916509029560: add only highly correlated additional targets
# + 0.015708715240629875: beta_1=0.75
# + 0.015710850169124463: beta_1=0.8
# - 0.015679506881320745: beta_1=0.85