In [None]:
import sys

sys.path.append("../input/adabeliefoptimizer/pypi_packages/adabelief_tf0.1.0")

In [None]:
import sys

sys.path.append("../input/iterative-stratification/iterative-stratification-master")

In [None]:
import pickle

import numpy as np
import pandas as pd
import tensorflow as tf

In [None]:
import tensorflow


def build_callbacks(
    model_path, factor=0.1, mode="auto", monitor="val_loss", patience=0, verbose=0
):
    early_stopping = tf.keras.callbacks.EarlyStopping(
        mode=mode, monitor=monitor, patience=patience, verbose=verbose
    )
    model_checkpoint = tf.keras.callbacks.ModelCheckpoint(
        model_path, mode=mode, monitor=monitor, save_best_only=True, verbose=verbose
    )
    reduce_lr_on_plateau = tf.keras.callbacks.ReduceLROnPlateau(
        factor=factor, monitor=monitor, mode=mode, verbose=verbose
    )

    return [early_stopping, model_checkpoint, reduce_lr_on_plateau]

In [None]:
import tensorflow as tf
import tensorflow_addons as tfa
from adabelief_tf import AdaBeliefOptimizer


def build_transformer_classifier(
    input_dim,
    output_dim,
    bias_initializer="zeros",
    d_model=128,
    dff=256,
    label_smoothing=0.0,
    n_heads=8,
    n_layers=5,
    optimizer_params=None,
    pretrained_model_path=None,
    rate=0.0,
):
    inputs = tf.keras.layers.Input(shape=input_dim)

    x = inputs
    x = tf.keras.layers.Reshape((1, input_dim))(x)
    x = TransformerEncoder(n_layers - 2, d_model, n_heads, dff, rate=rate)(x)[:, 0, :]
    x = tfa.layers.WeightNormalization(
        tf.keras.layers.Dense(output_dim, bias_initializer=bias_initializer)
    )(x)

    outputs = tf.keras.layers.Activation("sigmoid")(x)

    model = tf.keras.models.Model(inputs=inputs, outputs=outputs)

    if optimizer_params is None:
        optimizer_params = {}

    if pretrained_model_path is not None:
        model.load_weights(pretrained_model_path, by_name=True)

    loss = tf.keras.losses.BinaryCrossentropy(label_smoothing=label_smoothing)
    optimizer = AdaBeliefOptimizer(**optimizer_params)

    model.compile(loss=loss, optimizer=optimizer)

    return model

In [None]:
def compute_row_statistics(X, prefix=""):
    Xt = pd.DataFrame()

    for agg_func in [
        # "min",
        # "max",
        "mean",
        "std",
        "kurtosis",
        "skew",
    ]:
        Xt[f"{prefix}{agg_func}"] = X.agg(agg_func, axis=1)

    return Xt

In [None]:
import numpy as np


def score(Y, Y_pred, eps=1e-15, label_smoothing=0.0):
    Y = np.asarray(Y)
    Y = np.ravel(Y)

    if label_smoothing > 0.0:
        Y = Y * (1.0 - label_smoothing) + 0.5 * label_smoothing

    Y_pred = np.asarray(Y_pred)
    Y_pred = np.ravel(Y_pred)
    Y_pred = np.clip(Y_pred, eps, 1.0 - eps)

    return -np.mean(Y * np.log(Y_pred) + (1.0 - Y) * np.log(1.0 - Y_pred))

In [None]:
import os
import random as rn

import tensorflow as tf
import numpy as np


def set_seed(seed=0):
    os.environ["PYTHONHASHSEED"] = str(seed)

    rn.seed(seed)
    np.random.seed(seed)
    tf.random.set_seed(seed)

    graph = tf.compat.v1.get_default_graph()
    session_conf = tf.compat.v1.ConfigProto(
        inter_op_parallelism_threads=1, intra_op_parallelism_threads=1
    )
    sess = tf.compat.v1.Session(graph=graph, config=session_conf)

    tf.compat.v1.keras.backend.set_session(sess)

In [None]:
from sklearn.base import BaseEstimator
from sklearn.base import TransformerMixin


class ClippedFeatures(BaseEstimator, TransformerMixin):
    def __init__(self, copy=True, high=0.99, low=0.01):
        self.copy = copy
        self.high = high
        self.low = low

    def fit(self, X, y=None):
        self.data_max_ = X.quantile(q=self.high)
        self.data_min_ = X.quantile(q=self.low)

        return self

    def transform(self, X):
        if self.copy:
            X = X.copy()

        X.clip(self.data_min_, self.data_max_, axis=1, inplace=True)

        return X

In [None]:
# https://arxiv.org/abs/1905.04899

import numpy as np
import tensorflow as tf


class Cutmix(tf.keras.utils.Sequence):
    def __init__(self, X, y=None, batch_size=32, alpha=1.0):
        self.X = np.asarray(X)

        if y is None:
            self.y = y
        else:
            self.y = np.asarray(y)

        self.batch_size = batch_size
        self.alpha = alpha

    def __getitem__(self, i):
        X_batch = self.X[i * self.batch_size : (i + 1) * self.batch_size]

        n_samples, n_features = self.X.shape
        batch_size = X_batch.shape[0]
        shuffle = np.random.choice(n_samples, batch_size)

        l = np.random.beta(self.alpha, self.alpha)
        mask = np.random.choice([0.0, 1.0], size=n_features, p=[1.0 - l, l])
        X_shuffle = self.X[shuffle]
        X_batch = mask * X_batch + (1.0 - mask) * X_shuffle

        if self.y is None:
            return X_batch, None

        y_batch = self.y[i * self.batch_size : (i + 1) * self.batch_size]
        y_shuffle = self.y[shuffle]
        y_batch = l * y_batch + (1.0 - l) * y_shuffle

        return X_batch, y_batch

    def __len__(self):
        n_samples = self.X.shape[0]

        return int(np.ceil(n_samples / self.batch_size))

In [None]:
import numpy as np
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold
from sklearn.model_selection._split import _BaseKFold


class MultilabelStratifiedGroupKFold(_BaseKFold):
    def __init__(self, n_splits=5, random_state=None, shuffle=False):
        super().__init__(n_splits=n_splits, random_state=random_state, shuffle=shuffle)

    def _iter_test_indices(self, X=None, y=None, groups=None):
        cv = MultilabelStratifiedKFold(
            n_splits=self.n_splits,
            random_state=self.random_state,
            shuffle=self.shuffle,
        )

        value_counts = groups.value_counts()
        regluar_indices = value_counts.loc[
            (value_counts == 6) | (value_counts == 12) | (value_counts == 18)
        ].index.sort_values()
        irregluar_indices = value_counts.loc[
            (value_counts != 6) & (value_counts != 12) & (value_counts != 18)
        ].index.sort_values()

        group_to_fold = {}
        tmp = y.groupby(groups).mean().loc[regluar_indices]

        for fold, (_, test) in enumerate(cv.split(tmp, tmp)):
            group_to_fold.update({group: fold for group in tmp.index[test]})

        sample_to_fold = {}
        tmp = y.loc[groups.isin(irregluar_indices)]

        for fold, (_, test) in enumerate(cv.split(tmp, tmp)):
            sample_to_fold.update({sample: fold for sample in tmp.index[test]})

        folds = groups.map(group_to_fold)
        is_na = folds.isna()
        folds[is_na] = folds[is_na].index.map(sample_to_fold).values

        for i in range(self.n_splits):
            yield np.where(folds == i)[0]

In [None]:
import numpy as np
import tensorflow as tf
import tensorflow_addons as tfa


def gelu(x):
    cdf = 0.5 * (1.0 + tf.tanh((np.sqrt(2 / np.pi) * (x + 0.044715 * tf.pow(x, 3)))))

    return x * cdf


def point_wise_feed_forward_network(d_model, dff):
    return tf.keras.Sequential(
        [
            tf.keras.layers.Dense(dff, activation=gelu),  # (batch_size, seq_len, dff)
            tf.keras.layers.Dense(d_model),  # (batch_size, seq_len, d_model)
        ]
    )


def scaled_dot_product_attention(q, k, v, mask):
    matmul_qk = tf.matmul(q, k, transpose_b=True)  # (..., seq_len_q, seq_len_k)

    # scale matmul_qk
    dk = tf.cast(tf.shape(k)[-1], tf.float32)
    scaled_attention_logits = matmul_qk / tf.math.sqrt(dk)

    # add the mask to the scaled tensor.
    if mask is not None:
        scaled_attention_logits += mask * -1e09

    # softmax is normalized on the last axis (seq_len_k) so that the scores add up to 1.
    attention_weights = tf.nn.softmax(
        scaled_attention_logits, axis=-1
    )  # (..., seq_len_q, seq_len_k)

    output = tf.matmul(attention_weights, v)  # (..., seq_len_q, depth_v)

    return output, attention_weights


class MultiHeadAttention(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads):
        super().__init__()

        self.num_heads = num_heads
        self.d_model = d_model

        self.depth = d_model // self.num_heads

        self.wq = tf.keras.layers.Dense(d_model)
        self.wk = tf.keras.layers.Dense(d_model)
        self.wv = tf.keras.layers.Dense(d_model)

        self.dense = tf.keras.layers.Dense(d_model)

    def split_heads(self, x, batch_size):
        x = tf.reshape(x, (batch_size, -1, self.num_heads, self.depth))

        return tf.transpose(x, perm=[0, 2, 1, 3])

    def call(self, v, k, q, mask):
        batch_size = tf.shape(q)[0]

        q = self.wq(q)  # (batch_size, seq_len, d_model)
        k = self.wk(k)  # (batch_size, seq_len, d_model)
        v = self.wv(v)  # (batch_size, seq_len, d_model)

        q = self.split_heads(q, batch_size)  # (batch_size, num_heads, seq_len_q, depth)
        k = self.split_heads(k, batch_size)  # (batch_size, num_heads, seq_len_k, depth)
        v = self.split_heads(v, batch_size)  # (batch_size, num_heads, seq_len_v, depth)

        scaled_attention, attention_weights = scaled_dot_product_attention(
            q, k, v, mask
        )

        scaled_attention = tf.transpose(
            scaled_attention, perm=[0, 2, 1, 3]
        )  # (batch_size, seq_len_q, num_heads, depth)

        concat_attention = tf.reshape(
            scaled_attention, (batch_size, -1, self.d_model)
        )  # (batch_size, seq_len_q, d_model)

        output = self.dense(concat_attention)  # (batch_size, seq_len_q, d_model)

        return output, attention_weights


class EncoderLayer(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads, dff, rate=0.1):
        super().__init__()

        self.mha = MultiHeadAttention(d_model, num_heads)
        self.ffn = point_wise_feed_forward_network(d_model, dff)

        self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-06)
        self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-06)

        self.dropout1 = tf.keras.layers.Dropout(rate)
        self.dropout2 = tf.keras.layers.Dropout(rate)

    def call(self, x, training, mask):
        attn_output, _ = self.mha(x, x, x, mask)  # (batch_size, input_seq_len, d_model)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(x + attn_output)  # (batch_size, input_seq_len, d_model)

        ffn_output = self.ffn(out1)  # (batch_size, input_seq_len, d_model)
        ffn_output = self.dropout2(ffn_output, training=training)
        out2 = self.layernorm2(
            out1 + ffn_output
        )  # (batch_size, input_seq_len, d_model)

        return out2


class TransformerEncoder(tf.keras.layers.Layer):
    def __init__(self, num_layers, d_model, num_heads, dff, rate=0.1, **kwargs):
        super().__init__(**kwargs)

        self.d_model = d_model
        self.num_layers = num_layers
        self.num_heads = num_heads
        self.dff = dff
        self.rate = rate

        self.embedding = tfa.layers.WeightNormalization(
            tf.keras.layers.Dense(self.d_model)
        )

        self.enc_layers = [
            EncoderLayer(self.d_model, self.num_heads, self.dff, self.rate)
            for _ in range(self.num_layers)
        ]

        self.dropout = tf.keras.layers.Dropout(self.rate)

    def get_config(self):
        config = super().get_config().copy()

        config.update(
            {
                "num_layers": self.num_layers,
                "d_model": self.d_model,
                "num_heads": self.num_heads,
                "dff": self.dff,
                "rate": self.rate,
            }
        )

        return config

    def call(self, x, training, mask=None):
        seq_len = tf.shape(x)[1]

        x = self.embedding(x)
        x = self.dropout(x, training=training)

        for i in range(self.num_layers):
            x = self.enc_layers[i](x, training, mask)

        return x  # (batch_size, input_seq_len, d_model)

In [None]:
dtype = {"cp_type": "category", "cp_dose": "category"}
index_col = "sig_id"

train_features = pd.read_csv(
    "../input/lish-moa/train_features.csv", dtype=dtype, index_col=index_col
)
X = train_features.select_dtypes("number")
Y = pd.read_csv("../input/lish-moa/train_targets_scored.csv", index_col=index_col)
groups = pd.read_csv(
    "../input/lish-moa/train_drug.csv", index_col=index_col, squeeze=True
)

columns = Y.columns

In [None]:
c_prefix = "c-"
g_prefix = "g-"
c_columns = X.columns.str.startswith(c_prefix)
g_columns = X.columns.str.startswith(g_prefix)
X_stats_c = compute_row_statistics(X.loc[:, c_columns], prefix=c_prefix)
X_stats_g = compute_row_statistics(X.loc[:, g_columns], prefix=g_prefix)

with open("../input/preprocessor-fit/clipped_features.pkl", "rb") as f:
    clipped_features = pickle.load(f)

X = clipped_features.transform(X)

X = pd.concat([X, X_stats_c, X_stats_g], axis=1)

In [None]:
train_size, n_features = X.shape
_, n_classes = Y.shape

In [None]:
# hyperparameters
alpha = 1.0
batch_size = 4096
factor = 0.5
n_seeds = 5
n_splits = 5
patience = 30
shuffle = True
params = {
    "d_model": 64,
    "dff": 256,
    "label_smoothing": 5e-04,
    "n_heads": 16,
    "n_layers": 5,
    "optimizer_params": {"beta_1": 0.75, "lr": 0.03},
    "rate": 0.3,
}
fit_params = {"epochs": 1_000, "verbose": 0}

with open("params.pkl", "wb") as f:
    pickle.dump(params, f)

In [None]:
%%time
bias_initializer = -Y.mean(axis=0).apply(np.log).values
bias_initializer = tf.keras.initializers.Constant(bias_initializer)

Y_pred = np.zeros((train_size, n_classes))
Y_pred = pd.DataFrame(Y_pred, columns=Y.columns, index=Y.index)

for i in range(n_seeds):
    set_seed(seed=i)

    cv = MultilabelStratifiedGroupKFold(
        n_splits=n_splits, random_state=i, shuffle=shuffle
    )

    for j, (train, valid) in enumerate(cv.split(X, Y[columns], groups)):
        model_path = f"model_seed_{i}_fold_{j}.h5"

        model = build_transformer_classifier(
            n_features,
            n_classes,
            bias_initializer=bias_initializer,
            **params,
        )

        generator = Cutmix(
            X.iloc[train], Y.iloc[train], alpha=alpha, batch_size=batch_size
        )
        callbacks = build_callbacks(model_path, factor=factor, patience=patience)
        history = model.fit(
            generator,
            callbacks=callbacks,
            validation_data=(X.iloc[valid], Y.iloc[valid]),
            **fit_params,
        )

        model.load_weights(model_path)

        Y_pred.iloc[valid] += model.predict(X.iloc[valid]) / n_seeds

Y_pred[train_features["cp_type"] == "ctl_vehicle"] = 0.0

with open("Y_pred.pkl", "wb") as f:
    pickle.dump(Y_pred[columns], f)

In [None]:
score(Y[columns], Y_pred[columns])

In [None]:
# b 0.017613745637092890: vanilla
# + 0.020231773279606454: remove cutmix
# - 0.016933955369047540: batch_size=256
# - 0.016561983004116220: batch_size=512
# - 0.016317078077301540: batch_size=1024
# - 0.016197360264813016: batch_size=2048
# - 0.016143128480625435: batch_size=4096
# + 0.016290473858185200: batch_size=8192
# + 0.016169000906222634: lr=0.01
# + 0.017920056100902890: lr=0.1
# - 0.016141199647868334: cutmix(alpha=2.0)
# - 0.016122704124989867: cutmix(alpha=1.0)
# + 0.016145613363032430: cutmix(alpha=0.5)
# + 0.016154231642736275: label_smoothing=1e-04
# + 0.016151331836921672: label_smoothing=1e-03
# - 0.016051994765583076: rate=0.35
# - 0.016007689076553790: rate=0.3
# + 0.016050162884695393: rate=0.25
# - 0.015961052459868758: d_model=64
# + 0.016032246860115885: d_model=32
# + 0.016004713614897498: dff=128
# + 0.015986869746250267: dff=512
# + 0.016011309773467880: n_layers=6
# + 0.016051247327988002: n_layers=4
# - 0.015956038639902700: n_heads=16
# + 0.020065607207388930: remove weight normalization
# b 0.015961052459868758: n_heads=16
# - 0.015938192644412540: apply weight normalization to an embedding layer
# + 0.016249224488928680: activation="elu"
# + 0.015938192644412540: n_heads=32
# + 0.015961652661258500: apply weight normalization to MultiAheadAttention
# - 0.015932305370398952: add rowstatistics
# - 0.015919330570302686: rowstatistics -> clipped_features -> concat
# + 0.016104782998983973: add nonscored targets as additional targets
# + 0.015925919237608416: add only highly correlated additional targets
# + 0.015986798159799236: rowstatistics(max, min)
# + 0.015961425133573657: polynomiral features after clipped features
# + 0.015944889832617482: polynomiral features before clipped features
# - 0.015918330358751717: beta_1=0.8
# - 0.015897521669603114: beta_1=0.75
# + 0.015919002373462714: beta_1=0.7
# + 0.019501834012630444: beta_2=0.95
# + 0.017315020687719890: beta_2=0.99