In [None]:
import sys

sys.path.append("../input/iterative-stratification/iterative-stratification-master")

In [None]:
import pickle

import numpy as np
import pandas as pd
import tensorflow as tf

In [None]:
MIXED_PRECISION = False
XLA_ACCELERATE = True

In [None]:
if MIXED_PRECISION:
    policy = tf.keras.mixed_precision.experimental.Policy("mixed_float16")

    tf.keras.mixed_precision.experimental.set_policy(policy)

tf.config.optimizer.set_jit(XLA_ACCELERATE)

In [None]:
import tensorflow


def build_callbacks(
    checkpoint_path,
    factor=0.1,
    mode="auto",
    monitor="val_loss",
    patience_for_es=0,
    patience_for_rlop=10,
    save_best_only=False,
    verbose=0,
):
    callbacks = []

    if checkpoint_path is not None:
        model_checkpoint = tf.keras.callbacks.ModelCheckpoint(
            checkpoint_path,
            mode=mode,
            monitor=monitor,
            save_best_only=save_best_only,
            verbose=verbose,
        )

        callbacks.append(model_checkpoint)

    if patience_for_es is not None:
        early_stopping = tf.keras.callbacks.EarlyStopping(
            mode=mode, monitor=monitor, patience=patience_for_es, verbose=verbose
        )

        callbacks.append(early_stopping)

    if patience_for_rlop is not None:
        reduce_lr_on_plateau = tf.keras.callbacks.ReduceLROnPlateau(
            factor=factor,
            monitor=monitor,
            mode=mode,
            patience=patience_for_rlop,
            verbose=verbose,
        )

        callbacks.append(reduce_lr_on_plateau)

    return callbacks

In [None]:
import tensorflow as tf


def build_model(
    input_dim,
    output_dim,
    activation="relu",
    kernel_initializer="glorot_uniform",
    label_smoothing=0.0,
    momentum=0.99,
    n_layers=3,
    n_units="auto",
    optimizer_params=None,
    rate=0.0,
):
    if n_units == "auto":
        n_units = 0.5 * (input_dim + output_dim)

    inputs = tf.keras.layers.Input(shape=input_dim)

    x = inputs

    for i in range(n_layers - 2):
        x = tf.keras.layers.Dense(n_units, kernel_initializer=kernel_initializer)(x)
        x = tf.keras.layers.BatchNormalization(momentum=momentum)(x)
        x = tf.keras.layers.Activation(activation)(x)
        x = tf.keras.layers.Dropout(rate)(x)

    x = tf.keras.layers.Dense(output_dim)(x)

    outputs = tf.keras.layers.Activation("sigmoid")(x)

    model = tf.keras.models.Model(inputs=inputs, outputs=outputs)

    if optimizer_params is None:
        optimizer_params = {}

    loss = tf.keras.losses.BinaryCrossentropy(label_smoothing=label_smoothing)
    optimizer = tf.keras.optimizers.Adam(**optimizer_params)

    model.compile(loss=loss, optimizer=optimizer)

    return model

In [None]:
import numpy as np


def score(Y, Y_pred, eps=1e-15, label_smoothing=0.0):
    Y = np.asarray(Y)
    Y = np.ravel(Y)

    if label_smoothing > 0.0:
        Y = Y * (1.0 - label_smoothing) + 0.5 * label_smoothing

    Y_pred = np.asarray(Y_pred)
    Y_pred = np.ravel(Y_pred)
    Y_pred = np.clip(Y_pred, eps, 1.0 - eps)

    return -np.mean(Y * np.log(Y_pred) + (1.0 - Y) * np.log(1.0 - Y_pred))

In [None]:
import os
import random as rn

import tensorflow as tf
import numpy as np


def set_seed(seed=0):
    os.environ["PYTHONHASHSEED"] = str(seed)

    rn.seed(seed)
    np.random.seed(seed)
    tf.random.set_seed(seed)

    graph = tf.compat.v1.get_default_graph()
    session_conf = tf.compat.v1.ConfigProto(
        inter_op_parallelism_threads=1, intra_op_parallelism_threads=1
    )
    sess = tf.compat.v1.Session(graph=graph, config=session_conf)

    tf.compat.v1.keras.backend.set_session(sess)

In [None]:
# https://www.kaggle.com/c/lish-moa/discussion/195195

import numpy as np
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold
from sklearn.model_selection._split import _BaseKFold


class MultilabelStratifiedGroupKFold(_BaseKFold):
    def __init__(self, n_splits=5, random_state=None, shuffle=False):
        super().__init__(n_splits=n_splits, random_state=random_state, shuffle=shuffle)

    def _iter_test_indices(self, X=None, y=None, groups=None):
        cv = MultilabelStratifiedKFold(
            n_splits=self.n_splits,
            random_state=self.random_state,
            shuffle=self.shuffle,
        )

        value_counts = groups.value_counts()
        regluar_indices = value_counts.loc[value_counts <= 18].index.sort_values()
        irregluar_indices = value_counts.loc[value_counts > 18].index.sort_values()

        group_to_fold = {}
        tmp = y.groupby(groups).mean().loc[regluar_indices]

        for fold, (_, test) in enumerate(cv.split(tmp, tmp)):
            group_to_fold.update({group: fold for group in tmp.index[test]})

        sample_to_fold = {}
        tmp = y.loc[groups.isin(irregluar_indices)]

        for fold, (_, test) in enumerate(cv.split(tmp, tmp)):
            sample_to_fold.update({sample: fold for sample in tmp.index[test]})

        folds = groups.map(group_to_fold)
        is_na = folds.isna()
        folds[is_na] = folds[is_na].index.map(sample_to_fold).values

        for i in range(self.n_splits):
            yield np.where(folds == i)[0]

In [None]:
dtype = {"cp_type": "category", "cp_dose": "category"}
index_col = "sig_id"

train_features = pd.read_csv(
    "../input/lish-moa/train_features.csv", dtype=dtype, index_col=index_col
)
test_features = pd.read_csv(
    "../input/lish-moa/test_features.csv", dtype=dtype, index_col=index_col
)
X = train_features.select_dtypes("number")
X_test = test_features.select_dtypes("number")
Y = pd.read_csv("../input/lish-moa/train_targets_scored.csv", index_col=index_col)
groups = pd.read_csv(
    "../input/lish-moa/train_drug.csv", index_col=index_col, squeeze=True
)

train_size, n_features = X.shape
test_size, _ = X_test.shape
_, n_classes = Y.shape

In [None]:
# hyperparameters
n_seeds = 5
n_splits = 5
shuffle = True
params = {"n_layers": 4, "rate": 0.25}
callbacks_params = {"patience_for_es": 20, "save_best_only": True}
fit_params = {"epochs": 1_000}

In [None]:
%%time
Y_pred = np.zeros((train_size, n_classes))
Y_pred = pd.DataFrame(Y_pred, columns=Y.columns, index=Y.index)

for i in range(n_seeds):
    set_seed(seed=i)

    cv = MultilabelStratifiedGroupKFold(
        n_splits=n_splits, random_state=i, shuffle=shuffle
    )

    for j, (train, valid) in enumerate(cv.split(X, Y, groups)):
        model_path = f"model_seed_{i}_fold_{j}.h5"

        model = build_model(n_features, n_classes, **params)
        callbacks = build_callbacks(model_path, **callbacks_params)

        model.fit(
            X.iloc[train],
            Y.iloc[train],
            callbacks=callbacks,
            validation_data=(X.iloc[valid], Y.iloc[valid]),
            **fit_params,
        )

        model.load_weights(model_path)

        Y_pred.iloc[valid] += model.predict(X.iloc[valid]) / n_seeds

Y_pred[train_features["cp_type"] == "ctl_vehicle"] = 0.0

In [None]:
score(Y, Y_pred)

In [None]:
%%time
Y_pred = np.zeros((test_size, n_classes))
Y_pred = pd.DataFrame(Y_pred, columns=Y.columns, index=X_test.index)

for i in range(n_seeds):
    for j in range(n_splits):
        model = tf.keras.models.load_model(f"model_seed_{i}_fold_{j}.h5", compile=False)

        Y_pred += model.predict(X_test) / n_seeds / n_splits

Y_pred[test_features["cp_type"] == "ctl_vehicle"] = 0.0

In [None]:
Y_pred.to_csv("submission.csv")