In [1]:
import sys

#sys.path.append("../input/iterative-stratification/iterative-stratification-master")

In [2]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
os.environ["TF_FORCE_GPU_ALLOW_GROWTH"] = "true"

In [3]:
import os
import gc
import pickle
import joblib
import warnings

import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow.keras.backend as K
from tqdm.notebook import tqdm

warnings.simplefilter('ignore')

In [4]:
import os
import random as rn

import tensorflow as tf
import numpy as np


def set_seed(seed=0):
    os.environ["PYTHONHASHSEED"] = str(seed)

    rn.seed(seed)
    np.random.seed(seed)
    tf.random.set_seed(seed)

    graph = tf.compat.v1.get_default_graph()
    session_conf = tf.compat.v1.ConfigProto(
        inter_op_parallelism_threads=1, intra_op_parallelism_threads=1
    )
    sess = tf.compat.v1.Session(graph=graph, config=session_conf)

    tf.compat.v1.keras.backend.set_session(sess)

In [5]:
from sklearn.metrics import log_loss


def score(Y, Y_pred):
    _, n_classes = Y.shape

    losses = []

    for j in range(n_classes):
        loss = log_loss(Y.iloc[:, j], Y_pred.iloc[:, j], labels=[0, 1])

        losses.append(loss)

    return np.mean(losses)

In [6]:
from sklearn.metrics import roc_auc_score


def auc_score(Y, Y_pred):
    _, n_classes = Y.shape

    aucs = []

    for j in range(n_classes):
        auc = roc_auc_score(Y.iloc[:, j], Y_pred.iloc[:, j])

        aucs.append(auc)

    return np.mean(aucs)

In [7]:
import numpy as np
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold
from sklearn.model_selection._split import _BaseKFold


class MultilabelStratifiedGroupKFold(_BaseKFold):
    def __init__(self, n_splits=5, random_state=None, shuffle=False):
        super().__init__(n_splits=n_splits, random_state=random_state, shuffle=shuffle)

    def _iter_test_indices(self, X=None, y=None, groups=None):
        cv = MultilabelStratifiedKFold(
            n_splits=self.n_splits,
            random_state=self.random_state,
            shuffle=self.shuffle,
        )

        value_counts = groups.value_counts()
        regluar_indices = value_counts.loc[
            (value_counts == 6) | (value_counts == 12) | (value_counts == 18)
        ].index.sort_values()
        irregluar_indices = value_counts.loc[
            (value_counts != 6) & (value_counts != 12) & (value_counts != 18)
        ].index.sort_values()

        group_to_fold = {}
        tmp = y.groupby(groups).mean().loc[regluar_indices]

        for fold, (_, test) in enumerate(cv.split(tmp, tmp)):
            group_to_fold.update({group: fold for group in tmp.index[test]})

        sample_to_fold = {}
        tmp = y.loc[groups.isin(irregluar_indices)]

        for fold, (_, test) in enumerate(cv.split(tmp, tmp)):
            sample_to_fold.update({sample: fold for sample in tmp.index[test]})

        folds = groups.map(group_to_fold)
        is_na = folds.isna()
        folds[is_na] = folds[is_na].index.map(sample_to_fold).values

        for i in range(self.n_splits):
            yield np.where(folds == i)[0]

In [8]:
from sklearn.base import BaseEstimator
from sklearn.base import TransformerMixin


class ClippedFeatures(BaseEstimator, TransformerMixin):
    def __init__(self, copy=True, high=0.99, low=0.01):
        self.copy = copy
        self.high = high
        self.low = low

    def fit(self, X, y=None):
        self.data_max_ = X.quantile(q=self.high)
        self.data_min_ = X.quantile(q=self.low)

        return self

    def transform(self, X):
        if self.copy:
            X = X.copy()

        X.clip(self.data_min_, self.data_max_, axis=1, inplace=True)

        return X

In [9]:
import pandas as pd


def compute_row_statistics(X, prefix=""):
    Xt = pd.DataFrame()

    for agg_func in [
        # "min",
        # "max",
        "mean",
        "std",
        "kurtosis",
        "skew",
    ]:
        Xt[f"{prefix}{agg_func}"] = X.agg(agg_func, axis=1)

    return Xt

In [10]:
import tensorflow as tf


def build_callbacks(
    model_path, factor=0.1, mode="auto", monitor="val_loss", patience=0, verbose=0
):
    early_stopping = tf.keras.callbacks.EarlyStopping(
        mode=mode, monitor=monitor, patience=patience, verbose=verbose
    )
    model_checkpoint = tf.keras.callbacks.ModelCheckpoint(
        model_path, mode=mode, monitor=monitor, save_best_only=True, verbose=verbose
    )
    reduce_lr_on_plateau = tf.keras.callbacks.ReduceLROnPlateau(
        factor=factor, monitor=monitor, mode=mode, verbose=verbose
    )

    return [early_stopping, model_checkpoint, reduce_lr_on_plateau]

In [11]:
# https://arxiv.org/abs/1905.04899

import numpy as np
import tensorflow as tf


class Cutmix(tf.keras.utils.Sequence):
    def __init__(self, X, y=None, batch_size=32, alpha=1.0):
        self.X = np.asarray(X)

        if y is None:
            self.y = y
        else:
            self.y = np.asarray(y)

        self.batch_size = batch_size
        self.alpha = alpha

    def __getitem__(self, i):
        X_batch = self.X[i * self.batch_size : (i + 1) * self.batch_size]

        n_samples, n_features = self.X.shape
        batch_size = X_batch.shape[0]
        shuffle = np.random.choice(n_samples, batch_size)

        l = np.random.beta(self.alpha, self.alpha)
        mask = np.random.choice([0.0, 1.0], size=n_features, p=[1.0 - l, l])
        X_shuffle = self.X[shuffle]
        X_batch = mask * X_batch + (1.0 - mask) * X_shuffle

        if self.y is None:
            return X_batch, None

        y_batch = self.y[i * self.batch_size : (i + 1) * self.batch_size]
        y_shuffle = self.y[shuffle]
        y_batch = l * y_batch + (1.0 - l) * y_shuffle

        return X_batch, y_batch

    def __len__(self):
        n_samples = self.X.shape[0]

        return int(np.ceil(n_samples / self.batch_size))

In [12]:
import sys

sys.path.append(r"C:\Users\81908\jupyter_notebook\poetry_work\tfgpu\01_MoA_compe\code")
from tabnet_tf import *

# from tabnet import StackedTabNet

import tensorflow as tf
from adabelief_tf import AdaBeliefOptimizer


class StackedTabNetClassifier(tf.keras.Model):
    def __init__(
        self,
        num_classes,
        batch_momentum=0.98,
        epsilon=1e-05,
        feature_columns=None,
        feature_dim=64,
        norm_type="group",
        num_decision_steps=5,
        num_features=None,
        num_groups=2,
        num_layers=1,
        output_dim=64,
        relaxation_factor=1.5,
        sparsity_coefficient=1e-05,
        virtual_batch_size=None,
        **kwargs,
    ):
        super().__init__(**kwargs)

        self.stacked_tabnet = StackedTabNet(
            feature_columns,
            batch_momentum=batch_momentum,
            epsilon=epsilon,
            feature_dim=feature_dim,
            norm_type=norm_type,
            num_decision_steps=num_decision_steps,
            num_features=num_features,
            num_groups=num_groups,
            num_layers=num_layers,
            output_dim=output_dim,
            relaxation_factor=relaxation_factor,
            sparsity_coefficient=sparsity_coefficient,
            virtual_batch_size=virtual_batch_size,
        )

        self.classifier = tf.keras.layers.Dense(
            num_classes, activation="sigmoid", use_bias=False
        )

    def call(self, inputs, training=None):
        x = self.stacked_tabnet(inputs, training=training)

        return self.classifier(x)


def create_model_stacked_tabnet(
    n_features, num_classes=206, lr=0.001,
):
    model = StackedTabNetClassifier(
        num_classes=num_classes, num_features=n_features, **stacked_tabnet_params,
    )
    loss = tf.keras.losses.BinaryCrossentropy(label_smoothing=1e-03)
    optimizer = AdaBeliefOptimizer(learning_rate=lr)
    model.compile(loss=loss, optimizer=optimizer)
    return model

Tensorflow version 2.3.1


# Data

In [13]:
#dtype = {"cp_type": "category", "cp_dose": "category"}
#index_col = "sig_id"
#
#train_features = pd.read_csv(
#   "../input/lish-moa/train_features.csv", dtype=dtype, index_col=index_col
#)
#X = train_features.select_dtypes("number")
#Y_nonscored = pd.read_csv(
#   "../input/lish-moa/train_targets_nonscored.csv", index_col=index_col
#)
#Y = pd.read_csv("../input/lish-moa/train_targets_scored.csv", index_col=index_col)
#groups = pd.read_csv(
#   "../input/lish-moa/train_drug.csv", index_col=index_col, squeeze=True
#)
#
#columns = Y.columns

In [14]:
dtype = {"cp_type": "category", "cp_dose": "category"}
index_col = "sig_id"

DATADIR = r"C:\Users\81908\jupyter_notebook\poetry_work\tfgpu\01_MoA_compe\input\lish-moa"

groups = pd.read_csv(
    f"{DATADIR}/train_drug.csv", dtype=dtype, index_col=index_col, squeeze=True
)
train_features = pd.read_csv(
    f"{DATADIR}/train_features.csv", dtype=dtype, index_col=index_col
)
# X_test = pd.read_csv(f"{DATADIR}/test_features.csv", dtype=dtype, index_col=index_col)
X = train_features.select_dtypes("number")
Y_nonscored = pd.read_csv(f"{DATADIR}/train_targets_nonscored.csv", index_col=index_col)
Y = pd.read_csv(f"{DATADIR}/train_targets_scored.csv", index_col=index_col)

columns = Y.columns

In [15]:
c_prefix = "c-"
g_prefix = "g-"
c_columns = X.columns.str.startswith(c_prefix)
g_columns = X.columns.str.startswith(g_prefix)
X_c = compute_row_statistics(X.loc[:, c_columns], prefix=c_prefix)
X_g = compute_row_statistics(X.loc[:, g_columns], prefix=g_prefix)

clipped_features = ClippedFeatures()
X = clipped_features.fit_transform(X)
with open("clipped_features.pkl", "wb") as f:
    pickle.dump(clipped_features, f)

X = pd.concat([X, X_c, X_g], axis=1)

# params

In [16]:
alpha = 4.0
factor = 0.5
n_splits = 5
#n_seeds = 5
n_seeds = 1
patience = 30
shuffle = True
fit_params = {"epochs": 50, "verbose": 0}
batch_size = 256

stacked_tabnet_params = dict(
    epsilon=1e-05,
    feature_columns=None,
    virtual_batch_size=None,
    num_layers=2,
    num_decision_steps=1,
    norm_type="batch",
    num_groups=-1,
    batch_momentum=0.9,
    relaxation_factor=1.2,
    sparsity_coefficient=0.0001,
    feature_dim=2560,
    output_dim=128,
)

#DEBUG = True
DEBUG = False
if DEBUG:
    columns = [
        "cyclooxygenase_inhibitor",  # 陽性ラベル435個
#        "atp-sensitive_potassium_channel_antagonist",  # 陽性ラベル1個だけ
        "erbb2_inhibitor",  # 陽性ラベル1個だけ
        "antiarrhythmic",  # 陽性ラベル6個だけ
#        "aldehyde_dehydrogenase_inhibitor",  # 陽性ラベル7個だけ
#        "lipase_inhibitor",  # 陽性ラベル12個だけ
#        "sphingosine_receptor_agonist",  # 陽性ラベル25個だけ
#        "igf-1_inhibitor",  # 陽性ラベル37個だけ
#        "potassium_channel_activator",  # 陽性ラベル55個だけ
#        "potassium_channel_antagonist",  # 陽性ラベル98個だけ
#        "dopamine_receptor_agonist",  # 陽性ラベル121個だけ
#        "nfkb_inhibitor",  # 陽性ラベル832個
#        "dna_inhibitor",  # 陽性ラベル402個
#        "glutamate_receptor_antagonist",  # 陽性ラベル367個
#        "tubulin_inhibitor",  # 陽性ラベル316個
#        "pdgfr_inhibitor",  # 陽性ラベル297個
#        "calcium_channel_blocker",  # 陽性ラベル281個
#        "flt3_inhibitor",  # 陽性ラベル279個
#        "progesterone_receptor_agonist",  # 陽性ラベル119個
#        "hdac_inhibitor",  # 陽性ラベル106個
    ]
    Y = Y[columns]
    n_seeds = 2
    n_splits = 2
    patience = 2
    fit_params = {"epochs": 5, "verbose": 1}
    print(f"DEBUG: {DEBUG}")

In [17]:
train_size, n_features = X.shape
_, n_classes = Y.shape

In [18]:
# モデルロード時に使うパラメ保存
save_params = stacked_tabnet_params.copy()
save_params["num_classes"] = 1
save_params["num_features"] = n_features
with open("params.pkl", "wb") as f:
    pickle.dump(save_params, f)

In [19]:
# ラベルの共起数カウント.共起数の順位高いものから学習してみる
co_occ = Y.T @ Y
co_occ_sum = pd.DataFrame(co_occ.sum(axis=1)).reset_index()
co_occ_sum = co_occ_sum.sort_values(0, ascending=False)  # 共起数多い順にする
display(co_occ_sum)

co_occ_column = co_occ_sum["index"].values
print(co_occ_column)

Unnamed: 0,index,0
136,nfkb_inhibitor,1646
163,proteasome_inhibitor,1444
149,pdgfr_inhibitor,869
119,kit_inhibitor,839
89,flt3_inhibitor,794
...,...,...
120,laxative,6
33,atm_kinase_inhibitor,6
53,calcineurin_inhibitor,6
82,erbb2_inhibitor,2


['nfkb_inhibitor' 'proteasome_inhibitor' 'pdgfr_inhibitor' 'kit_inhibitor'
 'flt3_inhibitor' 'dopamine_receptor_antagonist'
 'serotonin_receptor_antagonist' 'cyclooxygenase_inhibitor'
 'dna_inhibitor' 'adrenergic_receptor_antagonist'
 'glutamate_receptor_antagonist' 'egfr_inhibitor' 'cdk_inhibitor'
 'vegfr_inhibitor' 'tubulin_inhibitor' 'glucocorticoid_receptor_agonist'
 'acetylcholine_receptor_antagonist' 'calcium_channel_blocker'
 'adrenergic_receptor_agonist' 'phosphodiesterase_inhibitor'
 'hmgcr_inhibitor' 'sodium_channel_inhibitor' 'serotonin_receptor_agonist'
 'raf_inhibitor' 'histamine_receptor_antagonist'
 'estrogen_receptor_agonist' 'pi3k_inhibitor' 'mtor_inhibitor'
 'ppar_receptor_agonist' 'bacterial_cell_wall_synthesis_inhibitor'
 'acetylcholine_receptor_agonist' 'gaba_receptor_antagonist'
 'topoisomerase_inhibitor' 'progesterone_receptor_agonist'
 'cytochrome_p450_inhibitor' 'aurora_kinase_inhibitor'
 'dopamine_receptor_agonist' 'jak_inhibitor' 'gaba_receptor_agonist'
 'tyr

# Train

In [20]:
%%time

Y_pred = np.zeros((train_size, n_classes))
Y_pred = pd.DataFrame(Y_pred, columns=Y.columns, index=Y.index)

# 重み流用してみる
K.clear_session()
model = create_model_stacked_tabnet(n_features, num_classes=1)

counts = []
for i in tqdm(range(n_seeds)):
    set_seed(seed=i)

    cv = MultilabelStratifiedGroupKFold(n_splits=n_splits, random_state=i, shuffle=shuffle)
    cv_split = cv.split(X, Y, groups)
        
    for j, (trn_idx, val_idx) in enumerate(cv_split):

        print(f"\n------------ fold:{j} ------------")

        X_train, X_val = X.iloc[trn_idx], X.iloc[val_idx]
        Y_train_targets, Y_val_targets = Y.iloc[trn_idx], Y.iloc[val_idx]
    
        targets_counts = []

        #for tar, tar_col in enumerate(Y.columns):
        for tar, tar_col in enumerate(co_occ_column):  # 共起数の順位高いものから学習してみる
            
            Y_train, Y_val = Y_train_targets.values[:, tar], Y_val_targets.values[:, tar]           

            #K.clear_session()
            #model = create_model_stacked_tabnet(n_features, num_classes=1)
            
            model_path = f"model_seed_{i}_fold_{j}_{tar_col}.h5"
            generator = Cutmix(X_train, Y_train, alpha=alpha, batch_size=batch_size)
            callbacks = build_callbacks(model_path, factor=factor, patience=patience)
            history = model.fit(generator, callbacks=callbacks, validation_data=(X_val, Y_val), **fit_params)
            
            model.load_weights(model_path)
            Y_pred[tar_col][val_idx] += model.predict(X_val)[:, 0] / n_seeds

            #del model
            #gc.collect()
            
            if i + j + tar > 0:
                # 重み流用するから0回目以降は早めに学習終わらせる
                patience = 10
            
            targets_counts.append(Y_train.sum())

        counts.append(targets_counts)

counts = np.array(counts)

Y_pred[train_features["cp_type"] == "ctl_vehicle"] = 0.0

with open("counts.pkl", "wb") as f:
    pickle.dump(counts, f)

with open("Y_pred.pkl", "wb") as f:
    pickle.dump(Y_pred[columns], f)

[31mPlease check your arguments if you have upgraded adabelief-tf from version 0.0.1.
[31mModifications to default arguments:
[31m                           eps  weight_decouple    rectify
-----------------------  -----  -----------------  -------------
adabelief-tf=0.0.1       1e-08  Not supported      Not supported
Current version (0.1.0)  1e-14  supported          default: True
[31mFor a complete table of recommended hyperparameters, see
[31mhttps://github.com/juntang-zhuang/Adabelief-Optimizer
[0m


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=1.0), HTML(value='')))


------------ fold:0 ------------


To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.












KeyboardInterrupt: 

In [21]:
oof_score = score(Y[columns], Y_pred[columns])
print(f"oof_score: {oof_score}")

oof_auc_score = auc_score(Y[columns], Y_pred[columns])
print(f"oof_auc_score: {oof_auc_score}")

oof_score: 0.09969834318333405
oof_auc_score: 0.5043248084153513


# pkl check

In [22]:
path = r"counts.pkl"
with open(path, 'rb') as f:
    counts = pickle.load(f)
print(counts.shape)
counts

(4, 3)


array([[220,   1,   0],
       [215,   0,   6],
       [215,   0,   6],
       [220,   1,   0]], dtype=int64)

In [23]:
path = r"Y_pred.pkl"
with open(path, 'rb') as f:
    Y_pred = pickle.load(f)
Y_pred

Unnamed: 0_level_0,cyclooxygenase_inhibitor,erbb2_inhibitor,antiarrhythmic
sig_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
id_000644bb2,0.017325,0.001335,0.000709
id_000779bfc,0.014054,0.000858,0.001351
id_000a6266a,0.018434,0.000927,0.001168
id_0015fd391,0.012568,0.000719,0.000745
id_001626bd3,0.008036,0.000718,0.000528
...,...,...,...
id_fffb1ceed,0.010143,0.000773,0.000611
id_fffb70c0c,0.016152,0.000772,0.000686
id_fffc1c3f4,0.000000,0.000000,0.000000
id_fffcb9e7c,0.003412,0.000594,0.000762


# predict test

In [24]:
test_features = pd.read_csv(
    #"../input/lish-moa/test_features.csv", dtype=dtype, index_col=index_col
    f"{DATADIR}/test_features.csv", dtype=dtype, index_col=index_col
)
X_test = test_features.select_dtypes("number")

X_c = compute_row_statistics(X_test.loc[:, c_columns], prefix=c_prefix)
X_g = compute_row_statistics(X_test.loc[:, g_columns], prefix=g_prefix)

with open("./clipped_features.pkl", "rb") as f:
    clipped_features = pickle.load(f)
X_test = clipped_features.transform(X_test)
X_test = pd.concat([X_test, X_c, X_g], axis=1)

In [None]:
Y_test_pred = np.zeros((X_test.shape[0], len(columns)))
Y_test_pred = pd.DataFrame(Y_test_pred, columns=columns, index=test_features.index)

with open("params.pkl", "rb") as f:
    save_params = pickle.load(f)
K.clear_session()
model = StackedTabNetClassifier(**save_params)
model(np.zeros((1, n_features)))

for i in range(n_seeds):
    for j in range(n_splits):
        for tar, tar_col in enumerate(Y.columns):
            model_path = f"model_seed_{i}_fold_{j}_{tar_col}.h5"
            model.load_weights(model_path)
            Y_test_pred[tar_col] += model.predict(X_test)[:, 0] / (n_seeds * n_splits)
        
Y_test_pred[test_features["cp_type"] == "ctl_vehicle"] = 0.0

Y_test_pred.to_csv("submission.csv")



To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.



In [None]:
print(Y_test_pred.shape)
display(Y_test_pred)