Version list:
- v1/v2/v3: Testing local code
- v4: First working version, no clue why it doesn't work in commit
- v5: Turn off pin memory, ran fully through (too much weight decay?)
- v6: decay 0, learning rate 5e-4, drop connect 0.05
- v7: Dropout increased to 0.5
- v8: Patience increased to 3, epochs to 6, drop connect rate to 0.5
- v9: epochs to 10
- v10: Drop connect down to 0.4, patience up to 4, epochs up to 12, next to change swap noise and perplexity.
- v11: Add postprocessing and perplexity to 12

In [None]:
kernel_mode = True

#!cp ../input/lish-moa-utils/utils.py .
import sys
if kernel_mode:
    sys.path.insert(0, "../input/iterative-stratification")
    #sys.path.insert(0, './')
    sys.path.insert(0, "../input/gen-efficientnet-pytorch")

import os
import numpy as np
import pandas as pd
import time
import random
import torch
import math
import pytorch_lightning as pl
import matplotlib
import matplotlib.pyplot as plt
import torch.nn.functional as F
from sklearn.metrics import log_loss
pd.options.display.max_columns = None
#!pip install -q geffnet
import geffnet
#!pip install -q iterative-stratification
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold
from sklearn.manifold import TSNE
import pickle
import seaborn as sns
import cv2
sns.set(style="darkgrid")
from utils import DeepInsightTransformer, LogScaler
import gc
gc.enable()
experiment_name='pleasework'

model_info = {
    "model_path": f"../input/deepinsight-transformers-perplexity-5"
    #f"../input/deepinsight-efficientnet-v4-b3/{experiment_name}"
    #if kernel_mode else
    #f"/workspace/Kaggle/MoA/completed/deepinsight_efficientnet_v4_b3/{experiment_name}"
}

model_type='b3'

pretrained_model = f"tf_efficientnet_{model_type}_ns"

model_output_folder = '.'
rand_seed = 42
perplexity = 24
patience=3
epochs=12
weight_decay=0.00 #0.1 at first (too much)

drop_connect_rate = 0.2 #0.2 at first
fc_size = 512

# Swap Noise
swap_prob = 0.15
swap_portion = 0.1


num_workers = 4
gpus = [0]

if model_type == "b0":
    batch_size = 48 # default 128
    infer_batch_size = 256
    image_size = 224  # B0
    drop_rate = 0.2  # B0
    resolution = 224
elif model_type == "b3":
    batch_size = 48 
    infer_batch_size = 128 # 256 results in OOM
    image_size = 300  # B3
    drop_rate = 0.3  # B3
    resolution = 300
elif model_type == "b5":
    batch_size = 8
    infer_batch_size = 16
    image_size = 456  # B5
    drop_rate = 0.4  # B5
    resolution = 456
elif model_type == "b7":
    batch_size = 2
    infer_batch_size = 4
    image_size = 800  # B7
    image_size = 772  # B7
    drop_rate = 0.5  # B7
    resolution = 772

In [None]:
train_features = pd.read_csv("../input/lish-moa/train_features.csv")
train_labels = pd.read_csv("../input/lish-moa/train_targets_scored.csv")
train_extra_labels = pd.read_csv("../input/lish-moa/train_targets_nonscored.csv")
test_features = pd.read_csv("../input/lish-moa/test_features.csv")

del train_labels['sig_id']

category_features = ["cp_type", "cp_dose"]
numeric_features = [c for c in train_features.columns if c != "sig_id" and c not in category_features]
all_features = category_features + numeric_features
gene_experssion_features = [c for c in numeric_features if c.startswith("g-")]
cell_viability_features = [c for c in numeric_features if c.startswith("c-")]
len(numeric_features), len(gene_experssion_features), len(cell_viability_features)

train_classes = [c for c in train_labels.columns if c != "sig_id"]
train_extra_classes = [c for c in train_extra_labels.columns if c != "sig_id"]
len(train_classes), len(train_extra_classes)

for df in [train_features, test_features]:
    df['cp_type'] = df['cp_type'].map({'ctl_vehicle': 0, 'trt_cp': 1})
    df['cp_dose'] = df['cp_dose'].map({'D1': 0, 'D2': 1})
    df['cp_time'] = df['cp_time'].map({24: 0, 48: 0.5, 72: 1})

In [None]:
def create_folds(num_starts, num_splits, control=True):
    folds = []

    # LOAD FILES
    train_feats = pd.read_csv('../input/lish-moa/train_features.csv')
    scored = pd.read_csv('../input/lish-moa/train_targets_scored.csv')
    drug = pd.read_csv('../input/lish-moa/train_drug.csv')
    if not control:
        scored = scored.loc[train_feats['cp_type'] == 'trt_cp', :]
        drug = drug.loc[train_feats['cp_type'] == 'trt_cp', :]
    targets = scored.columns[1:]
    scored = scored.merge(drug, on='sig_id', how='left') 

    # LOCATE DRUGS
    vc = scored.drug_id.value_counts()
    vc1 = vc.loc[vc <= 18].index
    vc2 = vc.loc[vc > 18].index

    for seed in range(num_starts):

        # STRATIFY DRUGS 18X OR LESS
        dct1 = {}; dct2 = {}
        skf = MultilabelStratifiedKFold(n_splits = num_splits, shuffle = True, random_state = seed)
        tmp = scored.groupby('drug_id')[targets].mean().loc[vc1]
        for fold,(idxT,idxV) in enumerate(skf.split(tmp,tmp[targets])):
            dd = {k:fold for k in tmp.index[idxV].values}
            dct1.update(dd)

        # STRATIFY DRUGS MORE THAN 18X
        skf = MultilabelStratifiedKFold(n_splits = num_splits, shuffle = True, random_state = seed)
        tmp = scored.loc[scored.drug_id.isin(vc2)].reset_index(drop = True)
        for fold,(idxT,idxV) in enumerate(skf.split(tmp,tmp[targets])):
            dd = {k:fold for k in tmp.sig_id[idxV].values}
            dct2.update(dd)

        # ASSIGN FOLDS
        scored['fold'] = scored.drug_id.map(dct1)
        scored.loc[scored.fold.isna(),'fold'] =\
            scored.loc[scored.fold.isna(),'sig_id'].map(dct2)
        scored.fold = scored.fold.astype('int8')
        folds.append(scored.fold.values)

        del scored['fold']

    return np.stack(folds) 

def plot_embed_2D(X, title=None):
    sns.set(style="darkgrid")

    # Create subplots
    fig, ax = plt.subplots(1, 1, figsize=(10, 7), squeeze=False)
    ax[0, 0].scatter(X[:, 0],
                     X[:, 1],
                     cmap=plt.cm.get_cmap("jet", 10),
                     marker="x",
                     alpha=1.0)
    plt.gca().set_aspect('equal', adjustable='box')

    if title is not None:
        ax[0, 0].set_title(title, fontsize=20)

    plt.rcParams.update({'font.size': 14})
    plt.show()

def tsne_transform(data, perplexity=30, plot=True):
    # Transpose to get (n_features, n_samples)
    data = data.T

    tsne = TSNE(n_components=2,
                metric='cosine',
                perplexity=perplexity,
                n_iter=1000,
                method='exact',
                random_state=rand_seed,
                n_jobs=-1)
    # Transpose to get (n_features, n_samples)
    transformed = tsne.fit_transform(data)

    if plot:
        plot_embed_2D(
            transformed,
            f"All Feature Location Matrix of Training Set (Perplexity: {perplexity})"
        )
    return transformed

def save_pickle(obj, model_output_folder, fold_i, name):
    pickle.dump(obj, open(f"{model_output_folder}/fold{fold_i}_{name}.pkl", 'wb'),
         pickle.HIGHEST_PROTOCOL)

In [None]:
def initialize_weight_goog(m, n='', fix_group_fanout=True):
    # weight init as per Tensorflow Official impl
    # https://github.com/tensorflow/tpu/blob/master/models/official/mnasnet/mnasnet_model.py
    if isinstance(m, torch.nn.Conv2d):
        fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
        if fix_group_fanout:
            fan_out //= m.groups
        m.weight.data.normal_(0, math.sqrt(2.0 / fan_out))
        if m.bias is not None:
            m.bias.data.zero_()
    elif isinstance(m, torch.nn.BatchNorm2d):
        m.weight.data.fill_(1.0)
        m.bias.data.zero_()
    elif isinstance(m, torch.nn.Linear):
        fan_out = m.weight.size(0)  # fan-out
        fan_in = 0
        if 'routing_fn' in n:
            fan_in = m.weight.size(1)
        init_range = 1.0 / math.sqrt(fan_in + fan_out)
        m.weight.data.uniform_(-init_range, init_range)
        m.bias.data.zero_()

def initialize_weight_default(m, n=''):
    if isinstance(m, torch.nn.Conv2d):
        torch.nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
    elif isinstance(m, torch.nn.BatchNorm2d):
        m.weight.data.fill_(1.0)
        m.bias.data.zero_()
    elif isinstance(m, torch.nn.Linear):
        torch.nn.init.kaiming_uniform_(m.weight,
                                 mode='fan_in',
                                 nonlinearity='linear')

class MoAEfficientNet(pl.LightningModule):
    def __init__(
            self,
            pretrained_model_name,
            training_set=(None, None),  # tuple
            valid_set=(None, None),  # tuple
            test_set=None,
            transformer=None,
            num_classes=206,
            in_chans=3,
            drop_rate=0.,
            drop_connect_rate=0.,
            fc_size=512,
            learning_rate=1e-3,
            weight_init='goog'):
        super(MoAEfficientNet, self).__init__()

        self.train_data, self.train_labels = training_set
        self.valid_data, self.valid_labels = valid_set
        self.test_data = test_set
        self.transformer = transformer

        self.backbone = getattr(geffnet, pretrained_model)(
            pretrained=True,
            in_chans=in_chans,
            drop_rate=drop_rate,
            drop_connect_rate=drop_connect_rate,
            weight_init=weight_init)

        self.backbone.classifier = torch.nn.Sequential(
            torch.nn.Linear(self.backbone.classifier.in_features, fc_size,
                      bias=True), torch.nn.ELU(),
            torch.nn.Linear(fc_size, num_classes, bias=True))

        if self.training:
            for m in self.backbone.classifier.modules():
                initialize_weight_goog(m)

        # Save passed hyperparameters
        self.save_hyperparameters("pretrained_model_name", "num_classes",
                                  "in_chans", "drop_rate", "drop_connect_rate",
                                  "weight_init", "fc_size", "learning_rate")

    def forward(self, x):
        return self.backbone(x)

    def training_step(self, batch, batch_idx):
        x = batch["x"]
        y = batch["y"]
        x = x.float()
        y = y.type_as(x)
        logits = self(x)

        loss = F.binary_cross_entropy_with_logits(logits, y, reduction="mean")

        self.log('train_loss',
                 loss,
                 on_step=True,
                 on_epoch=True,
                 prog_bar=True,
                 logger=True)

        return loss

    def validation_step(self, batch, batch_idx):
        x = batch["x"]
        y = batch["y"]
        x = x.float()
        y = y.type_as(x)
        logits = self(x)

        val_loss = F.binary_cross_entropy_with_logits(logits,
                                                      y,
                                                      reduction="mean")

        self.log('val_loss',
                 val_loss,
                 on_step=True,
                 on_epoch=True,
                 prog_bar=True,
                 logger=True)

        return val_loss

    def test_step(self, batch, batch_idx):
        x = batch["x"]
        y = batch["y"]
        x = x.float()
        y = y.type_as(x)
        logits = self(x)
        return {"pred_logits": logits}

    def test_epoch_end(self, output_results):
        all_outputs = torch.cat([out["pred_logits"] for out in output_results],
                                dim=0)
        print("Logits:", all_outputs)
        pred_probs = F.sigmoid(all_outputs).detach().cpu().numpy()
        print("Predictions: ", pred_probs)
        return {"pred_probs": pred_probs}

    def setup(self, stage=None):
        #         self.train_dataset = MoAImageDataset(self.train_data,
        #                                              self.train_labels,
        #                                              self.transformer)
        self.train_dataset = MoAImageSwapDataset(self.train_data,
                                                 self.train_labels,
                                                 self.transformer,
                                                 swap_prob=swap_prob,
                                                 swap_portion=swap_portion)

        self.val_dataset = MoAImageDataset(self.valid_data, self.valid_labels,
                                           self.transformer)

        self.test_dataset = TestDataset(self.test_data, None, self.transformer)

    def train_dataloader(self):
        train_dataloader = torch.utils.data.DataLoader(self.train_dataset,
                                      batch_size=batch_size,
                                      shuffle=True,
                                      num_workers=num_workers,
                                      pin_memory=False,
                                      drop_last=False)
        print(f"Train iterations: {len(train_dataloader)}")
        return train_dataloader

    def val_dataloader(self):
        val_dataloader = torch.utils.data.DataLoader(self.val_dataset,
                                    batch_size=infer_batch_size,
                                    shuffle=False,
                                    num_workers=num_workers,
                                    pin_memory=False,
                                    drop_last=False)
        print(f"Validate iterations: {len(val_dataloader)}")
        return val_dataloader

    def test_dataloader(self):
        test_dataloader = torch.utils.data.DataLoader(self.test_dataset,
                                     batch_size=infer_batch_size,
                                     shuffle=False,
                                     num_workers=num_workers,
                                     pin_memory=False,
                                     drop_last=False)
        print(f"Test iterations: {len(test_dataloader)}")
        return test_dataloader

    def configure_optimizers(self):
        print(f"Initial Learning Rate: {self.hparams.learning_rate:.6f}")
        optimizer = torch.optim.Adam(self.parameters(),
                               lr=self.hparams.learning_rate,
                               weight_decay=weight_decay)

        scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer,
                                                         T_max=T_max,
                                                         eta_min=0,
                                                         last_epoch=-1)

        return [optimizer], [scheduler]

class MoAImageSwapDataset(torch.utils.data.Dataset):
    def __init__(self,
                 features,
                 labels,
                 transformer,
                 swap_prob=0.15,
                 swap_portion=0.1):
        self.features = features
        self.labels = labels
        self.transformer = transformer
        self.swap_prob = swap_prob
        self.swap_portion = swap_portion

    def __getitem__(self, index):
        normalized = self.features[index, :]

        # Swap row featurs randomly
        normalized = self.add_swap_noise(index, normalized)

        normalized = np.expand_dims(normalized, axis=0)

        # Note: we are setting empty_value=1 to follow the setup in the paper
        image = self.transformer.transform(normalized, empty_value=1)[0]

        # Resize to target size
        gene_cht = cv2.resize(image, (image_size, image_size),
                              interpolation=cv2.INTER_CUBIC)

        # Convert to 3 channels
        image = np.repeat(gene_cht[np.newaxis, :, :], 3, axis=0)

        return {"x": image.astype(float), "y": self.labels[index, :].astype(float)}

    def add_swap_noise(self, index, X):
        if np.random.rand() < self.swap_prob:
            swap_index = np.random.randint(self.features.shape[0], size=1)[0]
            # Select only gene expression and cell viability features
            swap_features = np.random.choice(
                np.array(range(3, self.features.shape[1])),
                size=int(self.features.shape[1] * self.swap_portion),
                replace=False)
            X[swap_features] = self.features[swap_index, swap_features]

        return X

    def __len__(self):
        return self.features.shape[0]

class MoAImageDataset(torch.utils.data.Dataset):
    def __init__(self, features, labels, transformer):
        self.features = features
        self.labels = labels
        self.transformer = transformer

    def __getitem__(self, index):
        normalized = self.features[index, :]
        normalized = np.expand_dims(normalized, axis=0)

        # Note: we are setting empty_value=1 to follow the setup in the paper
        image = self.transformer.transform(normalized, empty_value=1)[0]

        # Resize to target size
        gene_cht = cv2.resize(image, (image_size, image_size),
                              interpolation=cv2.INTER_CUBIC)

        # Convert to 3 channels
        image = np.repeat(gene_cht[np.newaxis, :, :], 3, axis=0)

        return {"x": image.astype(float), "y": self.labels[index, :].astype(float)}

    def __len__(self):
        return len(self.features)

class TestDataset(torch.utils.data.Dataset):
    def __init__(self, features, labels, transformer):
        self.features = features
        self.labels = labels
        self.transformer = transformer

    def __getitem__(self, index):
        normalized = self.features[index, :]
        normalized = np.expand_dims(normalized, axis=0)

        # Note: we are setting empty_value=1 to follow the setup in the paper
        image = self.transformer.transform(normalized, empty_value=1)[0]

        # Resize to target size
        gene_cht = cv2.resize(image, (image_size, image_size),
                              interpolation=cv2.INTER_CUBIC)

        # Convert to 3 channels
        image = np.repeat(gene_cht[np.newaxis, :, :], 3, axis=0)

        return {"x": image.astype(float), "y": -1}

    def __len__(self):
        return len(self.features)

def get_infer_model(model_path, test_set, transformer):
    model = MoAEfficientNet.load_from_checkpoint(
        model_path,
        pretrained_model_name=pretrained_model,
        training_set=(None, None),  # tuple
        valid_set=(None, None),  # tuple
        test_set=test_set,
        transformer=transformer,
        drop_rate=drop_rate,
        drop_connect_rate=drop_connect_rate,
        fc_size=fc_size,
        weight_init='goog')

    model.freeze()
    model.eval()
    return model

def get_train_model(training_set, valid_set, transformer, test_set=None):
    model = MoAEfficientNet(
        pretrained_model_name=pretrained_model,
        training_set=training_set,  # tuple
        valid_set=valid_set,  # tuple
        test_set=test_set,
        transformer=transformer,
        drop_rate=drop_rate,
        drop_connect_rate=drop_connect_rate,
        fc_size=fc_size,
        weight_init='goog', learning_rate=1e-3)
    return model

def mean_logloss(y_pred, y_true):
    logloss = (1 - y_true) * np.log(1 - y_pred +
                                    1e-15) + y_true * np.log(y_pred + 1e-15)
    return np.mean(-logloss)

def load_pickle(model_output_folder, fold_i, name):
    return pickle.load(open(f"{model_output_folder}/fold{fold_i}_{name}.pkl", 'rb'))

In [None]:
kfolds = 5
createtransformer=True

fold = create_folds(1, kfolds)
res = train_labels.copy()
res.loc[:, train_labels.columns] = 0

for i in range(kfolds):
    
    train_index, val_index = np.where(fold!=i)[1], np.where(fold==i)[1]

    train_all_features = train_features.loc[train_index, all_features].copy().reset_index(drop=True).values
    t_labels = train_labels.iloc[train_index].copy().reset_index(drop=True).values
    valid_all_features = train_features.loc[val_index, all_features].copy().reset_index(drop=True).values
    v_labels = train_labels.iloc[val_index].copy().reset_index(drop=True).values
    test_all_features = train_features[all_features].copy().reset_index(drop=True).values
    print(train_all_features.shape, t_labels.shape, valid_all_features.shape, v_labels.shape, test_all_features.shape)

    T_max = math.floor(len(t_labels)/batch_size)

    if createtransformer:
        all_scaler = LogScaler()
        train_all_features = all_scaler.fit_transform(train_all_features)
        valid_all_features = all_scaler.transform(valid_all_features)
        test_all_features = all_scaler.transform(test_all_features)
        save_pickle(all_scaler, model_output_folder, i, "log-scaler")

        transformer= DeepInsightTransformer(pixels=resolution,
                                        perplexity=perplexity)
        transformer=transformer.fit(train_all_features)
        save_pickle(transformer, model_output_folder, i, "deepinsight-transform")
    else: 
        scaler = load_pickle(model_info['model_path'], i, "log-scaler")
        train_all_features = scaler.transform(train_all_features)
        valid_all_features = scaler.transform(valid_all_features)
        transformer = load_pickle(model_info['model_path'], i, "deepinsight-transform")
    model = get_train_model(training_set=(train_all_features, t_labels), valid_set=(valid_all_features, v_labels), test_set=valid_all_features, transformer=transformer)
    callbacks = [
        pl.callbacks.EarlyStopping(monitor='val_loss_epoch',
                    min_delta=1e-6,
                    patience=patience,
                    verbose=True,
                    mode='min',
                    strict=True),
        pl.callbacks.LearningRateMonitor(logging_interval='step')
    ]

    checkpoint_callback = pl.callbacks.ModelCheckpoint(
        filepath=f"{model_output_folder}/fold{i}" +
        "/{epoch}-{train_loss_epoch:.6f}-{val_loss_epoch:.6f}" +
        f"-image_size={image_size}-resolution={resolution}-perplexity={perplexity}-fc={fc_size}",
        save_top_k=1,
        save_weights_only=False,
        save_last=False,
        verbose=True,
        monitor='val_loss_epoch',
        mode='min',
        prefix='')

    trainer = pl.Trainer(
        gpus=gpus,
        distributed_backend="dp",  # multiple-gpus, 1 machine
        max_epochs=epochs,
        benchmark=False,
        deterministic=True,
        checkpoint_callback=checkpoint_callback,
        callbacks=callbacks,
        #accumulate_grad_batches=accumulate_grad_batches,
        #gradient_clip_val=gradient_clip_val,
        precision=16,
        #logger=logger
        )
    trainer.fit(model)

    output = trainer.test(model, verbose=False)[0]
    res.iloc[val_index] += output["pred_probs"]

res.to_csv('res.csv', index=False)

res.loc[train_features['cp_type'] == 0, train_labels.columns] = 0

metrics = []
for _target in train_labels.columns:
    metrics.append(log_loss(train_labels.loc[:, _target], res.loc[:, _target]))
print(f'OOF Metric with postprocessing: {np.mean(metrics)}')