In [None]:
from __future__ import print_function, division, absolute_import

%matplotlib inline
# %matplotlib notebook

import os, sys, gc
import numpy as np
import random
import glob

from tqdm import tqdm_notebook as tqdm

import torch
import ignite
import torch.nn as nn

import warnings
warnings.filterwarnings("ignore")

from rcic_pytorch_utils import *  ## our utility script https://www.kaggle.com/hmendonca/rcic-pytorch-utils
# !cat rcic-pytorch-utils.py

In [None]:
fold = 1; head_num = 4; head_run = 3
target = 'sirna_ct'
resolution = 256
dropout_rate = 0.4 # regularisation prior to ArcNet features
weight_decay = 2e-4
channel_size = 200 # ArcNet feature size
margin = 0.45      # ArcNet margin
learning_rate = 0.035
warmup_learning_rate = learning_rate/100
focal_gamma = 3.5

remove_head = False # reset ArcNet head (features and fc layers)
train_to = -18  ## only train top half of the model
freeze_at_1st_n_last = False  # freeze all but the last layers, at the first and last epoches
device = 'cuda' if torch.cuda.is_available() else 'cpu'

use_amp = True if device == 'cuda' else False
training = True if device == 'cuda' else False
train_batch_size = 96 if device == 'cuda' else 16
eval_batch_size = 16 if device == 'cuda' else 8

# debug = is_interactive()
debug = False
create_sub = (device == 'cuda' or debug)

print(f'Interactive:{is_interactive()} Debug:{debug} Device:{device} Training:{training}')

In [None]:
path_data = '/kaggle/input/recursion-cellular-image-classification'
model_path = glob.glob(f'/kaggle/input/*/fold{fold}*.pth')[0]
best_sub = glob.glob(f'/kaggle/input/*/submission.csv')[0]

print(f'Training from model  {model_path}')
print(f'with pseudo labels from  {best_sub}')

In [None]:
seed_everything(42000 + head_num*100 + head_run*10 + fold)

In [None]:
print(torch.__version__, ignite.__version__, os.cpu_count())
if device == 'cuda': print(torch.cuda.get_device_name())

In [None]:
# install NVIDIA Apex if needed to support mixed precision training
if use_amp and training:
    try:
        from apex import amp
    except ImportError:
#         !git clone https://github.com/NVIDIA/apex
#         !pip install --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" apex/
        !pip install  -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" ../input/*/*/NVIDIA-apex*
        from apex import amp

## Model


Let's define some helpful modules:
- Flatten 
- Swish 

The reason why Swish is not implemented in `torch.nn` can be found [here](https://github.com/pytorch/pytorch/pull/3182).


In [None]:
class Swish(nn.Module):
    def forward(self, x):
        return x * torch.sigmoid(x)


class Flatten(nn.Module):
    def forward(self, x):
        return x.reshape(x.shape[0], -1)

Let's visualize Swish transform vs ReLU:

In [None]:
import matplotlib.pylab as plt
%matplotlib inline

d = torch.linspace(-10.0, 10.0)
s = Swish()
res = s(d)
res2 = torch.relu(d)

plt.title("Swish transformation")
plt.plot(d.numpy(), res.numpy(), label='Swish')
plt.plot(d.numpy(), res2.numpy(), label='ReLU')
_ = plt.legend()

Now let's define `SqueezeExcitation` module

In [None]:
class SqueezeExcitation(nn.Module):
    
    def __init__(self, inplanes, se_planes):
        super(SqueezeExcitation, self).__init__()
        self.reduce_expand = nn.Sequential(
            nn.Conv2d(inplanes, se_planes, 
                      kernel_size=1, stride=1, padding=0, bias=True),
            Swish(),
            nn.Conv2d(se_planes, inplanes, 
                      kernel_size=1, stride=1, padding=0, bias=True),
            nn.Sigmoid()
        )

    def forward(self, x):
        x_se = torch.mean(x, dim=(-2, -1), keepdim=True)
        x_se = self.reduce_expand(x_se)
        return x_se * x


Next, we can define `MBConv`.

**Note on implementation**: in Tensorflow (and PyTorch ports) convolutions use `SAME` padding option which in PyTorch requires
a specific padding computation and additional operation to apply. We will use built-in padding argument of the convolution.

In [None]:
from torch.nn import functional as F

class MBConv(nn.Module):
    def __init__(self, inplanes, planes, kernel_size, stride, 
                 expand_rate=1.0, se_rate=0.25, 
                 drop_connect_rate=0.2):
        super(MBConv, self).__init__()

        expand_planes = int(inplanes * expand_rate)
        se_planes = max(1, int(inplanes * se_rate))

        self.expansion_conv = None        
        if expand_rate > 1.0:
            self.expansion_conv = nn.Sequential(
                nn.Conv2d(inplanes, expand_planes, 
                          kernel_size=1, stride=1, padding=0, bias=False),
                nn.BatchNorm2d(expand_planes, momentum=0.01, eps=1e-3),
                Swish()
            )
            inplanes = expand_planes

        self.depthwise_conv = nn.Sequential(
            nn.Conv2d(inplanes, expand_planes,
                      kernel_size=kernel_size, stride=stride, 
                      padding=kernel_size // 2, groups=expand_planes,
                      bias=False),
            nn.BatchNorm2d(expand_planes, momentum=0.01, eps=1e-3),
            Swish()
        )

        self.squeeze_excitation = SqueezeExcitation(expand_planes, se_planes)
        
        self.project_conv = nn.Sequential(
            nn.Conv2d(expand_planes, planes, 
                      kernel_size=1, stride=1, padding=0, bias=False),
            nn.BatchNorm2d(planes, momentum=0.01, eps=1e-3),
        )

        self.with_skip = stride == 1
        self.drop_connect_rate = torch.tensor(drop_connect_rate, requires_grad=False)
    
    def _drop_connect(self, x):        
        keep_prob = 1.0 - self.drop_connect_rate
        drop_mask = torch.rand(x.shape[0], 1, 1, 1) + keep_prob
        drop_mask = drop_mask.type_as(x)
        drop_mask.floor_()
        return drop_mask * x / keep_prob
        
    def forward(self, x):
        z = x
        if self.expansion_conv is not None:
            x = self.expansion_conv(x)

        x = self.depthwise_conv(x)
        x = self.squeeze_excitation(x)
        x = self.project_conv(x)
        
        # Add identity skip
        if x.shape == z.shape and self.with_skip:            
            if self.training and self.drop_connect_rate is not None:
                self._drop_connect(x)
            x += z
        return x

And finally, we can implement generic `EfficientNet':

In [None]:
from collections import OrderedDict
import math


def init_weights(module):    
    if isinstance(module, nn.Conv2d):    
        nn.init.kaiming_normal_(module.weight, a=0, mode='fan_out')
    elif isinstance(module, nn.Linear):
        init_range = 1.0 / math.sqrt(module.weight.shape[1])
        nn.init.uniform_(module.weight, a=-init_range, b=init_range)
        
        
class EfficientNet(nn.Module):
        
    def _setup_repeats(self, num_repeats):
        return int(math.ceil(self.depth_coefficient * num_repeats))
    
    def _setup_channels(self, num_channels):
        num_channels *= self.width_coefficient
        new_num_channels = math.floor(num_channels / self.divisor + 0.5) * self.divisor
        new_num_channels = max(self.divisor, new_num_channels)
        if new_num_channels < 0.9 * num_channels:
            new_num_channels += self.divisor
        return new_num_channels

    def __init__(self, num_classes, 
                 width_coefficient=1.0,
                 depth_coefficient=1.0,
                 se_rate=0.25,
                 dropout_rate=0.2,
                 drop_connect_rate=0.2):
        super(EfficientNet, self).__init__()
        
        self.width_coefficient = width_coefficient
        self.depth_coefficient = depth_coefficient
        self.divisor = 8
                
        list_channels = [32, 16, 24, 40, 80, 112, 192, 320, 1280]
        list_channels = [self._setup_channels(c) for c in list_channels]
                
        list_num_repeats = [1, 2, 2, 3, 3, 4, 1]
        list_num_repeats = [self._setup_repeats(r) for r in list_num_repeats]        
        
        expand_rates = [1, 6, 6, 6, 6, 6, 6]
        strides = [1, 2, 2, 2, 1, 2, 1]
        kernel_sizes = [3, 3, 5, 3, 5, 5, 3]

        # Define stem:
        self.stem = nn.Sequential(
            nn.Conv2d(3, list_channels[0], kernel_size=3, stride=2, padding=1, bias=False),
            nn.BatchNorm2d(list_channels[0], momentum=0.01, eps=1e-3),
            Swish()
        )
        
        # Define MBConv blocks
        blocks = []
        counter = 0
        num_blocks = sum(list_num_repeats)
        for idx in range(7):
            
            num_channels = list_channels[idx]
            next_num_channels = list_channels[idx + 1]
            num_repeats = list_num_repeats[idx]
            expand_rate = expand_rates[idx]
            kernel_size = kernel_sizes[idx]
            stride = strides[idx]
            drop_rate = drop_connect_rate * counter / num_blocks
            
            name = "MBConv{}_{}".format(expand_rate, counter)
            blocks.append((
                name,
                MBConv(num_channels, next_num_channels, 
                       kernel_size=kernel_size, stride=stride, expand_rate=expand_rate, 
                       se_rate=se_rate, drop_connect_rate=drop_rate)
            ))
            counter += 1
            for i in range(1, num_repeats):                
                name = "MBConv{}_{}".format(expand_rate, counter)
                drop_rate = drop_connect_rate * counter / num_blocks                
                blocks.append((
                    name,
                    MBConv(next_num_channels, next_num_channels, 
                           kernel_size=kernel_size, stride=1, expand_rate=expand_rate, 
                           se_rate=se_rate, drop_connect_rate=drop_rate)                                    
                ))
                counter += 1
        
        self.blocks = nn.Sequential(OrderedDict(blocks))
        
        # Define head
        self.head = nn.Sequential(
            nn.Conv2d(list_channels[-2], list_channels[-1], 
                      kernel_size=1, bias=False),
            nn.BatchNorm2d(list_channels[-1], momentum=0.01, eps=1e-3),
            Swish(),
            nn.AdaptiveAvgPool2d(1),
            Flatten(),
            nn.Dropout(p=dropout_rate),
            nn.Linear(list_channels[-1], num_classes)
        )

        self.apply(init_weights)
        
    def forward(self, x):
        f = self.stem(x)
        f = self.blocks(f)
        y = self.head(f)
        return y

**All EfficientNet models can be defined using the following parametrization:
```
# (width_coefficient, depth_coefficient, resolution, dropout_rate)
'efficientnet-b0': (1.0, 1.0, 224, 0.2),
'efficientnet-b1': (1.0, 1.1, 240, 0.2),
'efficientnet-b2': (1.1, 1.2, 260, 0.3),
'efficientnet-b3': (1.2, 1.4, 300, 0.3),
'efficientnet-b4': (1.4, 1.8, 380, 0.4),
'efficientnet-b5': (1.6, 2.2, 456, 0.4),
'efficientnet-b6': (1.8, 2.6, 528, 0.5),
'efficientnet-b7': (2.0, 3.1, 600, 0.5),
```    
Let's define and train the third last one: `EfficientNet-B4

## Dataflow

Let's setup the dataflow:
- load train and test datasets
- setup train/test image transforms
- setup train/test data loaders

According to the EfficientNet paper, authors borrowed training settings from other publications and the dataflow for CIFAR100 is the following:

- input images to the network during training are resized to the model resolution
- horizontally flipped randomly and augmented using cutout.
- each mini-batch contained 256 examples


In [None]:
import torchvision.utils as vutils

import pandas as pd
from sklearn import preprocessing
from sklearn.utils import shuffle

In [None]:
le = preprocessing.LabelEncoder()
le.fit_transform(['HEPG2', 'HUVEC', 'RPE', 'U2OS'])

In [None]:
df = pd.read_csv(path_data+'/train.csv')
df['set'] = 'train'
df_test = pd.read_csv(path_data+'/test.csv')
df_test['set'] = 'test'
best = pd.read_csv(best_sub, index_col=0)
df_test_pseudo = df_test.set_index('id_code').join(best, how='inner').reset_index()
#df_test['sirna'] = pd.read_csv(best_sub).sirna ## add pseudo-labels from best known submission

df_ctrl = pd.read_csv(path_data+'/train_controls.csv')
df_ctrl['set'] = 'train'
df_ctrl_test = pd.read_csv(path_data+'/test_controls.csv')
df_ctrl_test['set'] = 'test'
df_ctrl = pd.concat([df_ctrl, df_ctrl_test], ignore_index=False, sort=False)
del df_ctrl_test

n_targets = df.sirna.nunique() + df_ctrl.sirna.nunique()

for d in [df, df_test, df_test_pseudo, df_ctrl]:
    d['cell_type'] = d.experiment.apply(lambda s: s[:-3])
    d['cell_type_n'] = le.transform(d.cell_type)
    if d is not df_test:
        d['sirna_ct'] = d.sirna + (d.cell_type_n * n_targets)

n_classes = df.sirna_ct.nunique() + df_ctrl.sirna_ct.nunique()
print(n_targets, n_classes)

In [None]:
df_test_pseudo.shape

In [None]:
print('cell types:', df.cell_type.unique().tolist())
print('train experiments:\n', df.experiment.unique().tolist())
print('test experiments:\n', df_test.experiment.unique().tolist())

In [None]:
valid_experiments = get_fold(df, fold)
print('validation experiments:\n', valid_experiments)

# add 3/4 of validation to the training set to mimic effects of pseudo-labelling in tests set?
#df_train = df[~df.experiment.isin(valid_experiments) | (df.plate != fold)]
df_train = df[~df.experiment.isin(valid_experiments)]
df_valid = df[df.experiment.isin(valid_experiments)]

## add pseudo-labels to the oversampled training data (2/3)
df_train = pd.concat([df_train, df_train, df_test_pseudo], ignore_index=False, sort=False)

_ = df_train.groupby('experiment').id_code.count().plot(kind='bar')
plt.tight_layout()

df_train.sample(9)

Oversample minority classes in training

In [None]:
plt.yscale('log')
target_dist = df_train.groupby(target)[target].count()
max_n = target_dist.max()
print(max_n)

_ = target_dist.hist(bins=np.arange(max_n+1))
target_dist.describe()

In [None]:
if training or not debug:
    for label,n in target_dist.items():
        copies = min(int(round(max_n/n) - 1), 4)
        if copies != 0:
    #         print(f'label:{label} n:{n} copies:{copies}')
            addon = [df_train[df_train[target] == label]] * copies
            df_train = pd.concat([df_train] + addon, ignore_index=False, sort=False)

In [None]:
# show some copies
print(df_train.shape)
df_train[df_train.id_code == f'U2OS-0{5-fold}_1_D12']

In [None]:
plt.yscale('log')
target_dist = df_train.groupby(target)[target].count()
_ = target_dist.hist(bins=np.arange(target_dist.max()+1))
target_dist.describe()

In [None]:
_ = df_train.groupby('cell_type').id_code.count().plot(kind='bar')

Add controls

In [None]:
df_ctrl = df_ctrl[df_train.columns]
_ = df_ctrl.groupby('cell_type').id_code.count().plot(kind='bar')

In [None]:
## train on some controls too. A different number per cell type is chosen to improve the type balance in the training set

# U2OS: add all controls
df_train = pd.concat([df_train, df_ctrl[(df_ctrl['cell_type_n'] == 4)]],
                     ignore_index=False, sort=False)
# RPE & HEPG2 controls from only one plate (each plate repeats the same controls again)
df_train = pd.concat([df_train, df_ctrl[(
    df_ctrl['cell_type_n'].isin([0,2])) & ((
    df_ctrl['plate'] != fold) | (
    df_ctrl.experiment.isin(df_test.experiment.unique()) | df_ctrl.experiment.isin(valid_experiments)))]],
                     ignore_index=False, sort=False)
# HUVEC only controls from test and validation
df_train = pd.concat([df_train, df_ctrl[(
    df_ctrl['cell_type_n'] == 1) & (
    df_ctrl['plate'] == fold) & (
    df_ctrl.experiment.isin(df_test.experiment.unique()) | df_ctrl.experiment.isin(valid_experiments))]],
                     ignore_index=False, sort=False)

In [None]:
plt.yscale('log')
target_dist = df_train.groupby(target)[target].count()
_ = target_dist.hist(bins=np.arange(target_dist.max()+1))
target_dist.describe()

In [None]:
_ = df_train.groupby('cell_type').id_code.count().plot(kind='bar')

In [None]:
if debug:
    df_train = df_train[df_train[target].isin(df_train[target].sample(50))] # some random classes
    df_valid = df_valid[df_valid[target].isin(df_train[target].unique())]   # same classes

print(df_train.shape, df_valid.shape, df_test.shape)

Calculate class weights to balance loss function

In [None]:
class_weights = 1. / target_dist
class_weights *= n_classes / class_weights.sum()
# class_weights = class_weights**0.5  # smoothing non-linearity
class_weights.describe()

In [None]:
class_weights = class_weights.values.ravel()

Get pixels statistics by experiment

In [None]:
exp_stats = get_rcic_exp_stats(path_data)
exp_stats.describe()

In [None]:
if not is_interactive():
    del df; del df_ctrl; del target_dist
    gc.collect()

In [None]:
class ExpNormTwinDataset(Dataset):
    ''' Multi channel datatset normalised by experiment image stats
        During training, for every image a 'twin' image is also selected from the same class from another random experiment
    '''
    def __init__(self, df, img_dir, target, mode='train', sites=[1,2], channels=[1,2,3,4,5,6],
                 img_stats=None, transform=None):
        self.df = df
        self.channels = channels
        self.sites = sites
        self.target = target
        self.mode = mode
        self.img_dir = img_dir
        self.stats = img_stats
        self.transform = transform
        
    @staticmethod
    def _load_channel(file_name):
        img = cv2.imread(file_name, cv2.IMREAD_UNCHANGED)
        return np.float32(img)

    def _get_img(self, rec, site):
        dset, experiment, well, plate = rec.set, rec.experiment, rec.well, rec.plate
        paths = [os.path.join(self.img_dir, dset, experiment, f'Plate{plate}', f'{well}_s{site}_w{channel}.png')
                 for channel in self.channels]
        img = [self._load_channel(img_path) for img_path in paths]

        ## norm
        if self.stats is not None:
            stats = self.stats.loc[experiment, ['mean', 'std']]
            # mean subtract
            img = [i-m for i,m in zip(img, stats['mean'].values)]
            # norm to 1 std
            img = [i/s for i,s in zip(img, stats['std'].values)]
        
        img = np.stack(img, axis=-1)
#         print(stats)
#         print(img.shape, img.mean(axis=(1,0)).tolist(), img.std(axis=(1,0)).tolist(), float(img.min()), float(img.max()))
        if self.transform:
            img = self.transform(image=img)['image']
        return img
        
    def _get_twin(self, rec, site):
        experiment, target = rec.experiment, rec[self.target]
        twin = self.df[(self.df[self.target] == target) & (self.df.experiment != experiment)].sample(1).iloc[0]
        return self._get_img(twin, site)
        
    def __getitem__(self, index):
        rec = self.df.iloc[index]
        if self.mode == 'train':
            # returns a random site and a random twin
            img = self._get_img(rec, random.choice(self.sites))
            twin = self._get_twin(rec, random.choice(self.sites))
            return img, twin, rec[self.target]
        elif self.mode == 'eval':
            # returns a random site
            img = self._get_img(rec, random.choice(self.sites))
            return img, rec[self.target]
        elif self.mode == 'test':
            # returns raw images of all available sites
            return [self._get_img(rec, site) for site in self.sites]

    def __len__(self):
        """ Total number of samples in the dataset """
        return len(self.df)


In [None]:
from albumentations import *
from albumentations.pytorch import ToTensor

In [None]:
train_transform = Compose([
    Rotate(45, p=0.666),
#     GaussianBlur(blur_limit=3, p=0.5),
#     RandomBrightness(limit=0.01, p=0.5),
#     GaussNoise(var_limit=(0.001, 0.01), p=0.5),
#     OneOrOther(
#         RandomSizedCrop(min_max_height=np.int32([resolution*0.99, resolution*1.02]),
#                         height=resolution, width=resolution),
#     p=0.5),
    AverageCrop(resolution, resolution),
    HorizontalFlip(p=0.5),
    VerticalFlip(p=0.5),
    Transpose(p=0.5),
    ToTensor(),
])

valid_transform = Compose([
    CenterCrop(resolution, resolution),
    ToTensor(),
])


train_dataset = ExpNormTwinDataset(df_train, path_data, target=target, img_stats=exp_stats,
                                   mode='train', transform=train_transform)
train_eval_dataset = ExpNormTwinDataset(df_valid, path_data, target=target, img_stats=exp_stats,
                                   mode='eval', transform=valid_transform)

## TTA loaders
valid_dataset = ExpNormTwinDataset(df_valid, path_data, target=target, img_stats=exp_stats,
                                   mode='test', transform=ToTensor())
test_dataset = ExpNormTwinDataset(df_test, path_data, target=target, img_stats=exp_stats,
                                  mode='test', transform=ToTensor())

print(len(train_dataset), len(train_eval_dataset), len(valid_dataset), len(test_dataset))
print(test_dataset[0][0].shape, train_eval_dataset[0][0].shape, train_dataset[0][0].shape)

raw_image_size = test_dataset[0][1].shape[-1]
raw_image_size

In [None]:
from torch.utils.data import DataLoader

num_workers = os.cpu_count()
print('num_workers:', num_workers)

train_loader = DataLoader(train_dataset, batch_size=train_batch_size, num_workers=num_workers, 
                          shuffle=True, drop_last=True, pin_memory=True)

eval_train_loader = DataLoader(train_eval_dataset, batch_size=eval_batch_size, num_workers=num_workers, 
                               shuffle=False, drop_last=False, pin_memory=True)

## TTA loaders
valid_loader = DataLoader(valid_dataset, batch_size=eval_batch_size, num_workers=num_workers, 
                               shuffle=False, drop_last=False, pin_memory=True)

test_loader = DataLoader(test_dataset, batch_size=eval_batch_size, num_workers=num_workers, 
                         shuffle=False, drop_last=False, pin_memory=True)

print(len(train_loader), len(eval_train_loader), len(valid_loader), len(test_loader))

In [None]:
# Plot some training images
batch, twins, targets = next(iter(train_loader))

plt.figure(figsize=(16, 8))
plt.axis("off")
plt.title("Training Images")
_ = plt.imshow( # show every second channel
    normalize_channels(
        vutils.make_grid(batch[:16,::2], padding=2, normalize=False).cpu().numpy().transpose((1, 2, 0)),
    )
)

In [None]:
plt.figure(figsize=(16, 8))
plt.axis("off")
plt.title("Training Twins")
_ = plt.imshow( # show every second channel
    normalize_channels(
        vutils.make_grid(twins[:16,::2], padding=2, normalize=False).cpu().numpy().transpose((1, 2, 0)),
    )
)

In [None]:
targets = targets[:16]
targets.reshape([len(targets)//8,8])

In [None]:
## mean should be around 0 and std around 1
{'mean':batch.mean(dim=(0,2,3)), 'std':batch.std(dim=(0,2,3))}

In [None]:
del batch
torch.cuda.empty_cache()
gc.collect()

# Load pretrained weights

In [None]:
n_channels = len(test_dataset.channels)

In [None]:
def load_n_remap(model, model_path, device):
    model_state = torch.load(model_path, map_location=device)

    # A basic remapping is required
    mapping = { k:v for k,v in zip(model_state.keys(), model.state_dict().keys()) }
#         print(mapping)
    mapped_model_state = OrderedDict([
        (mapping[k], v) for k,v in model_state.items() if k in mapping.keys()
    ])

    model.load_state_dict(mapped_model_state, strict=False)
    return model

In [None]:
from collections import OrderedDict

## from: https://github.com/filipradenovic/cnnimageretrieval-pytorch
def gem(x, p=3, eps=1e-6):
    return F.avg_pool2d(x.clamp(min=eps).pow(p), (x.size(-2), x.size(-1))).pow(1./p)

class GeM(nn.Module):
    def __init__(self, p=3, eps=1e-6):
        super().__init__()
        self.p = nn.Parameter(torch.ones(1)*p)
        self.eps = eps

    def forward(self, x):
        return gem(x, p=self.p, eps=self.eps)
        
    def __repr__(self):
        return self.__class__.__name__ + '(' + 'p=' + '{:.4f}'.format(self.p.data.tolist()[0]) + ', ' + 'eps=' + str(self.eps) + ')'

def get_model(model_path=None, n_classes=n_classes):
    model = EfficientNet(num_classes=n_classes, 
                         width_coefficient=1.4, depth_coefficient=1.8,
                         dropout_rate=dropout_rate)

    # fix n_channels
    orig_stem = model.stem[0]
    model.stem[0] = nn.Conv2d(
                      in_channels=n_channels,
                      out_channels=model.stem[0].out_channels,
                      kernel_size=model.stem[0].kernel_size,
                      stride=model.stem[0].stride,
                      padding=model.stem[0].padding,
                      bias=model.stem[0].bias)
#     model.stem[0].weight.data = F.interpolate(orig_stem.weight.data.unsqueeze(0),
#                                           size=[n_channels]+list(orig_stem.kernel_size),
#                                           mode='trilinear', align_corners=False)[0]

#     model.head[3] = GeM(p=1.25)
    # remove head.fc
    del model.head[6]

    if model_path is not None:
        model = load_n_remap(model, model_path, device)
    
    return model

In [None]:
### ArcNet from: https://github.com/pudae/kaggle-humpback/blob/master/tasks/identifier.py  
import types
import math

class ArcModule(nn.Linear):
    def __init__(self, in_features, out_features, s, m):
        super().__init__(in_features=in_features, out_features=out_features, bias=False)
        self.s = s
        self.m = m
        self.cos_m = math.cos(m)
        self.sin_m = math.sin(m)
        self.th = math.cos(math.pi - m)
        self.mm = math.sin(math.pi - m) * m

    def forward(self, inputs, labels=None):
        cos_th = F.linear(inputs, F.normalize(self.weight))
        if labels is None:
            assert not self.training
            return cos_th * self.s
        cos_th = cos_th.clamp(-1, 1)
    
        ## add margin
        sin_th = torch.sqrt(1.0 - torch.pow(cos_th, 2))
        cos_th_m = cos_th * self.cos_m - sin_th * self.sin_m
        cos_th_m = torch.where(cos_th > self.th, cos_th_m, cos_th - self.mm)

        ## truncate margin?
        cond_v = cos_th - self.th
        cond = cond_v <= 0
        cos_th_m[cond] = (cos_th - self.mm)[cond]

        ## add margin only to correct class
        if labels.dim() == 1:
            labels = labels.unsqueeze(-1)
        onehot = torch.zeros_like(cos_th)
        onehot.scatter_(1, labels, 1)
        outputs = onehot * cos_th_m + (1.0 - onehot) * cos_th
        outputs = outputs * self.s
        return outputs


class ArcNet(nn.Module):
    def __init__(self, criterion, backbone, in_features, num_classes, channel_size, s=65, m=margin):
        super().__init__()
        assert num_classes % 2 == 0
        self.num_classes = num_classes
        self.criterion = criterion
        self.backbone = backbone
        self.feat = nn.Sequential(
            nn.Linear(in_features, channel_size, bias=False),
            nn.BatchNorm1d(channel_size, momentum=0.01),
        )
        self.arc = ArcModule(channel_size, num_classes, s=s, m=m)

    def features(self, images, labels=None):
        features = self.backbone(images)
        features = self.feat(features)
        features = F.normalize(features, dim=-1)
        return features

    def forward(self, images, twins=None, labels=None):
        if labels is None:
            features = self.features(images)
            return self.arc(features)
        else:
            return self.loss(images, twins, labels)

    # add additional cost to separate features according to their siRNA (not part of the original ArcNet)
    def loss(self, images, twins, labels, cos_w=3):
        f0 = self.features(images)
        f1 = self.features(twins)
        targets = (labels % n_targets)
        same_target = targets.expand(size=(len(targets), len(targets)))
        same_target = 2*(same_target == same_target.t()).type_as(f0) - 1  ## 1 for equal targets and -1 for different ones
        cos_dist = 1 - (torch.mm(f0, f1.t()) * same_target)
        cos_dist = cos_dist**2  ## squared dist to focus on large errors
        loss = cos_w * (torch.mean(cos_dist)) # + torch.diag(cos_dist).mean())  ## additional cost for twins (diagonal)
        pred0 = self.arc(f0, labels)
        pred1 = self.arc(f1, labels)
        return loss + 0.5 * (self.criterion(pred0, labels) + self.criterion(pred1, labels))

In [None]:
backbone = get_model(model_path if remove_head else None, n_classes)
model = ArcNet(criterion=FocalLoss(gamma=focal_gamma, alpha=class_weights),
               backbone=backbone,
               in_features=backbone.head[0].out_channels,
               num_classes=n_classes,
               channel_size=channel_size)
if not remove_head:
    model = load_n_remap(model, model_path, device)

In [None]:
# batch = next(iter(train_loader))
# with torch.no_grad():
#     print(model.to(device).loss(batch[0].to(device), batch[1].to(device), batch[2].to(device)))
# del batch

In [None]:
plot_first_kernels(model.backbone.stem[0].weight)

In [None]:
# check norms
plot_norms([(n,p) for n,p in model.named_parameters() if 'blocks' not in n])

In [None]:
model

We will finetune the model on GPU with AMP fp32/fp16 using nvidia/apex package.

In [None]:
if device == "cuda":
    assert torch.cuda.is_available()
    assert torch.backends.cudnn.enabled, "NVIDIA/Apex:Amp requires cudnn backend to be enabled."
    torch.backends.cudnn.benchmark = True

model = model.to(device)

In [None]:
# Check cosine distance between features
if training and not debug: cosine_distance_heatmap(model, next(iter(train_loader)))

torch.cuda.empty_cache()
gc.collect()

# Training
Let's setup focal loss as criterion and SGD as optimizer.

We will split model parameters into 3 groups: 

    1) feature extractor (pretrained weights)
    2) ArcNet normalised features
    3) classifier

and define different learning rates for these groups (via learning rate scheduler).

In [None]:
from itertools import chain

import torch.optim as optim
import torch.nn.functional as F

lr = warmup_learning_rate

## Only train layers after the last dimension reduction to same memory + time
if train_to:
    training_blocks = model.backbone.blocks[train_to:].parameters()
else:
    training_blocks = chain(model.backbone.stem.parameters(), model.backbone.blocks.parameters())

## freeze first layers prior to the warm-up run
params2freeze = None
if freeze_at_1st_n_last:
    params2freeze = chain(model.backbone.stem.named_parameters(), model.backbone.blocks.named_parameters())
elif train_to:
    params2freeze = chain(model.backbone.stem.named_parameters(), model.backbone.blocks[:train_to].named_parameters())

if params2freeze is not None:
    for name, param in params2freeze:
        print(f'"{name}" is frozen')
        param.requires_grad = False


optimizer = optim.SGD([
    {   "params": training_blocks,
        "lr": lr * 0.01,
    },
    {   "params": chain(model.feat[0].parameters(), model.backbone.head.parameters()),
        "lr": lr * 0.2,
    },
    {   "params": chain(model.feat[1].parameters(), model.arc.parameters()),
        "lr": lr
    }], 
    momentum=0.9, weight_decay=1e-6, nesterov=True)

# if not use_amp and head_run > 1: optimizer = Lookahead(optimizer)

In [None]:
if use_amp and training:
    # Initialize Amp
    model, optimizer = amp.initialize(model, optimizer, opt_level="O2", num_losses=1)

Next, let's define a single iteration function `update_fn`. This function is then used by `ignite.engine.Engine` to update model while running over the input data.

In [None]:
from ignite.utils import convert_tensor


def update_fn(engine, batch):
    x0 = convert_tensor(batch[0], device=device, non_blocking=True)
    x1 = convert_tensor(batch[1], device=device, non_blocking=True)
    y  = convert_tensor(batch[2], device=device, non_blocking=True)

    model.train()

    # Compute loss 
    loss = model(x0, x1, y)

    optimizer.zero_grad()
    if use_amp:
        with amp.scale_loss(loss, optimizer, loss_id=0) as scaled_loss:
            scaled_loss.backward()
    else:
        loss.backward()
    optimizer.step()
    
    return {
        "batchloss": loss.item(),
    }    

Let's check `update_fn` and warmup the optimizer momentum

In [None]:
warmup_iters = 1 + (3.5 * n_targets) // train_batch_size
# if hasattr(optimizer, 'k'): warmup_iters -= (warmup_iters % optimizer.k)  ## complete look ahead iteration

print(int(warmup_iters))
torch.cuda.empty_cache()
gc.collect()

if training:
    loss = []
    loader = iter(train_loader)
    pbar = tqdm(range(2 if debug else int(warmup_iters)))
    for i in pbar:
        res = update_fn(engine=None, batch=next(loader))
        loss.append(res['batchloss'])
        pbar.set_description(f"loss:{loss[-1]:.4f}"); pbar.refresh()

    if device == 'cuda': print('max_memory_allocated:', torch.cuda.max_memory_allocated())
    torch.cuda.empty_cache()
    gc.collect()
    print('mean loss:', np.mean(loss))

Now let's define a trainer and add some practical handlers:
- log to tensorboard: losses, metrics, lr
- progress bar
- models/optimizers checkpointing

In [None]:
from ignite.engine import Engine, Events, create_supervised_evaluator
from ignite.metrics import RunningAverage, Accuracy, Precision, Recall, Loss, TopKCategoricalAccuracy

from ignite.contrib.handlers import TensorboardLogger
from ignite.contrib.handlers.tensorboard_logger import OutputHandler, OptimizerParamsHandler

In [None]:
trainer = Engine(update_fn)

def siRNAaccuracyMax1108(pred, y):
    " Check predictions only on classes present in the validation and test sets "
    pred_ = pred.reshape([pred.shape[0], n_classes // n_targets, n_targets]).max(dim=1).values
    pred_ = torch.argmax(pred_[...,:1108], dim=-1)
    y_    = (y % n_targets)
    assert pred_.shape == y_.shape
    return (pred_ == y_).float().mean().cpu().numpy()

metrics = {
#     'Loss': Loss(criterion),
    'Score': Loss(siRNAaccuracyMax1108),
    'Accuracy': Accuracy(),
#     'Precision': Precision(average=True),
#     'Recall': Recall(average=True),
    'Top5Accuracy': TopKCategoricalAccuracy(k=5),
}

evaluator = create_supervised_evaluator(model, metrics=metrics, device=device, non_blocking=True)

history = Metrics(evaluator, eval_train_loader, output_transform=lambda out: out['batchloss'], interactive=is_interactive())
history.attach(trainer, "batchloss")

In [None]:
from datetime import datetime

log_path = f"./log"
tb_logger = TensorboardLogger(log_dir=log_path)

tb_logger.attach(trainer, 
                 log_handler=OutputHandler('training', ['batchloss', ]), 
                 event_name=Events.ITERATION_COMPLETED)

Let's setup learning rate scheduling:

In [None]:
from ignite.contrib.handlers import CosineAnnealingScheduler, LinearCyclicalScheduler, ParamGroupScheduler

sim_epochs = 2
epoch_size = len(train_loader)
cycle_mult = 1
lr_sched_params = {'param_name':'lr', 'cycle_size':epoch_size*2, 'cycle_mult':cycle_mult,
                   'start_value_mult':0.7, 'end_value_mult':0.5}
mom_sched_params = {'param_name':'momentum', 'cycle_size':epoch_size*2, 'cycle_mult':cycle_mult,
                    'start_value':0.82, 'end_value':0.92}
wd_sched_params = {'param_name':'weight_decay', 'cycle_size':epoch_size*sim_epochs, 'cycle_mult':cycle_mult,
                   'start_value':weight_decay/10, 'end_value':weight_decay}

lr = learning_rate
body_sched_params = {**lr_sched_params, 'start_value':lr/10, 'end_value':lr/(100*2**head_run)}
body_sched = CosineAnnealingScheduler(optimizer.param_groups[0], **body_sched_params)
feat_sched_params = {**lr_sched_params, 'start_value':lr/5, 'end_value':lr/(50*2**head_run)}
feat_sched = CosineAnnealingScheduler(optimizer.param_groups[1], **feat_sched_params)
head_sched_params = {**lr_sched_params, 'start_value':lr, 'end_value':lr/(20*2**head_run)}
head_sched = CosineAnnealingScheduler(optimizer.param_groups[2], **feat_sched_params)

mom_sched = CosineAnnealingScheduler(optimizer, **mom_sched_params)
wd_sched  = CosineAnnealingScheduler(optimizer, **wd_sched_params)
schedulers = [body_sched, feat_sched, head_sched, mom_sched, wd_sched]
names = ["lr (body)", "lr (feat)", "lr (head)", "momentum", "wd"]

lr_values0 = np.array(body_sched.simulate_values(num_events=epoch_size*sim_epochs, **body_sched_params))
lr_values1 = np.array(feat_sched.simulate_values(num_events=epoch_size*sim_epochs, **feat_sched_params))
lr_values2 = np.array(head_sched.simulate_values(num_events=epoch_size*sim_epochs, **head_sched_params))
wd_values  = np.array(  wd_sched.simulate_values(num_events=epoch_size*sim_epochs, **wd_sched_params))
mom_values = np.array(mom_sched.simulate_values(num_events=epoch_size*sim_epochs, **mom_sched_params))

fig = plt.figure(figsize=(16, 4))
ax = plt.subplot()
plt.title(f"Cosine annealing with start_value_mult={lr_sched_params['start_value_mult']}")
ax.plot(lr_values0[:, 0], lr_values0[:, 1], label=names[0])
ax.plot(lr_values1[:, 0], lr_values1[:, 1], label=names[1])
ax.plot(lr_values2[:, 0], lr_values2[:, 1], label=names[2])
ax.plot( wd_values[:, 0],  wd_values[:, 1], label=names[4])
ax.set_yscale('log')
ax.set_xlabel('Batches processed')
ax.set_ylabel("learning rate")
ax.legend(frameon=False, loc='upper right')
ax2 = ax.twinx()
ax2.plot(mom_values[:, 0], mom_values[:, 1], label=names[3])
ax2.set_ylabel("momentum")
ax2.legend(frameon=False, loc='lower right')
# fig.tight_layout()
_ = ax.plot()

In [None]:
scheduler = ParamGroupScheduler(schedulers=schedulers, names=names)

# Attach single scheduler to the trainer
trainer.add_event_handler(Events.ITERATION_STARTED, scheduler)

In [None]:
# Log optimizer parameters
tb_logger.attach(trainer,
                 log_handler=OptimizerParamsHandler(optimizer, "lr"), 
                 event_name=Events.EPOCH_STARTED)

In [None]:
from ignite.contrib.handlers import ProgressBar

# Iteration-wise progress bar
pbar = ProgressBar(bar_format="")
pbar.attach(trainer, metric_names=['batchloss',])

# Epoch-wise progress bar with display of training losses
ProgressBar(persist=True, bar_format="").attach(trainer,
                                                event_name=Events.EPOCH_STARTED,
                                                closing_event_name=Events.COMPLETED)

In [None]:
# Log validation metrics:
tb_logger.attach(evaluator,
                 log_handler=OutputHandler(tag="test",
                                           metric_names=list(metrics.keys()),
                                           another_engine=trainer),
                 event_name=Events.EPOCH_COMPLETED)

Now let's setup logging and the best model checkpointing:

In [None]:
import logging

# Setup engine &  logger
def setup_logger(logger):
    handler = logging.StreamHandler()
    formatter = logging.Formatter("%(asctime)s %(name)-12s %(levelname)-8s %(message)s")
    handler.setFormatter(formatter)
    logger.addHandler(handler)
    logger.setLevel(logging.INFO)

In [None]:
from ignite.handlers import ModelCheckpoint, EarlyStopping, TerminateOnNan

trainer.add_event_handler(Events.ITERATION_COMPLETED, TerminateOnNan())

# Store the best model
def default_score_fn(engine):
    score = engine.state.metrics['Score']
    return score

best_model_handler = ModelCheckpoint(dirname=log_path,
                                     filename_prefix="best",
                                     n_saved=10,
                                     score_name="Score",
                                     score_function=default_score_fn,
                                     require_empty=False)
evaluator.add_event_handler(Events.COMPLETED, best_model_handler, {'model': model, })

# Clear cuda cache between training/testing
@trainer.on(Events.EPOCH_COMPLETED)
@evaluator.on(Events.COMPLETED)
def empty_cuda_cache(engine):
    if debug and device == 'cuda': print('max_memory_allocated:', torch.cuda.max_memory_allocated())
    torch.cuda.empty_cache()
    gc.collect()

In [None]:
# @trainer.on(Events.ITERATION_STARTED)
# def apply_weight_decay(engine):
#     with torch.no_grad():
#         for mod in model.modules():
#             if not isinstance(mod, nn.modules.batchnorm._BatchNorm):
#                 for p in mod.parameters():
#                     p.data -= p.data * weight_decay * lr

In [None]:
NUM_EPOCHS = (4 if debug else sim_epochs)

if freeze_at_1st_n_last:
    @trainer.on(Events.EPOCH_STARTED)
    def turn_on_layers(engine):
        epoch = engine.state.epoch
        ## first and last epoch
        if (epoch == 1) or (epoch == NUM_EPOCHS):
            for name, param in model.named_parameters():
                if ('backbone' not in name) or ('head' in name):
                    pbar.log_message(f'training "{name}"')
                    param.requires_grad = True
                else:
    #                 pbar.log_message(f'"{name}" is frozen')
                    param.requires_grad = False
        elif (epoch == 2):
            count = 0
            for param in training_blocks:  # only re-enable blocks, as head is already training
                if param.requires_grad == False:
                    param.requires_grad = True
                    count += 1
            pbar.log_message(f"Epoch {epoch}: training all layers ({count})")

In [None]:
if training:
    state = trainer.run(train_loader, max_epochs=NUM_EPOCHS)

In [None]:
if training: plot_first_kernels(model.backbone.stem[0].weight)

In [None]:
# check norms
if training: plot_norms([(n,p) for n,p in model.named_parameters() if 'blocks' not in n])

In [None]:
# check gradients (last batch)
if training: plot_grad_flow([(n,p) for n,p in model.named_parameters() if 'blocks' not in n])

Results on the validation set:

In [None]:
if training: 
    history.plot(len(train_loader))
    plt.savefig(f'loss_history{evaluator.state.epoch}_fold{fold}.png')
    evaluator.state.metrics

# Inference

Let's load the best model and recompute evaluation metrics on test dataset with a very basic Test-Time-Augmentation to boost the performance

In [None]:
best_model = model

In [None]:
# Find the last checkpoint
!ls {log_path}
checkpoints = next(os.walk(log_path))[2]
checkpoints = sorted(filter(lambda f: f.endswith(".pth"), checkpoints))
scores = [c.split('=')[-1][:-4] for c in checkpoints]
best_epoch = 0
if len(scores) > 0:
    best_epoch = np.argmax(scores)
    print(best_epoch, scores)
    if not checkpoints:
        print('No weight files in {}'.format(log_path))
    else:
        model_path = f'fold{fold}h{head_num}r{head_run}_{best_epoch}_{scores[best_epoch]}.pth'
        !cp {os.path.join(log_path, checkpoints[best_epoch])} {model_path}
        best_model.load_state_dict(torch.load(model_path))

print(model_path)

if not is_interactive():
    !rm {log_path}/*.pth

In [None]:
best_model = best_model.to(device).eval()

In [None]:
cosine_distance_heatmap(best_model, next(iter(train_loader)))
_ = plt.savefig(f'cosine_distance_heatmap{best_epoch}_fold{fold}.png')

In [None]:
# Plot some test images
batch = next(iter(test_loader))

plt.figure(figsize=(16, 8))
plt.axis("off")
plt.title("Test Images")
_ = plt.imshow( normalize_channels(
    vutils.make_grid(batch[0][:16,::2], padding=2, normalize=False).cpu().numpy().transpose((1, 2, 0))
) )

In [None]:
# Classify
with torch.no_grad():
    y_pred = best_model(batch[0][:1].to(device))

# Print predictions
for idx in torch.topk(y_pred, k=10)[1].squeeze(0).tolist():
    prob = torch.softmax(y_pred, dim=-1)[0, idx].item()
    print('{label:<75} ({p:.8f}%)'.format(label=str(idx), p=prob*100))

del batch

#### Create submission and OOF predictions

In [None]:
TTA = 9
def tta(sites):
    pred = []
    for site in sites:
        site = site.to(device)
        for i in range(TTA):
            x = tta9crop(site, i, resolution)
            pred.append(best_model(x).unsqueeze(0))
    # concat and calc mean softmax for submission
    pred = torch.cat(pred)
    pred = pred.reshape([pred.shape[0], pred.shape[1], n_classes // n_targets, n_targets]).max(dim=-2).values
    return pred[...,:1108].softmax(dim=-1).mean(dim=0).cpu()

In [None]:
from ignite.contrib.handlers import ProgressBar

def inference_update_with_tta(engine, batch):
    global preds, allpreds
    with torch.no_grad():
        sites = batch
        pred = tta(sites)
        allpreds = np.concatenate([allpreds, (-np.log(pred.numpy())).astype(np.float16)])
        preds += (torch.argmax(pred, dim=-1) % n_targets).tolist()

In [None]:
if create_sub:
    allpreds = np.empty([0,1108])
    preds = []

    inferencer = Engine(inference_update_with_tta)
    ProgressBar(desc="Inference").attach(inferencer)

    result_state = inferencer.run(valid_loader, max_epochs=1)

In [None]:
if create_sub:
    df_valid['pred'] = preds
    df_valid['acc']  = (df_valid.pred == df_valid.sirna).astype(int)

    print('validation accuracy:', df_valid.acc.mean())
    _ = df_valid.groupby('cell_type').acc.mean().plot(kind='bar'); plt.show()
    _ = df_valid.groupby('experiment').acc.mean().plot(kind='bar'); plt.show()

In [None]:
if create_sub:
    pd.DataFrame(allpreds, index=df_valid.id_code).to_pickle(f'df_valid_fold{fold}_log.pkl.gz')
    print(glob.glob('*.gz'))

In [None]:
if create_sub:
    allpreds = np.empty([0,1108])
    preds = []

    inferencer = Engine(inference_update_with_tta)
    ProgressBar(desc="Inference").attach(inferencer)

    result_state = inferencer.run(test_loader, max_epochs=1)

    pd.DataFrame(allpreds, index=df_test.id_code).to_pickle('df_test_log.pkl.gz')
    print(glob.glob('*.gz'))

In [None]:
if create_sub:
    test_y = np.exp(-pd.read_pickle('df_test_log.pkl.gz').astype(np.float64)).fillna(0)
    valid_y = np.exp(-pd.read_pickle(f'df_valid_fold{fold}_log.pkl.gz').astype(np.float64)).fillna(0)
else: # recover last model
    test_y = np.exp(-pd.read_pickle(glob.glob(f'../input/*/df_test_log.pkl.gz')[0]).astype(np.float64)).fillna(0)
    valid_y = np.exp(-pd.read_pickle(glob.glob(f'../input/*/df_valid_fold{fold}_log.pkl.gz')[0]).astype(np.float64)).fillna(0)
    
# normalise across siRNA's (each siRNA is equally likely to appear)
test_y = test_y / test_y.sum(axis=0)
valid_y = valid_y / valid_y.sum(axis=0)

df_valid['pred'] = valid_y.values.argmax(axis=1)
df_valid['acc']  = (df_valid.pred == df_valid.sirna).astype(int)

print('validation accuracy (norm):', df_valid.acc.mean())
# _ = df_valid.groupby('cell_type').acc.mean().plot(kind='bar'); plt.show()
# _ = df_valid.groupby('experiment').acc.mean().plot(kind='bar'); plt.show()

In [None]:
## remove siRNA in the wrong plate and normalise again (both horizontally and vertically)
preds = remove_leaked_sirna(df_valid.set_index('id_code'), valid_y.copy())

best_score = -1
bestpreds = None
scores = []
for k in tqdm(range(10)):
    preds, newscore, sirna_preds = normalize_both_ways(preds, df_valid.sirna.values)
    print(f'Newscore:{newscore} Iter:{k}')
    scores.append(newscore)
    if newscore > best_score:
        best_score = newscore
        best_iter = k
        bestpreds = sirna_preds
#     elif device == 'cuda': break  ## stop to save GPU $$$

In [None]:
plt.plot(range(len(scores)), scores)
plt.ylabel("score")
plt.xlabel("iteration")

best_score, best_iter

In [None]:
df_valid['pred'] = bestpreds
df_valid['acc'] = (df_valid.pred == df_valid.sirna).astype(int)

print('validation accuracy (norm):', df_valid.acc.mean())
_ = df_valid.groupby('cell_type').acc.mean().plot(kind='bar'); plt.show()
_ = df_valid.groupby('experiment').acc.mean().plot(kind='bar'); plt.show()

In [None]:
test_preds = remove_leaked_sirna(df_test.set_index('id_code'), test_y.copy())

for i in tqdm(range(best_iter+1)):
    test_preds, _, _ = normalize_both_ways(test_preds)

## assign the most likely unique treatment to each well
preds = assign_sirna(test_preds)

Finally, the submission csv:

In [None]:
if create_sub:
    submission = pd.DataFrame({'id_code': pd.read_csv(path_data+'/sample_submission.csv').id_code.values,
                               'sirna':   np.squeeze(preds).astype(int)})
    submission.to_csv('submission.csv', index=False)
    submission.hist(bins=111)
    submission.head()

In [None]:
if not is_interactive():
    # clean up folders
    !rm -rf apex /tmp/*
    !ls -lh *