Using https://www.kaggle.com/dragonzhang/fastai-cell-tile-prototyping-training ! Thanks for the great kernel.

In [None]:
!pip install /kaggle/input/iterative-stratification/iterative-stratification-master/

In [None]:
import sys
package_path = '../input/efficientnet-pytorch/EfficientNet-PyTorch/EfficientNet-PyTorch-master'
sys.path.append(package_path)



In [None]:
import pandas as pd
import numpy as np
from fastai.vision.all import *
import pickle
import os

In [None]:
path = Path('../input/hpa-cell-tiles-sample-balanced-dataset')

In [None]:
df = pd.read_csv(path/'cell_df.csv')

In [None]:
df.head()

In [None]:
len(df)

In [None]:
labels = [str(i) for i in range(19)]
for x in labels: df[x] = df['image_labels'].apply(lambda r: int(x in r.split('|')))

## Change below to `frac=1` to run on the whole training sample

In [None]:
#dfs = df.sample(frac=0.1, random_state=42)

#dfs = df.sample(frac=1, random_state=42)

#let try less data for efficientnetb5

dfs = df.sample(frac=1, random_state=42)


dfs = dfs.reset_index(drop=True)
len(dfs)

In [None]:
unique_counts = {}
for lbl in labels:
    unique_counts[lbl] = len(dfs[dfs.image_labels == lbl])

full_counts = {}
for lbl in labels:
    count = 0
    for row_label in dfs['image_labels']:
        if lbl in row_label.split('|'): count += 1
    full_counts[lbl] = count
    
counts = list(zip(full_counts.keys(), full_counts.values(), unique_counts.values()))
counts = np.array(sorted(counts, key=lambda x:-x[1]))
counts = pd.DataFrame(counts, columns=['label', 'full_count', 'unique_count'])
counts.set_index('label').T


In [None]:
len(dfs)

## Using multilabel stratification for the train-validation split.

There is some leakage in the code below (cells belonging to the same image should be in the same split). However, when I fixed that, I got a lower score... coincidence? 

In [None]:
nfold = 5
seed = 42

y = dfs[labels].values
X = dfs[['image_id', 'cell_id']].values

dfs['fold'] = np.nan

from iterstrat.ml_stratifiers import MultilabelStratifiedKFold
mskf = MultilabelStratifiedKFold(n_splits=nfold, random_state=seed)
for i, (_, test_index) in enumerate(mskf.split(X, y)):
    dfs.iloc[test_index, -1] = i
    
dfs['fold'] = dfs['fold'].astype('int')

In [None]:
dfs['is_valid'] = False
dfs['is_valid'][dfs['fold'] == 0] = True

In [None]:
dfs.is_valid.value_counts()

In [None]:
def get_x(r): return path/'cells'/(r['image_id']+'_'+str(r['cell_id'])+'.jpg')
img = get_x(dfs.loc[12])
img = PILImage.create(img)
img.show();

In [None]:
def get_y(r): return r['image_labels'].split('|')
get_y(dfs.loc[12])

In [None]:
sample_stats = ([0.07237246, 0.04476176, 0.07661699], [0.17179589, 0.10284516, 0.14199627])

In [None]:
import albumentations

In [None]:
class AlbumentationsTransform(DisplayedTransform):
    split_idx,order=0,2
    def __init__(self, train_aug): store_attr()
    
    def encodes(self, img: PILImage):
        aug_img = self.train_aug(image=np.array(img))['image']
        return PILImage.create(aug_img)

In [None]:
def get_train_aug(): return albumentations.Compose([
            #albumentations.HueSaturationValue(
                #hue_shift_limit=0.2, 
                #sat_shift_limit=0.2, 
                #val_shift_limit=0.2, 
                #p=0.5
            #),
            albumentations.CoarseDropout(p=0.5),
            albumentations.RandomContrast(p = 0.6)
])

In [None]:
item_tfms = [RandomResizedCrop(224, min_scale=0.75, ratio=(1.,1.)),AlbumentationsTransform(get_train_aug()) ]
batch_tfms = [*aug_transforms(flip_vert=True, size=128, max_warp=0),  Normalize.from_stats(*sample_stats)]
bs=256

In [None]:
dblock = DataBlock(blocks=(ImageBlock, MultiCategoryBlock(vocab=labels)),
                splitter=ColSplitter(col='is_valid'),
                get_x=get_x,
                get_y=get_y,
                item_tfms=item_tfms,
                batch_tfms=batch_tfms
                )
dls = dblock.dataloaders(dfs, bs=bs)

In [None]:
dls.show_batch(nrows=3, ncols=3)