<img src="https://markdown.data-ensta.fr/uploads/upload_62aae64efd54f4a28389617d2ff98804.png" alt="drawing" width="1400"/>
<center><h1 style="color:Blue;font-size:40px;">HuBMAP 2022</h1> </center>
<center><h1 style="color:Blue;font-size:27px;">Vanilla Submission with Keras</h1> </center>

## Gist & acknowledgements

+ This notebook proposes a simple Keras pipeline to make a quick submission
+ It relies heavily on @quvbel's [segmentation library](https://github.com/qubvel/segmentation_models)
+ Thanks for @thedevastator's heavy lifting, this year's first [tiled dataset](https://www.kaggle.com/datasets/thedevastator/hubmap-2022-256x256) can be used early in the competition
+ This notebook is an adaptation of team @kishan98joshi's awesome [notebook](https://www.kaggle.com/code/joshi98kishan/hubmap-keras-pipeline-training-inference).
+ Cover picture from [NIH's website](https://commonfund.nih.gov/hubmap/image-of-the-week).

In [None]:
# Keras utilities (don't forget to "Add data" in the side menu if content not there, or else failure)
!pip install -U ../input/kerasapplications/Keras_Applications-1.0.8-py3-none-any.whl
!pip install ../input/qubvel/efficientnet-1.0.0-py3-none-any.whl
!pip install ../input/qubvel/image_classifiers-1.0.0-py3-none-any.whl
!pip install ../input/qubvel-segmentation-model-keras-v101/segmentation_models-master

%env SM_FRAMEWORK=tf.keras

Imports

In [None]:
import os, gc
import numpy as np 
import pandas as pd 
import cv2
import glob
import numba
import pathlib
from tqdm.notebook import tqdm
import tifffile as tiff
import tensorflow as tf
import tensorflow.keras.backend as K
import matplotlib.pyplot as plt
from sklearn.model_selection import GroupKFold

from albumentations import *
import segmentation_models as sm
import rasterio
from rasterio.windows import Window

import warnings
warnings.filterwarnings('ignore')

Loading and formatting utilities 

In [None]:
def get_settings(batch_size = 32, 
                 encoder = 'resnet34',
                 epochs = 5):
    
    return {'BATCH_SIZE' : batch_size,
           'ENCODER' : encoder,
           'EPOCHS' : epochs}


def get_path(pathof):
    """
    Returns mapping function based on the input argument
    
    Arguments:
            pathof : value can be either 'image' or 'mask'
    """

    if pathof=='image':
        return lambda fname: tf.strings.join([path_train, fname], 
                                             separator = '/')
    
    elif pathof=='mask':
        return lambda fname: tf.strings.join([path_masks, fname], 
                                              separator = '/') 

def get_normalised_tensor(path):
    """
    Reads the image and scale it to [0, 1]
    
    Arguments:
            path : path of the image to be read
            
    Returns:
            Returns normalized image tensor
    """
    
    im = tf.io.read_file(path)
    im = tf.io.decode_png(im)
    im = tf.image.convert_image_dtype(im, 
                                      tf.float32)
    return im

def get_tensor(path): 
    """
    Reads the image or mask
    
    Arguments:
            path : path of the image or mask to be read
            
    Returns:
            Returns image or mask tensor
    """

    mask = tf.io.read_file(path)
    mask = tf.io.decode_png(mask)
    
    return mask

def augment_data(image, mask):
    """
    This is a mapping function on a zipped dataset.
    
    It takes tf tensors, do numpy based albumentation transformations 
    and returns augmented tf tensors.
    
    Reference: https://albumentations.ai/docs/examples/tensorflow-example/
    """
    
    def _fn(image, mask):
        sample = transforms(image = image, 
                            mask = mask)
        aug_img = sample['image']
        aug_msk = sample['mask']
        
        aug_img = tf.cast(aug_img/255.0, tf.float32)
        
        return aug_img, aug_msk

    aug_img, aug_msk = tf.numpy_function(func = _fn, 
                                         inp = [image, mask],
                                         Tout = [tf.float32, tf.uint8])
    
    return aug_img, aug_msk


def dice_coeff(y_true, y_pred, epsilon=1.):
    
    """
    Calculates dice coefficient

    Arguments: 
            y_true : tensor of ground truth values.
            y_pred : tensor of predicted values.
            epsilon : constant to avoid divide by 0 errors.
    
    Returns:
            dice_coefficient
    """
    
    y_true_f = K.flatten(y_true)
    y_pred_f = K.flatten(y_pred)
    intersection = K.sum(y_true_f * y_pred_f)
    score = (2. * intersection + epsilon) / (K.sum(y_true_f) + K.sum(y_pred_f) + epsilon)
    return score


# https://www.kaggle.com/leighplt/pytorch-fcn-resnet50
def make_grid(shape, window=256, min_overlap=32):
    """
        Return Array of size (N,4), where N - number of tiles,
        2nd axis represente slices: x1,x2,y1,y2 
    """
    x, y = shape
    nx = x // (window - min_overlap) + 1
    x1 = np.linspace(0, x, num=nx, endpoint=False, dtype=np.int64)
    x1[-1] = x - window
    x2 = (x1 + window).clip(0, x)
    ny = y // (window - min_overlap) + 1
    y1 = np.linspace(0, y, num=ny, endpoint=False, dtype=np.int64)
    y1[-1] = y - window
    y2 = (y1 + window).clip(0, y)
    slices = np.zeros((nx,ny, 4), dtype=np.int64)
    
    for i in range(nx):
        for j in range(ny):
            slices[i,j] = x1[i], x2[i], y1[j], y2[j]    
    return slices.reshape(nx*ny,4)

@numba.njit()
def rle_numba(pixels):
    size = len(pixels)
    points = []
    if pixels[0] == 1: points.append(0)
    flag = True
    for i in range(1, size):
        if pixels[i] != pixels[i-1]:
            if flag:
                points.append(i+1)
                flag = False
            else:
                points.append(i+1 - points[-1])
                flag = True
    if pixels[-1] == 1: points.append(size-points[-1]+1)    
    return points

def rle_numba_encode(image):
    pixels = image.flatten(order = 'F')
    points = rle_numba(pixels)
    return ' '.join(str(x) for x in points)

### Training set up

In [None]:
# TRAINING SETTINGS
settings = get_settings(batch_size = 32,
                        epochs = 30,
                        encoder = 'seresnext50')

DATA_ORIG_PATH = '../input/hubmap-organ-segmentation'
DATA_PATH = '../input/hubmap-2022-256x256'

path_train = os.path.join(DATA_PATH, 'train')
path_masks = os.path.join(DATA_PATH, 'masks')

path_submission_file = os.path.join(DATA_ORIG_PATH, 'sample_submission.csv')

print(f'No. of training images: {len(os.listdir(path_train))}')
print(f'No. of masks: {len(os.listdir(path_masks))} \n')

In [None]:
fnames = np.array(os.listdir(path_train))
groups = [fname[:9] for fname in fnames]
group_kfold = GroupKFold(n_splits = 4)

# https://www.tensorflow.org/api_docs/python/tf/data/Dataset#shuffle
BUFFER_SIZE = 1000
BATCH_SIZE = settings['BATCH_SIZE']

### Model

In [None]:
# We will be using pretrained resnet34 as our encoder.

!mkdir -p ~/.keras/models

if settings['ENCODER']=='resnet34':
    !cp ../input/keras-pretrained-imagenet-weights/resnet34_imagenet_1000_no_top.h5 ~/.keras/models/resnet34_imagenet_1000_no_top.h5
elif settings['ENCODER']=='resnet50':
    !cp ../input/keras-pretrained-imagenet-weights/resnet50_imagenet_1000_no_top.h5 ~/.keras/models/resnet50_imagenet_1000_no_top.h5
elif settings['ENCODER']=='seresnext50':
    !cp ../input/keras-pretrained-imagenet-weights/seresnext50_imagenet_1000_no_top.h5 ~/.keras/models/seresnext50_imagenet_1000_no_top.h5

### Data Augmentation

In [None]:
# https://www.kaggle.com/iafoss/hubmap-pytorch-fast-ai-starter
transforms = Compose([
            HorizontalFlip(),
            VerticalFlip(),
            RandomRotate90(),
            ShiftScaleRotate(shift_limit=0.0625, scale_limit=0.2, rotate_limit=15, p=0.9, 
                             border_mode=cv2.BORDER_REFLECT),
            OneOf([
                OpticalDistortion(p=0.3),
                GridDistortion(p=.1),
                IAAPiecewiseAffine(p=0.3),
            ], p=0.3),
            OneOf([
                HueSaturationValue(10,15,10),
                CLAHE(clip_limit=2),
                RandomBrightnessContrast(),            
            ], p=0.3),
        ], p = 1.0)

## Training

In [None]:
for fold, (t_idx, v_idx) in enumerate(group_kfold.split(fnames, 
                                                        groups = groups)):
    
    print(f'Fold: {fold+1}')
    
    t_fnames_ds = tf.data.Dataset.from_tensor_slices(fnames[t_idx])
    v_fnames_ds = tf.data.Dataset.from_tensor_slices(fnames[v_idx])

    t_img_ds = t_fnames_ds.map(get_path('image')).map(get_tensor)
    t_msk_ds = t_fnames_ds.map(get_path('mask')).map(get_tensor)
    
    v_img_ds = v_fnames_ds.map(get_path('image')).map(get_normalised_tensor)
    v_msk_ds = v_fnames_ds.map(get_path('mask')).map(get_tensor)
    
    train_ds = tf.data.Dataset.zip((t_img_ds, t_msk_ds))
    train_ds = train_ds.map(augment_data)  
    val_ds = tf.data.Dataset.zip((v_img_ds, v_msk_ds))
    del t_fnames_ds, v_fnames_ds, t_img_ds, t_msk_ds, v_img_ds, v_msk_ds
    gc.collect()
    train_ds = train_ds.shuffle(BUFFER_SIZE)\
                       .batch(BATCH_SIZE)\
                       .repeat()
    val_ds = val_ds.batch(BATCH_SIZE)\
                   .repeat()
    model = sm.Unet(settings['ENCODER'], 
                encoder_weights='imagenet')
    model.compile(optimizer = 'adam',
                  loss = tf.keras.losses.BinaryCrossentropy(),
                  metrics = [dice_coeff])
    
    checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(f'./saved_models/fold_model_{fold+1}.pb',
                                                            save_best_only = True)
    EPOCHS = settings['EPOCHS']

    TOTAL_TRAIN_SAMPLES = len(t_idx)
    TOTAL_VAL_SAMPLES = len(t_idx)
    STEPS_PER_EPOCH = TOTAL_TRAIN_SAMPLES//BATCH_SIZE
    
    VALIDATION_STEPS = TOTAL_VAL_SAMPLES//BATCH_SIZE

    history = model.fit(train_ds, 
                        epochs = EPOCHS,
                        steps_per_epoch = STEPS_PER_EPOCH,
                        validation_data = val_ds,
                        validation_steps = VALIDATION_STEPS,
                        callbacks = [checkpoint_callback])
    del train_ds, val_ds, model
    gc.collect()
    
    # Not running the last iteration
    if fold==0:
        break

## Inference

In [None]:
identity = rasterio.Affine(1, 0, 0, 0, 1, 0)

# Size of the tile to be read by rasterio
WINDOW = 1024

MIN_OVERLAP = 32

# Tiles will be resized to NEW_SIZE, which is the size of the image
# on which the model was trained
NEW_SIZE = 256

In [None]:
!ls ./saved_models

In [None]:
fold_models = []

for fold_model_path in glob.glob(os.path.join('./saved_models/*.pb')):
    fold_models.append(tf.keras.models.load_model(fold_model_path, custom_objects={'dice_coeff': dice_coeff}))

In [None]:
p = pathlib.Path(DATA_ORIG_PATH)
subm = {}

for i, filename in tqdm(enumerate(p.glob('test_images/*.tiff')), 
                        total = len(list(p.glob('test_images/*.tiff')))):
    
    print(f'{i+1} Predicting {filename.stem}')
    
    dataset = rasterio.open(filename.as_posix(), transform = identity)
    slices = make_grid(dataset.shape, window=WINDOW, min_overlap=MIN_OVERLAP)
    preds = np.zeros(dataset.shape, dtype=np.uint8)
    
    for (x1,x2,y1,y2) in slices:
        image = dataset.read([1,2,3],
                    window=Window.from_slices((x1,x2),(y1,y2)))
        image = np.moveaxis(image, 0, -1)
        
        image = tf.image.convert_image_dtype(image, 
                                 tf.float32)
        image = cv2.resize(image.numpy(), (NEW_SIZE, NEW_SIZE))
        image = np.expand_dims(image, 0)
        
        pred = None
        
        for fold_model in fold_models:
            if pred is None:
                pred = np.squeeze(fold_model.predict(image))
            else:
                pred += np.squeeze(fold_model.predict(image))
        
        pred = pred/len(fold_models)
        
        pred = cv2.resize(pred, (WINDOW, WINDOW))
        preds[x1:x2,y1:y2] = (pred > 0.5).astype(np.uint8)
            
    subm[i] = {'id':filename.stem, 'rle': rle_numba_encode(preds)}
    del preds
    gc.collect();

In [None]:
submission = pd.DataFrame.from_dict(subm, orient='index')
submission.to_csv('../working/submission.csv', index=False)

submission.head()