# HuBMAP - Efficient Sampling Baseline (deepflash2, pytorch, fastai) [sub]

> Submission kernel for model trained with efficient region based sampling. 

***

## Overview

1. Installation and package loading
2. Functions and classes for prediction
3. Configuration
4. Prediction
5. Submission

#### Related Kernels

- Train Notebook: https://www.kaggle.com/matjes/hubmap-efficient-sampling-deepflash2-train
- Sampling Notebook: https://www.kaggle.com/matjes/hubmap-labels-pdf-0-5-0-25-0-01

#### Versions
- V12: Minor changes in deepflash2 API to support albumentations (changes `apply`in `DeformationField` slightly, see patch below)
- V13: Adding prediction threshold 0.4
- V14: Threshold 0.2 for d488c759a - see discussion https://www.kaggle.com/c/hubmap-kidney-segmentation/discussion/228993 
- V15: **NEW PREDICTION** 
    - Using overlapping tiles and gaussian weighting from [nnunet](https://www.nature.com/articles/s41592-020-01008-z)/[github](https://github.com/MIC-DKFZ/nnUNet), which will also be part of the upcoming `deepflash2` release
    - Supporting model ensembles
    - Fixing submission to private LB using `rasterio` (thanks to @leighplt [kernel](https://www.kaggle.com/leighplt/pytorch-fcn-resnet50) and @iafoss ([kernel](https://www.kaggle.com/iafoss/hubmap-pytorch-fast-ai-starter-sub))

### Installation and package loading

In [None]:
# Install deepflash2 and dependencies
! pip install ../input/kerasapplications/keras-team-keras-applications-3b180cb -f ./ --no-index -q
! pip install ../input/efficientnet/efficientnet-1.1.0/ -f ./ --no-index -q

import sys
sys.path.append("../input/segmentation-models-pytorch-install")
!pip install -q --no-deps ../input/deepflash2-lfs
import cv2, torch, gc, rasterio
import torch.nn.functional as F
import deepflash2.tta as tta
import matplotlib.pyplot as plt
import pandas as pd, numpy as np
import segmentation_models_pytorch as smp
from pathlib import Path
from rasterio.windows import Window
from torch.utils.data import Dataset, DataLoader
from scipy .ndimage.filters import gaussian_filter
from tqdm.notebook import tqdm

import os
import glob
import gc

import rasterio
from rasterio.windows import Window

import pathlib
from tqdm.notebook import tqdm
import cv2

import tensorflow as tf
import efficientnet as efn
import efficientnet.tfkeras
import tensorflow as tf
from tensorflow.keras import backend as K

import warnings
warnings.filterwarnings("ignore")

### Functions and classes for prediction

In [None]:
#https://www.kaggle.com/bguberfain/memory-aware-rle-encoding
#with transposed mask
def rle_encode_less_memory(img):
    #the image should be transposed
    pixels = img.T.flatten()
    
    # This simplified method requires first and last pixel to be zero
    pixels[0] = 0
    pixels[-1] = 0
    runs = np.where(pixels[1:] != pixels[:-1])[0] + 2
    runs[1::2] -= runs[::2]
    
    return ' '.join(str(x) for x in runs)

def load_model_weights(model, file, strict=True):
    state = torch.load(file, map_location='cpu')
    stats = state['stats']
    model_state = state['model']
    model.load_state_dict(model_state, strict=strict)
    return model, stats

# from https://github.com/MIC-DKFZ/nnUNet/blob/2fade8f32607220f8598544f0d5b5e5fa73768e5/nnunet/network_architecture/neural_network.py#L250
def _get_gaussian(patch_size, sigma_scale=1. / 8) -> np.ndarray:
    tmp = np.zeros(patch_size)
    center_coords = [i // 2 for i in patch_size]
    sigmas = [i * sigma_scale for i in patch_size]
    tmp[tuple(center_coords)] = 1
    gaussian_importance_map = gaussian_filter(tmp, sigmas, 0, mode='constant', cval=0)
    gaussian_importance_map = gaussian_importance_map / np.max(gaussian_importance_map) * 1
    gaussian_importance_map = gaussian_importance_map.astype(np.float32)

    # gaussian_importance_map cannot be 0, otherwise we may end up with nans!
    gaussian_importance_map[gaussian_importance_map == 0] = np.min(
        gaussian_importance_map[gaussian_importance_map != 0])

    return gaussian_importance_map

In [None]:
# Some code adapted from https://www.kaggle.com/iafoss/hubmap-pytorch-fast-ai-starter-sub
class HubmapDataset(Dataset):
    'HubmapDataset class that does not load the full tiff files.'
    def __init__(self, file, stats, scale=3, shift=.8, output_shape=(512,512), s_th = 40):
        
        self.mean, self.std = stats
        self.scale = scale
        self.shift = shift
        self.output_shape = output_shape
        self.input_shape = tuple(int(t*scale) for t in self.output_shape)      
        self.s_th = s_th #saturation blancking threshold
        self.p_th = 1000*(self.output_shape[0]//256)**2 #threshold for the minimum number of pixels

        identity = rasterio.Affine(1, 0, 0, 0, 1, 0)
        self.data = rasterio.open(file, transform = identity, num_threads='all_cpus')
        if self.data.count != 3:
            subdatasets = self.data.subdatasets
            self.layers = []
            if len(subdatasets) > 0:
                for i, subdataset in enumerate(subdatasets, 0):
                    self.layers.append(rasterio.open(subdataset))
            
        # Tiling
        self.slices = []
        self.out_slices = []
        self.out_data_shape = tuple(int(x//self.scale) for x in self.data.shape)
        start_points = [o//2 for o in self.output_shape]
        end_points = [(s - st) for s, st in zip(self.out_data_shape, start_points)]
        n_points = [int(s//(o*self.shift))+1 for s, o in zip(self.out_data_shape, self.output_shape)]
        center_points = [np.linspace(st, e, num=n, endpoint=True, dtype=np.int64) for st, e, n in zip(start_points, end_points, n_points)]
        for cx in center_points[1]:
            for cy in center_points[0]:
                # Calculate output slices for whole image
                slices = tuple(slice(int((c*self.scale - o/2).clip(0, s)), int((c*self.scale + o/2).clip(max=s)))
                                 for (c, o, s) in zip((cy, cx), self.input_shape, self.data.shape))
                self.slices.append(slices)
                
                out_slices = tuple(slice(int((c - o/2).clip(0, s)), int((c + o/2).clip(max=s)))
                                 for (c, o, s) in zip((cy, cx), self.output_shape, self.out_data_shape))
                self.out_slices.append(out_slices)
                

    def __len__(self):
        return len(self.slices)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()
            
        slices = self.slices[idx]
        if self.data.count == 3: # normal
            img = self.data.read([1, 2, 3], 
                window=Window.from_slices(*slices)
            )
            img = np.moveaxis(img, 0, -1)
        else: # with subdatasets/layers
            img = np.zeros((*self.input_shape, 3), dtype=np.uint8)
            for fl in range(3):
                img[:, :, fl] = self.layers[fl].read(
                    window=Window.from_slices(*slices)
                )
        
        if self.scale!=1:
            img = cv2.resize(img, self.output_shape, interpolation = cv2.INTER_AREA)
        
        #check for empty imges
        hsv = cv2.cvtColor(img, cv2.COLOR_BGR2HSV)
        h,s,v = cv2.split(hsv)
        if (s>self.s_th).sum() <= self.p_th or img.sum() <= self.p_th:
            # Remove if idx=-1
            idx = -1
        
        img = (img/255.0 - self.mean)/self.std
        img = img.transpose(2, 0, 1).astype('float32')
        
        return torch.from_numpy(img), idx
    
class Model_pred:
    'Class for prediction with multiple models'
    def __init__(self, models, use_tta=True, batch_size=32):
        self.models = models
        self.bs = batch_size
        self.tfms = [tta.HorizontalFlip()] if use_tta else [] #, tta.VerticalFlip()]  
        
    def predict(self, ds):
        #rasterio cannot be used with multiple workers
        dl = DataLoader(ds, self.bs, num_workers=0, shuffle=False, pin_memory=True)
        
        # Create zero arrays
        pred = np.zeros(ds.out_data_shape, dtype='float32')
        merge_map = np.zeros(ds.out_data_shape, dtype='float32')
        
        # Gaussian weights
        gw_numpy = _get_gaussian(ds.output_shape)
        gw = torch.from_numpy(gw_numpy).to(device)
        
        with torch.no_grad():
            for images, idxs in tqdm(iter(dl), total=len(dl)):
                if ((idxs>=0).sum() > 0): #exclude empty images
                    images = images[idxs>=0].to(device)
                    idxs = idxs[idxs>=0]
                    merger = tta.Merger()
                    for t in tta.Compose(self.tfms):
                        aug_images = t.augment_image(images)
                        model_merger = tta.Merger()
                        for model in tqdm(self.models):
                            out = model(aug_images)
                            out = F.softmax(out, dim=1)
                            model_merger.append(out)
                            torch.cuda.empty_cache()
                        out = t.deaugment_mask(model_merger.result())
                        merger.append(out)
            
                    # Apply gaussian weigthing
                    batch_smx = merger.result()*gw.view(1,1,*gw.shape)
                    batch_smx = [x for x in batch_smx.permute(0,2,3,1).cpu().numpy()]

                    for smx, idx in zip(batch_smx, idxs):
                        slcs = ds.out_slices[idx]
                        # Only using positive class here
                        pred[slcs] += smx[...,1]
                        merge_map[slcs] += gw_numpy

        pred /= merge_map
        return pred

### Configuration

In [None]:
class CONFIG():
    
    # data paths
    data_path = Path('../input/hubmap-kidney-segmentation')
    model_path = Path('../input/hubmap-single-fold-models-b1b5')
    
    # zoom factor (e.g., 3 means downscaling from 1536 to 512)
    scale = 3 
    # tile shift for prediction
    shift = 0.8 
    tile_shape = (512, 512)

    # pytorch model (https://github.com/qubvel/segmentation_models.pytorch)
    encoder_name = "efficientnet-b4"
    encoder_weights = None
    in_channels = 3
    classes = 2
    
    # dataloader 
    batch_size = 4
    
    # test time augmentation
    tta = True
    # prediction threshold
    threshold = 0.30
    
cfg = CONFIG()

In [None]:
import yaml
import pprint
    
THRESHOLD = 0.3 # preds > THRESHOLD
VOTERS = 0.5
WINDOW = 1024
MIN_OVERLAP = 300
NEW_SIZE = 256
SUM_PRED = 128
CHECK=False

In [None]:
def make_grid_tf(shape, window=1024, min_overlap=32):
    """
        Return Array of size (N,4), where N - number of tiles,
        2nd axis represente slices: x1,x2,y1,y2 
    """
    x, y = shape
    nx = x // (window - min_overlap) + 1
    x1 = np.linspace(0, x, num=nx, endpoint=False, dtype=np.int64)
    x1[-1] = x - window
    x2 = (x1 + window).clip(0, x)
    ny = y // (window - min_overlap) + 1
    y1 = np.linspace(0, y, num=ny, endpoint=False, dtype=np.int64)
    y1[-1] = y - window
    y2 = (y1 + window).clip(0, y)
    slices = np.zeros((nx,ny, 4), dtype=np.int64)
    
    for i in range(nx):
        for j in range(ny):
            slices[i,j] = x1[i], x2[i], y1[j], y2[j]    
    return slices.reshape(nx*ny,4)

In [None]:

models_path_list = [
    '../input/hubmap-efficientnet-b4/model-fold-4.h5',
    '../input/hubmap-efficientnet-b4/model-fold-0.h5',
    '../input/hubmap-efficientnet-b4/model-fold-3.h5',
    '../input/hubmap-efficientnet-b6-pseudo/model-fold-2.h5',
    '../input/hubmap-efficientnet-b6-pseudo/model-fold-1.h5',
    #'../input/hubmap-efficientnetb7-pseudo-labelled/model-fold-2.h5',
]

fold_models = []
for fold_model_path in models_path_list:
    fold_models.append(tf.keras.models.load_model(fold_model_path,compile = False))
print(len(fold_models))


In [None]:
deepflash_models=[]

'''
for fold in [1,2]:
        
        ###################### efficient net b4 #########################
        model = smp.Unet(encoder_name='efficientnet-b4', 
                         encoder_weights=cfg.encoder_weights, 
                         in_channels=cfg.in_channels, 
                         classes=cfg.classes)
        model, stats = load_model_weights(model, '../input/hubmap-deepflash-efficientnetb4/'+f'unet_efficientnet-b4_{fold}.pth')
        
        
        if torch.cuda.is_available():  model.cuda()
        
        deepflash_models.append(model)
        del model
        gc.collect()
        
        model = smp.Unet(encoder_name='efficientnet-b3', 
                         encoder_weights=cfg.encoder_weights, 
                         in_channels=cfg.in_channels, 
                         classes=cfg.classes)
        model, stats = load_model_weights(model, '../input/hubmap-deepflash-efficientnet-b3/'+f'unet_efficientnet-b3_{fold}.pth')
        
        if torch.cuda.is_available():  model.cuda()
        
        deepflash_models.append(model)
        del model
        gc.collect()
'''      
#         res_id=fold+2
#         model = smp.Unet(encoder_name='resnet50', 
#                          encoder_weights=cfg.encoder_weights, 
#                          in_channels=cfg.in_channels, 
#                          classes=cfg.classes)
#         model, stats = load_model_weights(model, '../input/hubmap-effiecient-sampling-deepflash-resnet/'+f'unet_resnet50_{res_id}.pth')
#         if torch.cuda.is_available():  model.cuda()
        
#         deepflash_models.append(model)
#         del model
#         gc.collect()
    
model = smp.Unet(encoder_name='efficientnet-b6', 
                 encoder_weights=cfg.encoder_weights, 
                 in_channels=cfg.in_channels, 
                 classes=cfg.classes)
model, stats = load_model_weights(model, '../input/hubmap-deepflash-efficientnetb6/'+f'unet_efficientnet-b6.pth')

if torch.cuda.is_available(): model.cuda()

deepflash_models.append(model)
del model
gc.collect()

# model = smp.Unet(encoder_name='efficientnet-b7', 
#                  encoder_weights=cfg.encoder_weights, 
#                  in_channels=cfg.in_channels, 
#                  classes=cfg.classes)
# model, stats = load_model_weights(model, '../input/hubmap-deepflash-effiicentnetb7/'+f'unet_efficientnet-b7.pth')

# if torch.cuda.is_available(): model.cuda()

# deepflash_models.append(model)
# del model
# gc.collect()

In [None]:
# Sample submissions for ids
import pathlib
import glob
from tqdm.notebook import tqdm
p = pathlib.Path('../input/hubmap-kidney-segmentation')
ids=[]

for index, filename in tqdm(enumerate(p.glob('test/*.tiff')), 
                        total = len(list(p.glob('test/*.tiff')))):
    ids.append(filename.stem)

df_sample=pd.DataFrame()
df_sample['id']=ids
df_sample['predicted']=np.nan
df_sample = df_sample.set_index('id')

if df_sample.shape[0]==5:
    df_sample = df_sample.iloc[1:2, :]
else:
    df_sample=df_sample

print(len(deepflash_models))
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

### Prediction

In [None]:
identity = rasterio.Affine(1, 0, 0, 0, 1, 0)

names,predicts = [],[]
for idx,row in tqdm(df_sample.iterrows(),total=len(df_sample)):
    
    print(f'###### File {idx} ######')
    f = cfg.data_path/'test'/f'{idx}.tiff'
    ds = HubmapDataset(f, stats, scale=cfg.scale, shift=cfg.shift, output_shape=cfg.tile_shape)
    
    print('Predicting...')   
    
    dl = DataLoader(ds, 1, num_workers=0, shuffle=False, pin_memory=True)
    
    preds = np.zeros(ds.out_data_shape, dtype=np.uint8)
    print(preds.shape)
    with torch.no_grad():
        for images, idxs in tqdm(iter(dl), total=len(dl)):
            if ((idxs>=0).sum() > 0): #exclude empty images
                images = images[idxs>=0].to(device)
                idxs = idxs[idxs>=0]
                merger = tta.Merger()
                for t in tta.Compose([tta.HorizontalFlip(), tta.VerticalFlip()]):
                    aug_images = t.augment_image(images)
                    model_merger = tta.Merger()
                    for model in deepflash_models:
                        out = model(aug_images)
                        #print(out.shape)
                        out = F.softmax(out, dim=1)
                        model_merger.append(out)
                        torch.cuda.empty_cache()
                    out = t.deaugment_mask(model_merger.result()).detach().cpu().numpy()/len(deepflash_models)
                    preds[ds.out_slices[idxs]]=(out[0, 1, :, :] > 0.3).astype(np.uint8)
                   
    del ds, dl, images, merger, aug_images, model, out, model_merger
    gc.collect()
    torch.cuda.empty_cache()
    
    dataset_tf = rasterio.open(f, transform = identity) 
    slices_tf = make_grid_tf(dataset_tf.shape, window=WINDOW, min_overlap=MIN_OVERLAP)
    print(dataset_tf.shape)
    if dataset_tf.count != 3:
        print('Image file with subdatasets as channels')
        layers_tf = [rasterio.open(subd) for subd in dataset_tf.subdatasets]

    print(f'Dataset Shape: {dataset_tf.shape}')
    preds_tf = np.zeros(dataset_tf.shape, dtype=np.uint8)
    EMPTY = np.zeros((NEW_SIZE, NEW_SIZE))
    
    for (x1,x2,y1,y2) in tqdm(slices_tf):
            if dataset_tf.count == 3:
                image = dataset_tf.read([1,2,3],
                            window=Window.from_slices((x1,x2),(y1,y2)))
                image = np.moveaxis(image, 0, -1)
            else:
                image = np.zeros((WINDOW, WINDOW, 3), dtype=np.uint8)
                for fl in range(3):
                    image[:,:,fl] = layers_tf[fl].read(window=Window.from_slices((x1,x2),(y1,y2)))


            hsv = cv2.cvtColor(image, cv2.COLOR_BGR2HSV)
            h,s,v = cv2.split(hsv)
            s_th = 40
            p_th = 1000*(1024//256)**2

            if (s>s_th).sum() <= p_th or image.sum() <= p_th :
                pred_temp = EMPTY
            else:

                image = cv2.resize(image, (NEW_SIZE, NEW_SIZE),interpolation = cv2.INTER_AREA)
                image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
                image = np.expand_dims(image, 0)
                image = tf.cast(image, tf.float32)

                pred_temp = None

                for fold_model in fold_models:
                    if pred_temp is None:
                        pred_temp = np.squeeze(fold_model.predict(image))
                    else:
                        pred_temp += np.squeeze(fold_model.predict(image))
                        
                    K.clear_session()

                pred_temp = pred_temp/len(fold_models)


            pred_temp = cv2.resize((pred_temp).astype('uint8'), (WINDOW, WINDOW))
            preds_tf[x1:x2,y1:y2] += pred_temp
            
            del image, hsv, s_th, p_th, pred_temp
            gc.collect()

    
    del EMPTY, slices_tf, dataset_tf, fold_model
    gc.collect()
    K.clear_session()
    
    preds = cv2.resize(preds (preds_tf.shape[0], preds_tf.shape[1]))
    preds = preds + preds_tf
    preds = (preds >= VOTERS).astype(np.uint8)
    del preds_tf
    gc.collect()
    
#     plt.hist(preds)
#     plt.show()

    rle = rle_encode_less_memory(preds)
    names.append(idx)
    predicts.append(rle)
    
    del preds, rle
    gc.collect()
    

In [None]:
df = pd.DataFrame({'id':names,'predicted':predicts})
df.to_csv('submission.csv',index=False)
df.head()