# Description
This kernel provides a starter Pytorch code for inference that performs dividing the images into tiles([based on this kernel](https://www.kaggle.com/iafoss/256x256-images)), selection of tiles with tissue, evaluation of the predictions of multiple models with TTA, combining the tile masks back into image level masks, and conversion into RLE. The inference is performed based on models trained in the [fast.ai starter kernel](https://www.kaggle.com/iafoss/hubmap-fast-ai-starter), provided by me. I hope it will help you to get started with this competition.

* Update (12/4): Fix problem with submission to private LB using **rasterio** (thanks to @leighplt for suggesting it in [his kernel](https://www.kaggle.com/leighplt/pytorch-fcn-resnet50)). For the prediction on the public part of the test set the predictions are identical except one of the images, where the new method predicts a mask different by several pixels, but the LB is the same. I think the tiles loaded by rasterio may be slightly different from ones loaded by tifffile for some image compressions.

In [None]:
!pip install --no-deps ../input/hubmap-packages/pretrained-models.pytorch-master/pretrained-models.pytorch-master > /dev/null

In [None]:
!pip install --no-deps ../input/hubmap-packages/efficientnet_pytorch-0.6.3/efficientnet_pytorch-0.6.3 > /dev/null

In [None]:
!pip install --no-deps ../input/hubmap-packages/timm-0.4.5-py3-none-any.whl > /dev/null

In [None]:
!pip install --no-deps ../input/hubmap-packages/segmentation_models.pytorch-master/segmentation_models.pytorch-master > /dev/null

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from PIL import Image
import tifffile as tiff
import cv2
import os
import gc
from tqdm.notebook import tqdm
import rasterio
from rasterio.windows import Window

from fastai.vision.all import *
from torch.cuda.amp import autocast
from torch.utils.data import Dataset, DataLoader
from typing import Optional, Union, List
import torch
import torch.nn as nn
import torch.nn.functional as F
import timm
from timm.models.layers import ConvBnAct, get_act_layer, create_act_layer, create_attn, create_conv2d
from timm.models.cspnet import _create_cspnet
import segmentation_models_pytorch as smp
import segmentation_models_pytorch.base.modules as md

import pytorch_lightning as pl

import warnings
warnings.filterwarnings("ignore")

In [None]:
sz = 512   #the size of tiles
step = 176 #step size of sliding window
reduce = 2 #reduce the original images by 4 times
TH = 0.4  #threshold for positive predictions
S_TH = 0.55
DATA = '../input/hubmap-kidney-segmentation/test/'
df_sample = pd.read_csv('../input/hubmap-kidney-segmentation/sample_submission.csv')
bs = 8
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Data

In [None]:
#functions to convert encoding to mask and mask to encoding
def enc2mask(encs, shape):
    img = np.zeros(shape[0]*shape[1], dtype=np.uint8)
    for m,enc in enumerate(encs):
        if isinstance(enc,np.float) and np.isnan(enc): continue
        s = enc.split()
        for i in range(len(s)//2):
            start = int(s[2*i]) - 1
            length = int(s[2*i+1])
            img[start:start+length] = 1 + m
    return img.reshape(shape).T

def mask2enc(mask, n=1):
    pixels = mask.T.flatten()
    encs = []
    for i in range(1,n+1):
        p = (pixels == i).astype(np.int8)
        if p.sum() == 0: encs.append(np.nan)
        else:
            p = np.concatenate([[0], p, [0]])
            runs = np.where(p[1:] != p[:-1])[0] + 1
            runs[1::2] -= runs[::2]
            encs.append(' '.join(str(x) for x in runs))
    return encs

#https://www.kaggle.com/bguberfain/memory-aware-rle-encoding
#with transposed mask
def rle_encode_less_memory(img):
    #the image should be transposed
    pixels = img.T.flatten()
    
    # This simplified method requires first and last pixel to be zero
    pixels[0] = 0
    pixels[-1] = 0
    runs = np.where(pixels[1:] != pixels[:-1])[0] + 2
    runs[1::2] -= runs[::2]
    
    return ' '.join(str(x) for x in runs)

In [None]:
# https://www.kaggle.com/iafoss/256x256-images
mean = np.array([0.63701425, 0.47097038, 0.68173952])
std = np.array([0.15979014, 0.22442915, 0.14194921])

s_th = 40  #saturation blancking threshold
p_th = 200*sz//256 #threshold for the minimum number of pixels
identity = rasterio.Affine(1, 0, 0, 0, 1, 0)

def img2tensor(img,dtype:np.dtype=np.float32):
    if img.ndim==2 : img = np.expand_dims(img,2)
    img = np.transpose(img,(2,0,1))
    return torch.from_numpy(img.astype(dtype, copy=False))

class HuBMAPDataset(Dataset):
    def __init__(self, idx, sz=sz, step=step, reduce=reduce):
        self.data = rasterio.open(os.path.join(DATA,idx+'.tiff'), transform = identity,
                                  num_threads='all_cpus')
        if self.data.count != 3:
            print('Image file with subdatasets as channels')
            self.layers = [rasterio.open(subd) for subd in self.data.subdatasets]
        self.shape = self.data.shape
        self.reduce = reduce
        self.sz = reduce*sz
        self.step = reduce*step
        self.pad0 = (self.sz - self.shape[0]%self.step)%self.step
        self.pad1 = (self.sz - self.shape[1]%self.step)%self.step
        self.n0max = (self.shape[0] + self.pad0 - self.sz)//self.step + 1
        self.n1max = (self.shape[1] + self.pad1 - self.sz)//self.step + 1
        
    def __len__(self):
        return self.n0max*self.n1max
    
    def __getitem__(self, idx):
        # the code below may be a little bit difficult to understand,
        # but the thing it does is mapping the original image to
        # tiles created with adding padding, as done in
        # https://www.kaggle.com/iafoss/256x256-images ,
        # and then the tiles are loaded with rasterio
        # n0,n1 - are the x and y index of the tile (idx = n0*self.n1max + n1)
        n0,n1 = idx//self.n1max, idx%self.n1max
        # x0,y0 - are the coordinates of the lower left corner of the tile in the image
        # negative numbers correspond to padding (which must not be loaded)
        x0,y0 = -self.pad0//2 + n0*self.step, -self.pad1//2 + n1*self.step
        # make sure that the region to read is within the image
        p00,p01 = max(0,x0), min(x0+self.sz,self.shape[0])
        p10,p11 = max(0,y0), min(y0+self.sz,self.shape[1])
        img = np.zeros((self.sz,self.sz,3),np.uint8)
        # mapping the loaded region to the tile
        if self.data.count == 3:
            img[(p00-x0):(p01-x0),(p10-y0):(p11-y0)] = np.moveaxis(self.data.read([1,2,3],
                    window=Window.from_slices((p00,p01),(p10,p11))), 0, -1)
        else:
            tile = np.zeros((p01-p00, p11-p10, 3), dtype=np.uint8)
            for fl in range(3):
                tile[:,:,fl] = self.layers[fl].read(window=Window.from_slices((p00,p01),(p10,p11)))
            img[(p00-x0):(p01-x0),(p10-y0):(p11-y0)] = tile
        
        if self.reduce != 1:
            img = cv2.resize(img,(self.sz//self.reduce,self.sz//self.reduce),
                             interpolation = cv2.INTER_AREA)
        #check for empty imges
        hsv = cv2.cvtColor(img, cv2.COLOR_BGR2HSV)
        h,s,v = cv2.split(hsv)
        if (s>s_th).sum() <= p_th or img.sum() <= p_th:
            #images with -1 will be skipped
            return img2tensor((img/255.0 - mean)/std), -1, x0, y0, p00, p01, p10, p11
        else: return img2tensor((img/255.0 - mean)/std), idx, x0, y0, p00, p01, p10, p11

In [None]:
#iterator like wrapper that returns predicted masks
class Model_pred:
    def __init__(self, models, dl, reduce, tta:bool=True):
        self.models = models
        self.dl = dl
        self.reduce = reduce
        self.tta = tta
        
    def __iter__(self):
        count=0
        with torch.no_grad():
            for x,y,x0,y0,p00,p01,p10,p11 in iter(self.dl):
                if ((y>=0).sum() > 0): #exclude empty images
                    x = x[y>=0].to(device)
                    x0 = x0[y>=0]
                    y0 = y0[y>=0]
                    p00 = p00[y>=0]
                    p01 = p01[y>=0]
                    p10 = p10[y>=0]
                    p11 = p11[y>=0]
                    y = y[y>=0]
#                     if self.half: x = x.half()
                    with autocast(enabled=True):
                        py = None
                        for model in self.models:
                            p = model(x)
                            p = torch.sigmoid(p).detach()
                            if py is None: py = p
                            else: py += p
                        if self.tta:
                            #x,y,xy flips as TTA
                            flips = [[-1],[-2],[-2,-1]]
                            for f in flips:
                                xf = torch.flip(x,f)
                                for model in self.models:
                                    p = model(xf)
                                    p = torch.flip(p,f)
                                    py += torch.sigmoid(p).detach()
                            py /= (1+len(flips))        
                        py /= len(self.models)

                    # py = F.upsample(py, scale_factor=self.reduce, mode="bilinear")
                    py = py.permute(0,2,3,1).half().cpu()
                    
                    batch_size = len(py)
                    for i in range(batch_size):
                        yield py[i],y[i],x0[i],y0[i],p00[i],p01[i],p10[i],p11[i]
                        count += 1
                    
    def __len__(self):
        return len(self.dl.dataset)

# Model

In [None]:
class FReLU(nn.Module):
    def __init__(self, c1, k=3):  # ch_in, kernel
        super().__init__()
        self.conv = nn.Conv2d(c1, c1, k, 1, 1, groups=c1, bias=False)
        self.bn = nn.BatchNorm2d(c1)

    def forward(self, x):
        return torch.max(x, self.bn(self.conv(x)))

class Conv2dAct(nn.Sequential):
    def __init__(
            self,
            in_channels,
            out_channels,
            kernel_size,
            padding=0,
            stride=1,
            use_batchnorm=True,
            act_layer='relu'
    ):
        conv = nn.Conv2d(
            in_channels,
            out_channels,
            kernel_size,
            stride=stride,
            padding=padding,
            bias=not (use_batchnorm),
        )
        if act_layer == 'frelu':
            act = FReLU(out_channels)
        else:
            act = create_act_layer(act_layer)
        if use_batchnorm:
            bn = nn.BatchNorm2d(out_channels)
        else:
            bn = nn.Identity()
        super(Conv2dAct, self).__init__(conv, bn, act)


class DecoderBlockV2(nn.Module):
    def __init__(
            self,
            in_channels,
            skip_channels,
            out_channels,
            use_batchnorm=True,
            act_layer='relu',
            attention_type=None,
    ):
        super().__init__()
        self.conv1 = Conv2dAct(
            in_channels + skip_channels,
            out_channels,
            kernel_size=3,
            padding=1,
            use_batchnorm=use_batchnorm,
            act_layer=act_layer
        )
        self.attention1 = md.Attention(attention_type, in_channels=in_channels + skip_channels)
        self.conv2 = Conv2dAct(
            out_channels,
            out_channels,
            kernel_size=3,
            padding=1,
            use_batchnorm=use_batchnorm,
            act_layer=act_layer
        )
        self.attention2 = md.Attention(attention_type, in_channels=out_channels)

    def forward(self, x, skip=None):
        x = F.interpolate(x, scale_factor=2, mode="nearest")
        if skip is not None:
            x = torch.cat([x, skip], dim=1)
            x = self.attention1(x)
        x = self.conv1(x)
        x = self.conv2(x)
        x = self.attention2(x)
        return x


class UnetDecoderV2(nn.Module):
    def __init__(
            self,
            encoder_channels,
            decoder_channels,
            n_blocks=5,
            use_batchnorm=True,
            act_layer='relu',
            attention_type=None,
    ):
        super().__init__()

        if n_blocks != len(decoder_channels):
            raise ValueError(
                "Model depth is {}, but you provide `decoder_channels` for {} blocks.".format(
                    n_blocks, len(decoder_channels)
                )
            )

        encoder_channels = encoder_channels[1:]  # remove first skip with same spatial resolution
        encoder_channels = encoder_channels[::-1]  # reverse channels to start from head of encoder

        # computing blocks input and output channels
        head_channels = encoder_channels[0]
        in_channels = [head_channels] + list(decoder_channels[:-1])
        skip_channels = list(encoder_channels[1:]) + [0]
        out_channels = decoder_channels

        self.center = nn.Identity()

        # combine decoder keyword arguments
        kwargs = dict(use_batchnorm=use_batchnorm, act_layer=act_layer, attention_type=attention_type)
        blocks = [
            DecoderBlockV2(in_ch, skip_ch, out_ch, **kwargs)
            for in_ch, skip_ch, out_ch in zip(in_channels, skip_channels, out_channels)
        ]
        self.blocks = nn.ModuleList(blocks)

    def forward(self, *features):

        features = features[1:]    # remove first skip with same spatial resolution
        features = features[::-1]  # reverse channels to start from head of encoder

        head = features[0]
        skips = features[1:]

        x = self.center(head)
        for i, decoder_block in enumerate(self.blocks):
            skip = skips[i] if i < len(skips) else None
            x = decoder_block(x, skip)

        return x

In [None]:
class CSPDarkNet53Encoder(nn.Module):
    def __init__(self, pretrain=True, act_layer='leaky_relu'):
        super(CSPDarkNet53Encoder, self).__init__()
        base = timm.create_model('cspdarknet53', pretrained=pretrain, act_layer=get_act_layer(act_layer))
        self.stem = base.stem
        self.layer0 = base.stages[0]
        self.layer1 = base.stages[1]
        self.layer2 = base.stages[2]
        self.layer3 = base.stages[3]
        self.layer4 = base.stages[4]
        del base
        
        self.depth = 5
        self.out_channels = (32, 64, 128, 256, 512, 1024)
        self.in_channels = 3
        
        
    def get_stages(self):
        return [
            self.stem,
            self.layer0,
            self.layer1,
            self.layer2,
            self.layer3,
            self.layer4,
        ]
    
    def forward(self, x):
        stages = self.get_stages()

        features = []
        for i in range(self.depth + 1):
            x = stages[i](x)
            features.append(x)

        return features

class CSPDarkNet53FPN(smp.base.SegmentationModel):
    def __init__(
        self,
        pretrain=True,
        decoder_pyramid_channels: int = 256,
        decoder_segmentation_channels: int = 128,
        decoder_merge_policy: str = "add",
        decoder_dropout: float = 0.2,
        in_channels: int = 3,
        classes: int = 1,
        activation: Optional[str] = None,
        upsampling: int = 4,
        aux_params: Optional[dict] = None,
    ):
        super().__init__()
        
        self.encoder = CSPDarkNet53Encoder(pretrain)
        
        self.decoder = smp.fpn.decoder.FPNDecoder(
            encoder_channels=self.encoder.out_channels,
            encoder_depth=self.encoder.depth,
            pyramid_channels=decoder_pyramid_channels,
            segmentation_channels=decoder_segmentation_channels,
            dropout=decoder_dropout,
            merge_policy=decoder_merge_policy,
        )

        self.segmentation_head = smp.base.SegmentationHead(
            in_channels=self.decoder.out_channels,
            out_channels=classes,
            activation=activation,
            kernel_size=1,
            upsampling=upsampling,
        )

        if aux_params is not None:
            self.classification_head = smp.base.ClassificationHead(
                in_channels=self.encoder.out_channels[-1], **aux_params
            )
        else:
            self.classification_head = None

        self.name = "fpn-cspdarknet53"
        self.initialize()


class CSPDarkNet53Unet(smp.base.SegmentationModel):
    def __init__(
        self,
        pretrain=True,
        act_layer='leaky_relu',
        decoder_use_batchnorm: bool = True,
        decoder_channels: List[int] = (256, 128, 64, 32, 16),
        decoder_attention_type: Optional[str] = None,
        in_channels: int = 3,
        classes: int = 1,
        activation: Optional[str] = None,
        aux_params: Optional[dict] = None,
        
    ):
        super().__init__()
        
        self.encoder = CSPDarkNet53Encoder(pretrain, act_layer)
        
        self.decoder = smp.unet.decoder.UnetDecoder(
            encoder_channels=self.encoder.out_channels,
            decoder_channels=decoder_channels,
            n_blocks=self.encoder.depth,
            use_batchnorm=decoder_use_batchnorm,
            center=False,
            attention_type=decoder_attention_type
        )

        self.segmentation_head = smp.base.SegmentationHead(
            in_channels=decoder_channels[-1],
            out_channels=classes,
            activation=activation,
            kernel_size=3,
        )

        if aux_params is not None:
            self.classification_head = smp.base.ClassificationHead(
                in_channels=self.encoder.out_channels[-1], **aux_params
            )
        else:
            self.classification_head = None

        self.name = "unet-cspdarknet53"
        self.initialize()
        
class CSPDarkNet53UnetV2(smp.base.SegmentationModel):
    def __init__(
        self,
        pretrain=True,
        act_layer='leaky_relu',
        decoder_use_batchnorm: bool = True,
        decoder_channels: List[int] = (256, 128, 64, 32, 16),
        decoder_act_layer: str = 'relu',
        decoder_attention_type: Optional[str] = None,
        in_channels: int = 3,
        classes: int = 1,
        activation: Optional[str] = None,
        aux_params: Optional[dict] = None,
        
    ):
        super().__init__()
        
        self.encoder = CSPDarkNet53Encoder(pretrain, act_layer)
        
        self.decoder = UnetDecoderV2(
            encoder_channels=self.encoder.out_channels,
            decoder_channels=decoder_channels,
            n_blocks=self.encoder.depth,
            use_batchnorm=decoder_use_batchnorm,
            act_layer=decoder_act_layer,
            attention_type=decoder_attention_type
        )

        self.segmentation_head = smp.base.SegmentationHead(
            in_channels=decoder_channels[-1],
            out_channels=classes,
            activation=activation,
            kernel_size=3,
        )

        if aux_params is not None:
            self.classification_head = smp.base.ClassificationHead(
                in_channels=self.encoder.out_channels[-1], **aux_params
            )
        else:
            self.classification_head = None

        self.name = "unetv2-cspdarknet53"
        self.initialize()

In [None]:
# Lightning model
class PlModel(pl.LightningModule):
    def __init__(self, learning_rate = 1e-3):
        super(PlModel, self).__init__()
        self.learning_rate = learning_rate
        self.model = smp.Unet('timm-efficientnet-b3', decoder_attention_type = None, encoder_weights = None)
        self.criterion = None

    def forward(self, x):
        return self.model(x)

In [None]:
models = []
B3_MODELS = [
    '../input/nfl-submissions/Mar291351_0_epoch20_val_metric0.940.ckpt',
    '../input/nfl-submissions/Mar291351_1_epoch20_val_metric0.940.ckpt',
    '../input/nfl-submissions/Mar291351_2_epoch18_val_metric0.947.ckpt',
    '../input/nfl-submissions/Mar291351_3_epoch20_val_metric0.939.ckpt',
]

for path in B3_MODELS:
    model = PlModel()
    model = model.load_from_checkpoint(path)
    model.float()
    model.eval()
    model.to(device)
    models.append(model)

In [None]:
D53_MODELS = [
    ('../input/hubmap-trained-models/unetv2-d53_sf-672-pseudo5-external_s2.pth', 'frelu'),
#     ('../input/hubmap-trained-models/unetv2-d53_sf-672-pseudo5-external_f1_s2.pth', 'frelu'),
    ('../input/hubmap-trained-models/unetv2-d53_sf_valid_8242609fa_best.pth', 'frelu'),
    ('../input/hubmap-trained-models/unetv2-d53_sf_valid_b2dc8411c_best.pth', 'frelu'),
    ('../input/hubmap-trained-models/unetv2-d53_sr_valid_4ef6695ce_best.pth', 'relu'),
    ('../input/hubmap-trained-models/unetv2-d53_sf_valid_2f6ecfcdf_best.pth', 'frelu')
]

for path, act in D53_MODELS:
    state_dict = torch.load(path,map_location=torch.device('cpu'))
    model = CSPDarkNet53UnetV2(
        pretrain=False,
        act_layer='swish',
        decoder_act_layer=act,
        in_channels=3, 
        classes=1).cuda()
    model.load_state_dict(state_dict['model'])
    model.float()
    model.eval()
    model.to(device)
    models.append(model)

del state_dict

In [None]:
print(f'Total number of trained models {len(models)}')

# Prediction

In [None]:
def get_gauss_filter(half):
    y,x = np.mgrid[-half:half,-half:half]
    y = half-abs(y)
    x = half-abs(x)
    w = np.minimum(x,y)
    w = w/w.max()#*2.5
    w = np.minimum(w,1)
    return w

In [None]:
import skimage.measure as ms

def reduce_fp(predict, shape, s, t=0.4):
    mask = (predict > t).astype(np.uint8)
    mask = ms.label(mask)
    
    l_probs = []
    for i in range(1, mask.max()+1):
        l_probs.append(predict[mask == i].max())
    l_probs = np.array(l_probs)
    
    mask[np.isin(mask, (np.where(l_probs < s)[0] + 1))] = 0
    return cv2.resize((mask > 0).astype(np.uint8), shape, interpolation=cv2.INTER_NEAREST)

In [None]:
half = sz//2
w = get_gauss_filter(half).astype(np.half)

names,preds = [],[]
for idx,row in tqdm(df_sample.iterrows(),total=len(df_sample)):
    idx = row['id']
    ds = HuBMAPDataset(idx, sz=sz, step=step, reduce=reduce)
    #rasterio cannot be used with multiple workers
    dl = DataLoader(ds,bs,num_workers=0,shuffle=False,pin_memory=True)
    mp = Model_pred(models,dl,reduce)
    
    predict = np.zeros((ds.shape[0]//reduce, ds.shape[1]//reduce), dtype=np.half)
    count = np.zeros_like(predict)
    
    #generate masks
    for p,i,x0,y0,p00,p01,p10,p11 in iter(mp):
        tile = p.squeeze(-1).numpy()
        c = np.array([x0,y0,p00,p01,p10,p11])
        
        c = c//reduce
        x0, y0, p00, p01, p10, p11 = c
        predict[p00:p01,p10:p11] += tile[(p00-x0):(p01-x0),(p10-y0):(p11-y0)] * w[(p00-x0):(p01-x0),(p10-y0):(p11-y0)]
        count[p00:p01,p10:p11] += w[(p00-x0):(p01-x0),(p10-y0):(p11-y0)]
    
    m = (count != 0)
    predict[m] /= count[m]
    
    del count
    gc.collect()
    
    #convert to rle
    #https://www.kaggle.com/bguberfain/memory-aware-rle-encoding
    
#     predict = cv2.resize((predict > TH).astype(np.uint8), dsize=(ds.shape[1],ds.shape[0]), interpolation=cv2.INTER_NEAREST)
    predict = reduce_fp(predict, (ds.shape[1],ds.shape[0]), S_TH, t=TH)
    rle = rle_encode_less_memory(predict)
    names.append(idx)
    preds.append(rle)
    del tile, c, predict, ds, dl
    gc.collect()

In [None]:
df = pd.DataFrame({'id':names,'predicted':preds})
df.to_csv('submission.csv',index=False)