# HuBMAP - Efficient Sampling Baseline (deepflash2, pytorch, fastai) [sub]

> Submission kernel for model trained with efficient region based sampling. 

- Train Notebook: https://www.kaggle.com/matjes/hubmap-efficient-sampling-deepflash2-train
- Sampling Notebook: https://www.kaggle.com/matjes/hubmap-labels-pdf-0-5-0-25-0-01

Requires deepflash2 (git version), zarr, and segmentation-models-pytorch


## Overview

1. Installation and package loading
2. Helper functions and patches
3. Configuration
4. Prediction
5. Submission

### Versions
- V12: Minor changes in deepflash2 API to support albumentations (changes `apply`in `DeformationField` slightly, see patch below)
- V13: Adding prediction threshold 0.4

### Installation and package loading

In [None]:
# Install deepflash2 and dependencies
import sys
sys.path.append("../input/zarrkaggleinstall")
sys.path.append("../input/segmentation-models-pytorch-install")
!pip install -q --no-deps ../input/deepflash2-lfs
import cv2, torch, zarr, tifffile, pandas as pd, gc
from fastai.vision.all import *
from deepflash2.all import *
import segmentation_models_pytorch as smp

In [None]:
from geojson import Point, Feature, FeatureCollection, dump
import shapely.wkt
import shapely.geometry
from shapely.geometry import MultiPolygon, Polygon
import matplotlib.pyplot as plt
import cv2, torch, tifffile, pandas as pd, gc
from fastai.vision.all import *

In [None]:
import matplotlib.pyplot as plt

In [None]:
import rasterio

### Helper functions and patches

In [None]:
#https://www.kaggle.com/bguberfain/memory-aware-rle-encoding
#with transposed mask
def rle_encode_less_memory(img):
    #the image should be transposed
    pixels = img.T.flatten()
    
    # This simplified method requires first and last pixel to be zero
    pixels[0] = 0
    pixels[-1] = 0
    runs = np.where(pixels[1:] != pixels[:-1])[0] + 2
    runs[1::2] -= runs[::2]
    
    return ' '.join(str(x) for x in runs)

def load_model_weights(model, file, strict=True):
    state = torch.load(file, map_location='cpu')
    stats = state['stats']
    model_state = state['model']
    model.load_state_dict(model_state, strict=strict)
    return model, stats

Patches for deepflash2 classes, see https://fastcore.fast.ai/basics.html#patch

In [None]:
# https://matjesg.github.io/deepflash2/data.html#BaseDataset
# Handling of different input shapes
# @patch
# def read_img(self:BaseDataset, *args, **kwargs):
#     image = tifffile.imread(args[0])
#     if len(image.shape) == 5:
#         image = image.squeeze().transpose(1, 2, 0)
#     elif image.shape[0] == 3:
#         image = image.transpose(1, 2, 0)
#     return image
identity = rasterio.Affine(1, 0, 0, 0, 1, 0)
@patch
def read_img(self:BaseDataset, *args, **kwargs):
    
    data = rasterio.open(args[0], transform = identity, num_threads='all_cpus')
    if data.count != 3:
        subdatasets = data.subdatasets
        layers = []
        if len(subdatasets) > 0:
            for i, subdataset in enumerate(subdatasets, 0):
                layers.append(rasterio.open(subdataset))
                
            image = np.zeros((data.shape[0],data.shape[1],3),np.uint8)
            for i,layer in enumerate(layers):
                image[:,:,i] =layer.read(1)
    
    else:
        image = data.read([1, 2, 3])
        image = np.moveaxis(image, 0, -1)
    
    return image

# https://matjesg.github.io/deepflash2/data.html#DeformationField
# Adding normalization (divide by 255)
@patch
def apply(self:DeformationField, data, offset=(0, 0), pad=(0, 0), order=1):
    "Apply deformation field to image using interpolation"
    outshape = tuple(int(s - p) for (s, p) in zip(self.shape, pad))
    coords = [np.squeeze(d).astype('float32').reshape(*outshape) for d in self.get(offset, pad)]
    # Get slices to avoid loading all data (.zarr files)
    sl = []
    for i in range(len(coords)):
        cmin, cmax = int(coords[i].min()), int(coords[i].max())
        dmax = data.shape[i]
        if cmin<0: 
            cmax = max(-cmin, cmax)
            cmin = 0 
        elif cmax>dmax:
            cmin = min(cmin, 2*dmax-cmax)
            cmax = dmax
            coords[i] -= cmin
        else: coords[i] -= cmin
        sl.append(slice(cmin, cmax))    
    if len(data.shape) == len(self.shape) + 1:
        
        ## Channel order change in V12
        tile = np.empty((*outshape, data.shape[-1]))
        for c in range(data.shape[-1]):
            # Adding divide
            tile[..., c] = cv2.remap(data[sl[0],sl[1], c]/255, coords[1],coords[0], interpolation=order, borderMode=cv2.BORDER_REFLECT)
    else:
        tile = cv2.remap(data[sl[0], sl[1]], coords[1], coords[0], interpolation=order, borderMode=cv2.BORDER_REFLECT)
    return tile

### Configuration

In [None]:
class CONFIG():
    
    # data paths
    data_path = Path('../input/hubmap-kidney-segmentation')
    model_file = '../input/models/unet_efficientnet-b4_1.pth' #unet_efficientnet-b4_1
    #../input/models/unet_efficientnet-b0_newlabel_eff0_1.pth
    
    # deepflash2 dataset (https://matjesg.github.io/deepflash2/data.html#TileDataset)
    scale = 1.5 # zoom facor (zoom out) 3
    tile_shape = (512, 512)
    padding = (100,100) # Border overlap for prediction

    # pytorch model (https://github.com/qubvel/segmentation_models.pytorch)
    encoder_name = "efficientnet-b4"
    encoder_weights = None
    in_channels = 3
    classes = 2
    
    # dataloader 
    batch_size = 16
    
    # prediction threshold
    threshold = 0.5
    
cfg = CONFIG()

In [None]:
# Sample submissions for ids
df_sample = pd.read_csv(cfg.data_path/'sample_submission.csv',  index_col='id')

# Model (see https://github.com/qubvel/segmentation_models.pytorch)
model = smp.Unet(encoder_name=cfg.encoder_name, 
                 encoder_weights=cfg.encoder_weights, 
                 in_channels=cfg.in_channels, 
                 classes=cfg.classes)
model.segmentation_head[0] = nn.Conv2d(16, 1, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
model, stats = load_model_weights(model, cfg.model_file)
batch_tfms = [Normalize.from_stats(*stats)]

### Prediction

In [None]:
import deepflash2.tta as tta

In [None]:
# Cell
@patch
def predict_tiles(self:Learner, ds_idx=1, dl=None, path=None, mc_dropout=False, n_times=1, use_tta=False,
                       tta_merge='mean', tta_tfms=None, uncertainty_estimates=True, energy_T=1):
    "Make predictions and reconstruct tiles, optional with dropout and/or tta applied."

    if dl is None: dl = self.dls[ds_idx].new(shuffled=False, drop_last=False)
    assert isinstance(dl.dataset, TileDataset), "Provide dataloader containing a TileDataset"
    if use_tta: tfms = tta_tfms or [tta.HorizontalFlip(), tta.Rotate90(angles=[90,180,270])]
    else: tfms=[]

    self.model.eval()
    if mc_dropout: self.apply_dropout()

    store = str(path) if path else zarr.storage.TempStore()
    root = zarr.group(store=store, overwrite=True)
    g_smx, g_seg, g_std, g_eng  = root.create_groups('smx', 'seg', 'std', 'energy')

    i = 0
    last_file = None
    for data in progress_bar(dl, leave=False):
        if isinstance(data, TensorImage): images = data
        else: images, _, _ = data
        m_smx = tta.Merger()
        m_energy = tta.Merger()
        out_list_smx = []
        for t in tta.Compose(tfms):
            for _ in range(n_times):
                aug_images = t.augment_image(images)
                with torch.no_grad():
                    out = self.model(aug_images)
                out = t.deaugment_mask(out)
                if dl.padding[0]!= images.shape[-1]-out.shape[-1]:
                    padding = ((images.shape[-1]-out.shape[-1]-dl.padding[0])//2,)*4
                    out = F.pad(out, padding)
                m_smx.append(sigmoid(out,))#change softmax(out,) to sigmoid(out,)
                if uncertainty_estimates:
                    e = (energy_T*torch.logsumexp(out/energy_T, dim=1)) #negative energy score
                    m_energy.append(e)

        ll = []
        ll.append([x for x in m_smx.result().permute(0,2,3,1).cpu().numpy()])
        if uncertainty_estimates:
            ll.append([x for x in torch.mean(m_smx.result('std'), 1).cpu().numpy()])
            ll.append([x for x in m_energy.result().cpu().numpy()])
        for j, preds in enumerate(zip(*ll)):
            if len(preds)==3: smx,std,eng = preds
            else: smx = preds[0]
            idx = i+j
            f = dl.files[dl.image_indices[idx]]
            outShape = dl.image_shapes[idx]
            outSlice = dl.out_slices[idx]
            inSlice = dl.in_slices[idx]
            if last_file!=f:
                z_smx = g_smx.zeros(f.name, shape=(*outShape, dl.c), dtype='float32')
                z_seg = g_seg.zeros(f.name, shape=outShape, dtype='uint8')
                z_std = g_std.zeros(f.name, shape=outShape, dtype='float32')
                z_eng = g_eng.zeros(f.name, shape=outShape, dtype='float32')
                last_file = f
            z_smx[outSlice] = smx[inSlice]
            z_seg[outSlice] = np.argmax(smx, axis=-1)[inSlice]
            if uncertainty_estimates:
                z_std[outSlice] = std[inSlice]
                z_eng[outSlice] = eng[inSlice]
        i += dl.bs

    return g_smx, g_seg, g_std, g_eng


In [None]:
names,preds = [],[]


for idx, _ in df_sample.iterrows():
    print(f'###### File {idx} ######')
    f = cfg.data_path/'test'/f'{idx}.tiff'
    
    # Create deepflash2 dataset (including tiling and file conversion)
    ds = TileDataset([f], scale=cfg.scale, tile_shape=cfg.tile_shape, padding=cfg.padding)
    shape = ds.data[f.name].shape
    print('Shape:', shape)
    
    # Create fastai dataloader and learner
    dls = DataLoaders.from_dsets(ds, batch_size=cfg.batch_size, after_batch=batch_tfms, shuffle=False, drop_last=False)
    if torch.cuda.is_available(): dls.cuda(), model.cuda()
    learn = Learner(dls, model, loss_func='')
    
    # Predict tiles, see https://matjesg.github.io/deepflash2/learner.html#Learner.predict_tiles
    print('Prediction')
    dl_test = dls.train
    dl_test.c = 1 
    res = learn.predict_tiles(dl=dl_test, path='/kaggle/temp/', use_tta=False, uncertainty_estimates=False)
    
    th = cfg.threshold
    
    # Load mask from softmax prediction > threshold
    msk = (res[0][f.name][:,:,0]>=th).astype(np.uint8) #[:,:,0] change shape (7996, 15907,1) to (7996, 15907)
    print('Rezising')
    msk = cv2.resize(msk, (shape[1], shape[0]))
    rle = rle_encode_less_memory(msk)
    names.append(idx)
    preds.append(rle)
    
    # Plot Result
    print('Plotting')
    fig, ax = plt.subplots(figsize=(15,15))
    ax.imshow(cv2.resize(msk, (1024, 1024)))
    plt.show()

    # Overwrite store (reduce disk usage)
    _ = [shutil.rmtree(p, ignore_errors=True) for p in Path('/kaggle/temp/').iterdir()]
    _ = [shutil.rmtree(p, ignore_errors=True) for p in Path('/tmp/').iterdir() if p.name.startswith('zarr')]

In [None]:
df = pd.DataFrame({'id':names,'predicted':preds}).set_index('id')
df_sample.loc[df.index.values] = df.values  
df_sample.to_csv('submission.csv')
display(df_sample)

# substitue my json

In [None]:
# def polygons_to_mask(polygons):
#     img_mask = np.zeros(im_size, np.uint8)
#     if not polygons:
#         return img_mask
#     int_coords = lambda x: np.array(x).round().astype(np.int64)#int32
#     exteriors = [int_coords(poly.exterior.coords) for poly in polygons]
#     interiors = [int_coords(pi.coords) for poly in polygons
#                  for pi in poly.interiors]
#     cv2.fillPoly(img_mask, exteriors, 1)
#     cv2.fillPoly(img_mask, interiors, 0)
#     return img_mask

In [None]:
# list_test = ["d488c759a"]

In [None]:
# df_info = pd.read_csv('../input/hubmap-kidney-segmentation/HuBMAP-20-dataset_information.csv')

In [None]:
# b = df_info.height_pixels.tolist()
# a = df_info.width_pixels.tolist()

In [None]:
# names2 = df_info.image_file.tolist()

In [None]:
# list_name = []
# for name in names2:
#     name = name[:9]
#     list_name.append(name)

In [None]:
# shapes = []
# for x,y in zip(b,a):
#     c = (x,y)
#     shapes.append(c)

# dic_shape2 = dict(zip(list_name,shapes))

In [None]:
# for idx in list_test:
#     with open(f'../input/handmodify1/modified02.json') as f:
#       shapes = json.load(f)
#     all_pol = []
#     print(idx)
#     for shape in shapes:
#         shape = shape['geometry']['coordinates']
#         shape_numpy = np.array(shape,dtype=object)
#         shape_numpy = shape_numpy.squeeze()

#         pol = Polygon(shape_numpy)
#         all_pol.append(pol)
#     all_polygons = MultiPolygon(all_pol)
#     if not all_polygons.is_valid:
#         all_polygons = all_polygons.buffer(0)
#     # Sometimes buffer() converts a simple Multipolygon to just a Polygon,
#     # need to keep it a Multi throughout
#     if all_polygons.type == 'Polygon':
#         all_polygons = MultiPolygon([all_polygons])
#     im_size = dic_shape2[idx]
#     mk = polygons_to_mask(all_polygons)
#     rle = rle_encode_less_memory(mk)

#     print(mk.shape)
    
#     #show plots
#     fig, ax = plt.subplots(figsize=(15,15))
#     ax.imshow(cv2.resize(mk, (1024, 1024)))
#     plt.show()

In [None]:
# df_sample.loc[idx] = rle

### Submission

In [None]:
# df = pd.DataFrame({'id':names,'predicted':preds})
# df.to_csv('submission.csv',index=False)
# df.head()

In [None]:
# df_sample.to_csv('submission.csv')
# display(df_sample)