### References

[Pytorch FCN-Resnet50 in 20 minute](https://www.kaggle.com/finlay/pytorch-fcn-resnet50-in-20-minute)

In [None]:
import numpy as np
import pathlib
import os
import time
import cv2
import gc
from tqdm.notebook import tqdm
import warnings

# data structure
import pandas as pd

# graphics
import matplotlib.pyplot as plt

# tiff file
import tifffile
import rasterio
from rasterio.windows import Window
from PIL import Image

# transforms
import albumentations as A
import torch.utils.data as D

warnings.filterwarnings('ignore')
%matplotlib inline

In [None]:
BASE_DIR = '../input/hubmap-kidney-segmentation'
TRAIN_DIR = BASE_DIR + "/train"
TEST_DIR = BASE_DIR + "/test"
TEST_PRED_DIR = "../input/submission-predictions"
WINDOW = 1024
NEW_SIZE = 512
OVERLAP = 32
THRESHOLD = 50

# Tools

In [None]:
def rle2mask(mask_rle, shape=(NEW_SIZE, NEW_SIZE)):
    '''
    mask_rle: run-length as string formated (start length)
    shape: (height,width) of array to return
    Returns numpy array, 1 - mask, 0 - background

    '''
    s = mask_rle.split()
    starts, lengths = [np.asarray(x, dtype=int)
                       for x in (s[0:][::2], s[1:][::2])]
    starts -= 1
    ends = starts + lengths
    img = np.zeros(shape[0]*shape[1], dtype=np.uint8)
    for lo, hi in zip(starts, ends):
        img[lo:hi] = 1

    del s, starts, lengths, ends
    gc.collect()
    return img.reshape(shape).T

In [None]:
def make_grid(shape, window=WINDOW, min_overlap=OVERLAP):
    """
        Return Array of size (N,4), where N - number of tiles,
        2nd axis represente slices: x1,x2,y1,y2
    """
    x, y = shape

    nx = x // (window - min_overlap) + 1
    x1 = np.linspace(0, x, num=nx, endpoint=False, dtype=np.int64)
    x1[-1] = x - window
    x2 = (x1 + window).clip(0, x)

    ny = y // (window - min_overlap) + 1
    y1 = np.linspace(0, y, num=ny, endpoint=False, dtype=np.int64)
    y1[-1] = y - window
    y2 = (y1 + window).clip(0, y)

    slices = np.zeros((nx, ny, 4), dtype=np.int64)
    for i in range(nx):
        for j in range(ny):
            slices[i, j] = x1[i], x2[i], y1[j], y2[j]

    del x, y, x1, x2, y1, y2
    gc.collect()
    return slices.reshape(nx*ny, 4)

In [None]:
def read_image(item, path, scale=None, verbose=1, with_mask=True):
    # image
    image_id = item.get('id')
    image = tifffile.imread(os.path.join(path, f"{image_id}.tiff"))
    if len(image.shape) == 5:
        image = image.squeeze().transpose(1, 2, 0)

    # mask
    if with_mask:
        column_name = 'encoding' if path == TRAIN_DIR else 'predicted'
        mask = rle2mask(item.get(column_name), (image.shape[1], image.shape[0]))

    if verbose:
        print(f"[{image_id}] Image shape: {image.shape}")
        if with_mask:
            print(f"[{image_id}] Mask shape: {mask.shape}")

    # scale
    if scale:
        new_size = (image.shape[1] // scale, image.shape[0] // scale)
        image = cv2.resize(image, new_size)
        if with_mask:
            mask = cv2.resize(mask, new_size)

        if verbose:
            print(f"[{image_id}] Resized Image shape: {image.shape}")
            if with_mask:
                print(f"[{image_id}] Resized Mask shape: {mask.shape}")
                
    if with_mask:
        return image, mask
    
    return image, None

In [None]:
def read_mask_only(item, path, verbose=1):
    # special function with memory optimization
    image = tifffile.imread(os.path.join(path, f"{item.get('id')}.tiff"))
    if len(image.shape) == 5:
        image = image.squeeze().transpose(1, 2, 0)
    image_shape = image.shape
    del image
    gc.collect()

    column_name = 'encoding' if path == TRAIN_DIR else 'predicted'
    mask = rle2mask(item.get(column_name), (image_shape[1], image_shape[0]))
    if verbose:
        print(f"[{image_id}] Mask shape: {mask.shape}")

    return mask

In [None]:
def plot_image(item, dir_path, scale=None, verbose=1):
    image, mask = read_image(item, dir_path, scale, verbose)

    plt.figure(figsize=(25, 10))

    plt.subplot(131)
    plt.imshow(image)
    plt.title(f"Image {item.get('id')}")
    plt.axis("off")

    plt.subplot(132)
    plt.imshow(mask, cmap="hot")
    plt.title("Mask")
    plt.axis("off")

    plt.subplot(133)
    plt.imshow(image)
    plt.imshow(mask, cmap="hot", alpha=0.5)
    plt.title("Image + mask")
    plt.axis("off")

    plt.savefig(f"full_image_mask_{item.get('id')}.png", bbox_inches='tight')
    plt.show()
    del image, mask

In [None]:
def to_tensor(x, **kwargs):
    return x.transpose(2, 0, 1).astype('float32')


def get_preprocessing():
    return A.Compose([A.Lambda(image=to_tensor, mask=to_tensor)])

In [None]:
identity = rasterio.Affine(1, 0, 0, 0, 1, 0)


class HubDataset(D.Dataset):
    def __init__(self, path, mode):
        self.csv = pd.read_csv(path, index_col=[0])
        self.mode = mode
        self.x, self.y = [], []
        self.build_slices()

    def build_slices(self):
        self.masks = []
        self.files = []
        self.slices = []
        column_name = 'encoding' if self.mode == 'train' else 'predicted'
        for i, file_id in enumerate(self.csv.index.values):
            filepath = f'{BASE_DIR}/{self.mode}/{file_id}.tiff'
            self.files.append(filepath)
            print('Transform', filepath)
            with rasterio.open(filepath,
                               transform=identity, driver='GTiff') as dataset:
                self.masks.append(
                    rle2mask(
                        self.csv.loc[file_id, column_name],
                        (dataset.shape[1], dataset.shape[0])
                    )
                )
                slices = make_grid(dataset.shape,
                                   window=WINDOW,
                                   min_overlap=OVERLAP)
                for (x1, x2, y1, y2) in tqdm(slices):
                    use_slice = self.masks[-1][x1:x2, y1:y2].sum() > THRESHOLD
                    if use_slice:
                        self.slices.append([i, x1, x2, y1, y2])
                        image = dataset.read(
                            [1, 2, 3],  # (r, g, b), h, w
                            window=Window.from_slices((x1, x2), (y1, y2)))
                        image = np.moveaxis(image, 0, -1)  # h, w, (r, g, b)
                        self.x.append(image)

                        mask = self.masks[-1][x1:x2, y1:y2]
                        mask = np.expand_dims(mask, axis=2)
                        self.y.append(mask)

                # visualise few patchs only
                del slices, image, mask, dataset, column_name, use_slice
                gc.collect()
                break

    # get data operation
    def __getitem__(self, index):
        preproc = get_preprocessing()(
            image=self.x[index],
            mask=self.y[index])  # (r, g, b), h, w
        image, mask = preproc['image'], preproc['mask']
        return image, mask

    def __len__(self):
        return len(self.x)

In [None]:
def stats_blobs(mask, connectivity):
    output = cv2.connectedComponentsWithStats(mask, connectivity)
    num_glomerulus = output[0] - 1
    print("Number of glomerulus : {}".format(num_glomerulus))
    stats = output[2]

    # areas
    largest_area = np.max(stats[1:, cv2.CC_STAT_AREA])
    print("Largest area of a glomerulus: {}".format(largest_area))
    smallest_area = np.min(stats[1:, cv2.CC_STAT_AREA])
    print("Smallest area of a glomerulus: {}".format(smallest_area))
    mean_area = np.mean(stats[1:, cv2.CC_STAT_AREA])
    print("Mean area of a glomerulus: {}".format(mean_area))
    covered_area = 100 * len(
        mask[np.nonzero(mask)]) / (mask.shape[0]*mask.shape[1])
    print("Covered area by glomerulus: {}%\n".format(covered_area))

    del output
    gc.collect()

    return stats

# A. Train dataset 

In [None]:
df_train = pd.read_csv(os.path.join(BASE_DIR, 'train.csv'))
print("The train dataset has {} images".format(df_train.shape[0]))
df_train = df_train.sort_values(by=['id'])
df_train

## A.1. Visualisation

Let's visualize full images of the train dataset on the first time

In [None]:
for idx in range(df_train.shape[0]):
    item = df_train.iloc[idx]
    plot_image(item, TRAIN_DIR, scale=20, verbose=1)
    del item
gc.collect()

## A.2. Blob analysis

Let's now analyze glomerulus on the train dataset to identify some features : 
- size
- area
- distribution
- numbers

In [None]:
plt.figure(figsize=(10, 7))
bboxes = []
for idx in range(df_train.shape[0]):
    item = df_train.iloc[idx]
    print("Image id: {}".format(item.get('id')))
    tmp_mask = read_mask_only(item, TRAIN_DIR, verbose=0)
    stats = stats_blobs(tmp_mask, 4)

    # plot histogram
    plt.hist(stats[1:, cv2.CC_STAT_AREA],
             histtype='step',
             label=item.get('id'))

    # get bboxes
    if idx == 0:
        bboxes.append([
            stats[1:30, cv2.CC_STAT_LEFT],
            stats[1:30, cv2.CC_STAT_TOP],
            stats[1:30, cv2.CC_STAT_LEFT] + stats[1:30, cv2.CC_STAT_WIDTH],
            stats[1:30, cv2.CC_STAT_TOP] + stats[1:30, cv2.CC_STAT_HEIGHT]])

    del tmp_mask, item, stats
    gc.collect()

plt.xlabel('Area (pixels²)')
plt.ylabel('Frequency')
plt.legend()
plt.savefig('train_glomerulus_area_hist.png', bbox_inches='tight')

These feature allow us to define a realistic range of area for glomerulus and therefore, delete too small glomerulus.

### Zoom on glomerulus

In [None]:
image, mask = read_image(df_train.iloc[0], TRAIN_DIR, verbose=0)
bbox = bboxes[0]

fig, axes = plt.subplots(2, 5, figsize=(20, 8))
val = 10
for i in range(2):
    for j in range(5):
        axes[i, j].imshow(
            image[bbox[1][val]:bbox[3][val],
                  bbox[0][val]:bbox[2][val]])
        axes[i, j].imshow(
            mask[bbox[1][val]:bbox[3][val],
                 bbox[0][val]:bbox[2][val]],
            alpha=0.3)
        axes[i, j].axis('off')
        val += 1
plt.savefig('train_glomerulus_zoom.png', bbox_inches='tight')
plt.show()

del image, mask, bbox, bboxes, df_train
gc.collect()

### Glomerulus in the train patches 

In [None]:
ds = HubDataset('../input/hubmap-kidney-segmentation/train.csv', 'train')

plt.figure(figsize=(30, 10))
idx = 1
for i in range(30, 42):
    image, mask = ds[i]

    plt.subplot(3, 8, idx)
    plt.imshow(image[0])
    idx += 1

    plt.subplot(3, 8, idx)
    plt.imshow(image[0])
    plt.imshow(mask[0], cmap='hot', alpha=0.5)
    idx += 1

    del image, mask
del ds
gc.collect()
plt.savefig('train_patches.png', bbox_inches='tight')

# B. Test dataset 

We can do the same identification on the test dataset and compare our results

In [None]:
df_test = pd.read_csv(os.path.join(TEST_PRED_DIR, 'submission.csv'))
print("The test dataset has {} images".format(df_test.shape[0]))
df_test = df_test.sort_values(by=['id'])
df_test

## B.1. Visualisation 

In [None]:
for idx in range(df_test.shape[0]):
    item = df_test.iloc[idx]
    plot_image(item, TEST_DIR, scale=20, verbose=0)
    del item
gc.collect()

## B.2 Blob analysis

In [None]:
bboxes = []
for idx in range(df_test.shape[0]):
    item = df_test.iloc[idx]
    print("Image id: {}".format(item.get('id')))
    tmp_mask = read_mask_only(item, TEST_DIR, verbose=0)
    stats = stats_blobs(tmp_mask, 4)

    # plot histogram
    plt.hist(stats[1:, cv2.CC_STAT_AREA],
             histtype='step',
             label=item.get('id'))

    # get bboxes
    if idx == 0:
        bboxes.append([
            stats[1:30, cv2.CC_STAT_LEFT],
            stats[1:30, cv2.CC_STAT_TOP],
            stats[1:30, cv2.CC_STAT_LEFT] + stats[1:30, cv2.CC_STAT_WIDTH],
            stats[1:30, cv2.CC_STAT_TOP] + stats[1:30, cv2.CC_STAT_HEIGHT]])

    del tmp_mask, item, stats
    gc.collect()
plt.xlabel('Area (pixels²)')
plt.ylabel('Frequency')
plt.legend()
plt.savefig('test_glomerulus_area_hist.png', bbox_inches='tight')

We can see that we have too smal glomerulus in our predictions, we should do some erosion/dilatation to remove these glomerulus and have a best score. I didn't done that because of a lack of memory

### Zoom on glomerulus 

In [None]:
image, mask = read_image(df_test.iloc[0], TEST_DIR, verbose=0)
bbox = bboxes[0]

fig, axes = plt.subplots(2, 5, figsize=(20, 8))
val = 10
for i in range(2):
    for j in range(5):
        axes[i, j].imshow(
            image[bbox[1][val]:bbox[3][val],
                  bbox[0][val]:bbox[2][val]])
        axes[i, j].imshow(
            mask[bbox[1][val]:bbox[3][val],
                 bbox[0][val]:bbox[2][val]],
            alpha=0.3)
        axes[i, j].axis('off')
        val += 1

del image, mask
gc.collect()
plt.savefig('test_glomerulus_zoom.png', bbox_inches='tight')
plt.show()

### Glomerulus on test patches 

In [None]:
ds = HubDataset('../input/submission-predictions/submission.csv', 'test')

plt.figure(figsize=(30, 10))
idx = 1
for i in range(30, 42):
    image, mask = ds[i]

    plt.subplot(3, 8, idx)
    plt.imshow(image[0])
    idx += 1

    plt.subplot(3, 8, idx)
    plt.imshow(image[0])
    plt.imshow(mask[0], cmap='hot', alpha=0.5)
    idx += 1

    del image, mask
del ds
gc.collect()
plt.savefig('test_patches.png', bbox_inches='tight')

## B.3. Influence of probability threshold on predictions

During my research of the best model for this competition, I've tried several threshold of binarization on my probability mask. You can visualisze below the effect of these thresholds on the final mask.
My best results were with threshold=0.5 as on the pictures.

In [None]:
p_files = pathlib.Path('../input/tresholds/')
thresholds = ['0.2', '0.5', '0.7']
cols = 4
rows = 5
image, mask = read_image(df_test.iloc[0],
                         TEST_DIR,
                         verbose=0,
                         with_mask=False)

fig, axes = plt.subplots(rows, cols, figsize=(20, 15))
for j, filename in enumerate(p_files.glob('*.csv')):
    print(filename)
    val = 10
    df_temp = pd.read_csv(filename)
    df_temp = df_temp.sort_values(by=['id'])
    mask = read_mask_only(df_temp.iloc[0], TEST_DIR, verbose=0)
    for i in range(5):
        if j == 0:
            axes[i, j].imshow(
                image[bbox[1][val] - 50:bbox[3][val] + 50,
                      bbox[0][val] - 50:bbox[2][val] + 50])
            axes[i, j].axis('off')
            axes[i, j].set_title(f'original image')

        axes[i, j+1].imshow(
            image[bbox[1][val] - 50:bbox[3][val] + 50,
                  bbox[0][val] - 50:bbox[2][val] + 50])
        axes[i, j+1].imshow(
            mask[bbox[1][val] - 50:bbox[3][val] + 50,
                 bbox[0][val] - 50:bbox[2][val] + 50],
            alpha=0.3)
        axes[i, j+1].axis('off')
        axes[i, j+1].set_title(
            f'image + mask with th={thresholds[j]}')
        val += 1

    del mask, df_temp

In [None]:
del bboxes, image, bbox
gc.collect()