__Competition Challenge__

The challenge is to detect functional tissue units (FTUs) across different tissue preparation pipelines. An FTU is defined as a “three-dimensional block of cells centered around a capillary, such that each cell in this block is within diffusion distance from any other cell in the same block” (de Bono, 2013). The goal of this competition is the __implementation of a successful and robust glomeruli FTU detector.__

__Competition Metric__

The competition is evaluated on the mean Dice coefficient.

__Competition Rules__

- CPU Notebook <= 9 hours run-time
- GPU Notebook <= 9 hours run-time
- TPUs will not be available for making submissions, however TPUs can be used for model training
- Internet access disabled - Can be used for Training by not for Inference
- Submission file must be named "submission.csv"

In [None]:
%env SM_FRAMEWORK=tf.keras
!pip install -q segmentation-models=="1.0.1"

In [None]:
%matplotlib inline

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os, json, re, math

from tqdm import tqdm
from glob import glob
import gc

from sklearn.model_selection import train_test_split
from sklearn.utils import class_weight
from sklearn.metrics import classification_report

import cv2
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display

import tifffile as tiff

plt.rcParams["figure.figsize"] = (12, 8)
plt.rcParams['axes.titlesize'] = 16

from kaggle_datasets import KaggleDatasets

import tensorflow as tf
import tensorflow.keras.layers as L

from tensorflow.data import Dataset
from tensorflow.keras.utils import get_custom_objects

from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import ReduceLROnPlateau, ModelCheckpoint, EarlyStopping

from tensorflow import keras
tf.keras.backend.set_image_data_format('channels_last')

import segmentation_models as sm

print(os.listdir('/kaggle/input/'))
print(os.listdir('/kaggle/input/hubmap-kidney-segmentation/'))

print("Tensorflow version " + tf.__version__)

from time import time, strftime, gmtime
start = time()
import datetime
print(str(datetime.datetime.now()))

In [None]:
base_dir = '/kaggle/input/hubmap-kidney-segmentation/'
BACKBONE = 'efficientnetb7'


In [None]:
os.listdir(base_dir + 'train/')

In [None]:
print('Number of train images: ', len(os.listdir(base_dir + 'train/')))

In [None]:
train = pd.read_csv(base_dir + 'train.csv')
train

In [None]:
meta = pd.read_csv(base_dir + 'HuBMAP-20-dataset_information.csv')
meta

In [None]:
meta.info()

__Any Missing values in train and meta data?__

In [None]:
train.isnull().sum()

In [None]:
meta.isnull().sum()

- Taking care of Missing values

In [None]:
meta.fillna(meta.mean(), inplace = True)

__Checking one json file__

In [None]:
with open(os.path.join(base_dir, 'train/0486052bb-anatomical-structure.json')) as file:
    ana = json.loads(file.read())
    
print(json.dumps(ana))

__Visualization__

In [None]:
fig, ax = plt.subplots(1, 2)

sns.distplot(meta['width_pixels'], ax = ax[0], kde = True, rug = True)
ax[0].axvline(np.mean(meta['width_pixels']), color = 'g', linestyle = '--')
ax[0].axvline(np.median(meta['width_pixels']), color = 'b', linestyle = '-')
ax[0].legend({'Median', 'Mean'})

sns.distplot(meta['height_pixels'], ax = ax[1], kde = True, rug = True)
ax[1].axvline(np.mean(meta['height_pixels']), color = 'g', linestyle = '--')
ax[1].axvline(np.median(meta['height_pixels']), color = 'b', linestyle = '-')
ax[1].legend({'Median', 'Mean'})

plt.suptitle('Distribution Plot of Pixel Width and Height')

In [None]:
fig, ax = plt.subplots(1, 2)

sns.distplot(meta['bmi_kg/m^2'], ax = ax[0], kde = True, rug = True)
ax[0].axvline(np.mean(meta['bmi_kg/m^2']), color = 'g', linestyle = '--')
ax[0].axvline(np.median(meta['bmi_kg/m^2']), color = 'b', linestyle = '-')
ax[0].legend({'Median', 'Mean'})

sns.distplot(meta['age'], ax = ax[1], kde = True, rug = True)
ax[1].axvline(np.mean(meta['age']), color = 'g', linestyle = '--')
ax[1].axvline(np.median(meta['age']), color = 'b', linestyle = '-')
ax[1].legend({'Median', 'Mean'})

plt.suptitle('Distribution Plot of BMI and Age')

In [None]:
fig, ax = plt.subplots(1, 2)

sns.distplot(meta['weight_kilograms'], ax = ax[0], kde = True, rug = True)
ax[0].axvline(np.mean(meta['weight_kilograms']), color = 'g', linestyle = '--')
ax[0].axvline(np.median(meta['weight_kilograms']), color = 'b', linestyle = '-')
ax[0].legend({'Median', 'Mean'})

sns.distplot(meta['height_centimeters'], ax = ax[1], kde = True, rug = True)
ax[1].axvline(np.mean(meta['height_centimeters']), color = 'g', linestyle = '--')
ax[1].axvline(np.median(meta['height_centimeters']), color = 'b', linestyle = '-')
ax[1].legend({'Median', 'Mean'})

plt.suptitle('Distribution Plot of Weight and Height')

In [None]:
fig, ax = plt.subplots(1, 2)

sns.countplot(meta['sex'], ax = ax[0])
sns.countplot(meta['race'], ax = ax[1])
plt.suptitle('Count Plot of Sex and Race')

In [None]:
fig, ax = plt.subplots(1, 2)

sns.countplot(meta['ethnicity'], ax = ax[0])
sns.countplot(meta['race'], ax = ax[1])
plt.suptitle('Count Plot of Ethnicity and Race')

In [None]:
fig, ax = plt.subplots(1, 2)

sns.countplot(meta['sex'], ax = ax[0])
sns.countplot(meta['laterality'], ax = ax[1])
plt.suptitle('Count Plot of Sex and Laterality')

__Helper Functions__

In [None]:
# https://www.kaggle.com/paulorzp/rle-functions-run-lenght-encode-decode

def mask2rle(img):
    
    '''
    img: numpy array, 1 - mask, 0 - background
    Returns run length as string formated
    '''
    pixels= img.T.flatten()
    pixels = np.concatenate([[0], pixels, [0]])
    runs = np.where(pixels[1:] != pixels[:-1])[0] + 1
    runs[1::2] -= runs[::2]
    return ' '.join(str(x) for x in runs)
 
def rle2mask(mask_rle, shape):
    
    '''
    mask_rle: run-length as string formated (start length)
    shape: (width,height) of array to return 
    Returns numpy array, 1 - mask, 0 - background

    '''
    s = mask_rle.split()
    starts, lengths = [np.asarray(x, dtype=int) for x in (s[0:][::2], s[1:][::2])]
    starts -= 1
    ends = starts + lengths
    img = np.zeros(shape[0]*shape[1], dtype=np.uint8)
    for lo, hi in zip(starts, ends):
        img[lo:hi] = 1
    return img.reshape(shape).T

def read_tiff(image, encoding_index, resize=None):
    
    '''
    read tiff images and mask.
    ----------------------------
    
    Arguments:
    image -- tiff image
    encoding_index -- corresponding tiff file encoding index.
    
    Returns:
    tiff_image -- tiff image
    tiff_mask -- segmentation mask
    '''
    
    tiff_image = tiff.imread(os.path.join(base_dir, f'train/{image}.tiff'))
    
    if len(tiff_image.shape) == 5:
        tiff_image = np.transpose(tiff_image.squeeze(), (1,2,0))
        
    tiff_mask = rle2mask(train['encoding'][encoding_index],
                         (tiff_image.shape[1], tiff_image.shape[0]))
    
    print(f'Image Shape: {tiff_image.shape}')
    print(f'Mask Shape: {tiff_mask.shape}')
    
    if resize:
        rescaled = (tiff_image.shape[1] // resize, tiff_image.shape[0] // resize)
        tiff_image = cv2.resize(tiff_image, rescaled)
        tiff_mask = cv2.resize(tiff_mask, rescaled)

    return tiff_image, tiff_mask

def read_test_tiff(image, resize=None):
    
    '''
    read tiff images.
    ----------------------------
    
    Arguments:
    image -- tiff image
    
    Returns:
    tiff_image -- tiff image
    tiff_mask -- segmentation mask
    '''
    
    tiff_image = tiff.imread(os.path.join(base_dir, f'test/{image}.tiff'))
    
    if len(tiff_image.shape) == 5:
        tiff_image = np.transpose(tiff_image.squeeze(), (1,2,0))
    
    if resize:
        rescaled = (tiff_image.shape[1] // resize, tiff_image.shape[0] // resize)
        tiff_image = cv2.resize(tiff_image, rescaled)

    return tiff_image

def plot(image, mask):
    
    '''
    plot image and mask
    ---------------------
    
    Arguments:
    image -- tiff image 
    mask -- segmentation mask
    
    Returns:
    matplotlib plot
    '''
    plt.figure(figsize = (15, 15))

    # Image
    plt.subplot(1, 3, 1)
    plt.imshow(image)
    plt.title("Image", fontsize = 16)

    # Mask
    plt.subplot(1, 3, 2)
    plt.imshow(mask)
    plt.title("Image Mask", fontsize = 16)

    # Image + Mask
    plt.subplot(1, 3, 3)
    plt.imshow(image)
    plt.imshow(mask, alpha=0.5)
    plt.title("Image + Mask", fontsize = 16);
    
def plot_subset(image, mask, start_rh, end_rh, start_cw, end_cw):
    
    '''
    plot image and mask
    ---------------------
    
    Arguments:
    image -- tiff image 
    mask -- segmentation mask
    start_rh -- height start
    end_rh -- height end
    start_cw -- width start 
    end_cw -- width end
    
    Returns:
    matplotlib plot
    '''

    # Figure size
    plt.figure(figsize=(15, 15))

    # subset image and mask
    subset_image = image[start_rh:end_rh, start_cw:end_cw, :]
    subset_mask = mask[start_rh:end_rh, start_cw:end_cw]

    # Image
    plt.subplot(1, 3, 1)
    plt.imshow(subset_image)
    plt.title("Zoomed Image", fontsize=16)

    # Mask
    plt.subplot(1, 3, 2)
    plt.imshow(subset_mask)
    plt.title("Zoomed Image Mask", fontsize=16)

    # Image + Mask
    plt.subplot(1, 3, 3)
    plt.imshow(subset_image)
    plt.imshow(subset_mask, alpha=0.5)
    plt.title("Zoomed Image + Mask", fontsize=16);

In [None]:
glob(base_dir + 'train/*.tiff'), glob(base_dir + 'test/*.tiff') 

In [None]:
img, msk = read_tiff('2f6ecfcdf', 0, resize = None)
gc.collect()

In [None]:
#plot(img, msk)
gc.collect()

In [None]:
plot_subset(img, msk, 5000, 10000, 10000, 15000)
gc.collect()

In [None]:
plot_subset(img, msk, 4000, 11000, 8000, 12000)
gc.collect()

__TPU Config__

In [None]:
try:
    # detect and initialize tpu
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    print('Connecting to tpu...')
    print('device running at:', tpu.master())
except ValueError:
    tpu = None

if tpu:
    print('Initializing TPU...')
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    # instantiate a distribution strategy
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
    print("TPU initialized")
else:
    print('Using deafualt strategy...')
    strategy = tf.distribute.get_strategy()

REPLICAS = strategy.num_replicas_in_sync
print(f"REPLICAS:  {REPLICAS}")

In [None]:
AUTO = tf.data.experimental.AUTOTUNE

SEED = 42
BATCH_SIZE = 8 * REPLICAS # ideal batch size is 128 per core. (128*8=1024)
IMAGE_DIM = [512, 512]
EPOCHS = 60
LR = 5e-4

GCS_PATH = KaggleDatasets().get_gcs_path('hubmap-512x512')
print(GCS_PATH)

#Append train and mask to GCS PATH
TIFF = tf.io.gfile.glob(str(GCS_PATH + '/train/*'))
MASK = tf.io.gfile.glob(str(GCS_PATH + '/masks/*'))

In [None]:
TRAIN_TIFF = Dataset.from_tensor_slices(TIFF)
TRAIN_MASK = Dataset.from_tensor_slices(MASK)

TIFF_COUNT = tf.data.experimental.cardinality(TRAIN_TIFF).numpy()
MASK_COUNT = tf.data.experimental.cardinality(TRAIN_MASK).numpy()

print('Training Data')
print(f'Total Tiff Images: {TIFF_COUNT}')
print(f'Total Masks: {MASK_COUNT}')

In [None]:
for files in TRAIN_TIFF.take(5):
    print(files.numpy())
print('\n')
for files in TRAIN_MASK.take(5):
    print(files.numpy())

In [None]:
def decode_image_and_mask(image, mask, augment = True):
    
    '''
    decode image and mask in order to
    feed data to TPU.
    --------------------------------
    
    Arguments:
    image -- patches of huge tiff file in png format.
    mask -- patches of mask in png format.
    augment -- apply augmentations on images and masks.
    
    Return:
    image 
    mask
    '''
    
    # load raw data as string
    image = tf.io.read_file(image)
    mask = tf.io.read_file(mask)
    
    image = tf.io.decode_png(image, channels = 3)  # convert compressed string to 3D uint8 tensor
    mask = tf.io.decode_png(mask)  # convert compressed string to uinst8 tensor
    
    if augment:
        
        if tf.random.uniform(()) > 0.5:
            image = tf.image.flip_left_right(image)
            mask = tf.image.flip_left_right(mask)
            
        if tf.random.uniform(()) > 0.4:
            image = tf.image.flip_up_down(image)
            mask = tf.image.flip_up_down(mask)
            
        if tf.random.uniform(()) > 0.5:
            image = tf.image.rot90(image, k=1)
            mask = tf.image.rot90(mask, k=1)
            
        if tf.random.uniform(()) > 0.45:
            image = tf.image.random_saturation(image, 0.7, 1.3)
            
        if tf.random.uniform(()) > 0.45:
            image = tf.image.random_contrast(image, 0.8, 1.2)
    
    image = tf.image.convert_image_dtype(image, tf.float32) # convert to floats in the [0,1] range
    mask = tf.cast(mask, tf.float32)  # convert to floats 1. and 0.
    
    image = tf.image.resize(image, [*IMAGE_DIM])
    image = tf.reshape(image, [*IMAGE_DIM, 3])  # reshaping image tensor
    
    mask = tf.image.resize(mask, [*IMAGE_DIM])
    mask = tf.reshape(mask, [*IMAGE_DIM]) # reshaping mask tensor
    
    return image, mask

def generate_dataset(tiff, masks, batch_size = 16, shuffle = True):
    
    '''
    generate batches of tf.Dataset
    object
    --------------------------------
    
    Arguments:
    tiff -- tf.data.Dataset object (tf.Tensor)
    mask -- tf.data.Dataset object (tf.Tensor)
    batch_size -- batches of image, mask pair
    shuffle -- generate train if True or validation data if False
    
    Return:
    dataset - tf.data.Dataset dataset 
    '''
    
    
    dataset = Dataset.zip((tiff, masks)) # create dataset by zipping (image, mask) into pair
    dataset = dataset.map(decode_image_and_mask, num_parallel_calls = AUTO) # decode raw data coming from GCS bucket to valid image, mask pair 
    dataset = dataset.cache() # cache dataset preprocessing work that doesn't fit in memory
    dataset = dataset.repeat() 
    
    # shuffle while training else set to False
    if shuffle:
        dataset = dataset.shuffle(buffer_size = 1000)
        
    dataset = dataset.batch(batch_size, drop_remainder = True) # generate batches of data
    dataset = dataset.prefetch(buffer_size = AUTO) # fetch dataset while model is training
    return dataset

In [None]:
train_dataset = generate_dataset(TRAIN_TIFF, TRAIN_MASK, batch_size = BATCH_SIZE)

__Check image and mask size in the dataset__

In [None]:
for img, msk in train_dataset.take(1):
    image_batch, mask_batch = img, msk
    print("Image shape: ", image_batch.numpy().shape)
    print("Mask shape: ", mask_batch.numpy().shape)

__Visualize images and masks__

In [None]:
plot(image_batch[0], mask_batch[0])

In [None]:
plt.figure(figsize = (16,16))
for i,(img, mask) in enumerate(zip(image_batch[:64], mask_batch[:64])):
    plt.subplot(8, 8, i + 1)
    plt.imshow(img, vmin = 0, vmax = 255)
    plt.imshow(mask, alpha = 0.4)
    plt.axis('off')
    plt.subplots_adjust(wspace = None, hspace = None)

__Split dataset into train and validation sets__

In [None]:
train_img, valid_img, train_msk, valid_msk = train_test_split(TIFF, MASK, test_size = 0.2, random_state = 42)
print(len(train_img), len(train_msk))
print(len(valid_img), len(valid_msk))

In [None]:
TRAIN_TIFF = Dataset.from_tensor_slices(train_img)
TRAIN_MASK = Dataset.from_tensor_slices(train_msk)

VALID_TIFF = Dataset.from_tensor_slices(valid_img)
VALID_MASK = Dataset.from_tensor_slices(valid_msk)

TRAIN_TIFF_CNT = tf.data.experimental.cardinality(TRAIN_TIFF).numpy()
TRAIN_MASK_CNT = tf.data.experimental.cardinality(TRAIN_MASK).numpy()

VALID_TIFF_CNT = tf.data.experimental.cardinality(VALID_TIFF).numpy()
VALID_MASK_CNT = tf.data.experimental.cardinality(VALID_MASK).numpy()

print('Training Data Count')
print(f'Total Train Tiff Images: {TRAIN_TIFF_CNT}')
print(f'Total Train Masks: {TRAIN_MASK_CNT}')

print('Validation Data Count')
print(f'Total Valid Tiff Images: {VALID_TIFF_CNT}')
print(f'Total Valid Masks: {VALID_MASK_CNT}')

In [None]:
steps_per_epoch = tf.data.experimental.cardinality(TRAIN_TIFF).numpy() // BATCH_SIZE
valid_steps = tf.data.experimental.cardinality(VALID_TIFF).numpy() // BATCH_SIZE

In [None]:
train_dataset = generate_dataset(TRAIN_TIFF, TRAIN_MASK, batch_size = BATCH_SIZE)
valid_dataset = generate_dataset(VALID_TIFF, VALID_MASK, batch_size = BATCH_SIZE)
train_dataset, valid_dataset

__Model using Segemntation Models by Qubvel__

In [None]:
with strategy.scope():
    model = sm.Unet(BACKBONE)

optimizer = 'adam'
model.compile(optimizer = optimizer,
              loss = tf.keras.losses.BinaryCrossentropy(),    
              metrics = [sm.metrics.iou_score, 'accuracy'])

early = tf.keras.callbacks.EarlyStopping(monitor = 'val_iou_score', patience = 10, mode = 'max')
check = tf.keras.callbacks.ModelCheckpoint(filepath = 'sm_unet.h5', monitor = 'val_iou_score', save_weights_only = True, 
                                       save_best_only = True, mode = 'max')
reduce = tf.keras.callbacks.ReduceLROnPlateau(monitor = 'val_loss', factor = 0.1, patience = 10, 
                                              min_lr = 0.00001)
#model.summary()

In [None]:
history = model.fit(train_dataset, 
                   epochs = EPOCHS, 
                   steps_per_epoch = steps_per_epoch, 
                   callbacks = [check, reduce], 
                   validation_data = valid_dataset, 
                   validation_steps = valid_steps, 
                   verbose = 1)

In [None]:
pd.DataFrame(history.history).plot(y = ['accuracy', 'val_accuracy'], logy = False)
plt.xlabel("Epochs")
plt.ylabel("Accuracy")

In [None]:
pd.DataFrame(history.history).plot(y = ['loss', 'val_loss'], logy = False)
plt.xlabel("Epochs")
plt.ylabel("Loss")

In [None]:
pd.DataFrame(history.history).plot(y = ['iou_score', 'val_iou_score'], logy = False)
plt.xlabel("Epochs")
plt.ylabel("IOU_Score")

In [None]:
model.load_weights('sm_unet.h5')
model.save('hubmap_sm.h5')

In [None]:
finish = time()
print(strftime("%H:%M:%S", gmtime(finish - start)))