In [None]:
from sklearn.utils import shuffle
from IPython.core.display import display, HTML, Javascript
from string import Template
import json, random
import IPython.display
from plotly.offline import init_notebook_mode, iplot
from plotly import subplots
import plotly.figure_factory as ff
import plotly as py
import plotly.graph_objects as go
init_notebook_mode(connected=True)


# Defining all our palette colors
primary_blue = "#496595"
primary_blue2 = "#85a1c1"
primary_blue3 = "#3f4d63"
primary_grey = "#c6ccd8"
primary_black = "#202022"
primary_bgcolor = "#f4f0ea"

# "coffee" pallette turqoise-gold.
f1 = "#a2885e"
f2 = "#e9cf87"
f3 = "#f1efd9"
f4 = "#8eb3aa"
f5 = "#235f83"
f6 = "#b4cde3"

In [None]:
html_contents ="""
<!DOCTYPE html>
<html lang="en">
    <head>
    <style>
    .toc h2{
        color: white;
        background: #3f4d63;
        font-weight: 600;
        font-family: Helvetica;
        font-size: 23px;
        padding: 6px 12px;
        margin-bottom: 2px;
    }
    
    .toc ol li{
        list-style:none;
        line-height:normal;
        }
     
    .toc li{
        background: #235f83;
        color: white;
        font-weight: 600;
        font-family: Helvetica;
        font-size: 18px;
        margin-bottom: 2px;
        padding: 6px 12px;
    }

    .toc ol ol li{
        background: #fff;
        color: #4d4d4d;
        font-weight: 400;
        font-size: 15px;
        font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
        margin-top: 0px;
        margin-bottom: 0px;
        padding: 3px 12px;
    } 
    
    .section_title{
        background-color: #3f4d63;
        color: white;
        font-family: Helvetica;
        font-size: 25px;
        padding: 6px 12px;
        margin-bottom: 5px;
    }
    .subsection_title{
        background: #235f83;
        color: white;
        font-family: Helvetica;
        font-size: 21px;
        padding: 6px 12px;
        margin-bottom: 0px;
    }
    .sidenote{
        font-size: 13px;
        border: 1px solid #d7d7d7;
        padding: 1px 10px 2px;
        box-shadow: 1px 1px 2px 1px rgba(0,0,0,0.3);
        margin-bottom: 3px;
    }
    </style>
    </head>
    <body>
        <div class="toc">
        
        <ol> 
        <h2> Table of Contents </h2>
        <li>1. Introduction </li> 
        <li>2. Basic Data Exploration</li>
        <li>3. Utility functions</li>
        <li>4. Image and Masks Visualizations</li>
        <ol> 
            <li>4.1 Train Images </li>
            <li>4.2 Train Images and Masks </li> 
            <li>4.3 Test Images </li> 
            <li>4.4 Plot Image and Mask </li> 
            <li>4.5 Plot Sliced Image and Mask </li> 
            <li>4.6 Plot Grid Image with Mask </li> 
        </ol>
        <li>5. Metadata Analysis</li>
        <li>6. References </li>
        </ol>
        </div>
    </body>
</html>
"""

HTML(html_contents)

# 1. Introduction

The Human BioMolecular Atlas Program (HuBMAP) is working to catalyze to catalyze the development of a framework for mapping of human body at the level of glomeruli functional tissue units(FTUs). HuBMAP aims to be an open map of the human body at the celular level. 

In this challenge, we will detect the FTUs across different tissue preparation pipelines. An FTU is a three dimensional block of cells centered around a capillary, such that each cell in this block is within diffusion distance from any other cell in the same block (de Bono, 2013).

The HuBMAP data used in this competition includes 11 frozen and 9 Formalin fixed Paraffin Embedded (FFPE) PAS kidney images.Glomeruli FTU annotations exist for all 20 tissue samples.

In [None]:
!pip install -q -U pip
!pip install -q -U seaborn

import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import cv2
import tifffile
from PIL import Image
import cv2
import os
from tqdm.notebook import tqdm
import zipfile

# 2. Basic Data Exploration

In [None]:
BASE_PATH = "../input/hubmap-kidney-segmentation/"
TRAIN_PATH = os.path.join(BASE_PATH, "train")

print(os.listdir(BASE_PATH))

In [None]:
sz = 256       # Size of the tiles
reduce = 4     # Reduce the original images by 4 times
MASKS = '../input/hubmap-kidney-segmentation/train.csv'
DATA = '../input/hubmap-kidney-segmentation/train/'
OUT_TRAIN = 'train.zip'
OUT_MASKS = 'masks.zip'

**Train masks**

train.csv contains the unique IDs of each image, as well as RLE-encoded representation of the mask for the objects of the image.

In [None]:
df_train = pd.read_csv(os.path.join(BASE_PATH, "train.csv"))
df_train

**Submission df**

In [None]:
df_sub = pd.read_csv(os.path.join(BASE_PATH, "sample_submission.csv"))
df_sub

**Number of Samples**

In [None]:
print(f"Number of train images: {df_train.shape[0]}")
print(f"Number of test images: {df_sub.shape[0]}")


**Train and test metadata**

HuBMAP-20-dataset_information.csv contains additional information including anonymized patient data about each image

In [None]:
df_info = pd.read_csv(os.path.join(BASE_PATH, "HuBMAP-20-dataset_information.csv"))
df_info.sample(5)

# 3. Utility functions

In [None]:
def rle2mask(mask_rle, shape):
    """
    mask_rle : run-length as string formated (start length)
    shape: (width, height) of array to return
    Returns numpy array, 1- mask, 0-background
    """
    
    s= mask_rle.split()
    starts, lengths = [
        np.asarray(x, dtype=int) for x in (s[0:][::2], s[1:][::2])
    ]
    starts -= 1
    ends = starts + lengths
    img = np.zeros(shape[0] * shape[1], dtype = np.uint8)
    for lo, hi in zip(starts, ends):
        img[lo : hi] = 1
    return img.reshape(shape).T

def read_image(image_id, scale=None, verbose=1):
    image = tifffile.imread(
        os.path.join(BASE_PATH, f"train/{image_id}.tiff")
    )
    if len(image.shape) == 5:
        image = image.squeeze().transpose(1, 2, 0)
    
    mask = rle2mask(
        df_train[df_train["id"] == image_id]["encoding"].values[0], 
        (image.shape[1], image.shape[0])
    )
    
    if verbose:
        print(f"[{image_id}] Image shape: {image.shape}")
        print(f"[{image_id}] Mask shape: {mask.shape}")
    
    if scale:
        new_size = (image.shape[1] // scale, image.shape[0] // scale)
        image = cv2.resize(image, new_size)
        mask = cv2.resize(mask, new_size)
        
        if verbose:
            print(f"[{image_id}] Resized Image shape: {image.shape}")
            print(f"[{image_id}] Resized Mask shape: {mask.shape}")
        
    return image, mask

def read_test_image(image_id, scale=None, verbose = 1):
    image = tifffile.imread(
    os.path.join(BASE_PATH, f"test/{image_id}.tiff")
    )
    if len(image.shape) == 5:
        image = image.squeeze().transpose(1,2,0)
        
    if verbose:
        print(f"[{image_id}] Image shape: {image.shape}")
        
    if scale:
        new_size = (image.shape[1] // scale, image.shape[0]//scale)
        image = cv2.resize(image, new_size)
        
    if verbose:
        print(f"[{image_id}] Resize Image shape: {image.shape}")
        
    return image

def plot_image_and_mask(image, mask, image_id):
    plt.figure(figsize=(16, 10))
    
    plt.subplot(1, 3, 1)
    plt.imshow(image)
    plt.title(f"Image {image_id}", fontsize=18)
    
    plt.subplot(1, 3, 2)
    plt.imshow(image)
    plt.imshow(mask, cmap = "hot", alpha = 0.5)
    plt.title(f"Image {image_id} + mask", fontsize=18)
    
    plt.subplot(1,3,3)
    plt.imshow(mask, cmap = "hot")
    plt.title(f"Mask", fontsize = 18)
    plt.show()
    

def plot_grid_image_with_mask(image, mask):
    plt.figure(figsize=(16,16))
    
    w_len = image.shape[0]
    h_len = image.shape[1]
    
    min_len = min(w_len, h_len)
    w_start = (w_len - min_len) // 2
    h_start = (h_len - min_len) // 2
    
    plt.imshow(image[w_start : w_start + min_len, h_start : h_start + min_len])
    plt.imshow(mask[w_start : w_start + min_len, h_start : h_start + min_len], cmap="hot", alpha = 0.5,)
    plt.axis("off")
    plt.show()
    

def plot_slice_image_and_mask(image, mask, start_h, end_h, start_w, end_w):
    plt.figure(figsize=(16,5))
    sub_image = image[start_h:end_h, start_w:end_w, :]
    sub_mask = mask[start_h:end_h, start_w:end_w]
    
    plt.subplot(1,3,1)
    plt.imshow(sub_image)
    plt.axis("off")
    
    plt.subplot(1,3,2)
    plt.imshow(sub_image)
    plt.imshow(sub_mask, cmap="hot", alpha = 0.5)
    plt.axis("off")
    
    plt.subplot(1,3,3)
    plt.imshow(sub_mask, cmap="hot")
    plt.axis("off")
    
    plt.show()
    
def enc2mask(encs, shape):
    img = np.zeros(shape[0]*shape[1], dtype=np.int8)
    for m, enc in enumerate(encs):
        if isinstance(enc,np.float) and np.isnan(enc): continue
        s = enc.split()
        for i in range(len(s)//2):
            start = int(s[2*i]) - 1
            length = int(s[2*i + 1])
            img[start:start+length] = 1+m
    return img.reshape(shape).T

def mask2enc(mask, n = 1):
    pixels = mask.T.flatten()
    encs = []
    for i in range(1, n+1):
        p = (pixels == i).astype(np.int8)
        if p.sum() == 0: 
            encs.append(np.nan)
        else:
            p = np.concatenate([[0], p, [0]])
            runs = np.where(p[1:] != p[:-1])[0] + 1
            runs[1::2] -= runs[::2]
            encs.append(' '.join(str(x) for x in runs))
    return encs

df_masks = pd.read_csv(MASKS).set_index('id')
df_masks.head()
            

In [None]:
s_th = 40  #saturation blancking threshold
p_th = 200*sz//256 #threshold for the minimum number of pixels

x_tot,x2_tot = [],[]
with zipfile.ZipFile(OUT_TRAIN, 'w') as img_out,\
 zipfile.ZipFile(OUT_MASKS, 'w') as mask_out:
    for index, encs in tqdm(df_masks.iterrows(),total=len(df_masks)):
        #read image and generate the mask
        img = tifffile.imread(os.path.join(DATA,index+'.tiff'))
        if len(img.shape) == 5: img = np.transpose(img.squeeze(), (1,2,0))
        mask = enc2mask(encs,(img.shape[1],img.shape[0]))

        #add padding to make the image dividable into tiles
        shape = img.shape
        pad0 = (reduce*sz - shape[0]%(reduce*sz))%(reduce*sz)
        pad1 = (reduce*sz - shape[1]%(reduce*sz))%(reduce*sz)
        img = np.pad(img,[[pad0//2,pad0-pad0//2],[pad1//2,pad1-pad1//2],[0,0]],
                    constant_values=0)
        mask = np.pad(mask,[[pad0//2,pad0-pad0//2],[pad1//2,pad1-pad1//2]],
                    constant_values=0)

        #split image and mask into tiles using the reshape+transpose trick
        img = cv2.resize(img,(img.shape[1]//reduce,img.shape[0]//reduce),
                         interpolation = cv2.INTER_AREA)
        img = img.reshape(img.shape[0]//sz,sz,img.shape[1]//sz,sz,3)
        img = img.transpose(0,2,1,3,4).reshape(-1,sz,sz,3)

        mask = cv2.resize(mask,(mask.shape[1]//reduce,mask.shape[0]//reduce),
                          interpolation = cv2.INTER_NEAREST)
        mask = mask.reshape(mask.shape[0]//sz,sz,mask.shape[1]//sz,sz)
        mask = mask.transpose(0,2,1,3).reshape(-1,sz,sz)

        #write data
        for i,(im,m) in enumerate(zip(img,mask)):
            #remove black or gray images based on saturation check
            hsv = cv2.cvtColor(im, cv2.COLOR_BGR2HSV)
            h, s, v = cv2.split(hsv)
            if (s>s_th).sum() <= p_th or im.sum() <= p_th: continue
            
            x_tot.append((im/255.0).reshape(-1,3).mean(0))
            x2_tot.append(((im/255.0)**2).reshape(-1,3).mean(0))
            
            im = cv2.imencode('.png',cv2.cvtColor(im, cv2.COLOR_RGB2BGR))[1]
            img_out.writestr(f'{index}_{i}.png', im)
            m = cv2.imencode('.png',m)[1]
            mask_out.writestr(f'{index}_{i}.png', m)

#image stats
img_avr =  np.array(x_tot).mean(0)
img_std =  np.sqrt(np.array(x2_tot).mean(0) - img_avr**2)
print('mean:',img_avr, ', std:', img_std)

# 4. Image and Masks Visualizations

In [None]:
small_ids = [
    "0486052bb", "095bf7a1f", "1e2425f28", "2f6ecfcdf",
    "54f2eec69", "aaa6a05cc", "cb2d976f4", "e79de561c",
]
small_images = []
small_masks = []

for small_id in small_ids:
    tmp_image, tmp_mask = read_image(small_id, scale = 20, verbose = 0)
    small_images.append(tmp_image)
    small_masks.append(tmp_mask)

**4.1 Train Images**

In [None]:
plt.figure(figsize=(16, 16))
for ind, (tmp_id, tmp_image) in enumerate(zip(small_ids, small_images)):
    plt.subplot(3, 3, ind+1)
    plt.imshow(tmp_image)
    plt.axis("off")

**4.2 Train images and masks**

In [None]:
plt.figure(figsize = (16, 16))
for ind, (tmp_id, tmp_image, tmp_mask) in enumerate(zip(small_ids, small_images, small_masks)):
    plt.subplot(3, 3, ind + 1)
    plt.imshow(tmp_image)
    plt.imshow(tmp_mask, cmap="hot", alpha = 0.5)
    plt.axis("off")

In [None]:
small_ids = [
    "26dc41664", "afa5e8098", "b2dc8411c", "b9a3865fc", "c68fe75ea",
]
small_images = []
for small_id in small_ids:
    tmp_image = read_test_image(small_id, scale = 20, verbose = 0)
    small_images.append(tmp_image)

**4.3 Test Images**

In [None]:
plt.figure(figsize=(16,11))
for ind, (tmp_id, tmp_image) in enumerate(zip(small_ids, small_images)):
    plt.subplot(2, 3, ind + 1)
    plt.imshow(tmp_image)
    plt.axis("off")

**4.4 Plot Image and Mask**

In [None]:
image_id = "0486052bb"
image, mask = read_image(image_id, 3)
plot_image_and_mask(image, mask, image_id)

**4.5 Plot Sliced Image and Mask**

In [None]:
plot_slice_image_and_mask(image, mask, 5000, 7000, 2000, 5000)
plot_slice_image_and_mask(image, mask, 5250, 6000, 3000, 4000)
plot_slice_image_and_mask(image, mask, 5370, 5700, 3500, 3800)


**4.6 Plot Grid Image with Mask**

In [None]:
plot_grid_image_with_mask(image, mask)

In [None]:
columns, rows = 4,4
idx0 = 20
fig=plt.figure(figsize=(columns*4, rows*4))
with zipfile.ZipFile(OUT_TRAIN, 'r') as img_arch, \
zipfile.ZipFile(OUT_MASKS, 'r') as msk_arch:
    fnames = sorted(img_arch.namelist())[8:]
    for i in range(rows):
        for j in range(columns):
            idx = i + j*columns
            img = cv2.imdecode(np.frombuffer(img_arch.read(fnames[idx0+idx]), np.uint8), cv2.IMREAD_COLOR)
            img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)
            mask = cv2.imdecode(np.frombuffer(msk_arch.read(fnames[idx0+idx]), np.uint8), cv2.IMREAD_GRAYSCALE)
            
            fig.add_subplot(rows, columns, idx +1)
            plt.axis('off')
            plt.imshow(Image.fromarray(img))
            plt.imshow(Image.fromarray(mask), alpha=0.2)
            
plt.show()            

# 5. Metadata Analysis

In [None]:
pd.read_json(os.path.join(BASE_PATH, "train/0486052bb-anatomical-structure.json"))

In [None]:
pd.read_json(os.path.join(BASE_PATH, "train/0486052bb.json"))

In [None]:
df_info["split"] = "test"
df_info.loc[df_info["image_file"].isin(os.listdir(os.path.join(BASE_PATH, "train"))), "split"] = "train"

In [None]:
df_info["area"] = df_info["width_pixels"] * df_info["height_pixels"]

In [None]:
df_info.head(3)

# 6. References 

https://www.kaggle.com/ihelon/hubmap-exploratory-data-analysis

https://www.kaggle.com/iafoss/256x256-images
