# HuBMAP - Exploratory Data Analysis

Quick Exploratory Data Analysis for [HuBMAP: Hacking the Kidney](https://www.kaggle.com/c/hubmap-kidney-segmentation) challenge

The HuBMAP data used in this hackathon includes 11 fresh frozen and 9 Formalin Fixed Paraffin Embedded (FFPE) PAS kidney images. Glomeruli FTU annotations exist for all 20 tissue samples; some of these will be shared for training, and others will be used to judge submissions.

![](https://storage.googleapis.com/kaggle-competitions/kaggle/22990/logos/header.png)

<a id="top"></a>

<div class="list-group" id="list-tab" role="tablist">
<h3 class="list-group-item list-group-item-action active" data-toggle="list" style='color:white; background:#EAA6D1; border:0' role="tab" aria-controls="home"><center>Quick Navigation</center></h3>

* [1. Basic Data Exploration](#1)
* [2. Image and Masks Visualizations](#2)
* [3. Metadata Analysis](#3)

In [None]:
!pip install -q -U pip
!pip install -q -U seaborn

<a id="1"></a>
<h2 style='background:#EAA6D1; border:0; color:white'><center>Basic Data Exploration<center><h2>

In [None]:
import os

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sn
import cv2
import tifffile

In [None]:
BASE_PATH = "../input/hubmap-kidney-segmentation/"
TRAIN_PATH = os.path.join(BASE_PATH, "train")

print(os.listdir(BASE_PATH))

### Train masks

**train.csv** contains the unique IDs for each image, as well as an RLE-encoded representation of the mask for the objects in the image. See the evaluation tab for details of the RLE encoding scheme.

In [None]:
df_train = pd.read_csv(
    os.path.join(BASE_PATH, "train.csv")
)
df_train

### Submission df

In [None]:
df_sub = pd.read_csv(
    os.path.join(BASE_PATH, "sample_submission.csv"))
df_sub

### Number of samples

In [None]:
print(f"Number of train images: {df_train.shape[0]}")
print(f"Number of test images: {df_sub.shape[0]}")

### Train and test metadata

**HuBMAP-20-dataset_information.csv** contains additional information (including anonymized patient data) about each image.

In [None]:
df_info = pd.read_csv(
    os.path.join(BASE_PATH, "HuBMAP-20-dataset_information.csv")
)
df_info.sample(3)

## Utility functions

In [None]:
# https://www.kaggle.com/paulorzp/rle-functions-run-lenght-encode-decode
def rle2mask(mask_rle, shape):
    '''
    mask_rle: run-length as string formated (start length)
    shape: (width,height) of array to return 
    Returns numpy array, 1 - mask, 0 - background

    '''
    s = mask_rle.split()
    starts, lengths = [
        np.asarray(x, dtype=int) for x in (s[0:][::2], s[1:][::2])
    ]
    starts -= 1
    ends = starts + lengths
    img = np.zeros(shape[0] * shape[1], dtype=np.uint8)
    for lo, hi in zip(starts, ends):
        img[lo : hi] = 1
    return img.reshape(shape).T


def read_image(image_id, scale=None, verbose=1):
    image = tifffile.imread(
        os.path.join(BASE_PATH, f"train/{image_id}.tiff")
    )
    if len(image.shape) == 5:
        image = image.squeeze().transpose(1, 2, 0)
    
    mask = rle2mask(
        df_train[df_train["id"] == image_id]["encoding"].values[0], 
        (image.shape[1], image.shape[0])
    )
    
    if verbose:
        print(f"[{image_id}] Image shape: {image.shape}")
        print(f"[{image_id}] Mask shape: {mask.shape}")
    
    if scale:
        new_size = (image.shape[1] // scale, image.shape[0] // scale)
        image = cv2.resize(image, new_size)
        mask = cv2.resize(mask, new_size)
        
        if verbose:
            print(f"[{image_id}] Resized Image shape: {image.shape}")
            print(f"[{image_id}] Resized Mask shape: {mask.shape}")
        
    return image, mask


def read_test_image(image_id, scale=None, verbose=1):
    image = tifffile.imread(
        os.path.join(BASE_PATH, f"test/{image_id}.tiff")
    )
    if len(image.shape) == 5:
        image = image.squeeze().transpose(1, 2, 0)
    
    if verbose:
        print(f"[{image_id}] Image shape: {image.shape}")
    
    if scale:
        new_size = (image.shape[1] // scale, image.shape[0] // scale)
        image = cv2.resize(image, new_size)
        
        if verbose:
            print(f"[{image_id}] Resized Image shape: {image.shape}")
        
    return image


def plot_image_and_mask(image, mask, image_id):
    plt.figure(figsize=(16, 10))
    
    plt.subplot(1, 3, 1)
    plt.imshow(image)
    plt.title(f"Image {image_id}", fontsize=18)
    
    plt.subplot(1, 3, 2)
    plt.imshow(image)
    plt.imshow(mask, cmap="hot", alpha=0.5)
    plt.title(f"Image {image_id} + mask", fontsize=18)    
    
    plt.subplot(1, 3, 3)
    plt.imshow(mask, cmap="hot")
    plt.title(f"Mask", fontsize=18)    
    
    plt.show()
    
    
def plot_grid_image_with_mask(image, mask):
    plt.figure(figsize=(16, 16))
    
    w_len = image.shape[0]
    h_len = image.shape[1]
    
    min_len = min(w_len, h_len)
    w_start = (w_len - min_len) // 2
    h_start = (h_len - min_len) // 2
    
    plt.imshow(image[w_start : w_start + min_len, h_start : h_start + min_len])
    plt.imshow(
        mask[w_start : w_start + min_len, h_start : h_start + min_len], cmap="hot", alpha=0.5,
    )
    plt.axis("off")
            
    plt.show()
    

def plot_slice_image_and_mask(image, mask, start_h, end_h, start_w, end_w):
    plt.figure(figsize=(16, 5))
    
    sub_image = image[start_h:end_h, start_w:end_w, :]
    sub_mask = mask[start_h:end_h, start_w:end_w]
    
    plt.subplot(1, 3, 1)
    plt.imshow(sub_image)
    plt.axis("off")
    
    plt.subplot(1, 3, 2)
    plt.imshow(sub_image)
    plt.imshow(sub_mask, cmap="hot", alpha=0.5)
    plt.axis("off")
    
    plt.subplot(1, 3, 3)
    plt.imshow(sub_mask, cmap="hot")
    plt.axis("off")
    
    plt.show()

<a id="2"></a>
<h2 style='background:#EAA6D1; border:0; color:white'><center>Image and Masks Visualizations<center><h2>

In [None]:
small_ids = [
    "0486052bb", "095bf7a1f", "1e2425f28", "2f6ecfcdf",
    "54f2eec69", "aaa6a05cc", "cb2d976f4", "e79de561c",
]
small_images = []
small_masks = []

for small_id in small_ids:
    tmp_image, tmp_mask = read_image(small_id, scale=20, verbose=0)
    small_images.append(tmp_image)
    small_masks.append(tmp_mask)

## Train images

In [None]:
plt.figure(figsize=(16, 16))
for ind, (tmp_id, tmp_image) in enumerate(zip(small_ids, small_images)):
    plt.subplot(3, 3, ind + 1)
    plt.imshow(tmp_image)
    plt.axis("off")

## Train images + masks

In [None]:
plt.figure(figsize=(16, 16))
for ind, (tmp_id, tmp_image, tmp_mask) in enumerate(zip(small_ids, small_images, small_masks)):
    plt.subplot(3, 3, ind + 1)
    plt.imshow(tmp_image)
    plt.imshow(tmp_mask, cmap="hot", alpha=0.5)
    plt.axis("off")

In [None]:
small_ids = [
    "26dc41664", "afa5e8098", "b2dc8411c", "b9a3865fc", "c68fe75ea",
]
small_images = []

for small_id in small_ids:
    tmp_image = read_test_image(small_id, scale=20, verbose=0)
    small_images.append(tmp_image)

## Test images

In [None]:
plt.figure(figsize=(16, 11))
for ind, (tmp_id, tmp_image) in enumerate(zip(small_ids, small_images)):
    plt.subplot(2, 3, ind + 1)
    plt.imshow(tmp_image)
    plt.axis("off")

## 0486052bb

In [None]:
image_id = "0486052bb"
image, mask = read_image(image_id, 2)

In [None]:
plot_image_and_mask(image, mask, image_id)

In [None]:
plot_slice_image_and_mask(image, mask, 5000, 7500, 2500, 5000)
plot_slice_image_and_mask(image, mask, 5250, 5720, 3500, 4000)
plot_slice_image_and_mask(image, mask, 5375, 5575, 3650, 3850)

In [None]:
plot_grid_image_with_mask(image, mask)

## 095bf7a1f

In [None]:
image_id = "095bf7a1f"
image, mask = read_image(image_id, scale=2)

In [None]:
plot_image_and_mask(image, mask, image_id)

In [None]:
plot_slice_image_and_mask(image, mask, 7500, 10000, 10000, 12500)

In [None]:
plot_grid_image_with_mask(image, mask)

## 1e2425f28

In [None]:
image_id = "1e2425f28"
image, mask = read_image(image_id, scale=2)

In [None]:
plot_image_and_mask(image, mask, image_id)

## 2f6ecfcdf

In [None]:
image_id = "2f6ecfcdf"
image, mask = read_image(image_id, scale=2)

In [None]:
plot_image_and_mask(image, mask, image_id)

In [None]:
plot_slice_image_and_mask(image, mask, 10000, 12000, 8000, 10000)

## aaa6a05cc

In [None]:
image_id = "aaa6a05cc"
image, mask = read_image(image_id)

In [None]:
plot_image_and_mask(image, mask, image_id)

In [None]:
plot_slice_image_and_mask(image, mask, 6500, 8500, 7000, 9000)

## e79de561c

In [None]:
image_id = "e79de561c"
image, mask = read_image(image_id)

In [None]:
plot_image_and_mask(image, mask, image_id)

In [None]:
plot_slice_image_and_mask(image, mask, 4000, 6000, 2000, 4000)

<a id="3"></a>
<h2 style='background:#EAA6D1; border:0; color:white'><center>Metadata Analysis<center><h2>

In [None]:
pd.read_json(
    os.path.join(BASE_PATH, "train/0486052bb-anatomical-structure.json")
)

In [None]:
pd.read_json(
    os.path.join(BASE_PATH, "train/0486052bb.json")
)

In [None]:
df_info["split"] = "test"
df_info.loc[df_info["image_file"].isin(os.listdir(os.path.join(BASE_PATH, "train"))), "split"] = "train"

In [None]:
df_info["area"] = df_info["width_pixels"] * df_info["height_pixels"]

In [None]:
df_info.head()

In [None]:
plt.figure(figsize=(16, 35))
plt.subplot(6, 2, 1)
sn.countplot(x="race", hue="split", data=df_info)
plt.subplot(6, 2, 2)
sn.countplot(x="ethnicity", hue="split", data=df_info)
plt.subplot(6, 2, 3)
sn.countplot(x="sex", hue="split", data=df_info)
plt.subplot(6, 2, 4)
sn.countplot(x="laterality", hue="split", data=df_info)
plt.subplot(6, 2, 5)
sn.histplot(x="age", hue="split", data=df_info)
plt.subplot(6, 2, 6)
sn.histplot(x="weight_kilograms", hue="split", data=df_info)
plt.subplot(6, 2, 7)
sn.histplot(x="height_centimeters", hue="split", data=df_info)
plt.subplot(6, 2, 8)
sn.histplot(x="bmi_kg/m^2", hue="split", data=df_info)
plt.subplot(6, 2, 9)
sn.histplot(x="percent_cortex", hue="split", data=df_info)
plt.subplot(6, 2, 10)
sn.histplot(x="percent_medulla", hue="split", data=df_info)
plt.subplot(6, 2, 11)
sn.histplot(x="area", hue="split", data=df_info);

# WORK IN PROGRESS...