# HuBMAP + HPA - Hacking the Human Body
The task here in this competition is to segment the tissue units found in organs like lungs, kidney etc...

![Competition Image](https://storage.googleapis.com/kaggle-competitions/kaggle/34547/logos/header.png?t=2022-02-15-22-37-27)

## Table of Contents
1. [Goals](#Goals)
2. [Getting Started](#Getting-Started)
    1. [Train Data](#Train-Data)
    2. [Test Data](#Test-Data)
3. [Visualizations](#Visualizations)
    1. [Train Images](#Train-Images)
    2. [Test Image](#Test-Image)
4. [Detailed view of Train Images by category and sex](#Detailed-view-of-Train-Images-by-category-and-sex)
    1. [Spleen Male](#Spleen-Male:-19360)
    2. [Spleen Female](#Spleen-Female:-18792)
    3. [Kindney Male](#Kidney-Male:-15192)
    4. [Kindney Female](#Kidney-Female:-24522)
    5. [Lung Male](#Lung-Male:-24782)
    6. [Lung Female](#Lung-Female:-27232)
    7. [Prostate](#Prostate:-30424)
    8. [Large Intestine Male](#Large-Intestine-Male:-21812)
    9. [Large Intestine Female](#Large-Intestine-Female:-4062)
5. [Analyzing the Meta-Data](#Analyzing-the-Meta-Data)

## Goals
The goal of this competition is to identify the locations of each functional tissue unit (FTU) in biopsy slides from several different organs. The underlying data includes imagery from different sources prepared with different protocols at a variety of resolutions, reflecting typical challenges for working with medical data.

<br>

<font size=4 color='blue'>If you find this notebook useful, leave an upvote, that motivates me to write more such notebooks.</font>

<br>

---
**NOTE:**

<font size=4 color='red'> This notebook is still a work in progress! </font>

---

## Getting Started <a name="getting-started"></a>

In [None]:
import os

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
plt.style.use("ggplot")

import seaborn as sns

import tifffile
import cv2

In [None]:
RANDOM_SEED = 42 
BASE_DIR = "../input/hubmap-organ-segmentation"
TRAIN_DIR = "../input/hubmap-organ-segmentation/train_images"
TEST_DIR = "../input/hubmap-organ-segmentation/test_images"
LABEL_DIR = "../input/hubmap-organ-segmentation/train_annotations"

In [None]:
def seed_everything(seed):
    """
    Seeds basic parameters for reproductibility of results
    
    Arg:
        seed {int} -- Number for the seed
    """
#     random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
#     torch.manual_seed(seed)
#     torch.cuda.manual_seed(seed)
#     torch.backends.cudnn.deterministic = True
#     torch.backends.cudnn.benchmark = False

seed_everything(RANDOM_SEED)

### Train Data
`train.csv` contains the RLE encoded masks and some metadata which could be very useful. 

In [None]:
train_df = pd.read_csv(os.path.join(BASE_DIR, "train.csv"))
train_df.sample(5)

The number of images in the training only just over 350.

This might push us more towards external data found in the HuBMAP website [https://portal.hubmapconsortium.org](https://portal.hubmapconsortium.org/), Transfer Learning, heavy augmentations etc...


In [None]:
train_df.info()

`pixel_size` and `tissue_thickness` might be more or less the same throughout the dataset.

In [None]:
train_df.describe()

### Test Data

This competition uses a hidden test dataset. It is mentioned in the data description that we can expect around 550 images in the test set.

In [None]:
test_df = pd.read_csv(os.path.join(BASE_DIR, "test.csv"))
test_df

In [None]:
# Credits: https://www.kaggle.com/code/ihelon/hubmap-exploratory-data-analysis

# https://www.kaggle.com/paulorzp/rle-functions-run-lenght-encode-decode
def rle2mask(mask_rle, shape):
    '''
    mask_rle: run-length as string formated (start length)
    shape: (width,height) of array to return 
    Returns numpy array, 1 - mask, 0 - background

    '''
    s = mask_rle.split()
    starts, lengths = [
        np.asarray(x, dtype=int) for x in (s[0:][::2], s[1:][::2])
    ]
    starts -= 1
    ends = starts + lengths
    img = np.zeros(shape[0] * shape[1], dtype=np.uint8)
    for lo, hi in zip(starts, ends):
        img[lo : hi] = 1
    return img.reshape(shape).T


def read_image(image_id, scale=None, verbose=1):
    image = tifffile.imread(
        os.path.join(BASE_DIR, f"train_images/{image_id}.tiff")
    )
    if len(image.shape) == 5:
        image = image.squeeze().transpose(1, 2, 0)
    
    mask = rle2mask(
        train_df[train_df["id"] == image_id]["rle"].values[0], 
        (image.shape[1], image.shape[0])
    )
    
    if verbose:
        print(f"[{image_id}] Image shape: {image.shape}")
        print(f"[{image_id}] Mask shape: {mask.shape}")
    
    if scale:
        new_size = (image.shape[1] // scale, image.shape[0] // scale)
        image = cv2.resize(image, new_size)
        mask = cv2.resize(mask, new_size)
        
        if verbose:
            print(f"[{image_id}] Resized Image shape: {image.shape}")
            print(f"[{image_id}] Resized Mask shape: {mask.shape}")
        
    return image, mask


def read_test_image(image_id, scale=None, verbose=1):
    image = tifffile.imread(
        os.path.join(BASE_DIR, f"test_images/{image_id}.tiff")
    )
    if len(image.shape) == 5:
        image = image.squeeze().transpose(1, 2, 0)
    
    if verbose:
        print(f"[{image_id}] Image shape: {image.shape}")
    
    if scale:
        new_size = (image.shape[1] // scale, image.shape[0] // scale)
        image = cv2.resize(image, new_size)
        
        if verbose:
            print(f"[{image_id}] Resized Image shape: {image.shape}")
        
    return image


def plot_image_and_mask(image, mask, image_id, cmap):
    plt.figure(figsize=(16, 10))
    
    plt.subplot(1, 3, 1)
    plt.imshow(image)
    plt.grid(visible=False)
    plt.title(f"Image {image_id}", fontsize=18)
    
    plt.subplot(1, 3, 2)
    plt.imshow(image)
    plt.grid(visible=False)
    plt.imshow(mask, cmap=cmap, alpha=0.5)
    plt.title(f"Image {image_id} + mask", fontsize=18)    
    
    plt.subplot(1, 3, 3)
    plt.grid(visible=False)
    plt.imshow(mask, cmap=cmap)
    plt.title(f"Mask", fontsize=18)    

    plt.show()
    
    
def plot_grid_image_with_mask(image, mask):
    plt.figure(figsize=(16, 16))
    
    w_len = image.shape[0]
    h_len = image.shape[1]
    
    min_len = min(w_len, h_len)
    w_start = (w_len - min_len) // 2
    h_start = (h_len - min_len) // 2
    
    plt.imshow(image[w_start : w_start + min_len, h_start : h_start + min_len])
    plt.imshow(
        mask[w_start : w_start + min_len, h_start : h_start + min_len], cmap="hot", alpha=0.5,
    )
    plt.axis("off")
            
    plt.show()
    

def plot_slice_image_and_mask(image, mask, start_h, end_h, start_w, end_w, cmap):
    plt.figure(figsize=(16, 5))
    
    sub_image = image[start_h:end_h, start_w:end_w, :]
    sub_mask = mask[start_h:end_h, start_w:end_w]
    
    plt.subplot(1, 3, 1)
    plt.imshow(sub_image)
    plt.axis("off")
    
    plt.subplot(1, 3, 2)
    plt.imshow(sub_image)
    plt.imshow(sub_mask, cmap=cmap, alpha=0.5)
    plt.axis("off")
    
    plt.subplot(1, 3, 3)
    plt.imshow(sub_mask, cmap=cmap)
    plt.axis("off")
    
    plt.show()

In [None]:
train_df[train_df.organ == "largeintestine"].sample(1)

In [None]:
sampled_ids = [24782, 24522, 19360, 29238, 27232, 18792, 30424, 21812]
train_df[train_df["id"].isin(sampled_ids)]

## Visualizations

### Train Images

In [None]:
sampled_images = []
sampled_masks = []

for sampled_id in sampled_ids:
    tmp_image, tmp_mask = read_image(sampled_id, scale=20, verbose=0)
    sampled_images.append(tmp_image)
    sampled_masks.append(tmp_mask)

def get_image_masks_with_id(sampled_ids):
    sampled_images = []
    sampled_masks = []

    for sampled_id in sampled_ids:
        tmp_image, tmp_mask = read_image(sampled_id, scale=20, verbose=0)
        sampled_images.append(tmp_image)
        sampled_masks.append(tmp_mask)
    
    return sampled_images, sampled_masks

In [None]:
plt.figure(figsize=(16, 16))
for ind, (tmp_id, tmp_image) in enumerate(zip(sampled_ids, sampled_images)):
    plt.subplot(3, 3, ind + 1)
    plt.imshow(tmp_image)
    plt.axis("off")

In [None]:
plt.figure(figsize=(16, 16))
for ind, (tmp_id, tmp_image, tmp_mask) in enumerate(zip(sampled_ids, sampled_images, sampled_masks)):
    plt.subplot(3, 3, ind + 1)
    plt.imshow(tmp_image)
    plt.imshow(tmp_mask, cmap="bwr", alpha=0.5)
    plt.axis("off")

### Test Image

In [None]:
image_id = 10078
test_image = read_test_image(image_id, scale=2, verbose=0)

plt.figure(figsize=(16, 16))
plt.imshow(test_image)
plt.axis("off")

## Detailed view of Train Images by category and sex

### Spleen Male: 19360

In [None]:
image_id = 19360
image, mask = read_image(image_id, 2)

In [None]:
plot_image_and_mask(image, mask, image_id, "bwr")

In [None]:
plot_slice_image_and_mask(image, mask, 625, 1300, 70, 400, "plasma")
plot_slice_image_and_mask(image, mask, 1200, 1450, 390, 600, "plasma")
plot_slice_image_and_mask(image, mask, 450, 950, 380, 720, "plasma")
plot_slice_image_and_mask(image, mask, 470, 900, 1050, 1450, "plasma")

In [None]:
plot_grid_image_with_mask(image, mask)

In [None]:
ids_sampled = train_df[(train_df["organ"] == "spleen") & (train_df["sex"] == "Male")].sample(6, random_state=RANDOM_SEED).id.tolist()

sampled_images, sampled_masks = get_image_masks_with_id(ids_sampled)

plt.figure(figsize=(16, 16))
for ind, (tmp_id, tmp_image, tmp_mask) in enumerate(zip(ids_sampled, sampled_images, sampled_masks)):
    plt.subplot(3, 3, ind + 1)
    plt.imshow(tmp_image)
    plt.imshow(tmp_mask, cmap="hot", alpha=0.5)
    plt.title(f"Male Spleen: {tmp_id}")
    plt.axis("off")

### Spleen Female: 18792

In [None]:
image_id = 18792
image, mask = read_image(image_id, 2)

In [None]:
plot_image_and_mask(image, mask, image_id, "bwr")

In [None]:
plot_slice_image_and_mask(image, mask, 550, 870, 180, 460, "plasma")
plot_slice_image_and_mask(image, mask, 500, 1150, 350, 1150, "plasma")
plot_slice_image_and_mask(image, mask, 710, 900, 950, 1400, "plasma")
plot_slice_image_and_mask(image, mask, 1000, 1400, 720, 1100, "plasma")

In [None]:
plot_grid_image_with_mask(image, mask)

In [None]:
ids_sampled = train_df[(train_df["organ"] == "spleen") & (train_df["sex"] == "Female")].sample(6, random_state=RANDOM_SEED).id.tolist()

sampled_images, sampled_masks = get_image_masks_with_id(ids_sampled)

plt.figure(figsize=(16, 16))
for ind, (tmp_id, tmp_image, tmp_mask) in enumerate(zip(ids_sampled, sampled_images, sampled_masks)):
    plt.subplot(3, 3, ind + 1)
    plt.imshow(tmp_image)
    plt.imshow(tmp_mask, cmap="hot", alpha=0.5)
    plt.title(f"Female Spleen: {tmp_id}")
    plt.axis("off")

### Lung Male: 24782

In [None]:
image_id = 24782
image, mask = read_image(image_id, 2)

In [None]:
plot_image_and_mask(image, mask, image_id, "bwr")

In [None]:
plot_slice_image_and_mask(image, mask, 400, 620, 750, 900, "plasma")

In [None]:
plot_grid_image_with_mask(image, mask)

In [None]:
ids_sampled = train_df[(train_df["organ"] == "lung") & (train_df["sex"] == "Male")].sample(6, random_state=RANDOM_SEED).id.tolist()

sampled_images, sampled_masks = get_image_masks_with_id(ids_sampled)

plt.figure(figsize=(16, 16))
for ind, (tmp_id, tmp_image, tmp_mask) in enumerate(zip(ids_sampled, sampled_images, sampled_masks)):
    plt.subplot(3, 3, ind + 1)
    plt.imshow(tmp_image)
    plt.imshow(tmp_mask, cmap="hot", alpha=0.5)
    plt.title(f"Male Lung: {tmp_id}")
    plt.axis("off")

### Lung Female: 27232

In [None]:
image_id = 27232
image, mask = read_image(image_id, 2)

In [None]:
plot_image_and_mask(image, mask, image_id, "bwr")

In [None]:
plot_slice_image_and_mask(image, mask, 1050, 1150, 250, 450, "plasma")
plot_slice_image_and_mask(image, mask, 1100, 1250, 450, 700, "plasma")
plot_slice_image_and_mask(image, mask, 400, 800, 480, 850, "plasma")
plot_slice_image_and_mask(image, mask, 1150, 1350, 750, 1050, "plasma")

In [None]:
plot_grid_image_with_mask(image, mask)

In [None]:
ids_sampled = train_df[(train_df["organ"] == "lung") & (train_df["sex"] == "Female")].sample(6, random_state=RANDOM_SEED).id.tolist()

sampled_images, sampled_masks = get_image_masks_with_id(ids_sampled)

plt.figure(figsize=(16, 16))
for ind, (tmp_id, tmp_image, tmp_mask) in enumerate(zip(ids_sampled, sampled_images, sampled_masks)):
    plt.subplot(3, 3, ind + 1)
    plt.imshow(tmp_image)
    plt.imshow(tmp_mask, cmap="hot", alpha=0.5)
    plt.title(f"Female Lung: {tmp_id}")
    plt.axis("off")

### Kidney Male: 15192

In [None]:
image_id = 15192
image, mask = read_image(image_id, 2)

In [None]:
plot_image_and_mask(image, mask, image_id, "bwr")

In [None]:
plot_slice_image_and_mask(image, mask, 1100, 1400, 400, 650, "plasma")
plot_slice_image_and_mask(image, mask, 1100, 1450, 720, 1100, "plasma")
plot_slice_image_and_mask(image, mask, 220, 450, 820, 1300, "plasma")

In [None]:
plot_grid_image_with_mask(image, mask)

In [None]:
ids_sampled = train_df[(train_df["organ"] == "kidney") & (train_df["sex"] == "Male")].sample(6, random_state=RANDOM_SEED).id.tolist()

sampled_images, sampled_masks = get_image_masks_with_id(ids_sampled)

plt.figure(figsize=(16, 16))
for ind, (tmp_id, tmp_image, tmp_mask) in enumerate(zip(ids_sampled, sampled_images, sampled_masks)):
    plt.subplot(3, 3, ind + 1)
    plt.imshow(tmp_image)
    plt.imshow(tmp_mask, cmap="hot", alpha=0.5)
    plt.title(f"Male Kidney: {tmp_id}")
    plt.axis("off")

### Kidney Female: 24522

In [None]:
image_id = 24522
image, mask = read_image(image_id, 2)

In [None]:
plot_image_and_mask(image, mask, image_id, "bwr")

In [None]:
plot_slice_image_and_mask(image, mask, 280, 550, 720, 1000, "plasma")
plot_slice_image_and_mask(image, mask, 400, 650, 1000, 1300, "plasma")

In [None]:
plot_grid_image_with_mask(image, mask)

In [None]:
ids_sampled = train_df[(train_df["organ"] == "kidney") & (train_df["sex"] == "Female")].sample(6, random_state=RANDOM_SEED).id.tolist()

sampled_images, sampled_masks = get_image_masks_with_id(ids_sampled)

plt.figure(figsize=(16, 16))
for ind, (tmp_id, tmp_image, tmp_mask) in enumerate(zip(ids_sampled, sampled_images, sampled_masks)):
    plt.subplot(3, 3, ind + 1)
    plt.imshow(tmp_image)
    plt.imshow(tmp_mask, cmap="hot", alpha=0.5)
    plt.title(f"Female Kidney: {tmp_id}")
    plt.axis("off")

### Prostate: 30424
(Male Only)

In [None]:
image_id = 30424
image, mask = read_image(image_id, 2)

In [None]:
plot_image_and_mask(image, mask, image_id, "bwr")

In [None]:
plot_slice_image_and_mask(image, mask, 780, 1500, 230, 850, "plasma")
plot_slice_image_and_mask(image, mask, 100, 800, 250, 1300, "plasma")
plot_slice_image_and_mask(image, mask, 750, 1100, 1200, 1500, "plasma")

In [None]:
plot_grid_image_with_mask(image, mask)

In [None]:
ids_sampled = train_df[train_df["organ"] == "prostate"].sample(6, random_state=RANDOM_SEED).id.tolist()

sampled_images, sampled_masks = get_image_masks_with_id(ids_sampled)

plt.figure(figsize=(16, 16))
for ind, (tmp_id, tmp_image, tmp_mask) in enumerate(zip(ids_sampled, sampled_images, sampled_masks)):
    plt.subplot(3, 3, ind + 1)
    plt.imshow(tmp_image)
    plt.imshow(tmp_mask, cmap="hot", alpha=0.5)
    plt.title(f"Male Prostate: {tmp_id}")
    plt.axis("off")

### Large Intestine Male: 21812


In [None]:
image_id = 21812
image, mask = read_image(image_id, 2)

In [None]:
plot_image_and_mask(image, mask, image_id, "bwr")

In [None]:
plot_slice_image_and_mask(image, mask, 10, 450, 100, 1000, "plasma")
plot_slice_image_and_mask(image, mask, 350, 1000, 50, 1500, "plasma")
plot_slice_image_and_mask(image, mask, 800, 1500, 400, 1500, "plasma")

In [None]:
plot_grid_image_with_mask(image, mask)

In [None]:
ids_sampled = train_df[(train_df["organ"] == "largeintestine") & (train_df["sex"] == "Male")].sample(6, random_state=RANDOM_SEED).id.tolist()

sampled_images, sampled_masks = get_image_masks_with_id(ids_sampled)

plt.figure(figsize=(16, 16))
for ind, (tmp_id, tmp_image, tmp_mask) in enumerate(zip(ids_sampled, sampled_images, sampled_masks)):
    plt.subplot(3, 3, ind + 1)
    plt.imshow(tmp_image)
    plt.imshow(tmp_mask, cmap="hot", alpha=0.5)
    plt.title(f"Male Large-Intestine: {tmp_id}")
    plt.axis("off")

### Large Intestine Female: 4062

In [None]:
image_id = 4062
image, mask = read_image(image_id, 2)

In [None]:
plot_image_and_mask(image, mask, image_id, "bwr")

In [None]:
plot_slice_image_and_mask(image, mask, 50, 600, 200, 1500, "plasma")
plot_slice_image_and_mask(image, mask, 650, 1500, 600, 1500, "plasma")

In [None]:
plot_grid_image_with_mask(image, mask)

In [None]:
ids_sampled = train_df[(train_df["organ"] == "largeintestine") & (train_df["sex"] == "Female")].sample(6, random_state=RANDOM_SEED).id.tolist()

sampled_images, sampled_masks = get_image_masks_with_id(ids_sampled)

plt.figure(figsize=(16, 16))
for ind, (tmp_id, tmp_image, tmp_mask) in enumerate(zip(ids_sampled, sampled_images, sampled_masks)):
    plt.subplot(3, 3, ind + 1)
    plt.imshow(tmp_image)
    plt.imshow(tmp_mask, cmap="hot", alpha=0.5)
    plt.title(f"Female Large-Intestine: {tmp_id}")
    plt.axis("off")

## Analyzing the Meta Data

In [None]:
train_df["area"] = train_df["img_height"] * train_df["img_height"]

In [None]:
train_df.info()

In [None]:
plt.figure(figsize=(8, 8))
train_df.organ.value_counts().plot(kind='bar')

In [None]:
plt.figure(figsize=(8, 8))
train_df.sex.value_counts().plot(kind='bar')

In [None]:
plt.figure(figsize=(8, 8))
train_df.data_source.value_counts().plot(kind='bar')

In [None]:
plt.figure(figsize=(12, 12))
sns.histplot(x="age", kde=True, data=train_df)
plt.show()

In [None]:
plt.figure(figsize=(12, 12))
sns.histplot(x="age", hue="sex", multiple="stack", kde=True, data=train_df)
plt.show()

In [None]:
plt.figure(figsize=(12, 12))
sns.histplot(x="age", hue="organ", multiple="stack", data=train_df)
plt.show()

In [None]:
plt.figure(figsize=(12, 12))
sns.displot(x="age", hue="organ", kind="kde", multiple='stack', data=train_df, height=12)

In [None]:
sns.displot(x="age", col="sex", hue="organ", kind="kde", multiple='stack', data=train_df)

In [None]:
plt.figure(figsize=(12, 12))
sns.countplot(x="organ", hue="sex", data=train_df)

In [None]:
plt.figure(figsize=(12, 12))
sns.countplot(x="organ", hue="data_source", data=train_df)