In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from pathlib import Path
from PIL import Image

import matplotlib.pyplot as plt 
%matplotlib inline 
plt.style.use("bmh")

In [None]:
train_imgs_root = Path("../input/hubmap-organ-segmentation/train_images")
df = pd.read_csv("../input/hubmap-organ-segmentation/train.csv")
print(df.shape)
df.head()

## Understanding Data
- we have 351 biopsy slides from several different organs. (kidney, prostate, large intestine, spleen, lung)
- image are of very large size and have high resolution. 

In [None]:
fig, ax = plt.subplots(figsize=(8*2, 3.5*3), nrows=3, ncols=2)

organ = df["organ"].value_counts()
ax.flat[0].bar(organ.index, organ.values)
ax.flat[0].set_title("organ counts")

ax.flat[1].scatter(df["img_height"].values, df["img_width"].values)
ax.flat[1].set_title("img height width")

ax.flat[2].hist(df["pixel_size"].values)
ax.flat[2].set_title("pixel_size")

ax.flat[3].hist(df["age"].values)
ax.flat[3].set_title("age")

ax.flat[4].hist(df["tissue_thickness"].values)
ax.flat[4].set_title("tissue_thicknees")

sex = df["sex"].value_counts()
ax.flat[5].bar(sex.index, sex.values)
ax.flat[5].set_title("sex")

plt.show()

In [None]:
def rle2mask(rle_string, img_shape):
    rle = [int(i) for i in rle_string.split(' ')]
    pairs = list(zip(rle[0::2],rle[1::2]))

    p_loc = []

    for start, length in pairs:
        for p_pos in range(start, start + length):
            p_loc.append((p_pos % img_shape[1], p_pos // img_shape[0]))

    canvas = np.zeros(img_shape).T
    canvas[tuple(zip(*p_loc))] = 1.0

    return canvas

## Visualizing images 

In [None]:
def get_random_img_mask(df):
    idx = np.random.randint(len(df))
    info = df.iloc[idx].to_dict()
    hpa_id = info["id"]
    organ = info["organ"]
    img_height, img_width = info["img_height"], info["img_width"]
    rle = info["rle"]
    mask = rle2mask(rle, (img_width, img_height))
    img = np.asarray(Image.open(train_imgs_root/(str(hpa_id)+".tiff")))
    return img, mask, info

def vis_mask_img(img, mask, info):
    fig, ax = plt.subplots(figsize=(8*2, 3.5*3), nrows=1, ncols=3)

    ax.flat[0].imshow(img)
    ax.flat[0].set_title(info["organ"])

    ax.flat[1].imshow(mask)
    ax.flat[1].set_title("mask")
    
    #https://www.kaggle.com/code/sohaibanwaar1203/polygons-and-masks-visualisation
    ax.flat[2].imshow(np.dstack((mask, np.zeros(mask.shape), np.argmax(img, axis=-1))))
    plt.show()

In [None]:
organs = df["organ"].unique()
for organ in organs:
    dd = df[df["organ"] == organ].reset_index(drop=True)
    print(f"visualizing: {organ}")
    for _ in range(2):
        img, mask, info = get_random_img_mask(dd)
        vis_mask_img(img, mask, info)