In [None]:
import numpy as np
import pandas as pd
import os
import cv2 as cv

from PIL import Image, ImageEnhance
import matplotlib.pyplot as plt

import plotly.express as px

import json
from collections import defaultdict

In [None]:
!pip install pycocotools

In [None]:
from pycocotools import _mask as MeaskUtils

# Refferences
1. https://www.kaggle.com/dschettler8845/train-sartorius-segmentation-eda-effdet-tf
2. liveCell dataset https://sartorius-research.github.io/LIVECell/

In [None]:
!ls /kaggle/input/sartorius-cell-instance-segmentation/train | wc -l

In [None]:
train_df = pd.read_csv('/kaggle/input/sartorius-cell-instance-segmentation/train.csv')

In [None]:
train_df

# Utils

In [None]:
# ref: https://www.kaggle.com/paulorzp/run-length-encode-and-decode
# modified from: https://www.kaggle.com/inversion/run-length-decoding-quick-start
def rle_decode(mask_rle, shape, color=1):
    """ TBD
    
    Args:
        mask_rle (str): run-length as string formated (start length)
        shape (tuple of ints): (height,width) of array to return 
    
    Returns: 
        Mask (np.array)
            - 1 indicating mask
            - 0 indicating background

    """
    # Split the string by space, then convert it into a integer array
    s = np.array(mask_rle.split(), dtype=int)

    # Every even value is the start, every odd value is the "run" length
    starts = s[0::2] - 1
    lengths = s[1::2]
    ends = starts + lengths

    # The image image is actually flattened since RLE is a 1D "run"
    if len(shape)==3:
        h, w, d = shape
        img = np.zeros((h * w, d), dtype=np.float32)
    else:
        h, w = shape
        img = np.zeros((h * w,), dtype=np.float32)

    # The color here is actually just any integer you want!
    for lo, hi in zip(starts, ends):
        img[lo : hi] = color
        
    # Don't forget to change the image back to the original shape
    return img.reshape(shape)

In [None]:
def get_img_and_mask(img_path, annotation, width, height, mask_only=False, rle_fn=rle_decode):
    """ Capture the relevant image array as well as the image mask """
    img_mask = np.zeros((height, width), dtype=np.uint8)
    for i, annot in enumerate(annotation): 
        img_mask = np.where(rle_fn(annot, (height, width))!=0, i, img_mask)
    
    # Early Exit
    if mask_only:
        return img_mask
    
    # Else Return images
    #img = tf_load_png(img_path)[..., 0]
    img = cv.imread(img_path)
    return img, img_mask

def plot_img_and_mask(img, mask, bboxes=None, invert_img=True, boost_contrast=True):
    """ Function to take an image and the corresponding mask and plot
    
    Args:
        img (np.arr): 1 channel np arr representing the image of cellular structures
        mask (np.arr): 1 channel np arr representing the instance masks (incrementing by one)
        bboxes (list of tuples, optional): (tl, br) coordinates of enclosing bboxes
        invert_img (bool, optional): Whether or not to invert the base image
        boost_contrast (bool, optional): Whether or not to boost contrast of the base image
        
    Returns:
        None; Plots the two arrays and overlays them to create a merged image
    """
    plt.figure(figsize=(20,10))
    
    plt.subplot(1,3,1)
    #_img = np.tile(np.expand_dims(img, axis=-1), 3)
    _img= img
    
    # Flip black-->white ... white-->black
    if invert_img:
        _img = _img.max()-_img
    
    if boost_contrast:
        _img = np.asarray(ImageEnhance.Contrast(Image.fromarray(_img)).enhance(16))
    
    if bboxes:
        for i, bbox in enumerate(bboxes):
            mask = cv.rectangle(mask, bbox[0], bbox[1], (i+1, 0, 0), thickness=2)
    
    plt.imshow(_img)
    plt.axis(False)
    plt.title("Cell Image", fontweight="bold")
    
    plt.subplot(1,3,2)
    _mask = np.zeros_like(_img)
    _mask[..., 0] = mask
    plt.imshow(mask, cmap="inferno")
    plt.axis(False)
    plt.title("Instance Segmentation Mask", fontweight="bold")
    
    merged = cv.addWeighted(_img, 0.75, np.clip(_mask, 0, 1)*255, 0.25, 0.0,)
    plt.subplot(1,3,3)
    plt.imshow(merged)
    plt.axis(False)
    plt.title("Cell Image w/ Instance Segmentation Mask Overlay", fontweight="bold")
    
    plt.tight_layout()
    plt.show()
    
# ref.: https://www.kaggle.com/stainsby/fast-tested-rle
def rle_encode(img):
    """ TBD
    
    Args:
        img (np.array): 
            - 1 indicating mask
            - 0 indicating background
    
    Returns: 
        run length as string formated
    """
    
    pixels = img.flatten()
    pixels = np.concatenate([[0], pixels, [0]])
    runs = np.where(pixels[1:] != pixels[:-1])[0] + 1
    runs[1::2] -= runs[::2]
    return ' '.join(str(x) for x in runs)

In [None]:
# https://github.com/PyImageSearch/imutils/blob/master/imutils/convenience.py
def grab_contours(cnts):
    """ TBD """
    
    # if the length the contours tuple returned by cv2.findContours
    # is '2' then we are using either OpenCV v2.4, v4-beta, or
    # v4-official
    if len(cnts) == 2:
        cnts = cnts[0]

    # if the length of the contours tuple is '3' then we are using
    # either OpenCV v3, v4-pre, or v4-alpha
    elif len(cnts) == 3:
        cnts = cnts[1]

    # otherwise OpenCV has changed their cv2.findContours return
    # signature yet again and I have no idea WTH is going on
    else:
        raise Exception(("Contours tuple must have length 2 or 3, "
            "otherwise OpenCV changed their cv2.findContours return "
            "signature yet again. Refer to OpenCV's documentation "
            "in that case"))

    # return the actual contours array
    return cnts

def get_contour_bbox(msk):
    """ Function to return the bounding box (tl, br) for a given mask """
    
    # Get contour(s) --> There should be only one
    cnts = cv.findContours(msk.copy(), cv.RETR_EXTERNAL, cv.CHAIN_APPROX_SIMPLE)

    contour = grab_contours(cnts)
    
    if len(contour)==0:
        return None
    else:
        contour = contour[0]
    
    # Get extreme coordinates
    tl = (tuple(contour[contour[:, :, 0].argmin()][0])[0], 
          tuple(contour[contour[:, :, 1].argmin()][0])[1])
    br = (tuple(contour[contour[:, :, 0].argmax()][0])[0], 
          tuple(contour[contour[:, :, 1].argmax()][0])[1])
    return tl, br

# Show Train Images

In [None]:
TRAIN_DIR = '/kaggle/input/sartorius-cell-instance-segmentation/train'

In [None]:
# Aggregate under training 
train_df["img_path"] = train_df["id"].apply(lambda x: os.path.join(TRAIN_DIR, x+".png")) # Capture Image Path As Well
tmp_df = train_df.drop_duplicates(subset=["id", "img_path"]).reset_index(drop=True)
tmp_df["annotation"] = train_df.groupby("id")["annotation"].agg(list).reset_index(drop=True)
train_df = tmp_df.copy()
train_df

In [None]:
for _, _dirname, files in os.walk(TRAIN_DIR):
    nums = 1
    for item in files[:nums]:
        _filepath = os.path.join(TRAIN_DIR, item)
        print(_filepath)
        img = cv.imread(_filepath)
        plt.figure()
        plt.imshow(img)

## Only show Single Mask

In [None]:
_idx = 18
_one = train_df.iloc[_idx]
_one

In [None]:
_filepath = os.path.join(TRAIN_DIR, _one.id + ".png")

_img = cv.imread(_filepath)

_img = np.asarray(ImageEnhance.Contrast(Image.fromarray(_img)).enhance(16))


plt.figure()
plt.imshow(_img)

In [None]:
def getAllAnnotation(annotation, height, width, rle_fn):
    img_mask = np.zeros((height, width), dtype=np.uint8)
    for i, annot in enumerate(annotation):
        img_mask = np.where(rle_fn(annot, (height, width))!=0, i, img_mask)
    return img_mask 

In [None]:
#_mask = rle_decode(_one.annotation, (_one.height, _one.width))
_mask = getAllAnnotation(_one.annotation, _one.height, _one.width, rle_decode)
plt.figure()
plt.imshow(_mask, cmap="inferno")

In [None]:
_mask2 = np.zeros_like(_img)
_mask2[..., 0] = _mask
_merged = cv.addWeighted(_img, 0.75, np.clip(_mask2, 0, 1)*255, 0.25, 0.0,)
plt.figure()
plt.imshow(_merged, cmap="inferno")

In [None]:
print("\n\n... WIDTH VALUE COUNTS ...")
for k,v in train_df.width.value_counts().items():
    print(f"\t--> There are {v} images with WIDTH={k}")

print("\n\n... HEIGHT VALUE COUNTS ...")
for k,v in train_df.height.value_counts().items():
    print(f"\t--> There are {v} images with HEIGHT={k}")

print("\n\n... AREA COUNTS ...")
for k,v in (train_df.width*train_df.height).value_counts().items():
    print(f"\t--> There are {v} images with AREA={k}")

print("\n\n... NOTE: ALL THE IMAGES ARE THE SAME SIZE ...\n")

In [None]:
print("\n\n... PLATE TIME VALUE COUNTS ...")
for k,v in train_df.plate_time.value_counts().items():
    print(f"\t--> There are {v} images with PLATE_TIME={k}")
fig = px.histogram(train_df, x="plate_time", color="cell_type", title="<b>Plate Time Histogram</b>")
fig.show()

In [None]:
print("\n\n... SAMPLE DATE VALUE COUNTS ...")
for k,v in train_df.sample_date.value_counts().items():
    print(f"\t--> There are {v} images with SAMPLE_DATE={k}")
fig = px.histogram(train_df, train_df.sample_date.apply(lambda x: x.replace("-", "_")), color="cell_type", title="<b>Sample Date Value Histogram</b>")
fig.show()

In [None]:
print("\n\n... ELAPSED TIME DELTA VALUE COUNTS ...")
for k,v in train_df.elapsed_timedelta.value_counts().items():
    print(f"\t--> There are {v} images with SAMPLE_DATE={k}")
fig = px.histogram(train_df, "elapsed_timedelta", color="cell_type", title="<b>Elapsed Time Delta Value Histogram</b>")
fig.show()

In [None]:
print("\n\n... SAMPLE ID VALUE COUNTS (>1) ...")
print(f"\t--> There are {len(train_df[train_df.sample_id.isin([x for x,v in train_df.sample_id.value_counts().items() if v>1])])} SAMPLE_IDs with more than one image\n")
for k,v in train_df[train_df.sample_id.isin([x for x,v in train_df.sample_id.value_counts().items() if v>1])].reset_index()["sample_id"].value_counts().items():
    print(f"\t--> There are {v} images with SAMPLE_ID={k}")
fig = px.histogram(train_df[train_df.sample_id.isin([x for x,v in train_df.sample_id.value_counts().items() if v>1])].reset_index(), "sample_id", color="cell_type", title="<b>Sample ID Value Histogram</b>")
fig.show()

In [None]:
print("\n\n... CELL TYPE VALUE COUNTS ...")
for k,v in train_df.cell_type.value_counts().items():
    print(f"\t--> There are {v} images with CELL_TYPE={k}")
    
fig = px.histogram(train_df, x="cell_type", title="<b>Cell Type Histogram</b>")
fig.show()

In [None]:
CELL_TYPES = list(train_df.cell_type.unique())
for ct in CELL_TYPES:
    print(f"\n\n... SHOWING THREE EXAMPLES OF CELL TYPE {ct.upper()} ...\n")
    for i in range(3):
        img, msk = get_img_and_mask(**train_df[train_df.cell_type==ct][["img_path", "annotation", "width", "height"]].sample(3).reset_index(drop=True).iloc[i].to_dict())
        plot_img_and_mask(img, msk)

# Show External Data

In [None]:
DATA_DIR = "/kaggle/input/sartorius-cell-instance-segmentation"
LC_DIR = os.path.join(DATA_DIR, "LIVECell_dataset_2021")
LC_ANN_DIR = os.path.join(LC_DIR, "annotations")
LC_IMG_DIR = os.path.join(LC_DIR, "images")

In [None]:
LC_CELL_TYPES = os.listdir(os.path.join(LC_ANN_DIR, "LIVECell_single_cells"))

print("\n... LOADING TRAIN COCO JSON ...\n")
LC_COCO_TRAIN = os.path.join(LC_ANN_DIR, "LIVECell", "livecell_coco_train.json")

print("\n... LOADING VALIDATION COCO JSON ...\n")
LC_COCO_VAL = os.path.join(LC_ANN_DIR, "LIVECell", "livecell_coco_val.json")

print("\n... LOADING TEST COCO JSON ...\n")
LC_COCO_TEST = os.path.join(LC_ANN_DIR, "LIVECell", "livecell_coco_test.json")

LC_SC_TRAIN = {
    lc_ct:os.path.join(LC_ANN_DIR, "LIVECell_single_cells", lc_ct, f"livecell_{lc_ct}_train.json") \
    for lc_ct in LC_CELL_TYPES
}
LC_SC_VAL = {
    lc_ct:os.path.join(LC_ANN_DIR, "LIVECell_single_cells", lc_ct, f"livecell_{lc_ct}_val.json") \
    for lc_ct in LC_CELL_TYPES
}
LC_SC_TEST = {
    lc_ct:os.path.join(LC_ANN_DIR, "LIVECell_single_cells", lc_ct, f"livecell_{lc_ct}_test.json") \
    for lc_ct in LC_CELL_TYPES
}

print(LC_SC_TRAIN)
#print(LC_SC_VAL)
#print(LC_SC_TEST)

In [None]:
import tifffile

# LiveCell

https://www.nature.com/articles/s41592-021-01249-6

In [None]:
LC_DIR_IMGS = os.path.join(LC_DIR, 'images')
#tiffiles = []
tiffiles = defaultdict(list)
class_count = defaultdict(list)

for _dirs in os.listdir(LC_DIR_IMGS):
    _split = os.path.join(LC_DIR_IMGS, _dirs)
    class_count[_dirs] = {}
    for foldlist in os.listdir(_split):
        _cellfold = os.path.join(_split, foldlist)
        
        class_count[_dirs][foldlist] = 0
        
        for file in os.listdir(_cellfold):
            _tiffilepath = os.path.join(_cellfold, file)
            #tiffiles.append(_tiffilepath)
            tiffiles[file] = _tiffilepath
            class_count[_dirs][foldlist] +=1

In [None]:
class_count

In [None]:
# TODO convert tiff to img
filename = 'SHSY5Y_Phase_B10_2_03d08h00m_4.tif'
f = tiffiles[filename]
image = tifffile.imread(f)
plt.figure()
plt.imshow(image)

In [None]:
_live_cell_annotation_file = '../input/sartorius-cell-instance-segmentation/LIVECell_dataset_2021/annotations/LIVECell/livecell_coco_train.json'
_live_cell_shsy5y_train = '../input/sartorius-cell-instance-segmentation/LIVECell_dataset_2021/annotations/LIVECell_single_cells/shsy5y/livecell_shsy5y_train.json'
_live_cell_shsy5y_val = '../input/sartorius-cell-instance-segmentation/LIVECell_dataset_2021/annotations/LIVECell_single_cells/shsy5y/livecell_shsy5y_val.json'
_live_cell_shsy5y_test = '../input/sartorius-cell-instance-segmentation/LIVECell_dataset_2021/annotations/LIVECell_single_cells/shsy5y/livecell_shsy5y_test.json'

In [None]:
_dataset = json.loads(open(_live_cell_annotation_file).read())

In [None]:
_dataset_shsy5y_train = json.loads(open(_live_cell_shsy5y_train).read())
_dataset_shsy5y_val = json.loads(open(_live_cell_shsy5y_val).read())
_dataset_shsy5y_test = json.loads(open(_live_cell_shsy5y_test).read())

In [None]:
print(_dataset['info'])
print(_dataset['licenses'])
print(_dataset['categories'])

In [None]:
print(_dataset.keys())
print(_dataset['images'][0].keys())
print(_dataset['annotations']['2'].keys())

In [None]:
print(len(_dataset_shsy5y_train['images']))
print(len(_dataset_shsy5y_val['images']))
print(len(_dataset_shsy5y_test['images']))

In [None]:
print(_dataset['images'][0])

In [None]:
_dataset['annotations']['2']['segmentation']

In [None]:
_dataset['annotations']['2']['bbox']

In [None]:
_check = _dataset['images'][2]
_check

In [None]:
tiffiles[_check['file_name']]

In [None]:
_dataset_anno = _dataset['annotations']
_image_id = _check['id']

_image = tifffile.imread(tiffiles[_check['file_name']])

In [None]:
allMask = np.zeros((520, 704))
for key in _dataset_anno.keys():
    if _dataset_anno[key]['image_id'] == _image_id :
        rle = MeaskUtils.frPoly(_dataset_anno[key]['segmentation'],520,704 )
        mask = MeaskUtils.decode(rle)
        allMask += mask[:,:,0]

plt.figure(figsize=(20,10))
plt.subplot(1,3,1)
plt.imshow(_image) 

plt.subplot(1,3,2)
plt.imshow(allMask == 1, cmap='jet', alpha=0.5) 

plt.subplot(1,3,3)
merged = cv.addWeighted(allMask, 0.75, allMask, 0.25, 0.0,)
plt.imshow(merged)

plt.show()

# Show Semisupervised Images

In [None]:
SEMI_DIR = '../input/sartorius-cell-instance-segmentation/train_semi_supervised'

In [None]:
unsupervised_files = []
unsupervised_files_path = []
for filename in os.listdir(SEMI_DIR):
    unsupervised_files_path.append(os.path.join(SEMI_DIR, filename))
    unsupervised_files.append(filename)

In [None]:
semi_df = pd.DataFrame()

semi_df["cell_type"] = [x.split("[", 1)[0] for x in unsupervised_files]
semi_df["compound"] = [x.split("]", 1)[0].split("[", 1)[-1] for x in unsupervised_files]
semi_df["img_path"] = unsupervised_files_path

In [None]:
fig = px.histogram(semi_df, "cell_type", color="compound")
fig.show()

fig = px.histogram(semi_df, "compound", color="cell_type")
fig.show()

In [None]:
semi_df

In [None]:
plt.figure(figsize=(20,26))
for i, img_path in zip(range(15), semi_df.img_path.to_list()):
    plt.subplot(5,3,i+1)
    plt.imshow((255-np.asarray(ImageEnhance.Contrast(Image.fromarray(cv.imread(img_path))).enhance(16))), cmap="inferno")
    plt.axis(False)
    plt.title(img_path.rsplit("/", 1)[-1].rsplit(".", 1)[0], fontweight="bold")
    
plt.tight_layout()
plt.show()