In [None]:
!pip install pycocotools

In [None]:
import os
import re
import json
from glob import glob
from tqdm.notebook import tqdm
from tqdm.contrib.concurrent import process_map
from multiprocessing import Manager

import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
import cv2
import scipy.ndimage
from pycocotools import _mask

import matplotlib.pyplot as plt
import seaborn as sns

## Sartorius - Cell Instance Segmentation

## 1. Introduction

This competition's objective is detecting masks of different cell objects in phase contrast microscopy images. There are 606 images, 73585 annotations in training set, and there are roughly 240 images in hidden test set. Average annotations per image is 121.42 in training set and same ratio is expected in the hidden test set. In addition to that, there are 1972 images without annotations in train_semi_supervised directory. Their metadata isn't listed in train.csv file.

There are 9 columns in image metadata file. elapsed_timedelta is initially dropped because it has the same values with plate_time.

* `id` - Unique ID of the image
* `annotation` - Run length encoded segmentation masks
* `width` - Width of the image
* `height` - Height of the image
* `cell_type` - Type of the cell line
* `plate_time` - Plate creation time
* `sample_date` - Timestamp of the sample
* `sample_id` - Unique ID of the sample
* `elapsed_timedelta` - Time since first image taken of sample

In [None]:
df_train = pd.read_csv('../input/sartorius-cell-instance-segmentation/train.csv')
df_train.drop(columns=['elapsed_timedelta'], inplace=True)
print(f'Training Set Shape: {df_train.shape} - {df_train["id"].nunique()} Images - Memory Usage: {df_train.memory_usage().sum() / 1024 ** 2:.2f} MB')

Files in train_semi_supervised directory can be appended to metadata file for further analysis. Their filenames contain all the information in metadata except width and height. Fortunately, width and height of all images are already known, and they are 704 and 520 respectively.

In [None]:
def parse_filename(filename):
    
    image_id = filename.split('.')[0]
    cell_type = filename.split('[')[0]
    filename_split = filename.split('_')
    plate_time = filename_split[-3]
    sample_date = filename_split[-4]
    sample_id = '_'.join(filename_split[:3]) + '_' + '_'.join(filename_split[-2:]).split('.')[0]
    
    return image_id, cell_type, plate_time, sample_date, sample_id


train_semi_supervised_images = os.listdir('../input/sartorius-cell-instance-segmentation/train_semi_supervised/')
for filename in tqdm(train_semi_supervised_images):
    image_id, cell_type, plate_time, sample_date, sample_id = parse_filename(filename)
    sample = {
        'id': image_id,
        'annotation': np.nan,
        'width': 704,
        'height': 520,
        'cell_type': cell_type,
        'plate_time': plate_time,
        'sample_date': sample_date,
        'sample_id': sample_id
    }
    df_train = df_train.append(sample, ignore_index=True)
    
df_train['cell_type'] = df_train['cell_type'].str.rstrip('s')
print(f'Training Set Shape: {df_train.shape} - {df_train["id"].nunique()} Images - Memory Usage: {df_train.memory_usage().sum() / 1024 ** 2:.2f} MB')

## 2. Metadata

For advanced data analysis and feature extraction, masks have to be decoded into 2 dimensional arrays. Since the training annotations are provided as run length encoded strings, they can be decoded with the function defined below. 

In [None]:
def decode_rle_mask(rle_mask, shape):

    """
    Decode run-length encoded segmentation mask string into 2d array

    Parameters
    ----------
    rle_mask (str): Run-length encoded segmentation mask string
    shape (tuple): Height and width of the mask

    Returns
    -------
    mask [numpy.ndarray of shape (height, width)]: Decoded 2d segmentation mask
    """

    rle_mask = rle_mask.split()
    starts, lengths = [np.asarray(x, dtype=int) for x in (rle_mask[0:][::2], rle_mask[1:][::2])]
    starts -= 1
    ends = starts + lengths

    mask = np.zeros((shape[0] * shape[1]), dtype=np.uint8)
    for start, end in zip(starts, ends):
        mask[start:end] = 1

    mask = mask.reshape(shape[0], shape[1])
    return mask


Additional metadata features like image mean, image standard deviation are extracted from both annotated and unannotated images. Mask area and annotation count are also extracted for only annotated files. Final dataframe with unannotated images and extracted metadata is saved as a csv file.

In [None]:
for image_id in tqdm(df_train.loc[~df_train['annotation'].isnull(), 'id'].unique()):
    
    image = cv2.imread(f'../input/sartorius-cell-instance-segmentation/train/{image_id}.png')
    image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

    df_train.loc[df_train['id'] == image_id, 'image_mean'] = np.mean(image)
    df_train.loc[df_train['id'] == image_id, 'image_std'] = np.std(image)
    
    for rle_mask in df_train.loc[df_train['id'] == image_id, 'annotation']:
        
        mask = decode_rle_mask(rle_mask, (520, 704))
        df_train.loc[(df_train['id'] == image_id) & (df_train['annotation'] == rle_mask), 'mask_area'] = np.sum(mask)


for image_id in tqdm(df_train.loc[df_train['annotation'].isnull(), 'id'].unique()):
    
    image = cv2.imread(f'../input/sartorius-cell-instance-segmentation/train_semi_supervised/{image_id}.png')
    image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    
    df_train.loc[df_train['id'] == image_id, 'image_mean'] = np.mean(image)
    df_train.loc[df_train['id'] == image_id, 'image_std'] = np.std(image)


annotation_counts = df_train.loc[~df_train['annotation'].isnull()].groupby('id')['annotation'].count()
df_train['annotation_count'] = df_train['id'].map(annotation_counts)
df_train.to_csv('train_processed.csv', index=False)

## 3. Cell Types

There are 3 types of cell lines in images and each image contains only a single cell type. Those types are cort (neurons), shsy5y (neuroblastoma) and astro (astrocytes). Each cell lines is different from each other in terms of characteristics and statistics, so each type might require its own unique processing techniques.

Distributions of cell types are different in annotated and unannotated training set. Annotated training set has higher number of cort but unannotated training set has higher number astro cell lines.

In [None]:
def visualize_cell_type_distributions(df, title):
    
    fig, ax = plt.subplots(figsize=(24, 5), dpi=100)

    sns.barplot(
        x=df['cell_type'].value_counts().index,
        y=df['cell_type'].value_counts().values,
        ax=ax
    )

    ax.set_xlabel('')
    ax.set_ylabel('')
    ax.set_xticklabels([f'{target} ({value_count:,})' for value_count, target in zip(df['cell_type'].value_counts().values, df['cell_type'].value_counts().index)])
    ax.tick_params(axis='x', labelsize=15, pad=10)
    ax.tick_params(axis='y', labelsize=15, pad=10)
    ax.set_title(title, size=20, pad=15)

    plt.show()


df_train_supervised_cell_types = df_train[~df_train['annotation'].isnull()].groupby('id')['cell_type'].first().reset_index()
visualize_cell_type_distributions(df=df_train_supervised_cell_types, title='Cell Type Distribution in Annotated Training Set')
df_train_unsupervised_cell_types = df_train[df_train['annotation'].isnull()].groupby('id')['cell_type'].first().reset_index()
visualize_cell_type_distributions(df=df_train_unsupervised_cell_types, title='Cell Type Distribution in Unannotated Training Set')
df_train_all_cell_types = df_train.groupby('id')['cell_type'].first().reset_index()
visualize_cell_type_distributions(df=df_train_all_cell_types, title='Cell Type Distribution in All Training Set')

Metadata distributions among cell types are quite different as expected. Number of annotations distribution is in a small range for cort cell lines, but the range increases in astro cell lines and it increases even more in shsy5y cell lines. Area of annotations distributions are quite similar in cort and shsy5y cell lines since the cells from both of those cell lines are very alike in terms of size. Average size of astrocytes are larger than those two cell types but size of an astrocyte can be between 37 and 13327 pixels. Image mean and image standard deviation distributions are slightly similar in cort and shsy5y cell lines because of their similar cell sizes but astro cell line has a very different distribution in those features. Based on those observations, cutoff points for discarding small detected objects or number of masks to predict can be identified for different cell types.

In [None]:
def visualize_feature_distribution_for_cell_types(df, feature, title):
    
    if feature == 'annotation_count' or feature == 'image_mean' or feature == 'image_std':
        df = df.groupby('id').first()
        
    print(f'{feature}\n{"-" * len(feature)}')
    print(f'cort Mean: {df.loc[df["cell_type"] == "cort"][feature].mean():.4f}  -  Median: {df.loc[df["cell_type"] == "cort"][feature].median():.4f}  -  Std: {df.loc[df["cell_type"] == "cort"][feature].std():.4f} - Min: {df.loc[df["cell_type"] == "cort"][feature].min():.4f} -  Max: {df.loc[df["cell_type"] == "cort"][feature].max():.4f}')
    print(f'shsy5y Mean: {df.loc[df["cell_type"] == "shsy5y"][feature].mean():.4f}  -  Median: {df.loc[df["cell_type"] == "shsy5y"][feature].median():.4f}  -  Std: {df.loc[df["cell_type"] == "shsy5y"][feature].std():.4f} - Min: {df.loc[df["cell_type"] == "shsy5y"][feature].min():.4f} -  Max: {df.loc[df["cell_type"] == "shsy5y"][feature].max():.4f}')
    print(f'astro Mean: {df.loc[df["cell_type"] == "astro"][feature].mean():.4f}  -  Median: {df.loc[df["cell_type"] == "astro"][feature].median():.4f}  -  Std: {df.loc[df["cell_type"] == "astro"][feature].std():.4f} - Min: {df.loc[df["cell_type"] == "astro"][feature].min():.4f} -  Max: {df.loc[df["cell_type"] == "astro"][feature].max():.4f}')

    fig, ax = plt.subplots(figsize=(24, 8), dpi=100)
    sns.kdeplot(df.loc[df['cell_type'] == 'cort'][feature], label='cort', fill=True)
    sns.kdeplot(df.loc[df['cell_type'] == 'shsy5y'][feature], label='shsy5y', fill=True)
    sns.kdeplot(df.loc[df['cell_type'] == 'astro'][feature], label='astro', fill=True)
    
    ax.legend(prop={'size': 16})
    ax.tick_params(axis='x', labelsize=12.5, pad=10)
    ax.tick_params(axis='y', labelsize=12.5, pad=10)
    ax.set_xlabel('')
    ax.set_ylabel('')
    ax.set_title(title, fontsize=20, pad=15)
    
    plt.show()
    

features = ['annotation_count', 'mask_area', 'image_mean', 'image_std']
for feature in features:
    visualize_feature_distribution_for_cell_types(df=df_train, feature=feature, title=f'{feature} Distribution in Training Set')

## 4. Images

Image mean and image standard deviation distributions are slightly different between images in train and train_semi_supervised directories. This doesn't necessarily mean the images are completely different and can't be used. It can be explained with number of objects being different in those two datasets and images in train_semi_supervised directory can still be useful.

In [None]:
def visualize_feature_distribution_for_images(df, feature):
    
    if feature == 'image_mean' or feature == 'image_std':
        df = df.groupby('id').first()
        
    condition = df['annotation'].isnull()
        
    print(f'{feature}\n{"-" * len(feature)}')
    print(f'train Mean: {df.loc[~condition, feature].mean():.4f}  -  Median: {df.loc[~condition, feature].median():.4f}  -  Std: {df.loc[~condition, feature].std():.4f} - Min: {df.loc[~condition, feature].min():.4f} -  Max: {df.loc[~condition, feature].max():.4f}')
    print(f'train_semi_supervised Mean: {df.loc[condition, feature].mean():.4f}  -  Median: {df.loc[condition, feature].median():.4f}  -  Std: {df.loc[condition, feature].std():.4f} - Min: {df.loc[condition, feature].min():.4f} -  Max: {df.loc[condition, feature].max():.4f}')

    fig, ax = plt.subplots(figsize=(24, 8), dpi=100)
    sns.kdeplot(df.loc[~condition, feature], label='train', fill=True)
    sns.kdeplot(df.loc[condition, feature], label='train_semi_supervised', fill=True)
    
    ax.legend(prop={'size': 16})
    ax.tick_params(axis='x', labelsize=12.5, pad=10)
    ax.tick_params(axis='y', labelsize=12.5, pad=10)
    ax.set_xlabel('')
    ax.set_ylabel('')
    ax.set_title(f'{feature} Distribution in Training Set', fontsize=20, pad=15)
    
    plt.show()
    

features = ['image_mean', 'image_std']
for feature in features:
    visualize_feature_distribution_for_images(df=df_train, feature=feature)

Since the number of annotated images is very few, all of them can be visualized with their annotations. In the next sections, cort, shsy5y and astro images from train directory are visualized with their annotations and metadata with the function defined below. 

In [None]:
def visualize_image(df, image_id):

    """
    Visualize image along with segmentation masks

    Parameters
    ----------
    df [pandas.DataFrame of shape (73585, 9)]: Training dataframe
    image_id (str): Image ID (filename)
    """
    
    image_path = df.loc[df['id'] == image_id, 'id'].values[0]
    cell_type = df.loc[df['id'] == image_id, 'cell_type'].values[0]
    annotation_count = df.loc[df['id'] == image_id, 'annotation_count'].values[0]
    plate_time = df.loc[df['id'] == image_id, 'plate_time'].values[0]
    sample_date = df.loc[df['id'] == image_id, 'sample_date'].values[0]
    sample_id = df.loc[df['id'] == image_id, 'sample_id'].values[0]

    image = cv2.imread(f'../input/sartorius-cell-instance-segmentation/train/{image_path}.png')
    image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    
    print(f'{image_id}\n{"-" * len(image_id)}')
    print(f'Image Mean: {np.mean(image):.4f}  -  Median: {np.median(image):.4f}  -  Std: {np.std(image):.4f} - Min: {np.min(image):.4f} -  Max: {np.max(image):.4f}')

    fig, axes = plt.subplots(figsize=(20, 20), ncols=2)
    fig.tight_layout(pad=5.0)
    
    axes[0].imshow(image, cmap='gray')
    masks = []
    for mask in df.loc[df['id'] == image_id, 'annotation'].values:
        decoded_mask = decode_rle_mask(rle_mask=mask, shape=image.shape)
        masks.append(decoded_mask)
    mask = np.stack(masks)
    mask = np.any(mask == 1, axis=0)
    axes[1].imshow(image, cmap='gray')
    axes[1].imshow(mask, alpha=0.4)

    for i in range(2):
        axes[i].set_xlabel('')
        axes[i].set_ylabel('')
        axes[i].tick_params(axis='x', labelsize=15, pad=10)
        axes[i].tick_params(axis='y', labelsize=15, pad=10)
        
    axes[0].set_title(f'{image_path} - {cell_type} - {int(annotation_count)} Annotations\n{plate_time} - {sample_date} - {sample_id}', fontsize=20, pad=15)
    axes[1].set_title('Segmentation Mask', fontsize=20, pad=15)
    plt.show()
    plt.close(fig)


## 5. Neurons (cort) Images

In [None]:
for image_id in df_train.loc[(df_train['cell_type'] == 'cort') & (~df_train['annotation'].isnull()), 'id'].unique():
    visualize_image(df=df_train, image_id=image_id)

## 6. Neuroblastoma (shsy5y) Images

In [None]:
for image_id in df_train.loc[(df_train['cell_type'] == 'shsy5y') & (~df_train['annotation'].isnull()), 'id'].unique():
    visualize_image(df=df_train, image_id=image_id)

## 7. Astrocytes (astro) Images

In [None]:
for image_id in df_train.loc[(df_train['cell_type'] == 'astro') & (~df_train['annotation'].isnull()), 'id'].unique():
    visualize_image(df=df_train, image_id=image_id)

## 8. LIVECell Dataset

[LIVECell](https://github.com/sartorius-research/LIVECell) is the predecessor dataset to this competition. It is a high-quality, manually annotated and expert-validated dataset with 1.6 million annotations of 8 different cell types. LIVECell dataset is provided in json format but it is converted to a csv file for convenience.

There are 5239 images and 1662447 annotations in LIVECell dataset. Average annotations per image is 317.32 which is much higher than average annotations per image in competition dataset since the cell cultures are more confluent in LIVECell dataset.

As the LIVECell dataset is the predecessor of the competition dataset, they are very similar. Images have width of 704 and height of 520 pixels in both datasets and they are probably taken from the same source.

In [None]:
df_livecell = pd.read_csv('../input/sartorius-cell-instance-segmentation-dataset/livecell.csv')
print(f'LIVECell Dataset Shape: {df_livecell.shape} - {df_livecell["id"].nunique()} Images - Memory Usage: {df_livecell.memory_usage().sum() / 1024 ** 2:.2f} MB')

There are 8 types of cell lines in LIVECell dataset and each image contains only a single cell type. Only shsy5y cell line exists in both competition and LIVECell dataset. Other 7 cell lines are completely different.

In [None]:
df_livecell_cell_types = df_livecell.groupby('id')['cell_type'].first().reset_index()
visualize_cell_type_distributions(df=df_livecell_cell_types, title='Cell Type Distribution in LIVECell Dataset')

shsy5y cell line is the second most common cell line after mcf7. There are 704 shsy5y images in LIVECell dataset but their annotation counts are much higher than the ones in competition dataset because there are lots of overconfluent images in it. 

In [None]:
def visualize_shsy5y_annotation_count_in_datasets():
    
    df_train_images = df_train.groupby('id').first()
    train_annotation_counts = df_train_images.loc[df_train_images['cell_type'] == 'shsy5y']['annotation_count']
    df_livecell_images = df_livecell.groupby('id').first()
    livecell_annotation_counts = df_livecell_images.loc[df_livecell_images['cell_type'] == 'shsy5y']['annotation_count']
    
    print(f'shsy5y Annotation Count\n{"-" * 24}')
    print(f'Competition Mean: {train_annotation_counts.mean():.4f}  -  Median: {train_annotation_counts.median():.4f}  -  Std: {train_annotation_counts.std():.4f} - Min: {train_annotation_counts.min():.4f} -  Max: {train_annotation_counts.max():.4f}')
    print(f'LIVECell Mean: {livecell_annotation_counts.mean():.4f}  -  Median: {livecell_annotation_counts.median():.4f}  -  Std: {livecell_annotation_counts.std():.4f} - Min: {livecell_annotation_counts.min():.4f} -  Max: {livecell_annotation_counts.max():.4f}')

    fig, ax = plt.subplots(figsize=(24, 8), dpi=100)
    sns.kdeplot(train_annotation_counts, label='Competition Dataset', fill=True)
    sns.kdeplot(livecell_annotation_counts, label='LIVECell Dataset', fill=True)
    
    ax.legend(prop={'size': 16})
    ax.tick_params(axis='x', labelsize=12.5, pad=10)
    ax.tick_params(axis='y', labelsize=12.5, pad=10)
    ax.set_xlabel('')
    ax.set_ylabel('')
    ax.set_title(f'Annotation Count Distribution of shsy5y Cell Lines', fontsize=20, pad=15)
    
    plt.show()


annotation_counts = df_livecell.groupby('id')['annotation'].count()
df_livecell['annotation_count'] = df_livecell['id'].map(annotation_counts)
visualize_shsy5y_annotation_count_in_datasets()