In [None]:
! pip install https://github.com/CellProfiling/HPA-Cell-Segmentation/archive/master.zip

In [None]:
import pandas as pd
from glob import glob
import matplotlib.pyplot as plt
import cv2
import random
import numpy as np

# Utils

In [None]:
def prep_images(ID, img_dir='../input/hpa-single-cell-image-classification/train/'):
    green_p = img_dir + f'{ID}_green.png'
    blue_p = img_dir + f'{ID}_blue.png'
    red_p = img_dir + f'{ID}_red.png'
    yellow_p = img_dir + f'{ID}_yellow.png'

    protein_img = cv2.imread(green_p)
    nucleus_img = cv2.imread(blue_p)
    microtubules_img = cv2.imread(red_p)
    ER_img = cv2.imread(yellow_p)

    return dict(protein=protein_img,
                microtubules=microtubules_img,
                endoplasmic_reticulum=ER_img,
                nucleus=nucleus_img,
                )


def merge_image(images):
    """
    Return a 3-channel image.
    The channels are: microtubules, endoplasmic reticulum, and nucleus
    """

    merged = np.dstack((images['microtubules'][:, :, 0],
                        images['endoplasmic_reticulum'][:, :, 0],
                        images['nucleus'][:, :, 0],
                        ))

    return merged


def visualize(images, show_merge=True, ID=None, df=None):
    if show_merge:
        images['merge'] = 0

    n = len(images)
    f = plt.figure(figsize=(12, 4))
    for i, (key, image) in enumerate(images.items()):
        plt.subplot(1, n, i + 1)

        if key == 'merge':
            image = merge_image(images)

        title = ' '.join(key.split('_')).title()
        plt.imshow(image)
        plt.title(title)
        plt.xticks([])
        plt.yticks([])

        if ID is not None:
            labels = df.set_index('ID').loc[ID, 'Label'].split('|')
            labels = [label_map[key] for key in labels]
            f.suptitle(labels)

    f.tight_layout()

In [None]:
train = pd.read_csv('../input/hpa-single-cell-image-classification/train.csv')

train

# What should I expect the data format to be?

The training image-level labels are provided for each sample in train.csv. The bulk of the data for images - train.zip. Each sample consists of four files. Each file represents a different filter on the subcellular protein patterns represented by the sample. The format should be [filename]_[filter color].png for the PNG files. Colors are red for microtubule channels, blue for nuclei channels, yellow for Endoplasmic Reticulum (ER) channels, and green for protein of interest.

# What am I predicting?

You are predicting protein organelle localization labels for each cell in the image. Border cells are included when there is enough information to decide on the labels.

There are in total 19 different labels present in the dataset (18 labels for specific locations, and label 18 for negative and unspecific signal). The dataset is acquired in a highly standardized way using one imaging modality (confocal microscopy). However, the dataset comprises 17 different cell types of highly different morphology, which affect the protein patterns of the different organelles. All image samples are represented by four filters (stored as individual files), the protein of interest (green) plus three cellular landmarks: nucleus (blue), microtubules (red), endoplasmic reticulum (yellow). The green filter should hence be used to predict the label, and the other filters are used as references. The labels are represented as integers that map to the following:

In [None]:
image_paths = glob('../input/hpa-single-cell-image-classification/train/*.png')

len(image_paths)

In [None]:
label_map = {'0': 'Nucleoplasm',
             '1': 'Nuclear membrane',
             '2': 'Nucleoli',
             '3': 'Nucleoli fibrillar center',
             '4': 'Nuclear speckles',
             '5': 'Nuclear bodies',
             '6': 'Endoplasmic reticulum',
             '7': 'Golgi apparatus',
             '8': 'Intermediate filaments',
             '9': 'Actin filaments',
             '10': 'Microtubules',
             '11': 'Mitotic spindle',
             '12': 'Centrosome',
             '13': 'Plasma membrane',
             '14': 'Mitochondria',
             '15': 'Aggresome',
             '16': 'Cytosol',
             '17': 'Vesicles and punctate cytosolic patterns',
             '18': 'Negative'
            }

In [None]:
for _ in range(5):
    ID = random.choice(train.ID)
    images = prep_images(ID)
    visualize(images, ID=ID, df=train)

# Cell Segmentation

In [None]:
from hpacellseg import cellsegmentator
from hpacellseg.utils import label_cell, label_nuclei

In [None]:
image_ids = []
for _ in range(5):
    image_ids.append(random.choice(train.ID.unique()))
    
image_dicts = [prep_images(image_id) for image_id in image_ids]

image_lists = []
for key in image_dicts[0].keys():
    if key == 'protein':
        continue
    images = [image_dict[key][:, :, 0] for image_dict in image_dicts]
    image_lists.append(images)

In [None]:
segmentator = cellsegmentator.CellSegmentator(nuclei_model='.nuclei_model.pth',
                                              cell_model='.cell_model.pth',
                                              scale_factor=0.25,
                                              device='cuda',
                                              padding=False,
                                              multi_channel_model=True)

In [None]:
cell_segmentations = segmentator.pred_cells(image_lists)
nuc_segmentations = segmentator.pred_nuclei(image_lists[2])

In [None]:
for i in range(5):
    img = merge_image(image_dicts[i])
    nuclei_mask, cell_mask = label_cell(nuc_segmentations[i], cell_segmentations[i])
    visualize(dict(image=img,
                  nuclei_mask=nuclei_mask,
                  cell_mask=cell_mask,
                  ),
              show_merge=False
             )