# Introduction

This notebook has for goal to introduce several functions to load the data and create instance masks for every cell present in training images. By generating instance segmentation masks, we will make it possible to analyze cells individually and potentially associate each of them to one or several of the labels given for the entire image.

In [None]:
import numpy as np
import pandas as pd
import os
import cv2
import matplotlib.pyplot as plt
from tqdm import tqdm

In [None]:
dataset_folder = "/kaggle/input/hpa-single-cell-image-classification/"
training_image_folder = dataset_folder+"train/"
train_df = pd.read_csv(dataset_folder+"train.csv")
train_df

# Load the images and apply a binary mask

In [None]:
def get_binary_mask(img):
    '''
    Turn the RGB image into grayscale before
    applying an Otsu threshold to obtain a
    binary segmentation
    '''
    
    blurred_img = cv2.GaussianBlur(img,(25,25),0)
    gray_img = cv2.cvtColor(blurred_img, cv2.COLOR_RGBA2GRAY)
    ret, otsu = cv2.threshold(gray_img, 0, 255, cv2.THRESH_BINARY+cv2.THRESH_OTSU)
    
    kernel = np.ones((40,40),np.uint8)
    closed_mask = cv2.morphologyEx(otsu, cv2.MORPH_CLOSE, kernel)
    return closed_mask

In [None]:
def load_RGBY_image(image_id_path):
    '''
    Load and stack the channels that are stored separately.
    '''
    
    red_image = cv2.imread(image_id_path+"_red.png", cv2.IMREAD_UNCHANGED)
    green_image = cv2.imread(image_id_path+"_green.png", cv2.IMREAD_UNCHANGED)
    blue_image = cv2.imread(image_id_path+"_blue.png", cv2.IMREAD_UNCHANGED)
    yellow_image = cv2.imread(image_id_path+"_yellow.png", cv2.IMREAD_UNCHANGED)

    stacked_images = np.transpose(np.array([red_image, green_image, blue_image, yellow_image]), (1,2,0))
    return stacked_images

In [None]:
image_id_path = training_image_folder+train_df.iloc[0].ID
stacked_images = load_RGBY_image(image_id_path)
binary_mask = get_binary_mask(stacked_images)

In [None]:
plt.imshow(stacked_images[:,:,:3])
plt.show()

In [None]:
plt.imshow(binary_mask)
plt.show()

# Generate instance masks and convert to RLE encoding

In [None]:
def rle_encoding(x):
    '''
    Turns our masks into RLE encoding to easily store them
    and feed them into models later on
    https://en.wikipedia.org/wiki/Run-length_encoding
    '''
    
    dots = np.where(x.T.flatten() == 255)[0]
    run_lengths = []
    prev = -2
    for b in dots:
        if (b>prev+1): run_lengths.extend((b + 1, 0))
        run_lengths[-1] += 1
        prev = b
        
    return ' '.join([str(x) for x in run_lengths])

In [None]:
def get_instance_masks(binary_mask):
    '''
    Using a binary mask, this function filters out 
    small items and create a separate mask for each 
    blobs
    '''
    
    contours= cv2.findContours(binary_mask,
                               cv2.RETR_TREE, 
                               cv2.CHAIN_APPROX_SIMPLE)
    instance_masks = []
    for contour in contours[0]:
        if cv2.contourArea(contour)>100:
            instance_contour = np.zeros(binary_mask.shape)
            cv2.drawContours(instance_contour,[contour], 
                             0, 255,thickness=cv2.FILLED)
            
            encoded_cell_mask = rle_encoding(instance_contour)
            instance_masks.append(encoded_cell_mask)
            
    return instance_masks

# Add the RLE encoding to the existing training dataframe

In [None]:
process_RLE_for = 20
train_df["RLE_encoding"] = ""

with tqdm(total=process_RLE_for) as pbar:
    for idx, item in train_df[:process_RLE_for].iterrows():
        image_id_path = training_image_folder+item.ID

        stacked_images = load_RGBY_image(image_id_path)
        binary_mask = get_binary_mask(stacked_images)
        instance_masks = get_instance_masks(binary_mask)

        train_df.at[idx, "RLE_encoding"] = str(instance_masks)
        pbar.update(1)

In [None]:
train_df

By creating individual masks for every cells in the training images, I now have the possibility to proceed to image analysis. Below, I display every cell and the color distribution for the RGB channels. A preliminary methods to identify the cells' classes could be to cluster them based on their color distribution signature.

In [None]:
def plot_color_distribution(isolated_cell_img):
    color = ('r','g','b','y')
    for i,col in enumerate(color):
        histr = cv2.calcHist([isolated_cell_img],[i],None,[256],[1,256])
        plt.plot(histr,color = col)
        plt.xlim([1,256])
    plt.show()

def analyze_individual_cells(binary_mask, original_image):
    
    contours= cv2.findContours(binary_mask,
                               cv2.RETR_TREE, 
                               cv2.CHAIN_APPROX_SIMPLE)
    
    for contour in contours[0]:
        if cv2.contourArea(contour)>100:
            x, y, width, height = cv2.boundingRect(contour)
            
            instance_contour = np.zeros(binary_mask.shape)
            cv2.drawContours(instance_contour,[contour], 
                             0, 255, thickness=cv2.FILLED)

            isolated_cell_image = np.zeros(binary_mask.shape)
            isolated_cell_image = cv2.bitwise_and(original_image,original_image, mask = instance_contour.astype("uint8"))
    
            plt.imshow(isolated_cell_image[y:y+height,x:x+width,:3])
            plt.show()
            plot_color_distribution(isolated_cell_image[y:y+height,x:x+width])

As seen below, when attempting to isolate individual cells using the functions previously defined, we can observe some issues when cells are too close from each other. Nonetheless, it does appear like a promising technique to generate instance masks.

In [None]:
image_id_path = training_image_folder+train_df.iloc[0].ID
stacked_images = load_RGBY_image(image_id_path)
binary_mask = get_binary_mask(stacked_images)
analyze_individual_cells(binary_mask, stacked_images)

In [None]:
image_id_path = training_image_folder+train_df.iloc[55].ID
stacked_images = load_RGBY_image(image_id_path)
binary_mask = get_binary_mask(stacked_images)
analyze_individual_cells(binary_mask, stacked_images)

## Thanks for reading this notebook! If you found this notebook helpful, please give it an upvote. It is always greatly appreciated!