# Project Overview

For this competition the majority of my time was spent on the inference. The initial cell segmentation utilized [this notebook.](https://www.kaggle.com/rdizzl3/hpa-segmentation-masks-no-internet) and associated datasets which are inputs for this work. Noting that the key to the segmentator is setting the scale factor to 0.25, padding to True which gave the cells the best masking. 

# Cell Segmentation

For the competition, I used the cell segmentator and then a modified version of the label_cell function from [this notebook].(https://www.kaggle.com/samusram/even-faster-hpa-cell-segmentation). This was combine with my OpenCV code to pull images for my different models.  See get_image_masks and create_cell_images functions for those details. 

In [None]:
!pip install -q "../input/pycocotools/pycocotools-2.0-cp37-cp37m-linux_x86_64.whl"
!pip install -q "../input/hpapytorchzoozip/pytorch_zoo-master"
!pip install -q "../input/hpacellsegmentatormaster/HPA-Cell-Segmentation-master"

NUC_MODEL = '../input/hpacellsegmentatormodelweights/dpn_unet_nuclei_v1.pth'
CELL_MODEL = '../input/hpacellsegmentatormodelweights/dpn_unet_cell_3ch_v1.pth'

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import imageio
import torch
from tqdm import tqdm
import time
import gc

import matplotlib.pyplot as plt
import cv2

import hpacellseg.cellsegmentator as cellseg
segmentator = cellseg.CellSegmentator(
    NUC_MODEL,
    CELL_MODEL,
    scale_factor=0.25,
    padding=True,
    multi_channel_model=True
)


#from hpacellseg.utils import label_cell, label_nuclei

from pycocotools import _mask as coco_mask
import typing as t
import zlib
import base64

import keras
import keras.backend as K
import tensorflow as tf
from tensorflow.keras import backend, layers
from keras.preprocessing.image import img_to_array



With the cell segmentation processes taking up a significant amount of GPU, I used 2 different methods to download my classification models for the competition. Both can be found in the [keras docuementation](https://www.tensorflow.org/api_docs/python/tf/config/experimental/set_memory_growth). 

In [None]:
# Stop Tensorflow From Eating All The Memory
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
        try:
            # Currently, memory growth needs to be the same across GPUs
            for gpu in gpus:
                tf.config.experimental.set_memory_growth(gpu, True)
            logical_gpus = tf.config.experimental.list_logical_devices('GPU')
            print(len(gpus), "... Physical GPUs,", len(logical_gpus), "Logical GPUs ...\n")
        except RuntimeError as e:
            # Memory growth must be set before GPUs have been initialized
            print(e)

gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        tf.config.experimental.set_virtual_device_configuration(gpus[0], [tf.config.experimental.VirtualDeviceConfiguration(memory_limit=1024)])
    except RuntimeError as e:
        print(e)

In [None]:
#https://github.com/qubvel/efficientnet/blob/8984e988ecccd9c3a15be2e793991845619a8a26/efficientnet/model.py#L591
class FixedDropout(layers.Dropout):
        def _get_noise_shape(self, inputs):
            if self.noise_shape is None:
                return self.noise_shape

            symbolic_shape = backend.shape(inputs)
            noise_shape = [symbolic_shape[axis] if shape is None else shape
                           for axis, shape in enumerate(self.noise_shape)]
            return tuple(noise_shape)

In [None]:
RGB_model = keras.models.load_model('../input/hpa-models-2021/ProteinModelRGB_rev_18.h5')
G_model = keras.models.load_model('../input/hpa-models-2021/GreentileProteinModel_rev_2.h5')
multicellmodel = keras.models.load_model('../input/hpa-models-2021/Full_image_greenModelRev9.h5', custom_objects={'FixedDropout':FixedDropout(rate=0.5)})
img_type = 'g' #'rgb','g', 'ryb_g' per model above
multicell2model = keras.models.load_model('../input/hpa-models-2021/Full_image_RYB_GModelRev11.h5', custom_objects={'FixedDropout':FixedDropout(rate=0.5)})
img2_type = 'ryb_g' #'rgb','g', 'ryb_g' per model above

In [None]:
import scipy.ndimage as ndi
from skimage import filters, measure, segmentation, transform, util, morphology, feature
from skimage.morphology import (binary_erosion, closing, disk,
                                    remove_small_holes, remove_small_objects)


def label_cell(nuclei_pred, cell_pred):
    """Label the cells and the nuclei.
    Keyword arguments:
    nuclei_pred -- a 3D numpy array of a prediction from a nuclei image.
    cell_pred -- a 3D numpy array of a prediction from a cell image.
    Returns:
    A tuple containing:
    nuclei-label -- A nuclei mask data array.
    cell-label  -- A cell mask data array.
    0's in the data arrays indicate background while a continous
    strech of a specific number indicates the area for a specific
    cell.
    The same value in cell mask and nuclei mask refers to the identical cell.
    NOTE: The nuclei labeling from this function will be sligthly
    different from the values in :func:`label_nuclei` as this version
    will use information from the cell-predictions to make better
    estimates.
    """
    def __wsh(
        mask_img,
        threshold,
        border_img,
        seeds,
        threshold_adjustment=0.35,
        small_object_size_cutoff=10,
    ):
        img_copy = np.copy(mask_img)
        m = seeds * border_img  # * dt
        img_copy[m <= threshold + threshold_adjustment] = 0
        img_copy[m > threshold + threshold_adjustment] = 1
        img_copy = img_copy.astype(np.bool)
        img_copy = remove_small_objects(img_copy, small_object_size_cutoff).astype(
            np.uint8
        )

        mask_img[mask_img <= threshold] = 0
        mask_img[mask_img > threshold] = 1
        mask_img = mask_img.astype(np.bool)
        mask_img = remove_small_holes(mask_img, 63)
        mask_img = remove_small_objects(mask_img, 1).astype(np.uint8)
        markers = ndi.label(img_copy, output=np.uint32)[0]
        labeled_array = segmentation.watershed(
            mask_img, markers, mask=mask_img, watershed_line=True
        )
        return labeled_array

    nuclei_label = __wsh(
        nuclei_pred[..., 2] / 255.0,
        0.4,
        1 - (nuclei_pred[..., 1] + cell_pred[..., 1]) / 255.0 > 0.05,
        nuclei_pred[..., 2] / 255,
        threshold_adjustment=-0.25,
        small_object_size_cutoff=32,
    )

    # for hpa_image, to remove the small pseduo nuclei
    nuclei_label = remove_small_objects(nuclei_label, 157)
    nuclei_label = measure.label(nuclei_label)
    # this is to remove the cell borders' signal from cell mask.
    # could use np.logical_and with some revision, to replace this func.
    # Tuned for segmentation hpa images
    threshold_value = max(0.22, filters.threshold_otsu(cell_pred[..., 2] / 255) * 0.5)
    # exclude the green area first
    cell_region = np.multiply(
        cell_pred[..., 2] / 255 > threshold_value,
        np.invert(np.asarray(cell_pred[..., 1] / 255 > 0.05, dtype=np.int8)),
    )
    sk = np.asarray(cell_region, dtype=np.int8)
    distance = np.clip(cell_pred[..., 2], 255 * threshold_value, cell_pred[..., 2])
    cell_label = segmentation.watershed(-distance, nuclei_label, mask=sk)
    cell_label = remove_small_objects(cell_label, 344).astype(np.uint8)
    selem = disk(2)
    cell_label = closing(cell_label, selem)
    cell_label = __fill_holes(cell_label)
    
    # this part is to use green channel, and extend cell label to green channel
    # benefit is to exclude cells clear on border but without nucleus
    sk = np.asarray(
        np.add(
            np.asarray(cell_label > 0, dtype=np.int8),
            np.asarray(cell_pred[..., 1] / 255 > 0.05, dtype=np.int8),
        )
        > 0,
        dtype=np.int8,
    )
    cell_label = segmentation.watershed(-distance, cell_label, mask=sk)
    cell_label = __fill_holes(cell_label)
    cell_label = np.asarray(cell_label > 0, dtype=np.uint8)
    cell_label = measure.label(cell_label)
    cell_label = remove_small_objects(cell_label, 344)
    cell_label = measure.label(cell_label)
    cell_label = np.asarray(cell_label, dtype=np.uint16)

    return nuclei_label, cell_label

def __fill_holes(image):
    """Fill_holes for labelled image, with a unique number."""
    boundaries = segmentation.find_boundaries(image)
    image = np.multiply(image, np.invert(boundaries))
    image = ndi.binary_fill_holes(image > 0)
    image = ndi.label(image)[0]
    return image

In [None]:
def build_image_names(image_id: str) -> list:
    # mt is the mitchondria
    mt = f'/kaggle/input/hpa-single-cell-image-classification/test/{image_id}_red.png'
    
    # er is the endoplasmic reticulum
    er = f'/kaggle/input/hpa-single-cell-image-classification/test/{image_id}_yellow.png'
    
    # nu is the nuclei
    nu = f'/kaggle/input/hpa-single-cell-image-classification/test/{image_id}_blue.png'
    
    # high is the protein
    high = f'/kaggle/input/hpa-single-cell-image-classification/test/{image_id}_green.png'
    
    return [mt], [er], [nu], [high], [[mt], [er], [nu]],

def grab_contours(cnts):
    # if the length the contours tuple returned by cv2.findContours
    # is '2' then we are using either OpenCV v2.4, v4-beta, or
    # v4-official
    if len(cnts) == 2:
        cnts = cnts[0]

    # if the length of the contours tuple is '3' then we are using
    # either OpenCV v3, v4-pre, or v4-alpha
    elif len(cnts) == 3:
        cnts = cnts[1]

    # otherwise OpenCV has changed their cv2.findContours return
    # signature yet again and I have no idea WTH is going on
    else:
        raise Exception(("Contours tuple must have length 2 or 3, "
            "otherwise OpenCV changed their cv2.findContours return "
            "signature yet again. Refer to OpenCV's documentation "
            "in that case"))

    # return the actual contours array
    return cnts

def create_cell_images(RGB, RYB_G, G, cell_masks, size):
    def clipimgtosquare(group_img):
        #cover and crop image/contour/cell to maximum size for model.
        cnt_img = np.zeros_like(group_img)
        cnt_img[cover == 255] = group_img[cover == 255]
        cnt_img = cnt_img[y:y+h, x:x+w]
        #resize to ratio of desired size
        old_size = cnt_img.shape[:2] 
        ratio = float(size)/max(old_size)
        new_size = tuple([int(x*ratio) for x in old_size])
        resized = cv2.resize(cnt_img, (new_size[1], new_size[0]))
        #Create padding for final square image at desired size
        delta_w = size - new_size[1]
        delta_h = size - new_size[0]
        top, bottom = delta_h//2, delta_h-(delta_h//2)
        left, right = delta_w//2, delta_w-(delta_w//2)
        color = [0, 0, 0]
        square = cv2.copyMakeBorder(resized, top, bottom, left, right, cv2.BORDER_CONSTANT, value=color)
        square = img_to_array(square)
        #square = np.expand_dims(square, axis=0)
        return square

    mask = cv2.convertScaleAbs(cell_masks)
    cnts = grab_contours(cv2.findContours(mask, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE))
    RGBs = []
    RYB_Gs = []
    Gs = []
    for i in range(1,cell_masks.max()):
        mask = cv2.convertScaleAbs(np.where(cell_masks==i, 1, 0))
        c = grab_contours(cv2.findContours(mask, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE))
        (x,y,w,h) = cv2.boundingRect(c[0])
        # Create top cover where white is what we want, black otherwise
        cover = np.zeros_like(RGB)
        # Draw contour (white) over img blank(black) - all 3 channels
        cv2.drawContours(cover, [c[0]], 0, (255,255,255), -1)
        
        rgb = clipimgtosquare(RGB)
        ryb_g = clipimgtosquare(RYB_G)
        g = clipimgtosquare(G)
        
        RGBs.append(rgb)
        RYB_Gs.append(ryb_g)
        Gs.append(g)
        
    return RGBs, RYB_Gs, Gs


def image_predictions(images, model, TTArepeat=0, batch_size=8):
    labels = []
    confidences = []
    images = np.vstack(images)
    confidence = model.predict(images,batch_size=batch_size)
    if TTArepeat > 0:
        TTApred = []
        TTApred.append(confidence)
        image = data_augmentation(images)
        for i in range(TTArepeat):
            image = data_augmentation(image)
            TTApred.append(model.predict(image))
        confidence = np.mean(TTApred,axis=0)
            
    confidences.append(confidence)
    return confidences

# Code obtained form competition main page:
#https://www.kaggle.com/c/hpa-single-cell-image-classification/overview/evaluation
def create_cell_masks(mask):
    """Converts a binary mask into OID challenge encoding ascii text."""
    # check input mask --
    if mask.dtype != np.bool:
        raise ValueError(
            "encode_binary_mask expects a binary mask, received dtype == %s" %
            mask.dtype)

    mask = np.squeeze(mask)
    if len(mask.shape) != 2:
        raise ValueError(
            "encode_binary_mask expects a 2d mask, received shape == %s" %
            mask.shape)

    # convert input mask to expected COCO API input --
    mask_to_encode = mask.reshape(mask.shape[0], mask.shape[1], 1)
    mask_to_encode = mask_to_encode.astype(np.uint8)
    mask_to_encode = np.asfortranarray(mask_to_encode)

    # RLE encode mask --
    encoded_mask = coco_mask.encode(mask_to_encode)[0]["counts"]

    # compress and base64 encoding --
    binary_str = zlib.compress(encoded_mask, zlib.Z_BEST_COMPRESSION)
    base64_str = base64.b64encode(binary_str)
    return base64_str

def get_image_masks(image_id):
    mt, er, nu, high, images = build_image_names(image_id=image_id)
    
    # For nuclei
    nuc_segmentations = segmentator.pred_nuclei(images[2])
    
    # For full cells
    cell_segmentations = segmentator.pred_cells(images)
    
    # post-processing
    nuclei_mask, cell_mask = label_cell(nuc_segmentations[0], cell_segmentations[0])

    # Protein model image
    blue = normalization(plt.imread(nu[0]))
    green = normalization(plt.imread(high[0]))
    red = normalization(plt.imread(mt[0]))
    yellow = normalization(plt.imread(er[0]))
    
    RGB = np.dstack((red, green, blue))
    RYB_G =img = np.dstack((red+green, yellow+green, blue+green))
    G = np.stack((green,)*3, axis=-1)
       
    return RGB, RYB_G, G, cell_mask

def flatten_list_of_lists(l_o_l, to_string=False):
    if not to_string:
        return [item for sublist in l_o_l for item in sublist]
    else:
        return [str(item) for sublist in l_o_l for item in sublist]
    
def image_prediction_string(confidences, masks, labelqty = 19, threshold = .00):
    labels = []
    probs = []
    codes = []
    predictionstring = []
    for pred, mask in zip(confidences, masks):
        neglabel = 1-pred.max()
        for label in range(0,labelqty):
            if pred[label]>threshold:
                labels.append(label)
                probs.append(pred[label])
                codes.append(mask.decode('UTF-8'))
        labels.append(labelqty)
        probs.append(neglabel)
        codes.append(mask.decode('UTF-8'))
            
    predictionstring = [" ".join(flatten_list_of_lists(zip([label, pred, mask]), to_string=True)) for label, pred, mask in zip(labels, probs, codes)]
    #print(" ".join(predictionstring))
    return (" ".join(predictionstring))
   
def data_aug_exp(img, modeltype, size):
    images = [] 
    images.append(img)
    image = tf.image.central_crop(img, central_fraction=.8)
    image = tf.image.resize(image, [size,size])
    image = tf.image.rot90(img, k=1)
    images.append(image)
    image = tf.image.central_crop(image, central_fraction=.8)
    image = tf.image.resize(image, [size,size])
    images.append(image)
    image = tf.image.flip_left_right(img)
    images.append(image)
    image = tf.image.central_crop(image, central_fraction=.8)
    image = tf.image.resize(image, [size,size])
    images.append(image)
    image = tf.image.flip_up_down(img)
    images.append(image)
    image = tf.image.central_crop(image, central_fraction=.8)
    image = tf.image.resize(image, [size,size])
    images.append(image)
    #if modeltype == 'rgb':
    #    image = tf.image.adjust_hue(img, 0.01)
    #    images.append(image)
    if modeltype == 'ryb_g':
        image = tf.image.adjust_contrast(img, 0.51)
        images.append(image)
        image = tf.image.central_crop(image, central_fraction=.8)
        image = tf.image.resize(image, [size,size])
        images.append(image)
        
        
        #image = tf.image.adjust_brightness(img, delta=0.2)
        #images.append(image)    
        #image = tf.image.adjust_saturation(img, 0.95)
        #images.append(image)
    images = tf.expand_dims(images, axis=0)
    images = np.vstack(images)
    return images

def image_predictions_exp(images, model, modeltype, TTArepeat=False, batch_size=8, size=128):
    images = np.expand_dims(images, axis=0)
    images = np.vstack(images)
    confidence = model.predict(images,batch_size=batch_size)
    if TTArepeat:
        TTApred = []
        aug_images = data_aug_exp(images,modeltype,size)
        for img in aug_images:
            TTApred.append(model.predict(img))
        confidence = np.mean(TTApred,axis=0)
    return confidence

def weighted_predictions(pred1, wt1, pred2=0, wt2=0, pred3=0, wt3=0):
    new_pred = pred1*wt1+pred2*wt2+pred3*wt3
    
    return np.array(new_pred)

def normalization(array):
    a = (array - array.min())/(array.max()-array.min())
    return a

def add_neglabel(array, labelqty):
    arraynew = []
    maximum = array.max(axis = 1)
    array = np.insert(array, labelqty, 1-maximum, axis = 1)
    return array
    
def print_cell(image, title, index):
    plt.subplot(1,10,index)
    plt.imshow(image)
    plt.title(title)
    plt.axis('off')
    #plt.show()
    

Here are the different 'view points' that a supplied to each image for the model training and the inference prediction. Test time augmentation was important for accurate prediction of the images. the most important augmentation proved to be a random crop that acts like a zoom inot the features of the images. 

In [None]:
train_dir = '../input/hpa-single-cell-image-classification/test/'
image_id = '01a14326-67b8-43b0-ac7a-ba6dfb3c38ad'
RGB, RYB_G, G, cell_masks = get_image_masks(image_id)

RGBs, RYB_Gs, Gs = create_cell_images(RGB, RYB_G, G, cell_masks, 128)

plt.figure(figsize=(25,4))
print_cell(cell_masks, 'masks', 1)
print_cell(RGB, 'RGB cells', 2)
print_cell(RYB_G, 'RYB_G cells', 3)
print_cell(G, 'Green cells', 4)
plt.tight_layout()
plt.show()

rgb = data_aug_exp(RGBs, 'rgb', 128)
ryb_g = data_aug_exp(RYB_Gs, 'ryb_g', 128)
g = data_aug_exp(Gs, 'g', 128)
title = ''

s=multicellmodel.input_shape[1]
full = tf.image.resize(RGB, [s,s])
full = data_aug_exp(full, img_type, s)

for j in range(2):#len(cells)):
    i=0
    plt.figure(figsize =(25,10))
    for img in rgb:
        i+=1
        print_cell(img[j], title, i)
    plt.tight_layout()
    plt.show()
    i=0
    plt.figure(figsize =(25,10))
    for img in rgb:
        i+=1
        print_cell(img[j][:,:,1], '', i)
    plt.tight_layout()
    plt.show()
    i=0
    plt.figure(figsize =(25,10))
    for img in ryb_g:
        i+=1
        print_cell(img[j], title, i)
    plt.tight_layout()
    plt.show()
    i=0
    plt.figure(figsize =(25,10))
    for img in ryb_g:
        i+=1
        print_cell(img[j][:,:,1], '', i)
    plt.tight_layout()
    plt.show()
    i=0
    plt.figure(figsize =(25,10))
    for img in g:
        i+=1
        print_cell(img[j], title, i)
    plt.tight_layout()
    plt.show()
    i=0
    plt.figure(figsize =(25,10))
    for img in g:
        i+=1
        print_cell(img[j][:,:,1], '', i)
    plt.tight_layout()
    plt.show()
    i=0
    plt.figure(figsize =(25,10))
    for img in full:
        i+=1
        print_cell(img, title, i)
    plt.tight_layout()
    plt.show()

To save overall time and GPU resources my inference only predicted 2 images before submission. 

In [None]:
start = time.time()

#sub = pd.DataFrame(columns = ['ID', 'ImageWidth', 'ImageHeight', 'PredictionString'])
test_dir = '../input/hpa-single-cell-image-classification/test/'
test_images = os.listdir(test_dir)

images = [i.split("_")[0] for i in test_images]
names = np.unique(images)
public = len(names)==559
if public:
    print('...only public testset...')
    names = names[0:2]

In [None]:
multicellpredict = []
multicellpredmax = []
iterations = 1
size = 128
imghs = []
imgws = []
predictionstrings = []
for image_id in tqdm(names):
    RGB, RYB_G, G, cell_masks = get_image_masks(image_id)
    RGBs, RYB_Gs, Gs = create_cell_images(RGB, RYB_G, G, cell_masks, size)
    
    RGBpred = image_predictions_exp(RGBs, RGB_model, 'rgb', TTArepeat=True, batch_size=1)
    Gpred = image_predictions_exp(Gs, G_model, 'g', TTArepeat=True, batch_size=1)
    confidences  = weighted_predictions(RGBpred, .5, Gpred, .5)
       
    if multicellmodel:
        s = multicellmodel.input_shape[1]
        img = G
        TTA = True
        img = tf.image.resize(img, [s,s])
        img = np.expand_dims(img, axis=0)
        if TTA:
            img = data_aug_exp(img,img_type,s)[:,0]
            multicellpredict = multicellmodel.predict(img)
            multicellpredict  = np.mean(multicellpredict,axis=0)
        else:
            multicellpredict = multicellmodel.predict(img)[0]
            
        if multicell2model:
            s = multicell2model.input_shape[1]
            img = RYB_G
            img = tf.image.resize(img, [s,s])
            img = np.expand_dims(img, axis=0)
            if TTA:
                img = data_aug_exp(img,img2_type,s)[:,0]
                multicell2predict = multicell2model.predict(img)
                multicell2predict  = np.mean(multicell2predict,axis=0)
            else:
                multicell2predict = multicell2model.predict(img)[0]
            multicellpredict = weighted_predictions(multicell2predict, .4, multicellpredict, .6)
        #Using weights estimates
        confidences = weighted_predictions(multicellpredict[:18], .45, confidences, .55)
    
    masks = [create_cell_masks(np.where(cell_masks==i, 1, 0).astype(np.bool)) for i in range(1,cell_masks.max())]
    shape = RGB.shape
    imghs.append(shape[0])
    imgws.append(shape[1])
    string = image_prediction_string(confidences, masks, labelqty = 18, threshold = .00)
    predictionstrings.append(string)
    iterations+=1
    
    ##print(string) ###debug only
    
    
    
sub=pd.DataFrame({'ID':names, 'ImageWidth':imghs , 'ImageHeight':imgws, 'PredictionString':predictionstrings})

end = time.time()
print(f"All {len(names)} images complete at {round((end-start)/60,1)} mins.")

In [None]:
sub.head(8)

In [None]:
sub.to_csv("/kaggle/working/submission.csv", index=False)