**References**
Below are few of the references that helped me understand and participate in this competition.
- https://www.kaggle.com/dschettler8845/hpa-cellwise-classification-inference/data
- https://www.kaggle.com/its7171/hpa-mask
- https://www.kaggle.com/thedrcat/hpa-single-cell-classification-eda
- https://towardsdatascience.com/multi-label-image-classification-in-tensorflow-2-0-7d4cf8a4bc72
- Sechidis, K., Tsoumakas, G., & Vlahavas, I. (2011). On the stratification of multi-label data. Machine Learning and Knowledge Discovery in Databases, 145-158
- And many more!!

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import cv2
from pathlib import Path
import imageio
from time import time
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
import tensorflow as tf
from keras.preprocessing import image
from tensorflow.keras import layers
import matplotlib.style as style
from keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Dropout, Flatten, Dense

In [None]:
IMG_SIZE=256
N_LABELS = 19
EPOCHS = 1
TF_BATCH_SIZE = 256 # Big enough to measure an F1-score
AUTOTUNE = tf.data.experimental.AUTOTUNE # Adapt preprocessing and prefetching dynamically to reduce GPU and CPU idle time
SHUFFLE_BUFFER_SIZE = 1024 # Shuffle the training data by a chunck of 1024 observations

IMG_SIZE = 256 # Specify height and width of image to match the input format of the model
CHANNELS = 3 # Keep RGB color channels to match the input format of the model

In [None]:
def parse_function(filename, label):
    """Function that returns a tuple of normalized image array and labels array.
    Args:
        filename: string representing path to image
        label: 0/1 one-dimensional array of size N_LABELS
    """
    # Read an image from a file
    image_string = tf.io.read_file(filename)
    # Decode it into a dense vector
    image_decoded = tf.image.decode_jpeg(image_string, channels=CHANNELS)
    # Resize it to fixed shape
    image_resized = tf.image.resize(image_decoded, [IMG_SIZE, IMG_SIZE])
    # Normalize it from [0, 255] to [0.0, 1.0]
    image_normalized = image_resized / 255.0
#     print(image_normalized)
    return image_normalized, label

In [None]:
def create_dataset(filenames, labels, is_training=True):
    """Load and parse dataset.
    Args:
        filenames: list of image paths
        labels: numpy array of shape (TF_BATCH_SIZE, N_LABELS)
        is_training: boolean to indicate training mode
    """
    
    # Create a first dataset of file paths and labels
    dataset = tf.data.Dataset.from_tensor_slices((filenames, labels))
    # Parse and preprocess observations in parallel
    dataset = dataset.map(parse_function, num_parallel_calls=AUTOTUNE)
    
#     if is_training == True:
#         # This is a small dataset, only load it once, and keep it in memory.
#         dataset = dataset.cache()
#         # Shuffle the data each buffer size
#         dataset = dataset.shuffle(buffer_size=SHUFFLE_BUFFER_SIZE)
        
    # Batch the data for multiple steps
    dataset = dataset.batch(TF_BATCH_SIZE)
    # Fetch batches in the background while the model is training.
    dataset = dataset.prefetch(buffer_size=AUTOTUNE)
    
    return dataset

In [None]:
@tf.function
def macro_soft_f1(y, y_hat):
    """Compute the macro soft F1-score as a cost (average 1 - soft-F1 across all labels).
    Use probability values instead of binary predictions.
    
    Args:
        y (int32 Tensor): targets array of shape (TF_BATCH_SIZE, N_LABELS)
        y_hat (float32 Tensor): probability matrix from forward propagation of shape (TF_BATCH_SIZE, N_LABELS)
        
    Returns:
        cost (scalar Tensor): value of the cost function for the batch
    """
    y = tf.cast(y, tf.float32)
    y_hat = tf.cast(y_hat, tf.float32)
    tp = tf.reduce_sum(y_hat * y, axis=0)
    fp = tf.reduce_sum(y_hat * (1 - y), axis=0)
    fn = tf.reduce_sum((1 - y_hat) * y, axis=0)
    soft_f1 = 2*tp / (2*tp + fn + fp + 1e-16)
    cost = 1 - soft_f1 # reduce 1 - soft-f1 in order to increase soft-f1
    macro_cost = tf.reduce_mean(cost) # average on all labels
    return macro_cost

In [None]:
physical_devices = tf.config.list_physical_devices('GPU')
try:
  tf.config.experimental.set_memory_growth(physical_devices[0], True)
except:
  # Invalid device or cannot modify virtual devices once initialized.
  pass

In [None]:
!pip install "../input/pycocotools/pycocotools-2.0-cp37-cp37m-linux_x86_64.whl"
!pip install "../input/hpapytorchzoozip/pytorch_zoo-master"
!pip install "../input/hpacellsegmentatorraman/HPA-Cell-Segmentation/"

In [None]:
from pycocotools import _mask as coco_mask
import typing as t
import base64
import zlib

def binary_mask_to_ascii(mask, mask_val=1):
    """Converts a binary mask into OID challenge encoding ascii text."""
    mask = np.where(mask==mask_val, 1, 0).astype(np.bool)
    
    # check input mask --
    if mask.dtype != np.bool:
        raise ValueError(f"encode_binary_mask expects a binary mask, received dtype == {mask.dtype}")

    mask = np.squeeze(mask)
    if len(mask.shape) != 2:
        raise ValueError(f"encode_binary_mask expects a 2d mask, received shape == {mask.shape}")

    # convert input mask to expected COCO API input --
    mask_to_encode = mask.reshape(mask.shape[0], mask.shape[1], 1)
    mask_to_encode = mask_to_encode.astype(np.uint8)
    mask_to_encode = np.asfortranarray(mask_to_encode)

    # RLE encode mask --
    encoded_mask = coco_mask.encode(mask_to_encode)[0]["counts"]

    # compress and base64 encoding --
    binary_str = zlib.compress(encoded_mask, zlib.Z_BEST_COMPRESSION)
    base64_str = base64.b64encode(binary_str)
    return base64_str.decode()

In [None]:
import os

if not os.path.exists('cells-segmented'):
    os.makedirs('cells-segmented')
if not os.path.exists('cells-segmented/test'):
    os.makedirs('cells-segmented/test')
if not os.path.exists('test-masks'):
    os.makedirs('test-masks')

In [None]:
# import glob
# red_test_globs = glob.glob('../input/hpa-single-cell-image-classification/test/*_red.png')
# test_imageids = []
# for name in red_test_globs:
#     tokens = name.split('/')
#     image_id = tokens[len(tokens) - 1].split('_red')[0]
#     test_imageids.append({'ID': image_id})
# test_df = pd.DataFrame(test_imageids)

In [None]:
test_df = pd.read_csv('../input/hpa-single-cell-image-classification/sample_submission.csv')

In [None]:
test_df

In [None]:
TEST_IMAGE_SIZE = 1024
BATCH_SIZE = 24

In [None]:
def crop_cell(img, mask):
    mask = mask.reshape(mask.shape[0], mask.shape[1], 1)
    img_mask = img * mask
    non_zero_points = np.argwhere(img_mask[:,:,:])
    max_xy = non_zero_points.max(axis=0)
    min_xy = non_zero_points.min(axis=0)
    return img_mask[min_xy[0]:max_xy[0] + 1,min_xy[1]:max_xy[1] + 1,:]

In [None]:
def read_img(image_id, color, train_or_test='test', image_size=None):
    filename = f'../input/hpa-single-cell-image-classification/{train_or_test}/{image_id}_{color}.png'
    assert os.path.exists(filename), f'not found {filename}'
    img = cv2.imread(filename, cv2.IMREAD_GRAYSCALE)
    if image_size is not None:
        img = cv2.resize(img, (image_size, image_size))
    if img.max() > 255:
        img_max = img.max()
        img = (img/255).astype('uint8')
    return img

In [None]:
def load_images(df, train_or_test = 'test'):
    image_path_prefix = f'../input/hpa-single-cell-image-classification/{train_or_test}/'
    
    red_images = [cv2.imread(f'{image_path_prefix}/{row.ID}_red.png', cv2.IMREAD_GRAYSCALE) for _,row in df.iterrows()]
    green_images = [cv2.imread(f'{image_path_prefix}/{row.ID}_green.png', cv2.IMREAD_GRAYSCALE) for _,row in df.iterrows()]
    blue_images = [cv2.imread(f'{image_path_prefix}/{row.ID}_blue.png', cv2.IMREAD_GRAYSCALE) for _,row in df.iterrows()]
    # 24x512x512
    height_widths = [red_images[i].shape for i in range(len(red_images))]
    blue_image_scaled = [cv2.resize(b, (TEST_IMAGE_SIZE, TEST_IMAGE_SIZE)) / 255. for b in blue_images]
    rgb_image_scaled = [cv2.resize(np.stack((red_images[i], green_images[i], blue_images[i]), axis=2), (TEST_IMAGE_SIZE, TEST_IMAGE_SIZE)) / 255.
                        for i in range(len(red_images))]
    # 24x3x1024x1024
    return blue_image_scaled, rgb_image_scaled, height_widths

In [None]:
def segment_cells(df, train_or_test='test'):
    all_cells = []
    for index, row in df.iterrows():
        image_id = row.ID
#         if index % 50 == 0:
#         print(f'Working on ImageId={image_id}, index={index}')
        if train_or_test == 'train':
            cell_mask = np.load(f'{DATASET_HPA_MASK}/hpa_cell_mask/{image_id}.npz')['arr_0']
        else:
            cell_mask = np.load(f'test-masks/{image_id}.npz')['arr_0']
        red_image = read_img(image_id, color='red', train_or_test=train_or_test, image_size=TEST_IMAGE_SIZE)
        green_image = read_img(image_id, color='green', train_or_test=train_or_test, image_size=TEST_IMAGE_SIZE)
        blue_image = read_img(image_id, color='blue', train_or_test=train_or_test, image_size=TEST_IMAGE_SIZE)
        img = np.dstack((blue_image, green_image, red_image))
        for i in range(1, np.max(cell_mask) + 1):
#             print(f'Working on cell={i}')
            if train_or_test == 'train':
                all_cells.append({
                    'image_id': image_id,
                    'cell_no': i,
                    'labels': row.Label
                })
            else:
                all_cells.append({
                    'image_id': image_id,
                    'cell_no': i,
                })
            bin_mask = cell_mask == i;
            cell = crop_cell(img, bin_mask)
            cv2.imwrite(f'./cells-segmented/{train_or_test}/{image_id}_{i}.jpg', cell)
    return pd.DataFrame(all_cells)

In [None]:
def generate_cell_masks(df, train_or_test='test'):
    height_widths = []
    for start in range(0, len(df), BATCH_SIZE):
        end = min(start + BATCH_SIZE, len(df))
        chunk = df[start:end]
#         print(str(start) + '-' + str(end))
        blue_images, images, hws = load_images(chunk, train_or_test=train_or_test)
        height_widths += hws
        # For nuclei
        nuc_segmentations = segmentator.pred_nuclei(blue_images)
        # For full cells
        cell_segmentations = segmentator.pred_cells(images, precombined=True)
        # post-processing
        for i, pred in enumerate(cell_segmentations):
            nuclei_mask, cell_mask = utils.label_cell(nuc_segmentations[i], cell_segmentations[i])
            if train_or_test == 'test':
                image_id = df.iloc[start + i].ID
                np.savez_compressed(f'test-masks/{image_id}', cell_mask)
    return height_widths

In [None]:
from hpacellseg import cellsegmentator, utils
    
NUC_MODEL = "../input/hpacellsegmentatormodelweights/dpn_unet_nuclei_v1.pth"
CELL_MODEL = "../input/hpacellsegmentatormodelweights/dpn_unet_cell_3ch_v1.pth"
segmentator = cellsegmentator.CellSegmentator(
    NUC_MODEL,
    CELL_MODEL,
    device="cuda",
    multi_channel_model=True,
)

In [None]:
print('Starting cell segmentation')
test_height_widths = generate_cell_masks(test_df, train_or_test='test')
test_cells_df = segment_cells(test_df, train_or_test='test')

In [None]:
# plt.imshow(np.load(f'test-masks/020a29cf-2c24-478b-8603-c22a90dc3e31.npz')['arr_0'])

In [None]:
# from IPython.display import Image
# # Image(f'../input/hpa-single-cell-image-classification/test/020a29cf-2c24-478b-8603-c22a90dc3e31_yellow.png')
# Image(f'./cells-segmented/test/84895b21-d582-4fc8-b8a4-7ffcbab587a8_40.jpg')

In [None]:
import torch
torch.cuda.empty_cache()

In [None]:
test_cells_df.shape

In [None]:
X_test_sub = ['./cells-segmented/test/' + str(row.image_id) + '_' + str(row.cell_no) + '.jpg' for _, row in test_cells_df.iterrows()]
print(X_test_sub[0:2])
X_test_dict = {row.image_id + '_' + str(row.cell_no) : index for index, row in test_cells_df.iterrows()}

In [None]:
from tensorflow import keras
model = keras.models.load_model('../input/hpa-train-model/model-dl.h5', custom_objects={'macro_soft_f1':macro_soft_f1})

In [None]:
# model.weights[12]

In [None]:
# X_test_sub

In [None]:
# plt.imshow(parse_function('./cells-segmented/test/9b1d4b27-6946-4b86-a818-8e91029c3dfa_1.jpg', None)[0].numpy())

In [None]:
test_ds = create_dataset(X_test_sub, X_test_sub, False)

In [None]:
# model.weights

In [None]:
# for z in test_ds:
#   print(plt.imshow(z[0][10]))

In [None]:
test_predictions = model.predict(test_ds, verbose=1)

In [None]:
# test_predictions

In [None]:
THRESHOLD = 0.05
submission_data = []
for index, row in test_df.iterrows():
    height_width = test_height_widths[index]
    image_id = row.ID
    mask = np.load(f'./test-masks/{image_id}.npz')['arr_0']
    predictions = []
    cell_id = 1
    while f'{image_id}_{cell_id}' in X_test_dict:
        cell_index = X_test_dict[f'{image_id}_{cell_id}']
#         print(cell_index)
        pred = test_predictions[cell_index]
#         print(pred)
#         filtered_preds = {i: pred[i] for i in range(N_LABELS)}
        submission_rle = binary_mask_to_ascii(mask, mask_val=cell_id)
        cell_predictions = [f'{i} {p} {submission_rle}' for i,p in enumerate(pred) if p >= THRESHOLD]
        predictions += cell_predictions
        cell_id += 1
    submission_data.append({
        'ID': image_id,
        'ImageWidth': height_width[0],
        'ImageHeight': height_width[1],
        'PredictionString': ' '.join(predictions)
    })

In [None]:
# submission_data

In [None]:
pd.DataFrame(submission_data).to_csv('./submission.csv', index=False)

In [None]:
# fig, ax = plt.subplots(5, 2, figsize=(20,50))

# for i, data_id in enumerate(train_df.ID.to_list()[:5]):
    
#     cell_image = np.stack([
#         cv2.imread(f'../input/hpa-single-cell-image-classification/train/{data_id}_red.png', 0),
#         cv2.imread(f'../input/hpa-single-cell-image-classification/train/{data_id}_yellow.png', 0),
#         cv2.imread(f'../input/hpa-single-cell-image-classification/train/{data_id}_blue.png', 0)], axis=2)
#     cell_image = cv2.resize(cell_image, (512, 512))
#     ax[i, 0].imshow(cell_image)
#     ax[i, 0].imshow(train_cell_masks[i], alpha=0.5)
#     ax[i, 0].axis('off')
#     ax[i, 0].set_title('Faster')
    
#     ax[i, 1].imshow(train_cell_masks[i])
#     ax[i, 1].axis('off')
#     ax[i, 1].set_title('Just cell masks')

In [None]:
!rm -rf test-masks
!rm -rf cells-segmented