<h1 style="text-align: center; font-family: Verdana; font-size: 32px; font-style: normal; font-weight: bold; text-decoration: none; text-transform: none; font-variant: small-caps; letter-spacing: 3px; color: #74d5dd; background-color: #ffffff;">Human Protein Atlas - Single Cell Classification</h1>
<h2 style="text-align: center; font-family: Verdana; font-size: 24px; font-style: normal; font-weight: bold; text-decoration: underline; text-transform: none; letter-spacing: 2px; color: navy; background-color: #ffffff;">Categorical Classification At a Cellular Level [TRAINING]</h2>
<h5 style="text-align: center; font-family: Verdana; font-size: 12px; font-style: normal; font-weight: bold; text-decoration: None; text-transform: none; letter-spacing: 1px; color: black; background-color: #ffffff;">CREATED BY: DARIEN SCHETTLER</h5>

<br><br>

<h1 style="font-family: Verdana; font-size: 24px; font-style: normal; font-weight: bold; text-decoration: none; text-transform: none; letter-spacing: 3px; background-color: #ffffff; color: navy;" id="imports">0&nbsp;&nbsp;IMPORTS&nbsp;&nbsp;&nbsp;&nbsp;<a href="#toc">&#10514;</a></h1>

In [None]:
print("\n... OTHER IMPORTS STARTING ...\n")
print("\n\tVERSION INFORMATION")

# Machine Learning and Data Science Imports
import tensorflow_addons as tfa; print(f"\t\t– TENSORFLOW ADDONS VERSION: {tfa.__version__}");
import tensorflow as tf; print(f"\t\t– TENSORFLOW VERSION: {tf.__version__}");
import pandas as pd; pd.options.mode.chained_assignment = None;
import numpy as np; print(f"\t\t– NUMPY VERSION: {np.__version__}");
import scipy; print(f"\t\t– SCIPY VERSION: {scipy.__version__}");

# Built In Imports
from collections import Counter
from datetime import datetime
import multiprocessing
from glob import glob
import warnings
import requests
import imageio
import IPython
import urllib
import zipfile
import pickle
import random
import shutil
import string
import math
import tqdm
import time
import gzip
import io
import os
import gc
import re

# Visualization Imports
from matplotlib.colors import ListedColormap
import matplotlib.patches as patches
import plotly.graph_objects as go
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
import plotly.express as px
import seaborn as sns
from PIL import Image
import matplotlib; print(f"\t\t– MATPLOTLIB VERSION: {matplotlib.__version__}");
import plotly
import PIL
import cv2
import ast

# PRESETS
LBL_NAMES = ["Nucleoplasm", "Nuclear Membrane", "Nucleoli", "Nucleoli Fibrillar Center", "Nuclear Speckles", "Nuclear Bodies", "Endoplasmic Reticulum", "Golgi Apparatus", "Intermediate Filaments", "Actin Filaments", "Microtubules", "Mitotic Spindle", "Centrosome", "Plasma Membrane", "Mitochondria", "Aggresome", "Cytosol", "Vesicles", "Negative"]
INT_2_STR = {x:LBL_NAMES[x] for x in np.arange(19)}
INT_2_STR_LOWER = {k:v.lower().replace(" ", "_") for k,v in INT_2_STR.items()}
STR_2_INT_LOWER = {v:k for k,v in INT_2_STR_LOWER.items()}
STR_2_INT = {v:k for k,v in INT_2_STR.items()}
FIG_FONT = dict(family="Helvetica, Arial", size=14, color="#7f7f7f")
LABEL_COLORS = [px.colors.label_rgb(px.colors.convert_to_RGB_255(x)) for x in sns.color_palette("Spectral", len(LBL_NAMES))]
LABEL_COL_MAP = {str(i):x for i,x in enumerate(LABEL_COLORS)}

print("\n\n... IMPORTS COMPLETE ...\n")

<h1 style="font-family: Verdana; font-size: 24px; font-style: normal; font-weight: bold; text-decoration: none; text-transform: none; letter-spacing: 3px; color: navy; background-color: #ffffff;" id="setup">2&nbsp;&nbsp;NOTEBOOK SETUP&nbsp;&nbsp;&nbsp;&nbsp;<a href="#toc">&#10514;</a></h1>

In [None]:
# Define the path to the root data directory
ROOT_DIR = "/kaggle/input"
OUTPUT_DIR = "/kaggle"

# Define the path to the competition data directory
COMP_DIR = os.path.join(ROOT_DIR, "hpa-single-cell-image-classification")


COLORS=["red", "green", "blue", "yellow"]
TILE_OUTPUT_DIRS = [os.path.join(OUTPUT_DIR, f"{c}_tiles") for c in COLORS]
for OUTPUT_DIR in TILE_OUTPUT_DIRS:
    for k in STR_2_INT_LOWER.keys():
        os.makedirs(os.path.join(OUTPUT_DIR, k), exist_ok=True)

# Define the paths to the training and testing tfrecord and 
# image folders respectively for the competition data
TRAIN_IMG_DIR = os.path.join(COMP_DIR, "train")

# Capture all the relevant full image paths for the competition dataset
TRAIN_IMG_PATHS = sorted([os.path.join(TRAIN_IMG_DIR, f_name) for f_name in os.listdir(TRAIN_IMG_DIR)])

# Define paths to the relevant csv files
TRAIN_CSV = os.path.join(ROOT_DIR, "hpa-train-data-with-additional-metadata/updated_train.csv")

print("\n... Loading Massive Train Dataframe ...\n")
# Create the relevant dataframe objects
train_df = pd.read_csv(TRAIN_CSV)
train_df.drop(columns=["mask_rles"], inplace=True)
train_df = train_df[train_df.Label.str.count("\|")==0].reset_index(drop=True)
train_df.mask_bboxes = train_df.mask_bboxes.apply(lambda x: ast.literal_eval(x))

print("\n\nTRAIN DATAFRAME\n\n")
display(train_df.head(3))

<h1 style="font-family: Verdana; font-size: 24px; font-style: normal; font-weight: bold; text-decoration: none; text-transform: none; letter-spacing: 3px; color: navy; background-color: #ffffff;" id="helper_functions">3&nbsp;&nbsp;HELPER FUNCTIONS&nbsp;&nbsp;&nbsp;&nbsp;<a href="#toc">&#10514;</a></h1>

In [None]:
def load_image_color(img_id, img_dir, color):
    """ Load An Image To Be Tiled """
    return cv2.imread(os.path.join(img_dir, img_id+f"_{color}.png"), 0)


def load_image(img_id, img_dir):
    """ Load An Image Using ID and Directory Path - Composes 4 Individual Images """
    return np.stack(
        [np.asarray(cv2.imread(os.path.join(img_dir, img_id+f"_{c}.png"), 0)/255.) for c in ["red", "green", "blue", "yellow"]], axis=-1
    )

    
def convert_rgby_to_rgb(arr):
    """ Convert a 4 channel (RGBY) image to a 3 channel RGB image.
    
    Advice From Competition Host/User: lnhtrang

    For annotation (by experts) and for the model, I guess we agree that individual 
    channels with full range px values are better. 
    In annotation, we toggled the channels. 
    For visualization purpose only, you can try blending the channels. 
    For example, 
        - red = red + yellow
        - green = green + yellow/2
        - blue=blue.
        
    Args:
        arr (numpy array): The RGBY, 4 channel numpy array for a given image
    
    Returns:
        RGB Image
    """
    
    rgb_arr = np.zeros_like(arr[..., :-1])
    rgb_arr[..., 0] = arr[..., 0]
    rgb_arr[..., 1] = arr[..., 1]+arr[..., 3]/2
    rgb_arr[..., 2] = arr[..., 2]
    
    return rgb_arr
    
    
def plot_ex(arr, figsize=(20,6), title=None, plot_merged=True, rgb_only=False):
    """ Plot 4 Channels Side by Side """
    if plot_merged and not rgb_only:
        n_images=5 
    elif plot_merged and rgb_only:
        n_images=4
    elif not plot_merged and rgb_only:
        n_images=4
    else:
        n_images=3
    plt.figure(figsize=figsize)
    if type(title) == str:
        plt.suptitle(title, fontsize=20, fontweight="bold")

    for i, c in enumerate(["Red Channel – Microtubles", "Green Channel – Protein of Interest", "Blue - Nucleus", "Yellow – Endoplasmic Reticulum"]):
        if not rgb_only:
            ch_arr = np.zeros_like(arr[..., :-1])        
        else:
            ch_arr = np.zeros_like(arr)
        if c in ["Red Channel – Microtubles", "Green Channel – Protein of Interest", "Blue - Nucleus"]:
            ch_arr[..., i] = arr[..., i]
        else:
            if rgb_only:
                continue
            ch_arr[..., 0] = arr[..., i]
            ch_arr[..., 1] = arr[..., i]
        plt.subplot(1,n_images,i+1)
        plt.title(f"{c.title()}", fontweight="bold")
        plt.imshow(ch_arr)
        plt.axis(False)
        
    if plot_merged:
        plt.subplot(1,n_images,n_images)
        
        if rgb_only:
            plt.title(f"Merged RGB", fontweight="bold")
            plt.imshow(arr)
        else:
            plt.title(f"Merged RGBY into RGB", fontweight="bold")
            plt.imshow(convert_rgby_to_rgb(arr))
        plt.axis(False)
        
    plt.tight_layout(rect=[0, 0.2, 1, 0.97])
    plt.show()
    
    
def flatten_list_of_lists(l_o_l):
    return [item for sublist in l_o_l for item in sublist]


def pad_to_square(a, is_2d=False):
    """ Pad an array `a` evenly until it is a square """
    if a.shape[1]>a.shape[0]: # pad height
        n_to_add = a.shape[1]-a.shape[0]
        top_pad = n_to_add//2
        bottom_pad = n_to_add-top_pad
        if is_2d:
            a = np.pad(a, [(top_pad, bottom_pad), (0, 0)], mode='constant')
        else:
            a = np.pad(a, [(top_pad, bottom_pad), (0, 0), (0, 0)], mode='constant')
    elif a.shape[0]>a.shape[1]: # pad width
        n_to_add = a.shape[0]-a.shape[1]
        left_pad = n_to_add//2
        right_pad = n_to_add-left_pad
        if is_2d:
            a = np.pad(a, [(0, 0), (left_pad, right_pad)], mode='constant')
        else:
            a = np.pad(a, [(0, 0), (left_pad, right_pad), (0, 0)], mode='constant')
    else:
        pass
    return a


def get_cell_tiles_from_id(img_id, bboxes, tile_size=(128,128), color="red"):
    img = load_image_color(img_id, TRAIN_IMG_DIR, color)
    batch_cell_tiles = [
        cv2.resize(
            pad_to_square(img[bbox[1]:bbox[3], bbox[0]:bbox[2], ...], is_2d=True), 
                   tile_size, interpolation=cv2.INTER_CUBIC
        ) for bbox in bboxes
    ]
    return batch_cell_tiles


def img_id_to_save_files(output_dir, img_id, bboxes, lbl, tile_size=(128,128), color="red"):
    out_dir_path = os.path.join(output_dir, INT_2_STR_LOWER[int(lbl)])
    cell_tiles = get_cell_tiles_from_id(img_id, bboxes, tile_size, color)
    for i, tile in enumerate(cell_tiles):
        cv2.imwrite(os.path.join(out_dir_path, f"{img_id}_{i+1:02}.png"), tile)

In [None]:
# Get requisite arrays
train_arr = train_df[["ID", "Label", "mask_bboxes"]].values
train_ids = train_arr[:, 0]
train_labels = train_arr[:, 1]
train_bboxes = train_arr[:, 2]

# Loop over and generate the tiles
for clr, out_dir in tqdm(zip(COLORS, TILE_OUTPUT_DIRS), total=len(COLORS)):
    for _id, _bboxes, _lbl in tqdm(zip(train_ids, train_bboxes, train_labels), total=len(train_ids)):
        img_id_to_save_files(out_dir, _id, _bboxes, _lbl, color=clr)

In [None]:
!du -sh ./
!zip -r tile_dataset.zip /kaggle/red_tiles /kaggle/green_tiles /kaggle/blue_tiles /kaggle/yellow_tiles
!rm -rf /kaggle/*tiles
!du -sh ./