In [None]:
!pip install https://github.com/CellProfiling/HPA-Cell-Segmentation/archive/master.zip

In [None]:
import os
import numpy as np
import pandas as pd
from PIL import Image
import tensorflow as tf
from matplotlib import pyplot as plt
import hpacellseg.cellsegmentator as cellsegmentator
from hpacellseg.utils import label_cell, label_nuclei

# **FUNCTIONS**

In [None]:
# Input: list of image filters as png
# Output: list of image filters as np.arrays
def image_to_arrays(path):
    
    image_arrays = list()
    for image in path:
        array = np.asarray(Image.open(image))
        image_arrays.append(array)
        
    return image_arrays

> Reference: [Human Protein Atlas - Segmentation](https://www.kaggle.com/christopherworley/human-protein-atlas-segmentation#Functions)

In [None]:
# Get single image that blends all RGBY into RGB
# Introduce the images as arrays. Can use the function above.

def get_blended_image(images): 
    # get rgby images for sample

    # blend rgby images into single array
    blended_array = np.stack(images[:-1], 2)

    # Create PIL Image
    blended_image = Image.fromarray( np.uint8(blended_array) )
    return blended_image

In [None]:
# Introduce list of image filters
# Returns a processed image ready for the CNN and an encoded label as tensor
def image_prep(paths, label):

    img = image_to_arrays(paths)
    size = np.shape(img[0])[0]
    img = tf.image.convert_image_dtype(img, dtype=tf.float32)
    img = tf.reshape(img, (1, size, size, 3))
    img = tf.image.resize(img, IMG_SIZE)

    label = tf.strings.split(label, sep='|')
    label = tf.strings.to_number(label, out_type=tf.int32)
    label = tf.reduce_sum(tf.one_hot(indices=label, depth=19), axis=0)
    label = tf.reshape(label, (1, 19))
    
    return img, label

In [None]:
def apply_augmentation(image, label):
    aug_img = tf.numpy_function(func=aug_fn, inp=[image], Tout=tf.float32)
    aug_img.set_shape((IMG_SIZE[0], IMG_SIZE[0], 3))
    
    return aug_img, label

In [None]:
def plot_hist(hist):
    plt.plot(hist.history["accuracy"])
    plt.plot(hist.history["val_accuracy"])
    plt.title("model accuracy")
    plt.ylabel("accuracy")
    plt.xlabel("epoch")
    plt.legend(["train", "validation"], loc="upper left")
    plt.show()

In [None]:
DATA_DIR = "/kaggle/input/hpa-single-cell-image-classification"

train = pd.read_csv(os.path.join(DATA_DIR,'train.csv'))

In [None]:
colours = ['_red.png', '_blue.png', '_yellow.png', '_green.png']
TRAIN = '../input/hpa-single-cell-image-classification/train'
paths = [[os.path.join(TRAIN, train.iloc[idx,0])+ colour for colour in colours] for idx in range(len(train))]

# *Data Analisys...*

In [None]:
# Let's check out the label distribution frequency.
label_counts = []
for label in train['Label']:
    sep = label.split('|')
    for num in sep:
        labels.append(int(num))
counts = pd.value_counts(labels)

# It's an ugly plot, but I'm trying to save some time here...
plt.bar(x = counts.index,height=counts)
plt.xticks(counts.index)
plt.show()

In [None]:
titles = ['microtubules', 'nuclei', 'endoplasmic reticulum', 'protein of interest']
fig, axs = plt.subplots(3, 4, figsize =(16,8))
for entry in range(3):
    for channel in range(4):
        img = plt.imread(paths[entry][channel])
        axs[entry, channel].imshow(img)        
        if entry == 0:
            axs[0, channel].set_title(titles[channel])

# **Segmentation using [HPA-Cell-Segmentation](https://github.com/CellProfiling/HPA-Cell-Segmentation)**

In [None]:
NUC_MODEL = "./nuclei-model.pth"
CELL_MODEL = "./cell-model.pth"
segmentator = cellsegmentator.CellSegmentator(
    NUC_MODEL,
    CELL_MODEL,
    scale_factor=0.25,
    device="cuda",
    padding=False,
    multi_channel_model=True,
)

image = paths[4]
arrays = image_to_arrays(image)
nuclei = arrays[1]
cell = arrays[:-1]

# Nuclei segmentation
nuc_segmentations = segmentator.pred_nuclei([nuclei])

f, ax = plt.subplots(1, 2, figsize=(16,16))
ax[0].imshow(arrays[1])
ax[0].set_title('Original Nucleis', size=20)
ax[1].imshow(nuc_segmentations[0])
ax[1].set_title('Segmented Nucleis', size=20)
plt.show()

# Cell segmentation
inter_step = [[i] for i in image[:-1]]
cell_segmentations = segmentator.pred_cells(inter_step)

f, ax = plt.subplots(1, 2, figsize=(16,16))
ax[0].imshow(get_blended_image(arrays))
ax[0].set_title('Original Cells', size=20)
ax[1].imshow(cell_segmentations[0])
ax[1].set_title('Segmented Cells', size=20)
plt.show()

# **Visualizing the masks**

In [None]:
# Nuclei mask
nuclei_mask = label_nuclei(nuc_segmentations[0])
# Cell masks
cell_nuclei_mask, cell_mask = label_cell(nuc_segmentations[0], cell_segmentations[0])
# Plotting
f, ax = plt.subplots(1, 3, figsize=(16,16))
ax[0].imshow(nuclei_mask)
ax[0].set_title('Nuclei Mask', size=20)
ax[1].imshow(cell_nuclei_mask)
ax[1].set_title('Cell Nuclei Mask', size=20)
ax[2].imshow(cell_mask)
ax[2].set_title('Cell Mask', size=20)
plt.show()

Let's check the results of the segmentation

In [None]:
# Let's stack the original image and the segmentation mask, to see how the segmentation worked out
plt.figure(figsize=(20,20))
plt.imshow(get_blended_image(arrays))
plt.imshow(cell_mask, alpha=0.5)
plt.title('Segmentation results', size=40)
plt.axis('off')
plt.show()

# **Cell separation**

The objective of this project is to label each cell in the image. Therefore each cell in the image must be separated.

In [None]:
# Unique vector of cell_mask numbers
numbers = set(np.ravel(cell_mask))
numbers.remove(0)

fig = plt.figure(figsize=(25,6*len(numbers)/4))
index = 1

ax = fig.add_subplot(len(numbers)//4+1, 4, index)
ax.set_title("Complete Cell Mask", size=20)
plt.imshow(cell_mask)

index += 1
for number in numbers:
    isolated_cell = np.where(cell_mask==number, cell_mask, 0)
    ax = fig.add_subplot(len(numbers)//4+1, 4, index)
    ax.set_title("Segment {number}", size=20)
    plt.imshow(isolated_cell)
    index += 1

Now that the segmentation is complete. We should be able to train an image classification model to identify each cell within the image.

The main problem is that the labels are given for each image, therefore we don't really know which of the cells in the image may represent such label.
Maybe the CNN is able to understand the pattern given the same label for every cell of the image, although it can lead to high misslabeling.

# **TRAINING MODEL SETUP**

Reference: [HPA: Multi-Label Classification with TF and W&B](https://www.kaggle.com/ayuraj/hpa-multi-label-classification-with-tf-and-w-b)

Imports.

In [None]:
from tensorflow.keras.applications import EfficientNetB0
from tensorflow.keras.models import Sequential
from tensorflow.keras import layers
from tensorflow.keras import Model
import wandb

Parameter setting

In [None]:
LABELS= {
0: "Nucleoplasm",
1: "Nuclear membrane",
2: "Nucleoli",
3: "Nucleoli fibrillar center",
4: "Nuclear speckles",
5: "Nuclear bodies",
6: "Endoplasmic reticulum",
7: "Golgi apparatus",
8: "Intermediate filaments",
9: "Actin filaments",
10: "Microtubules",
11: "Mitotic spindle",
12: "Centrosome",
13: "Plasma membrane",
14: "Mitochondria",
15: "Aggresome",
16: "Cytosol",
17: "Vesicles and punctate cytosolic patterns",
18: "Negative"
}

In [None]:
# We'll use EfficientNetB0 model, which requires an image dimension of (224,224,3).Therefor, we can only pass a 3 filter image... 
#We'll put aside the yellow filter for now.
IMG_SIZE = [224, 224]
BATCH_SIZE = 64
AUTOTUNE = tf.data.experimental.AUTOTUNE

colours = ['_red.png', '_blue.png', '_green.png']
TRAIN = '../input/hpa-single-cell-image-classification/train'
paths = [[os.path.join(TRAIN, train.iloc[idx,0])+ colour for colour in colours] for idx in range(len(train))]

# **Training data**

In [None]:
# Processing the data for training:
training_data = []
for i,path in enumerate(paths[:500]):
    img, label = image_prep(path, train['Label'][i])
    training_data.append([img,label])

train_ds = tf.data.Dataset.from_tensor_slices(([training_data[i][0] for i in range(len(training_data))], [training_data[i][1] for i in range(len(training_data))]))
len(train_ds)

# **Validation data**

In [None]:
val_data = []
start_img = 500
val_num = 100
for i,path in enumerate(paths[start_img:start_img+val_num]):
    img, label = image_prep(path, train['Label'][i+start_img])
    val_data.append([img,label])

val_ds = tf.data.Dataset.from_tensor_slices(([val_data[i][0] for i in range(len(val_data))], [val_data[i][1] for i in range(len(val_data))]))

# **CNN Model**

In [None]:
base_model = EfficientNetB0(include_top=False, weights='imagenet')
base_model.trainable = True

inputs = layers.Input((IMG_SIZE[0], IMG_SIZE[0], 3))

x = base_model(inputs, training=True)
x = layers.GlobalAveragePooling2D()(x)
x = layers.Dropout(0.5)(x)
outputs = layers.Dense(len(LABELS), activation='sigmoid')(x)

tf.keras.backend.clear_session()

model = Model(inputs, outputs)
model.summary()

In [None]:
tf.keras.backend.clear_session()

earlystopper = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss', patience=10, verbose=0, mode='min',
    restore_best_weights=True
)

model.compile('adam', 'binary_crossentropy', metrics=[tf.keras.metrics.AUC(multi_label=True)])
#model.compile('adam', 'binary_crossentropy', metrics=['accuracy'])
#run = wandb.init(entity='ayush-thakur', project='hpa', job_type='train')

hist = model.fit(train_ds, 
          epochs=50,
          validation_data=val_ds,
          verbose=1,
          callbacks=[earlystopper]
                )
#plot_hist(hist)
#run.finish()