In [None]:
import os
import sys
import cv2
import itertools
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
import seaborn as sns

%matplotlib inline

In [None]:
root_dir = '../input/hpa-single-cell-image-classification'
train_df = pd.read_csv(root_dir+'/train.csv')
train_df.head()

In [None]:
train_df.shape

In [None]:
label_dict = {
    0: 'Nucleoplasm',
    1: 'Nuclear membrane',
    2: 'Nucleoli',
    3: 'Nucleoli fibrillar center',
    4: 'Nuclear speckles',
    5: 'Nuclear bodies',
    6: 'Endoplasmic reticulum',
    7: 'Golgi apparatus',
    8: 'Intermediate filaments',
    9: 'Actin filaments',
    10: 'Microtubules',
    11: 'Mitotic spindle',
    12: 'Centrosome',
    13: 'Plasma membrane',
    14: 'Mitochondria',
    15: 'Aggresome',
    16: 'Cytosol',
    17: 'Vesicles and punctate cytosolic patterns',
    18: 'Negative' 
}

reverse_labels = {y:x for x,y in label_dict.items()}

In [None]:
ohe_labels = train_df['Label'].str.get_dummies(sep='|')
ohe_labels = ohe_labels.rename(columns={str(x): y for x,y in label_dict.items()})
ohe_labels = pd.concat([train_df['ID'], ohe_labels], axis=1)
#ohe_labels = train_df['ID'].append(ohe_labels)
ohe_labels.head()

In [None]:
fig = px.bar(ohe_labels.drop('ID', axis=1).sum())
fig.show()

In [None]:
sample_img = train_df.sample(1)
sample_img, sample_label = sample_img.values[0]

In [None]:
sample_img_r = cv2.imread(root_dir+'/train/'+sample_img+'_red.png', cv2.IMREAD_GRAYSCALE)
sample_img_g = cv2.imread(root_dir+'/train/'+sample_img+'_green.png', cv2.IMREAD_GRAYSCALE)
sample_img_b = cv2.imread(root_dir+'/train/'+sample_img+'_blue.png', cv2.IMREAD_GRAYSCALE)
sample_img_y = cv2.imread(root_dir+'/train/'+sample_img+'_yellow.png', cv2.IMREAD_GRAYSCALE)

fig, ax = plt.subplots(2, 2, figsize=(10,10))

ax[0][0].imshow(sample_img_r, cmap='gray')
ax[0][0].set_title('red');
ax[1][0].imshow(sample_img_g, cmap='gray')
ax[1][0].set_title('green');
ax[0][1].imshow(sample_img_b, cmap='gray')
ax[0][1].set_title('blue');
ax[1][1].imshow(sample_img_y, cmap='gray')
ax[1][1].set_title('yellow');
fig.suptitle(', '.join([label_dict[int(x)] for x in sample_label.split('|')]));

In [None]:
# for fun's sake :)
fig, ax = plt.subplots(2, 2, figsize=(10,10))

ax[0][0].imshow(np.stack([sample_img_r, sample_img_g, sample_img_b], axis=-1))
ax[1][0].imshow(np.stack([sample_img_r, sample_img_g, sample_img_y], axis=-1))
ax[0][1].imshow(np.stack([sample_img_r, sample_img_y, sample_img_b], axis=-1))
ax[1][1].imshow(np.stack([sample_img_y, sample_img_b, sample_img_r], axis=-1))

From previous competition <a href='https://www.kaggle.com/c/human-protein-atlas-image-classification'>Human Protein Atlas Image Classification</a>
> All image samples are represented by four filters (stored as individual files), the protein of interest (green) plus three cellular landmarks: nucleus (blue), microtubules (red), endoplasmic reticulum (yellow). The green filter should hence be used to predict the label, and the other filters are used as references.

H0 #1: Green channel alone carries significant predictive power.<br>
H0 #2: there is some correlation between yellow-green and red-blue channels

In [None]:
def correlate(arr1, arr2):
    product = np.mean((arr1 - arr1.mean()) * (arr2 - arr2.mean()))
    stds = arr1.std() * arr2.std()
    if stds == 0:
        return 0
    else:
        product /= stds
        return product

In [None]:
channels = {'red': sample_img_r,
           'green': sample_img_g,
           'blue': sample_img_b,
           'yellow': sample_img_y}

for i, j in itertools.combinations(list(channels.keys()), 2):
    print(f'Correlation {i} - {j}: {correlate(channels[i], channels[j])}')

In [None]:
for ch in list(channels.keys()):
    print(f'{ch}: \n min: {channels[ch].min()}, max: {channels[ch].max()}, \
    mean: {channels[ch].mean()}, median: {np.median(channels[ch])}, std: {channels[ch].std()}')

Common sence tells us we should locate the same number of nuclei and cells. Competition's data description confirms we should pay particular attention to blue channel if we want to locate precisely a cell. Just to see how we can approach this task, let's see if we can find contours of those and compare to other channels.

In [None]:
def plot_channel(channel, thresh=10, blur_kernel=5, draw_box=False):
    channel_mask = np.zeros(channels[channel].shape)
    if blur_kernel>0:
        img = cv2.medianBlur(channels[channel], blur_kernel)
    else:
        img = channels[channel]
    _, thresh_img = cv2.threshold(img, thresh, 255, cv2.THRESH_BINARY+cv2.THRESH_OTSU)
    contours, _ = cv2.findContours(thresh_img, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    if draw_box is False:
        channel_mask = cv2.drawContours(mask, contours, -1, 255, 2)
    else:
        for cont in contours:
            px ,py, w, h = cv2.boundingRect(cont)
            channel_mask = cv2.drawContours(channel_mask, [cont], 0, 255, 2)
            channel_mask = cv2.rectangle(channel_mask, (px ,py), (px+w, py+h), 255, 2)

    fig, ax = plt.subplots(1, 2, figsize=(8,11))

    ax[0].imshow(channels[channel], cmap='gray')
    ax[0].set_title(f'{channel} image')
    ax[1].imshow(channel_mask, cmap='gray')
    ax[1].set_title(f'{channel} mask ({len(contours)} contours)')
    
plot_channel('red', draw_box=True)
plot_channel('green', draw_box=True)
plot_channel('blue', draw_box=True)
plot_channel('yellow', draw_box=True)

Easier said than done, huh? As for blue channel luckely there is not particular issue of false positives but merging contours is.

Let's turn the gaze into some statictics of labels co-occurence and might be we will discover something interesting along the way.

In [None]:
#fig, ax = plt.subplots()
px.bar(ohe_labels.sum(axis=1).value_counts())

In [None]:
data = []
cols = ohe_labels.drop('ID', axis=1).columns
fig = go.Figure(data=[
    go.Bar(name='Frequency of labels', x=cols, 
           y=[ohe_labels[x].sum() for x in cols],
          marker_color='limegreen')
])

fig.show()