# Exploratory data analysis - Human Protein Atlas competition 
Exploratory data analysis (EDA) for the Human Protein Atlas (HPA) competition.

This notebooks is based on:

1. [Human Protein Atlas Cell Segmentation + [EDA] by Kuldeep Singh Chouhan](https://www.kaggle.com/kool777/human-protein-atlas-cell-segmentation-eda)

Additional Kaggle datasets for using HPA CellSegmentator without downloading files:
1. [Nuclei and cell weights for HPA CellSegmentator by RDizzl3](https://www.kaggle.com/rdizzl3/hpacellsegmentatormodelweights).
2. [Nuclei and cell for HPA CellSegmentator by daishuby](https://www.kaggle.com/daishu/hpacellsegmodel).  
.

In [None]:
# libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import glob
import warnings 
import os,gc,cv2
import shutil
from tqdm.notebook import tqdm

%matplotlib inline
warnings.filterwarnings('ignore')

In [None]:
# HPA single-cell image segmentation 
!pip install https://github.com/CellProfiling/HPA-Cell-Segmentation/archive/master.zip
import hpacellseg.cellsegmentator as cellsegmentator
from hpacellseg.utils import label_cell, label_nuclei

## Functions

In [None]:
# functions


# read and visualize sample image
def read_sample_image(filename, path):    
    '''
    read individual images
    of different filters (R, G, B, Y)
    and stack them.
    ---------------------------------
    Arguments:
    filename -- sample image path
    
    Returns:
    stacked_images -- stacked (RGBY) image
    '''
    red = cv2.imread( path +'train/' + filename + "_red.png", cv2.IMREAD_UNCHANGED)
    green = cv2.imread( path + 'train/' + filename + "_green.png", cv2.IMREAD_UNCHANGED)
    blue = cv2.imread( path + 'train/' + filename + "_blue.png", cv2.IMREAD_UNCHANGED)
    yellow = cv2.imread( path + 'train/' + filename + "_yellow.png", cv2.IMREAD_UNCHANGED)
    stacked_images = np.transpose( np.array([red, green, blue, yellow] ), (1,2,0) )
    return stacked_images


def plot_all(im, label):    
    '''
    plot all RGBY image,
    Red, Green, Blue, Yellow, 
    filters images.
    --------------------------
    Argument:
    im - image
    ''' 
    plt.figure(figsize=(15, 15))
    plt.subplot(1, 5, 1)
    plt.imshow(im[:,:,:3])
    plt.title('RGBY Image')
    plt.axis('off')
    plt.subplot(1, 5, 2)
    plt.imshow(im[:,:,0], cmap='Reds')
    plt.title('Microtubule channels')
    plt.axis('off')
    plt.subplot(1, 5, 3)
    plt.imshow(im[:,:,1], cmap='Greens')
    plt.title('Protein of Interest')
    plt.axis('off')
    plt.subplot(1, 5, 4)
    plt.imshow(im[:,:,2], cmap='Blues')
    plt.title('Nucleus')
    plt.axis('off')
    plt.subplot(1, 5, 5)
    plt.imshow(im[:,:,3], cmap='Oranges')
    plt.title('Endoplasmic Reticulum')
    plt.axis('off')
    plt.show()

    
# read and visualize sample image
def read_sample_image_seg(filename, path):    
    '''
    read individual images
    of different filters (R, B, Y)
    and stack them for segmentation.
    ---------------------------------
    Arguments:
    filename -- sample image file path
    
    Returns:
    stacked_images -- stacked (RBY) image path in lists.
    '''
    red = path + 'train/' + filename + "_red.png"
    blue = path + 'train/' + filename + "_blue.png"
    yellow = path + 'train/' + filename + "_yellow.png"
    stacked_images = [[red], [yellow], [blue]]
    return stacked_images, red, blue, yellow


# segment cell 
def segmentCell(image, segmentator):   
    '''
    segment cell and nuclei from
    microtubules, endoplasmic reticulum,
    and nuclei (R, B, Y) filters.
    ------------------------------------
    Argument:
    image -- (R, B, Y) list of image arrays
    segmentator -- CellSegmentator class object
    
    Returns:
    cell_mask -- segmented cell mask
    '''
    nuc_segmentations = segmentator.pred_nuclei(image[2])
    cell_segmentations = segmentator.pred_cells(image)
    nuclei_mask, cell_mask = label_cell(nuc_segmentations[0], cell_segmentations[0])
    gc.collect(); del nuc_segmentations; del cell_segmentations; del nuclei_mask
    return cell_mask


# plot segmented cells mask, image
def plot_cell_segments(mask, red, blue, yellow):    
    '''
    plot segmented cells
    and images
    ---------------------
    Arguments:
    mask -- cell mask
    red -- red filter image path
    blue -- blue filter image path
    yellow -- yellow filter image path
    '''
    microtubule = plt.imread(r)    
    endoplasmicrec = plt.imread(b)    
    nuclei = plt.imread(y)
    img = np.dstack((microtubule, endoplasmicrec, nuclei))
    plt.figure(figsize=(15, 15))
    plt.subplot(1, 3, 1)
    plt.imshow(img)
    plt.title('Image')
    plt.axis('off')
    plt.subplot(1, 3, 2)
    plt.imshow(mask)
    plt.title('Mask')
    plt.axis('off')
    plt.subplot(1, 3, 3)
    plt.imshow(img)
    plt.imshow(mask, alpha=0.6)
    plt.title('Image + Mask')
    plt.axis('off')
    plt.show()

    
# plot single segmented cells mask, image
def plot_single_cell(mask, red, blue, yellow):
    '''
    plot single cell mask
    and image
    ---------------------
    Arguments:
    mask -- cell mask
    red -- red filter image path
    blue -- blue filter image path
    yellow -- yellow filter image path
    '''
    microtubule = plt.imread(r)    
    endoplasmicrec = plt.imread(b)    
    nuclei = plt.imread(y)
    img = np.dstack((microtubule, endoplasmicrec, nuclei))
    
    contours= cv2.findContours(mask.astype('uint8'),
                               cv2.RETR_TREE, 
                               cv2.CHAIN_APPROX_SIMPLE)
    areas = [cv2.contourArea(c) for c in contours[0]]
    x = np.argsort(areas)
    cnt = contours[0][x[-1]]
    x,yc,w,h = cv2.boundingRect(cnt)  
    plt.figure(figsize=(15, 15))
    plt.subplot(1, 3, 1)
    plt.imshow(img[yc:yc+h, x:x+w])
    plt.title('Cell Image')
    plt.axis('off')
    plt.subplot(1, 3, 2)
    plt.imshow(mask[yc:yc+h, x:x+w])
    plt.title('Cell Mask')
    plt.axis('off') 
    plt.subplot(1, 3, 3)
    plt.imshow(img[yc:yc+h, x:x+w])
    plt.imshow(mask[yc:yc+h, x:x+w], alpha=0.6)
    plt.title('Cell Image + Mask')
    plt.axis('off')
    plt.show()


## 1. EDA for labels and images


### 1.1  EDA for labels
Count label distribution across samples and number of labels per sample.

In [None]:
# data directory
DIR = '../input/hpa-single-cell-image-classification/'
os.listdir(DIR)

In [None]:
# csv files
train_df = pd.read_csv( DIR + 'train.csv' )
sample_submission = pd.read_csv( DIR + 'sample_submission.csv' )

In [None]:
train_df.head()

In [None]:
sample_submission.head()

In [None]:
# plot class counts

# spliting label column
train_df["Label"] = train_df["Label"].str.split("|")

# class labels
class_labels = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18']

# binarizing each label/class
for label in tqdm(class_labels):
    train_df[label] = train_df['Label'].map(lambda result: 1 if label in result else 0)

# rename column
train_df.columns = ['ID', 'Label', 'Nucleoplasm', 'Nuclear membrane', 'Nucleoli', 'Nucleoli fibrillar center',
                    'Nuclear speckles', 'Nuclear bodies', 'Endoplasmic reticulum', 'Golgi apparatus', 'Intermediate filaments',
                    'Actin filaments', 'Microtubules', 'Mitotic spindle', 'Centrosome', 'Plasma membrane', 'Mitochondria',
                    'Aggresome', 'Cytosol', 'Vesicles and punctate cytosolic patterns', 'Negative']

# class counts dataframe 
class_counts = train_df.sum().drop(['ID', 'Label']).sort_values(ascending=False)
print(class_counts, '\n')

# generate plot
plt.figure( figsize=(14,12) )
ax=sns.barplot(y=class_counts.index.values, x=class_counts.values, palette='tab10')
plt.suptitle("Label Distribution")
plt.title("HPA single-cell classification Kaggle competition 2021")
plt.show()

* The most frequent label is Nucleoplasm, and there is clearly a class imbalance.
* There are only 34 negative examples, which is the less frequent class.
* It might be useful to try data augmentation methods in order to generate a more balanced set.

In [None]:
# plot labels per sample/image
label_per_image = train_df.drop(['ID', 'Label'], axis=1).sum(axis=1)
plt.figure(figsize=(16,10))
ax = sns.countplot(label_per_image, palette='Pastel1')
for p in ax.patches:
    height = p.get_height()
    ax.text( p.get_x()+p.get_width()/2.,
             height + 3,
             '{:1.2f}%'.format(height/len(label_per_image)*100),
             ha="center", fontsize=12)
    plt.title("Label Per Sample/Image", fontsize=16)

* A large percentage of images (48.19%) contain only one label.
* The majority of images (88.88%) contain one or two labels.
* This implies that only 11.12% of the images contains three or more labels.

### 1.2 EDA for images
In this subsection we want to understand how each sample image consists of four channels for each different cellular structures: Red - microtubules, Blue - nuclei, Yellow - endoplasmatic reticulum (ER), and Green - protein of interest. The classification task is to predict "protein organelle localization labels for each cell in the image" [see Kaggle notebook: 'Single-cell Patterns'](https://www.kaggle.com/lnhtrang/single-cell-patterns). This is why each image has multiple labels corresponding to these organelles. Only images with single labels (largest number in the training set) are considered. This means that each plot corresponds to one organelle label  and shows the stacked image (RGBY) along with each individual channel. For example the first image has the label 'Nucleoplasm' and means that the protein of interest is located only at this organelle according to the annotation.

In [None]:
# select images with single labels
train = train_df.loc[train_df['Label'].apply(lambda x: len(x)==1)==True]

# plot one example image per organelle
for label in train_df.drop(['ID', 'Label'], axis=1):
    print(label)
    im = read_sample_image(train[train[label]==1].sample(1).ID.to_string().split(' ')[4],  path=DIR)
    plot_all(im, label)

### 1.3 Label-wise image cell segmentation 
Each image should be segmented into cells. The [HPA segmentation tool](https://github.com/CellProfiling/HPA-Cell-Segmentation) can be used to perform this segmentation task. Here we will show how to do it. Each plot corresponds to single label RBY image, the segmented mask, and the stacked image + mask. There is an issue when trying to download the HPA CellSegmentator model weights, reported [here](https://www.kaggle.com/c/hpa-single-cell-image-classification/discussion/230090) and [here](https://www.kaggle.com/c/hpa-single-cell-image-classification/discussion/231036). These weights are available as a [dataset](https://www.kaggle.com/rdizzl3/hpacellsegmentatormodelweights) (by RDizzl3) or in this [other dataset](https://www.kaggle.com/daishu/hpacellsegmodel) (by daishuby).  

In [None]:
# Define CellSegmentator class
# [source: https://github.com/CellProfiling/HPA-Cell-Segmentation]
#-----------------------------------------------------------

# [1] path to the nuclei model weights (from dataset):
NUC_MODEL = '../input/hpacellsegmentatormodelweights/dpn_unet_nuclei_v1.pth'

#-----------------------------------------------------------
# [2] path to the cell model weights (from dataset):
CELL_MODEL = '../input/hpacellsegmentatormodelweights/dpn_unet_cell_3ch_v1.pth'

#-----------------------------------------------------------
# [3] scale_factor: determines how much the images should be 
# scaled before being fed to the models. For HPA Cell images, 
# a value of 0.25 (default) is good.
#-----------------------------------------------------------
# [4] device: Inform Torch which device to put the model on. 
#Valid  values are ‘cpu’ or ‘cuda’ or pointed cuda device  
# like 'cuda:0’. Defaults to cuda.
#-----------------------------------------------------------
# [5] padding: If True, add some padding before feeding the 
# images to the neural networks. This is not required but 
# can make segmentations, especially cell segmentations,
# more accurate. Defaults to False. Note: If you have issues 
# running the segmentation due to image dimensions, setting 
# padding to True may help.
#-----------------------------------------------------------
# [6] multi_channel_model: If True, use the pretrained 
# three-channel version of the model. Having this set to 
# True gives you better cell segmentations but requires 
# you to give the model endoplasmic reticulum images as 
# part of the cell segmentation. Otherwise, the version 
# trained with only two channels, microtubules and nuclei, 
# will be used. Defaults to True
#-----------------------------------------------------------
segmentator = cellsegmentator.CellSegmentator(
    NUC_MODEL,
    CELL_MODEL,
    scale_factor=0.25,
    device="cpu",
    padding=True,
    multi_channel_model=True,
)

In [None]:
# select images with single labels
for label in train_df.drop(['ID', 'Label'], axis=1):
    print(label)
    # plot one example image per organelle
    im, r, b, y = read_sample_image_seg(train[train[label]==1].sample(1).ID.to_string().split(' ')[4], path=DIR)
    mask = segmentCell(im, segmentator)
    plot_cell_segments(mask, r, b, y)

### 1.4 Label-wise  single-cell segmentation

In [None]:
for label in train_df.drop(['ID', 'Label'], axis=1):
    print(label)
    im, r, b, y = read_sample_image_seg(train[train[label]==1].sample(1).ID.to_string().split(' ')[4],path=DIR)
    mask = segmentCell(im, segmentator)
    plot_single_cell(mask, r, b, y)