# Downloading HPA public data

In [None]:
%%capture
# Install W&B
!pip install wandb --upgrade
# Install HPA Cell Segmentation tool.
!pip install https://github.com/CellProfiling/HPA-Cell-Segmentation/archive/master.zip

In [None]:
# General imports.
import io
import re
import os
import cv2
import glob
import gzip
import imageio
import pathlib
import requests
import numpy as np
import pandas as pd
import seaborn as sns
from PIL import Image
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
from skimage.transform import resize
from sklearn.model_selection import train_test_split

%matplotlib inline

# HPA Segmentation tool related imports
import hpacellseg.cellsegmentator as cellsegmentator
from hpacellseg.utils import label_cell, label_nuclei

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline


def tif_gzip_to_png(tif_path):
    '''Function to convert .tif.gz to .png and put it in the same folder
    Eg. for working in local work station
    '''
    png_path = pathlib.Path(tif_path.replace('.tif.gz','.png'))
    tf = gzip.open(tif_path).read()
    img = imageio.imread(tf, 'tiff')
    imageio.imwrite(png_path, img)
    
def download_and_convert_tifgzip_to_png(url, target_path):    
    '''Function to convert .tif.gz to .png and put it in the same folder
    Eg. in Kaggle notebook
    '''
    r = requests.get(url)
    f = io.BytesIO(r.content)
    tf = gzip.open(f).read()
    img = imageio.imread(tf, 'tiff')
    imageio.imwrite(target_path, img)

In [None]:
# W&B import and login
import wandb
from kaggle_secrets import UserSecretsClient

user_secrets = UserSecretsClient()
wandb_api = user_secrets.get_secret("wandb_api")

wandb.login(key=wandb_api)

LOG_WANDB = False

In [None]:
# All label names in the public HPA and their corresponding index. 
all_locations = dict({
    "Nucleoplasm": 0,
    "Nuclear membrane": 1,
    "Nucleoli": 2,
    "Nucleoli fibrillar center": 3,
    "Nuclear speckles": 4,
    "Nuclear bodies": 5,
    "Endoplasmic reticulum": 6,
    "Golgi apparatus": 7,
    "Intermediate filaments": 8,
    "Actin filaments": 9,
    "Focal adhesion sites": 9,
    "Microtubules": 10,
    "Mitotic spindle": 11,
    "Centrosome": 12,
    "Centriolar satellite": 12,
    "Plasma membrane": 13,
    "Cell Junctions": 13,
    "Mitochondria": 14,
    "Aggresome": 15,
    "Cytosol": 16,
    "Vesicles": 17,
    "Peroxisomes": 17,
    "Endosomes": 17,
    "Lysosomes": 17,
    "Lipid droplets": 17,
    "Cytoplasmic bodies": 17,
    "No staining": 18
})


def add_label_idx(df, all_locations):
    '''Function to convert label name to index
    '''
    df["Label_idx"] = None
    for i, row in df.iterrows():
        labels = row.Label.split(',')
        idx = []
        for l in labels:
            if l in all_locations.keys():
                idx.append(str(all_locations[l]))
        if len(idx)>0:
            df.loc[i,"Label_idx"] = "|".join(idx)
            
        print(df.loc[i,"Label"], df.loc[i,"Label_idx"])
    return df
    

In [None]:
public_hpa_df = pd.read_csv('../input/publichpa-withcellline/kaggle_2021.tsv')
# Remove all images overlapping with Training set
public_hpa_df = public_hpa_df[public_hpa_df.in_trainset == False]

# Remove all images with only labels that are not in this competition
public_hpa_df = public_hpa_df[~public_hpa_df.Label_idx.isna()]

colors = ['blue', 'red', 'green', 'yellow']
celllines = ['A-431', 'A549', 'EFO-21', 'HAP1', 'HEK 293', 'HUVEC TERT2', 'HaCaT', 'HeLa', 'PC-3', 'RH-30', 'RPTEC TERT1', 'SH-SY5Y', 'SK-MEL-30', 'SiHa', 'U-2 OS', 'U-251 MG', 'hTCEpi']
public_hpa_df_17 = public_hpa_df[public_hpa_df.Cellline.isin(celllines)]
len(public_hpa_df), len(public_hpa_df_17)

public_hpa_df.head(10)

In [None]:
if LOG_WANDB:
    public_hpa_df.to_csv('public_hpa.csv', index=False)

    # log as artifact
    run = wandb.init(project='hpa', job_type='hpa_public_dataset_creation')
    artifact = wandb.Artifact('hpa_public_data', type='dataset')
    artifact.add_file('public_hpa.csv')
    run.log_artifact(artifact)
    run.join()

In [None]:
all_labels = public_hpa_df.Label_idx.unique().tolist()
all_labels = '|'.join(all_labels)
all_labels = all_labels.split('|')
all_labels = list(set(all_labels))
num_unique_labels = len(all_labels)
all_labels = sorted(all_labels, key=int)
all_labels = ' '.join(all_labels)
print(f'{num_unique_labels} unique labels, values: {all_labels}')

In [None]:
labels = [str(i) for i in range(19)]

unique_counts = {}
for lbl in labels:
    unique_counts[lbl] = len(public_hpa_df[public_hpa_df.Label_idx == lbl])

full_counts = {}
for lbl in labels:
    count = 0
    for row_label in public_hpa_df['Label_idx']:
        if lbl in row_label.split('|'): count += 1
    full_counts[lbl] = count
    
counts = list(zip(full_counts.keys(), full_counts.values(), unique_counts.values()))
counts = np.array(sorted(counts, key=lambda x:-x[1]))
counts = pd.DataFrame(counts, columns=['label', 'full_count', 'unique_count'])

sns.set(style="whitegrid")
f, ax = plt.subplots(figsize=(16, 12))

sns.set_color_codes("pastel")
sns.barplot(x="full_count", y="label", data=counts, order=counts.label.values,
            label="full count", color="b", orient = 'h')

# Plot the crashes where alcohol was involved
sns.set_color_codes("muted")
sns.barplot(x="unique_count", y="label", data=counts, order=counts.label.values,
            label="unique count", color="b", orient = 'h')

# Add a legend and informative axis label
ax.legend(ncol=2, loc="lower right", frameon=True)
ax.set(ylabel="",
       xlabel="Counts")
sns.despine(left=True, bottom=True)

# Get Single Labeled Images

In [None]:
# Add a column - num_classes
public_hpa_df['num_classes'] = public_hpa_df['Label_idx'].apply(lambda r: len(r.split('|')))
print(f'Total number of images: {len(public_hpa_df)}')
public_hpa_df.head()

In [None]:
# interested in single labels only
print(public_hpa_df['num_classes'].value_counts())
public_hpa_df['num_classes'].value_counts().plot.bar(title='Examples with multiple labels', xlabel='number of labels per example', ylabel='# train examples')
plt.show()

In [None]:
df_one_label = public_hpa_df.loc[public_hpa_df['num_classes'] == 1]
print(f'Number of images with one image-level labels: {len(df_one_label)}')
df_one_label.Label_idx = df_one_label.Label_idx.astype('int64')
df_one_label.head()

In [None]:
# Ref: https://www.kaggle.com/divyanshuusingh/eda-image-segmentation
label_names= {
0: "Nucleoplasm",
1: "Nuclear membrane",
2: "Nucleoli",
3: "Nucleoli fibrillar center",
4: "Nuclear speckles",
5: "Nuclear bodies",
6: "Endoplasmic reticulum",
7: "Golgi apparatus",
8: "Intermediate filaments",
9: "Actin filaments",
10: "Microtubules",
11: "Mitotic spindle",
12: "Centrosome",
13: "Plasma membrane",
14: "Mitochondria",
15: "Aggresome",
16: "Cytosol",
17: "Vesicles and punctate cytosolic patterns",
18: "Negative"
}

labels, counts = np.unique(df_one_label.Label_idx.values, return_counts=True)
print(f'The unique labels are: {labels} and there values are: {counts}')

plt.figure(figsize=(15,5))
plt.bar(labels, counts)

for index, value in enumerate(counts):
    plt.text(index-0.25, value, str(value), fontdict=dict(fontsize=10))

plt.xticks(np.arange(len(labels)), labels=label_names.values(), rotation=85)
plt.show()

! Warning: These are raw, full size images. Each channel is approximately 8MB. With ~~82495~~ 77668 images * 4 channels, everything amounts to around ~~2.6TB~~ 2.4TB of data.

We have 17 cell lines in the training set and test set. So if you want to download extra public data for training and downloading is too slow because of big size, you probably want to consider:
- Downloading just 17 cell lines (67k images * 4 channels, instead of 77.6k images)
- Sampling according to label (eg. You have a lot of Nucleoplasm and Cytosol in the training set already so maybe you just need more rare labels)
- Using jpeg images. These were purposely created for visualization on HPA website. They are much smaller in size (you will loose some info compared to tif file, but maybe that's enough for your model. Your call). To download jpeg, simply change `.tif.gz` to `.jpg` in the url. For example: https://images.proteinatlas.org/10005/921_B9_1_blue.jpg	

In [None]:
DESIRED_LABEL = 1
LABEL_NAME = '_'.join(label_names[DESIRED_LABEL].split(' ')).lower()

tmp_df = df_one_label.loc[df_one_label.Label_idx==DESIRED_LABEL]
tmp_df.head()

In [None]:
if LOG_WANDB:
    tmp_df.to_csv(f'pubic_{LABEL_NAME}.csv')
    
    # log as artifact
    run = wandb.init(entity='ayush-thakur', project='hpa', job_type=f'public_{LABEL_NAME}')
    artifact_csv = run.use_artifact('ayush-thakur/hpa/hpa_public_data:v0', type='dataset')

    artifact = wandb.Artifact(f'{LABEL_NAME}', type='dataset')
    artifact.add_file(f'pubic_{LABEL_NAME}.csv')
    run.log_artifact(artifact)
    run.join()

In [None]:
SAVE_DIR = f'/kaggle/tmp/hpa_public_cell_level_{LABEL_NAME}_128x128/'

os.makedirs(SAVE_DIR+'rgb', exist_ok=True)
os.makedirs(SAVE_DIR+'protein', exist_ok=True)

!ls {SAVE_DIR}

## Model

In [None]:
NUC_MODEL = "./nuclei-model.pth"
CELL_MODEL = "./cell-model.pth"

segmentator = cellsegmentator.CellSegmentator(
    NUC_MODEL,
    CELL_MODEL,
    scale_factor=0.25,
    device="cuda",
    padding=True,
    multi_channel_model=True,
)

In [None]:
IMAGE_HEIGHT = 128
IMAGE_WIDTH = 128

# check if the contour for nuclie is touching the image boundary. 
def is_border_nuclei(contour_points):
    unique_points = np.unique(contour_points)
    # basically if any point is 0 that means its touching the edge of the image.
    if 0 in unique_points:
        return True
    return False

def extract_cell_level_images(paths, image_id):
    '''
    Input
    
    paths - absolute path of colored images. 
    '''    
    # LOAD IMAGES
    for path in paths:
        if 'red' in path:
            mt = path
        elif 'blue' in path:
            nu = path
        elif 'yellow' in path:
            er = path
        elif 'green' in path:
            pr = path
    
    # Get all channel images.
    microtubule = np.array(Image.open(mt))
    endoplasmicrec = np.array(Image.open(er))
    nuclei = np.array(Image.open(nu))
    protein = np.array(Image.open(pr))
    protein = ((protein/np.max(protein))*255).astype('uint8')

    # Stack the channels to form image. Using Red, Yellow and Blue.
    image = np.dstack((microtubule, endoplasmicrec, nuclei))
    image = ((image/np.max(image))*255).astype('uint8')

    # PERFORM SEGMENTATION
    
    # For nuclei segmentation only blue channel is required.
    nuc_segmentation = segmentator.pred_nuclei([nu])
    # For full cells all the three reference(except green) channels are required.
    cell_segmentation = segmentator.pred_cells([[mt], [er], [nu]])
    # get cell mask
    nuclei_mask, cell_mask = label_cell(nuc_segmentation[0], cell_segmentation[0])

    # GET IMDIVIDUAL CELLS
    
    # Count the number of cells.
    cells = np.unique(cell_mask)
    nuclei = np.unique(nuclei_mask)

    cell_count = 0
    for cell_index in cells[1:]:
        # Get cell and nucleus mask for one cell.
        single_cell_mask = np.where(cell_mask==cell_index, 1,0).astype('uint8')
        if cell_index in nuclei:
            nucleus_mask = np.where(nuclei_mask==cell_index, 1,0).astype('uint8')
        else:
            continue
        
        # get contour for cell and nucleus
        cell_cnts, _ = cv2.findContours(single_cell_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
        nucleus_cnts, _ = cv2.findContours(nucleus_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)

        # Check if the nucleus is touching the boundary of image.
        if not is_border_nuclei(nucleus_cnts[0]): # If not touching the boundary 
            # Mask the cell to be cropped.
            cell = cv2.bitwise_and(image, image, mask=single_cell_mask)
            # Mask the protein to be cropped.
            cell_protein = cv2.bitwise_and(protein, protein, mask=nucleus_mask)
            
            # Get bounding box covering the contour. 
            x,y,w,h = cv2.boundingRect(cell_cnts[0])
            # Crop cell
            cell = cell[y:y+h, x:x+w]
            # Resize cell
            cell = cv2.resize(cell, (IMAGE_HEIGHT, IMAGE_WIDTH), cv2.INTER_AREA)
            
            # Crop protein
            cell_protein = cell_protein[y:y+h, x:x+w]
            # Resize protein
            cell_protein = cv2.resize(cell_protein, (IMAGE_HEIGHT, IMAGE_WIDTH), cv2.INTER_AREA)
            
            # Save images
            cv2.imwrite(SAVE_DIR+'rgb/'+image_id+f'_{cell_count}.png', cell)
            cv2.imwrite(SAVE_DIR+'protein/'+image_id+f'_{cell_count}.png', cell_protein)
            
            cell_count+=1
            
    print('.', end='')

In [None]:
save_dir = os.path.join(os.getcwd(),'publichpa1')
if not os.path.exists(save_dir):
    os.makedirs(save_dir)
    
for i, row in tqdm(tmp_df.iterrows()):
    try:
        # Download Image
        img = row.Image
        color_image_paths = []
        for color in colors:
            img_url = f'{img}_{color}.tif.gz'
            save_path = os.path.join(save_dir,  f'{os.path.basename(img)}_{color}.png')
            download_and_convert_tifgzip_to_png(img_url, save_path)            
            color_image_paths.append(save_path)
    except:
        print(f'failed to download: {img}')

In [None]:
print(f'Number of images: {len(os.listdir(save_dir))//4}')

In [None]:
images = os.listdir(save_dir)
for i, row in tqdm(tmp_df.iterrows()):
    img = row.Image.split('/')[-1]
    channel_imgs = []
    for image in images:
        if img in image:
            channel_imgs.append(save_dir+'/'+image)
    
    extract_cell_level_images(channel_imgs, img)

In [None]:
# Copy Kaggle API token to ~/.kaggle
! mkdir -p /root/.kaggle/
! cp ../input/apitoken/kaggle.json /root/.kaggle/kaggle.json
# Initialize dataset creation
! kaggle datasets init -p {SAVE_DIR}

In [None]:
!ls {SAVE_DIR}

In [None]:
%%bash
echo "{
  \"title\": \"HPA: Single Label Public Nuclear Membrane\",
  \"id\": \"ayuraj/SingleLabelPublicNuclearMembrane\",
  \"licenses\": [
    {
      \"name\": \"CC0-1.0\"
    }
  ]
}" > /kaggle/tmp/hpa_public_cell_level_nuclear_membrane_128x128/dataset-metadata.json

In [None]:
# !kaggle datasets create -p {SAVE_DIR} -u --dir-mode tar
!kaggle datasets version -p {SAVE_DIR} -m "add complete images"  --dir-mode tar