In [None]:
# install packages
!rsync -a ../input/mmdetection-v280/mmdetection ../
!pip install ../input/mmdetection-v280/src/mmdet-2.8.0/mmdet-2.8.0/
!pip install ../input/mmdetection-v280/src/mmpycocotools-12.0.3/mmpycocotools-12.0.3/
!pip install ../input/mmdetection-v280/src/addict-2.4.0-py3-none-any.whl
!pip install ../input/mmdetection-v280/src/yapf-0.30.0-py2.py3-none-any.whl
!pip install ../input/mmdetection-v280/src/mmcv_full-1.2.6-cp37-cp37m-manylinux1_x86_64.whl

In [None]:
# basic 
import pickle
import imageio
import warnings
import os, gc, cv2
import numpy as np
import pandas as pd
from glob import glob
from itertools import groupby

from tqdm import tqdm
from multiprocessing import Pool
import base64
import typing as t
import zlib
import random
random.seed(0)

# visualize
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
exp_name = "v3"
conf_name = "mask_rcnn_s101_fpn_syncbn-backbone+head_mstrain_1x_coco"
# mask_rcnn_s50_fpn_syncbn-backbone+head_mstrain_1x_coco.py
cell_mask_dir = '../input/hpa-mask/hpa_cell_mask'    
ROOT = '../input/hpa-single-cell-image-classification/'
train_or_test = 'train'

img_dir = f'../work/mmdet_{exp_name}_{train_or_test}'
if not os.path.exists(img_dir):
    os.makedirs(img_dir)
#     !mkdir -p {img_dir}
    
df = pd.read_csv(os.path.join(ROOT, 'train.csv'))

# this script takes more than 9hours for full data.
# debug = True
# if debug:
#     df = df[:4]

In [None]:
# # Output directorys if we want to predict the masks
# NUCL_DIR = '/kaggle/working/hpa-mask/hpa-nucl-mask'
# CELL_DIR = '/kaggle/working/hpa-mask/hpa-cell-mask'
# if not os.path.exists(NUCL_DIR):
#     os.makedirs(NUCL_DIR)
# if not os.path.exists(CELL_DIR):
#     os.makedirs(CELL_DIR)
PRE_LOADED_NUCL_DIR = '../input/hpa-mask/hpa_nuclei_mask'
PRE_LOADED_CELL_DIR = '../input/hpa-mask/hpa_cell_mask'

In [None]:
os.listdir(ROOT)

In [None]:
os.listdir(img_dir)

# Data exploration

In [None]:
df_train  = pd.read_csv(os.path.join(ROOT, train_or_test + '.csv'))
df_train.head()

In [None]:
print(f'We have {df_train.shape[0]} rows and {df_train.shape[1]} columns in our df_train.csv.')

In [None]:
print(f'Missing values in train_df.csv in each columns:\n{df_train.isnull().sum()}')

In [None]:
all_labels = df_train.Label.unique().tolist()
all_labels = '|'.join(all_labels)
all_labels = all_labels.split('|')
all_labels = list(set(all_labels))
num_unique_labels = len(all_labels)
all_labels = sorted(all_labels, key=int)
all_labels = ' '.join(all_labels)
print(f'{num_unique_labels} unique labels, values: {all_labels}')

In [None]:
df_train['num_classes'] = df_train['Label'].apply(lambda r: len(r.split('|')))
df_train['num_classes'].value_counts().plot.bar(title='Examples with multiple labels', xlabel='number of labels per example', ylabel='# train examples')
plt.show()

In [None]:
labels = [str(i) for i in range(19)]

unique_counts = {}
for lbl in labels:
    unique_counts[lbl] = len(df_train[df_train.Label == lbl])
# unique_counts
full_counts = {}
for lbl in labels:
    count = 0
    for row_label in df_train['Label']:
        if lbl in row_label.split('|'): 
            count += 1
    full_counts[lbl] = count
  
counts = list(zip(full_counts.keys(), full_counts.values(), unique_counts.values()))
counts = np.array(sorted(counts, key=lambda x:-x[1]))
counts = pd.DataFrame(counts, columns=['label', 'full_count', 'unique_count'])
counts = counts.astype({"label": int, "full_count": int, "unique_count": int})
                         
# print (counts.dtypes)
type(counts["full_count"][0])
sns.set(style="whitegrid")
f, ax = plt.subplots(figsize=(16, 12))

sns.set_color_codes("pastel")
sns.barplot(x="full_count", y="label", data=counts, order=counts.label.values,
            label="full count", color="b", orient = 'h')

# Plot the crashes where alcohol was involved
sns.set_color_codes("muted")
sns.barplot(x="unique_count", y="label", data=counts, order=counts.label.values,
            label="unique_count", color="b", orient = 'h')

# Add a legend and informative axis label
ax.legend(ncol=2, loc="lower right", frameon=True)
ax.set(ylabel="",
       xlabel="Counts")
sns.despine(left=True, bottom=True)

# helper funcs

In [None]:
def build_image_names(image_id: str, folder:str= train_or_test) -> list:
    # mt is the mitchondria
    mt = os.path.join(ROOT, folder, image_id + '_red.png')
    
    # er is the endoplasmic reticulum
    er = os.path.join(ROOT, folder, image_id + '_yellow.png')
    
    # nu is the nuclei
    nu = os.path.join(ROOT, folder, image_id + '_blue.png')
    
    return [mt], [er], [nu], [[mt], [er], [nu]]

def segmentCell(image, segmentator):
    # For nuclei
    nuc_segmentations = segmentator.pred_nuclei(image[2])
    
    # For full cells
    cell_segmentations = segmentator.pred_cells(image)
    
    # post-processing
    nuclei_mask, cell_mask = label_cell(nuc_segmentations[0], cell_segmentations[0])
    
    gc.collect(); del nuc_segmentations; del cell_segmentations
    
    return nuclei_mask, cell_mask 

def plot_cell_segments(cell_mask, mt, er, nu):
    
    i = 0
    microtubule = plt.imread(mt[i])    
    endoplasmicrec = plt.imread(er[i])    
    nuclei = plt.imread(nu[i])
    img = np.dstack((microtubule, endoplasmicrec, nuclei))
    
    plt.figure(figsize=(15, 15))
    plt.subplot(1, 3, 1)
    plt.imshow(img)
    plt.title('Image')
    plt.axis('off')
    
    plt.subplot(1, 3, 2)
    plt.imshow(cell_mask)
    plt.title('Mask')
    plt.axis('off')
    
    plt.subplot(1, 3, 3)
    plt.imshow(img)
    plt.imshow(cell_mask, alpha=0.6)
    plt.title('Image + Mask')
    plt.axis('off')
    plt.show()
    

# convert segmentation mask image to run length encoding
MAX_GREEN = 64 # filter out dark green cells
def get_rles_from_mask(image_id, class_id, image_size=None):
    mask = np.load(f'{cell_mask_dir}/{image_id}.npz')['arr_0']
    mask = cv2.resize(mask, dsize=image_size, interpolation=cv2.INTER_LINEAR)

    if class_id != '18':
        green_img = read_img(image_id, 'green', 'train', image_size)
    rle_list = []
    mask_ids = np.unique(mask)
    for val in mask_ids:
        if val == 0:
            continue
        binary_mask = np.where(mask == val, 1, 0).astype(bool)
        if class_id != '18':
            masked_img = green_img * binary_mask
            #print(val, green_img.max(),masked_img.max())
            if masked_img.max() < MAX_GREEN:
                continue
        rle = coco_rle_encode(binary_mask)
        rle_list.append(rle)
    return rle_list, mask.shape[0], mask.shape[1]

def coco_rle_encode(mask):
    rle = {'counts': [], 'size': list(mask.shape)}
    counts = rle.get('counts')
    for i, (value, elements) in enumerate(groupby(mask.ravel(order='F'))):
        if i == 0 and value == 1:
            counts.append(0)
        counts.append(len(list(elements)))
    return rle

# mmdet custom dataset generator
def mk_mmdet_custom_data(image_id, class_id, image_size=None):
    rles, height, width = get_rles_from_mask(image_id, class_id, image_size)
    if len(rles) == 0:
        return {
            'filename': image_id+'.jpg',
            'width': width,
            'height': height,
            'ann': {}
        }
    rles = mutils.frPyObjects(rles, height, width)
    bboxes = mutils.toBbox(rles)
    bboxes[:, 2] += bboxes[:, 0]
    bboxes[:, 3] += bboxes[:, 1]
    return {
        'filename': image_id+'.jpg',
        'width': width,
        'height': height,
        'ann':
            {
                'bboxes': np.array(bboxes, dtype=np.float32),
                'labels': np.zeros(len(bboxes)), # dummy data.(will be replaced later)
                'masks': rles
            }
    }

# print utility from public notebook
def print_masked_img(image_id, mask):
    img = load_RGBY_image(image_id, train_or_test)
    
    plt.figure(figsize=(15, 15))
    plt.subplot(1, 3, 1)
    plt.imshow(img)
    plt.title('Image')
    plt.axis('off')
    
    plt.subplot(1, 3, 2)
    plt.imshow(mask)
    plt.title('Mask')
    plt.axis('off')
    
    plt.subplot(1, 3, 3)
    plt.imshow(img)
    plt.imshow(mask, alpha=0.6)
    plt.title('Image + Mask')
    plt.axis('off')
    plt.show()
    
# image loader, using rgb only here
def load_RGBY_image(image_id, train_or_test='train', image_size=None):
    red = read_img(image_id, "red", train_or_test, image_size)
    green = read_img(image_id, "green", train_or_test, image_size)
    blue = read_img(image_id, "blue", train_or_test, image_size)
    #yellow = read_img(image_id, "yellow", train_or_test, image_size)
    stacked_images = np.transpose(np.array([red, green, blue]), (1,2,0))
    return stacked_images

# 
def read_img(image_id, color, train_or_test='train', image_size=None):
    filename = f'{ROOT}/{train_or_test}/{image_id}_{color}.png'
    assert os.path.exists(filename), f'not found {filename}'
    img = cv2.imread(filename, cv2.IMREAD_UNCHANGED)
    if image_size is not None:
        img = cv2.resize(img, image_size)
    if img.max() > 255:
        img_max = img.max()
        img = (img/255).astype('uint8')
    return img

# make annotation helper called multi processes
def mk_ann(idx):
    image_id = df.iloc[idx].ID
    class_id = df.iloc[idx].Label
    image_size = (512,512)
    anno = mk_mmdet_custom_data(image_id, class_id, image_size)
    img = load_RGBY_image(image_id, train_or_test, image_size)
    cv2.imwrite(f'{img_dir}/{image_id}.jpg', img)
    return anno, idx, image_id

In [None]:
# red = read_img('5c27f04c-bb99-11e8-b2b9-ac1f6b6435d0', "red", train_or_test)
# red.dtype

In [None]:
# img = cv2.imread('../input/hpa-single-cell-image-classification/test/0173029a-161d-40ef-af28-2342915b22fb_blue.png', cv2.IMREAD_UNCHANGED)
# img.dtype
# print(img.max()/255)
# if img.max() > 255:
#     img_max = img.max()
#     img = (img/255).astype('uint8')
# img.dtype

# checking segment mask
To extract the each cells, [CellSegmentator](https://github.com/CellProfiling/HPA-Cell-Segmentation) can be used.
And The extracted segment masks are stored in [this dataset](https://www.kaggle.com/its7171/hpa-mask).


# Keeping only single label cells mask

In [None]:
df_train

In [None]:
df_train["Label"] = df_train["Label"].str.split("|")
df_train.head()

In [None]:
df = df_train.loc[df_train['Label'].apply(lambda x: len(x)==1)==True]
df.head()

In [None]:
df.shape

## Predicting sgmentation masks for cell and nuclei

In [None]:
# !pip install -q "../input/hpapytorchzoozip/pytorch_zoo-master"
# !pip install -q "../input/hpacellsegmentatormaster/HPA-Cell-Segmentation-master"

In [None]:
# import hpacellseg.cellsegmentator as cellsegmentator
# from hpacellseg.utils import label_cell, label_nuclei

# NUC_MODEL = '../input/hpacellsegmentatormodelweights/dpn_unet_nuclei_v1.pth'
# CELL_MODEL = '../input/hpacellsegmentatormodelweights/dpn_unet_cell_3ch_v1.pth'

# segmentator = cellsegmentator.CellSegmentator(
#     NUC_MODEL,
#     CELL_MODEL,
#     scale_factor=0.25,
#     device='cuda',
#     padding=False,
#     multi_channel_model=True
# )

In [None]:
# Predicting the masks and saving them

# for image_id in tqdm(train.ID.values):
#     print('[INFO]: Dealing with {} ...'.format(image_id))
#     mt, er, nu, images = build_image_names(image_id)    
#     nucl_mask, cell_mask = segmentCell(images, segmentator)
#     # Saving the predicted nucl and cell masks 
#     np.savez_compressed(f'{CELL_DIR}/{image_id}', cell_mask)
#     np.savez_compressed(f'{NUCL_DIR}/{image_id}', nucl_mask)


## Loading segmentation mask

In [None]:
# # os.listdir(ROOT+ train_or_test)
# resized_cell_mask_dir = f'../work/mmdet_{exp_name}_cell_mask_resized'
# !mkdir -p {resized_cell_mask_dir}
# resized_nucl_mask_dir = f'../work/mmdet_{exp_name}_nucl_mask_resized'
# !mkdir -p {resized_nucl_mask_dir}

In [None]:
os.listdir('../work')

In [None]:
mask = np.load(f'{cell_mask_dir}/5e9afd56-bb99-11e8-b2b9-ac1f6b6435d0.npz')['arr_0']
plt.imshow(mask)
plt.show()

In [None]:
img_cv2 = cv2.resize(mask, dsize=(512,512), interpolation=cv2.INTER_LINEAR)
img_cv2.shape

In [None]:
image_id = '5e9afd56-bb99-11e8-b2b9-ac1f6b6435d0'
cell_mask = np.load(f'{PRE_LOADED_CELL_DIR}/{image_id}.npz')['arr_0']
nucl_mask = np.load(f'{PRE_LOADED_NUCL_DIR}/{image_id}.npz')['arr_0']
mt, er, nu, images = build_image_names(image_id)    
plot_cell_segments(cell_mask, mt, er, nu)

In [None]:
cell_mask_dir = '../input/hpa-mask/hpa_cell_mask'    
for idx in range(3):
    image_id = df.iloc[idx].ID
    cell_mask = np.load(f'{cell_mask_dir}/{image_id}.npz')['arr_0']
    print_masked_img(image_id, cell_mask)

In [None]:
i = 0
plt.figure(figsize=(15, 15))
microtubule = plt.imread(mt[i])    
endoplasmicrec = plt.imread(er[i])    
nuclei = plt.imread(nu[i])
mask = cell_mask
img = np.dstack((microtubule, endoplasmicrec, nuclei))
# plt.imshow(img)
plt.imshow(mask)
plt.show()

In [None]:
df.shape

# generate data for mmdetection training

In [None]:
# from pycocotools import mask as mutils

# # this part would take several hours, depends on your CPU power.
# MAX_THRE = 4 # set your avarable CPU count.
# p = Pool(processes=MAX_THRE)
# annos = []
# len_df = len(df)
# for anno, idx, image_id in p.imap(mk_ann, range(len_df)):
#     if len(anno['ann']) > 0:
#         annos.append(anno)
#     print(f'{idx+1}/{len_df}, {image_id}')
    
# lbl_cnt_dict = df.set_index('ID').to_dict()['Label']
# trn_annos = []
# val_annos = []
# val_len = int(len(annos)*0.01)
# for idx in range(len(annos)):
#     ann = annos[idx]
#     filename = ann['filename'].replace('.jpg','').replace('.png','')
#     label_ids = lbl_cnt_dict[filename]
#     len_ann = len(ann['ann']['bboxes'])
#     bboxes = ann['ann']['bboxes']
#     masks = ann['ann']['masks']
#     # asign image level labels to each cells
#     for cnt, label_id in enumerate(label_ids.split('|')):
#         label_id = int(label_id)
#         if cnt == 0:
#             ann['ann']['labels'] = np.full(len_ann, label_id)
#         else:
#             ann['ann']['bboxes'] = np.concatenate([ann['ann']['bboxes'],bboxes])
#             ann['ann']['labels'] = np.concatenate([ann['ann']['labels'],np.full(len_ann, label_id)])
#             ann['ann']['masks'] = ann['ann']['masks'] + masks    
#     if idx < val_len:
#         val_annos.append(ann)
#     else:
#         trn_annos.append(ann)
    
# with open(f'../work/mmdet_{exp_name}_full.pkl', 'wb') as f:
#     pickle.dump(annos, f)
# with open(f'../work/mmdet_{exp_name}_trn.pkl', 'wb') as f:
#     pickle.dump(trn_annos, f)
# with open(f'../work/mmdet_{exp_name}_val.pkl', 'wb') as f:
#     pickle.dump(val_annos, f)

In [None]:
with open(f'../work/mmdet_{exp_name}_full.pkl', 'wb') as f:
    pickle.dump(annos, f)
with open(f'../work/mmdet_{exp_name}_trn.pkl', 'wb') as f:
    pickle.dump(trn_annos, f)
with open(f'../work/mmdet_{exp_name}_val.pkl', 'wb') as f:
    pickle.dump(val_annos, f)

#### I have already created the COCO format labled data so we can load it any time

In [None]:
with open(f'../work/mmdet_{exp_name}_full.pkl', 'rb') as f:
          full_annos = pickle.load(f)
with open(f'../work/mmdet_{exp_name}_trn.pkl', 'rb') as f:
          trn_annos = pickle.load(f)
with open(f'../work/mmdet_{exp_name}_val.pkl', 'rb') as f:
          val_annos = pickle.load(f)

In [None]:
len(trn_annos)

In [None]:
# !cp '../input/loading-masks-creating-bboxs/mmdet_{exp_name}_full.pkl' ../work
!cp '../input/loading-masks-creating-bboxs/mmdet_{exp_name}_trn.pkl' ../work
!cp '../input/loading-masks-creating-bboxs/mmdet_{exp_name}_val.pkl' ../work

In [None]:
df_train

In [None]:
image_size = (512,512)
for idx in range(len(df)):
    image_id = df.iloc[idx].ID
    img = load_RGBY_image(image_id, train_or_test, image_size)
    cv2.imwrite(f'{img_dir}/{image_id}.jpg', img)

In [None]:
len(os.listdir(img_dir))

# training

In [None]:
# I just made following config files based on default mask_rcnn.
# The main changes are CustomDataset, num_classes, data path, etc.
# Other than that, I used it as is for mmdetection.
!ls -l ../mmdetection/configs/hpa_{exp_name}/

In [None]:
# conf_name = "mask_rcnn_s101_fpn_syncbn-backbone+head_mstrain_1x_coco"
conf_name = "mask_rcnn_s50_fpn_syncbn-backbone+head_mstrain_1x_coco"
print(conf_name)

In [None]:
config = f'configs/hpa_{exp_name}/{conf_name}.py'

# using --no-validate to avoid some errors for custom dataset metrics
additional_conf = '--no-validate --cfg-options'
additional_conf += f' work_dir=../working/work_dir'
additional_conf += f' optimizer.lr=0.0025'
cmd = f'bash -x tools/dist_train.sh {config} 1 {additional_conf}'
!cd ../mmdetection; {cmd}

In [None]:
!ls -Rl .