# `data_process`

In [3]:
#default_exp data_process

In [20]:
#export
import os
from pathlib import Path
from itertools import groupby
import mlcrate
from multiprocessing import Pool
from pycocotools import mask as mutils
from pycocotools import _mask as coco_mask
import numpy as np
import pandas as pd
import cv2, PIL
import zlib
import base64

from kgl_humanprotein.config.config import *

run on collie.local


In [5]:
import holoviews as hv
hv.extension('bokeh')

In [6]:
dir_data = Path('../data')
dir_models = Path('../models/')

dir_hpa = dir_data/'hpa-single-cell-image-classification'

## Miscellaneous

In [7]:
#export
def imgids_from_directory(path):
    if isinstance(path, str): 
        path = Path(path)
        
    imgids = set(n.stem.split('_')[0] for n in path.iterdir())
    return list(imgids)

In [8]:
#export
imgids_testing = [
    '000a6c98-bb9b-11e8-b2b9-ac1f6b6435d0',
    '001838f8-bbca-11e8-b2bc-ac1f6b6435d0',
    '000c99ba-bba4-11e8-b2b9-ac1f6b6435d0',
    'a34d8680-bb99-11e8-b2b9-ac1f6b6435d0',
    '000a9596-bbc4-11e8-b2bc-ac1f6b6435d0']

## Loading image samples

In [9]:
#export

def read_img(
    dir_data, image_id, color,
    train_or_test='train', image_size=None, suffix='.png'):
    
    filename = (f'{dir_data}/{train_or_test}/'
                f'{image_id}_{color}{suffix}')
    assert os.path.exists(filename), f'not found {filename}'
    img = cv2.imread(filename, cv2.IMREAD_UNCHANGED)
    if image_size is not None:
        img = cv2.resize(img, (image_size, image_size))
    if img.max() > 255:
        img_max = img.max()
        img = (img/255).astype('uint8')
    return img

def load_RGBY_image(
    dir_data, image_id, rgb_only=False,
    train_or_test='train', image_size=None, suffix='.png'):
    
    red, green, blue = [
        read_img(
            dir_data, image_id, color, 
            train_or_test, image_size, suffix) 
        for color in ('red', 'green', 'blue')]

    channels = [red, green, blue]
    
    if not rgb_only:
        yellow = read_img(
            dir_data, image_id, "yellow", 
            train_or_test, image_size, suffix)
        channels.append(yellow)
        
    stacked_images = np.transpose(np.array(channels), (1,2,0))
    return stacked_images

## Cell segmentation

- https://www.kaggle.com/lnhtrang/hpa-public-data-download-and-hpacellseg#Using-HPA-segmentation-tool

- https://www.kaggle.com/its7171/mmdetection-for-segmentation-training

Install `hpacellseg`:

In [2]:
# !pip install https://github.com/CellProfiling/HPA-Cell-Segmentation/archive/master.zip

Collecting https://github.com/CellProfiling/HPA-Cell-Segmentation/archive/master.zip
  Downloading https://github.com/CellProfiling/HPA-Cell-Segmentation/archive/master.zip
[K     / 17 kB 593 kB/ss
Collecting pytorch_zoo@ https://github.com/haoxusci/pytorch_zoo/archive/master.zip
  Downloading https://github.com/haoxusci/pytorch_zoo/archive/master.zip
[K     / 131 kB 964 kB/s
Building wheels for collected packages: hpacellseg, pytorch-zoo
  Building wheel for hpacellseg (setup.py) ... [?25ldone
[?25h  Created wheel for hpacellseg: filename=hpacellseg-0.1.8-py3-none-any.whl size=14919 sha256=20d25f996e95ac2b34fcc9af3c0a5ac6aac37ebbdf202c6566dea787f4715a5c
  Stored in directory: /private/var/folders/j8/yjq07z717675nj9j6bk7zxc40000gn/T/pip-ephem-wheel-cache-k02orb4f/wheels/cd/d8/de/04ad08802d62537f8dffc89b6a7ce0a53c3d29ea6eae522ab1
  Building wheel for pytorch-zoo (setup.py) ... [?25ldone
[?25h  Created wheel for pytorch-zoo: filename=pytorch_zoo-0.0.0-py3-none-any.whl size=30138 sh

In [7]:
#export
import hpacellseg.cellsegmentator as cellsegmentator
from hpacellseg.utils import label_cell, label_nuclei
from tqdm import tqdm

class CellSegmentator(cellsegmentator.CellSegmentator):
    def __init__(self, nuc_model, cell_model, *args, **kwargs):
        nuc_model = str(nuc_model)
        cell_model = str(cell_model)
        super().__init__(nuc_model, cell_model, *args, **kwargs)
        
    def __call__(self, red, yellow, blue):
        '''
        `red`: list
          Red images' file paths.
        `yellow`: list
          Yellow images' file paths.
        `blue`: list
          Blue images' file paths.
        '''
        assert len(red) == len(yellow) == len(blue)

        if isinstance(red[0], Path):
            red, yellow, blue = (
                [str(n) for n in fns] 
                for fns in [red, yellow, blue])

        print('Predicting nuclei and cells...', end='')
        segs_nucl = self.pred_nuclei(blue)
        segs_cell = self.pred_cells([red, yellow, blue])
        print(' done.')

        print('Labelling cells...', end='')
        masks = []
        for seg_nucl, seg_cell in zip(segs_nucl, segs_cell):
            mask_nucl, mask_cell = label_cell(seg_nucl, seg_cell)
            masks.append((mask_nucl, mask_cell))
        print(' done.')
            
        return masks
    
    
def get_cellmask(img, segmentator):
    img_r, img_y, img_b = img[...,0], img[...,3], img[...,2]
    
    masks = segmentator(red=[img_r], yellow=[img_y], blue=[img_b])
    
    _, mask = masks[0]
    return mask

In [8]:
dir_cellseg_model = Path('../models/HPA_Cell_Segmentation/')
NUC_MODEL = dir_cellseg_model/'nuclei-model.pth'
CELL_MODEL = dir_cellseg_model/'cell-model.pth'

segmentator = CellSegmentator(
    NUC_MODEL,
    CELL_MODEL,
    scale_factor=0.25,
    device="cuda",
    padding=False,
    multi_channel_model=True)

No GPU found, using CPU.


please compile abn


In [187]:
def test_segment_given_filepaths():

    imgids = imgids_testing[:]

    fns_red, fns_yellow, fns_blue = (
        [dir_hpa/'train'/f'{imgid}_{color}.png' for imgid in imgids]
        for color in ('red', 'yellow', 'blue'))
    
    masks = segmentator(red=fns_red, yellow=fns_yellow, blue=fns_blue)

## Masks, RLE and bboxes

In [64]:
def encode_binary_mask(mask):
    """Converts a binary mask into OID challenge encoding ascii text."""

    # check input mask --
    if mask.dtype != np.bool:
        raise ValueError(
        "encode_binary_mask expects a binary mask, received dtype == %s" %
        mask.dtype)

    mask = np.squeeze(mask)
    if len(mask.shape) != 2:
        raise ValueError(
        "encode_binary_mask expects a 2d mask, received shape == %s" %
        mask.shape)

    # convert input mask to expected COCO API input --
    mask_to_encode = mask.reshape(mask.shape[0], mask.shape[1], 1)
    mask_to_encode = mask_to_encode.astype(np.uint8)
    mask_to_encode = np.asfortranarray(mask_to_encode)

    # RLE encode mask --
    encoded_mask = coco_mask.encode(mask_to_encode)[0]["counts"]

    # compress and base64 encoding --
    binary_str = zlib.compress(encoded_mask, zlib.Z_BEST_COMPRESSION)
    base64_str = base64.b64encode(binary_str)
    return base64_str.decode()



def coco_rle_encode(bmask):
    rle = {'counts': [], 'size': list(bmask.shape)}
    counts = rle.get('counts')
    for i, (value, elements) in enumerate(groupby(bmask.ravel(order='F'))):
        if i == 0 and value == 1:
            counts.append(0)
        counts.append(len(list(elements)))
    return rle


def get_rles(mask):
    ids_cell = np.unique(mask)
    
    rles = []
    for id in ids_cell:
        if id == 0: 
            continue

        bmask = np.where(mask == id, 1, 0)

        rle = coco_rle_encode(bmask)
        rles.append(rle)
        
    height, width = mask.shape
    rles = mutils.frPyObjects(rles, height, width)        
    
    return rles


def rles2bboxes(rles):
    if len(rles) == 0:
        return []

    bboxes = mutils.toBbox(rles)
    bboxes[:,2] += bboxes[:,0]
    bboxes[:,3] += bboxes[:,1]
    
    return bboxes


def crop_image(img, bbox, rle=None):
    bbox = bbox.astype(np.int16)
    x0, y0, x1, y1 = bbox
    
    crop = img[y0:y1, x0:x1]
    
    if rle is not None:
        bmask = mutils.decode(rle)

        crop = bmask[y0:y1, x0:x1][...,None] * crop
    
    return crop


def get_crops(img, bboxes, rles=None):
    if rles is not None:
        assert len(bboxes) == len(rles)
        
    crops = []
    for i, bbox in enumerate(bboxes):

        rle = None if rles is None else rles[i]
        crop = crop_image(img, bbox, rle)
        crops.append(crop)
        
    return crops


def remove_faint_greens(xs, crops, green_thres=64):
    assert len(xs) == len(crops)
    xs_out = []
    for x, crop in zip(xs, crops):
        if crop[...,1].max() > green_thres:
            xs_out.append(x)
    return xs_out


def pad_to_square(img):
    h, w, c = img.shape
    sz = max(h, w)
    img_padded = np.zeros((sz, sz, c), dtype=img.dtype)
    img_padded[:h, :w] = img.copy()
    return img_padded

In [94]:
def test_imgid_to_crops():
    idx = 3
    imgid = imgids_testing[idx]
    img = load_RGBY_image(dir_hpa, imgid)

    mask = get_cellmask(img, segmentator)
    rles = get_rles(mask)
    bboxes = rles2bboxes(rles)

    crops = get_crops(img, bboxes, rles=rles)

    rles, bboxes, crops = (
        remove_faint_greens(xs, crops, green_thres=64) 
        for xs in [rles, bboxes, crops])

    crops = [pad_to_square(img) for img in crops]
    
    figs = [hv.RGB(img[...,:3])]
    figs += [hv.RGB(img[...,:3]) for img in crops]

    layout = hv.Layout(figs).cols(3)

    h, w = 150, 150
    layout.opts(

        hv.opts.RGB(
            height=h, yaxis=None,
            width=w, xaxis=None),

        hv.opts.Image(
            height=h, yaxis=None,
            width=w, xaxis=None,
            alpha=.2)
    )   
    
    return layout

### Generate dataset of cell crops from `train`

In [59]:
def image_to_crops(
    segmentator, dir_hpa, imgid, dir_out=Path('./'),
    train_or_test='train', green_thres=64):
    '''
    '''
    img = load_RGBY_image(
        dir_hpa, imgid, train_or_test=train_or_test)

    mask = get_cellmask(img, segmentator)
    rles = get_rles(mask)
    bboxes = rles2bboxes(rles)

    crops = get_crops(img, bboxes, rles=rles)

    rles, bboxes, crops = (
        remove_faint_greens(xs, crops, green_thres=green_thres) 
        for xs in [rles, bboxes, crops])

    crops = [pad_to_square(img) for img in crops]
    
    cids = [f'{imgid}_{i}' for i in range(len(crops))]
    
    colors = ['red', 'green', 'blue', 'yellow']
    for cid, crop in zip(cids, crops):
        for channel, color in enumerate(colors):
            cv2.imwrite(
                str(dir_out/f'{cid}_{color}.jpg'), 
                crop[...,channel])
        
    rles = [encode_binary_mask(mutils.decode(rle).astype(bool)) 
            for rle in rles]
    df = pd.DataFrame({'Id': cids, 'rle': rles})
    df['Image_Id'] = imgid
    return df

In [10]:
# Generate crops dataset

train_or_test = 'train'
green_thres = 64

dir_cell_crops = dir_data/'cell_crops'
dir_out = dir_cell_crops/train_or_test
dir_out.mkdir(exist_ok=True)


idxs = range(len(imgids_testing))
def generate_crops(i):
    imgid = imgids_testing[i]
    df = image_to_crops(
        segmentator, dir_hpa, imgid, dir_out=dir_cells_train, 
        train_or_test=train_or_test, green_thres=green_thres)
    return df


# MAX_THRE = 2
# p = Pool(processes=MAX_THRE)

# dfs = []
# for df_img in map(generate_crops, idxs):
#     dfs.append(df_img)

# df = pd.concat(dfs, axis=0)

# df.to_csv(dir_cell_crops/'train.csv')

In [88]:
idx = 3

cid = df.Id.iloc[idx]
rle = df.rle.iloc[idx]

img = load_RGBY_image(dir_cell_crops, df.Id.iloc[6], suffix='.jpg')

hv.RGB(img[...,:3])

### Generate square cell crops of the same size

In [38]:
def resize_to_same_size(src, dst, size=512):
    fns_src = [n for n in src.iterdir()]
    
    dst = dst/f'images_{size}'
    dst.mkdir(exist_ok=True)
    fns_dst = [dst/f'{n.stem}.png' for n in src.iterdir()]
    
    for fn_src, fn_dst in zip(fns_src, fns_dst):
        img = cv2.imread(str(fn_src), cv2.IMREAD_UNCHANGED)
        img = cv2.resize(
            img, (size, size), interpolation=cv2.INTER_LINEAR)
        cv2.imwrite(str(fn_dst), img)
    
resize_to_same_size(
    Path('../data/cell_crops/train/'), 
    Path(f'../data/protein/test'), size=768)