# Creating a prototyping dataset with individual cells (test set)

My full solution is described here: https://www.kaggle.com/c/hpa-single-cell-image-classification/discussion/221550

What I need as an input to the classification model are images of individual cells. For experimentation I don't need all the images, instead I create a sample from the train set. The additional benefit is that my sample is more balanced than train. I use RGB channels only, which has proven to work well in the previous HPA challenge. I save the extracted cells as RGB jpg images so that I can feed them easily into my classifier.

## This is the notebook to create public test dataset processed in the same way as the training sample dataset.

### Kind people upvote useful notebooks and datasets :) 


Acknowledgements - this uses the dataset and some code by @its7171 (please upvote!):
- https://www.kaggle.com/its7171/hpa-mask
- https://www.kaggle.com/its7171/mmdetection-for-segmentation-training/

In [None]:
!pip install -q "../input/pycocotools/pycocotools-2.0-cp37-cp37m-linux_x86_64.whl"

In [None]:
from fastai.vision.all import *
import pandas as pd
import numpy as np
import os
from tqdm import tqdm
import cv2
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
path = Path('../input/hpa-single-cell-image-classification')
df = pd.read_csv(path/'sample_submission.csv')
cell_dir = '../input/hpa-cell-masks-test-dataset/work/cell_masks'

In [None]:
ROOT = '../input/hpa-single-cell-image-classification/'
train_or_test = 'test'

In [None]:
df.head()

In [None]:
def get_cropped_cell(img, msk):
    bmask = msk.astype(int)[...,None]
    masked_img = img * bmask
    true_points = np.argwhere(bmask)
    top_left = true_points.min(axis=0)
    bottom_right = true_points.max(axis=0)
    cropped_arr = masked_img[top_left[0]:bottom_right[0]+1,top_left[1]:bottom_right[1]+1]
    return cropped_arr

In [None]:
def get_stats(cropped_cell):
    x = (cropped_cell/255.0).reshape(-1,3).mean(0)
    x2 = ((cropped_cell/255.0)**2).reshape(-1,3).mean(0)
    return x, x2

In [None]:
def read_img(image_id, color, train_or_test='test', image_size=None):
    filename = f'{ROOT}/{train_or_test}/{image_id}_{color}.png'
    assert os.path.exists(filename), f'not found {filename}'
    img = cv2.imread(filename, cv2.IMREAD_UNCHANGED)
    if image_size is not None:
        img = cv2.resize(img, (image_size, image_size))
    if img.max() > 255:
        img_max = img.max()
        img = (img/255).astype('uint8')
    return img

In [None]:
import base64
import numpy as np
from pycocotools import _mask as coco_mask
import typing as t
import zlib


def encode_binary_mask(mask: np.ndarray) -> t.Text:
  """Converts a binary mask into OID challenge encoding ascii text."""

  # check input mask --
  if mask.dtype != np.bool:
    raise ValueError(
        "encode_binary_mask expects a binary mask, received dtype == %s" %
        mask.dtype)

  mask = np.squeeze(mask)
  if len(mask.shape) != 2:
    raise ValueError(
        "encode_binary_mask expects a 2d mask, received shape == %s" %
        mask.shape)

  # convert input mask to expected COCO API input --
  mask_to_encode = mask.reshape(mask.shape[0], mask.shape[1], 1)
  mask_to_encode = mask_to_encode.astype(np.uint8)
  mask_to_encode = np.asfortranarray(mask_to_encode)

  # RLE encode mask --
  encoded_mask = coco_mask.encode(mask_to_encode)[0]["counts"]

  # compress and base64 encoding --
  binary_str = zlib.compress(encoded_mask, zlib.Z_BEST_COMPRESSION)
  base64_str = base64.b64encode(binary_str)
  return base64_str.decode('ascii')

In [None]:
x_tot,x2_tot = [],[]
lbls = []
num_files = len(df)
all_cells = []
cell_mask_dir = '../input/hpa-cell-masks-test-dataset/work/cell_masks'

with zipfile.ZipFile('cells.zip', 'w') as img_out:

    for idx in tqdm(range(num_files)):
        image_id = df.iloc[idx].ID
        cell_mask = np.load(f'{cell_mask_dir}/{image_id}.npz')['arr_0']
        red = read_img(image_id, "red", train_or_test, None)
        green = read_img(image_id, "green", train_or_test, None)
        blue = read_img(image_id, "blue", train_or_test, None)
        #yellow = read_img(image_id, "yellow", train_or_test, image_size)
        stacked_image = np.transpose(np.array([blue, green, red]), (1,2,0))

        for j in range(1, np.max(cell_mask) + 1):
            bmask = (cell_mask == j)
            enc = encode_binary_mask(bmask)
            cropped_cell = get_cropped_cell(stacked_image, bmask)
            fname = f'{image_id}_{j}.jpg'
            im = cv2.imencode('.jpg', cropped_cell)[1]
            img_out.writestr(fname, im)
            x, x2 = get_stats(cropped_cell)
            x_tot.append(x)
            x2_tot.append(x2)
            all_cells.append({
                'image_id': image_id,
                'fname': fname,
                'r_mean': x[0],
                'g_mean': x[1],
                'b_mean': x[2],
                'cell_id': j,
                'size1': cropped_cell.shape[0],
                'size2': cropped_cell.shape[1],
                'enc': enc,
            })

#image stats
img_avr =  np.array(x_tot).mean(0)
img_std =  np.sqrt(np.array(x2_tot).mean(0) - img_avr**2)
cell_df = pd.DataFrame(all_cells)
cell_df.to_csv('cell_df.csv', index=False)
print('mean:',img_avr, ', std:', img_std)

In [None]:
cell_df.head()

In [None]:
!ls -l --block-size=M

In [None]:
cell_df.g_mean.hist(bins=100);

In [None]:
cell_df.r_mean.hist(bins=100);

In [None]:
cell_df.b_mean.hist(bins=100);