# Creating a prototyping dataset with individual cells

My full solution is described here: https://www.kaggle.com/c/hpa-single-cell-image-classification/discussion/221550

What I need as an input to the classification model are images of individual cells. For experimentation I don't need all the images, instead I create a sample from the train set. The additional benefit is that my sample is more balanced than train. I use RGB channels only, which has proven to work well in the previous HPA challenge. I save the extracted cells as RGB jpg images so that I can feed them easily into my classifier.

Acknowledgements - this uses the dataset and some code by @its7171 (please upvote!):
- https://www.kaggle.com/its7171/hpa-mask
- https://www.kaggle.com/its7171/mmdetection-for-segmentation-training/

In [None]:
from fastai.vision.all import *
import pandas as pd
import numpy as np
import os
from tqdm import tqdm
import cv2
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

**#FILTER**

In [None]:
dfd = pd.read_csv('../input/hpa-duplicate-images-in-train/hpa_duplicates.csv')

In [None]:
dfd.head()

In [None]:
path = Path('../input/hpa-single-cell-image-classification')
df = pd.read_csv(path/'train.csv')
cell_dir = '../input/hpa-mask/hpa_cell_mask'

In [None]:
for i in range(len(dfd)):
    index = df[df['ID'] == dfd['image2'][i]].index
    df= df.drop(index, axis=0)

In [None]:
ROOT = '../input/hpa-single-cell-image-classification/'
train_or_test = 'train'

In [None]:
labels = [str(i) for i in range(19)]
for x in labels: df[x] = df['Label'].apply(lambda r: int(x in r.split('|')))

In [None]:
df

In [None]:
dfs_0 = df[df['Label'] == '0'].sample(n=300, random_state=42).reset_index(drop=True)
dfs_1 = df[df['1'] == 1].sample(n=400, random_state=42).reset_index(drop=True)
dfs_1u = df[df['Label'] == '1'].sample(frac=1, random_state=42).reset_index(drop=True)
dfs_2 = df[df['Label'] == '2'].sample(frac=1, random_state=42).reset_index(drop=True)
dfs_3 = df[df['Label'] == '3'].sample(n=500, random_state=42).reset_index(drop=True)
dfs_4 = df[df['Label'] == '4'].sample(n=500, random_state=42).reset_index(drop=True)
dfs_5 = df[df['Label'] == '5'].sample(n=500, random_state=42).reset_index(drop=True)
dfs_6 = df[df['6'] == 1].sample(n=600, random_state=42).reset_index(drop=True)
dfs_7 = df[df['Label'] == '7'].sample(n=500, random_state=42).reset_index(drop=True)
dfs_8 = df[df['Label'] == '8'].sample(n=500, random_state=42).reset_index(drop=True)
dfs_9 = df[df['9'] == 1].sample(n=400, random_state=42).reset_index(drop=True)
dfs_9u = df[df['Label'] == '9'].sample(n=200, random_state=42).reset_index(drop=True)
dfs_10 = df[df['10'] == 1].sample(n=400, random_state=42).reset_index(drop=True)
dfs_10u = df[df['Label'] == '10'].sample(n=200, random_state=42).reset_index(drop=True)
dfs_11 = df[df['11'] == 1].sample(frac=1, random_state=42, replace= True).reset_index(drop=True).reset_index(drop=True)
dfs_12 = df[df['Label'] == '12'].sample(frac=1, random_state=42).reset_index(drop=True)
dfs_13 = df[df['Label'] == '13'].sample(n=400, random_state=42).reset_index(drop=True)
dfs_14 = df[df['Label'] == '14'].sample(n=500, random_state=42).reset_index(drop=True)
dfs_15 = df[df['15'] == 1].reset_index(drop=True)
dfs_16 = df[df['Label'] == '16'].sample(n=350, random_state=42).reset_index(drop=True)
dfs_17 = df[df['17'] == 1].sample(frac=1, random_state=42).reset_index(drop=True)
dfs_18 = df[df['18'] == 1].sample(frac=1, random_state=42).reset_index(drop=True)
dfs_ = [dfs_0, dfs_1, dfs_1u, dfs_2, dfs_3, dfs_4, dfs_5, dfs_6, dfs_7, dfs_8, dfs_9, dfs_9u, dfs_10, dfs_10u,
        dfs_11, dfs_12, dfs_13, dfs_14, dfs_15, dfs_16, dfs_17, dfs_18]

In [None]:
dfs = pd.concat(dfs_, ignore_index=True)
dfs.drop_duplicates(inplace=True, ignore_index=True)
len(dfs)

In [None]:
unique_counts = {}
for lbl in labels:
    unique_counts[lbl] = len(dfs[dfs.Label == lbl])

full_counts = {}
for lbl in labels:
    count = 0
    for row_label in dfs['Label']:
        if lbl in row_label.split('|'): count += 1
    full_counts[lbl] = count
    
counts = list(zip(full_counts.keys(), full_counts.values(), unique_counts.values()))
counts = np.array(sorted(counts, key=lambda x:-x[1]))
counts = pd.DataFrame(counts, columns=['label', 'full_count', 'unique_count'])
counts.set_index('label').T

In [None]:
def get_cropped_cell(img, msk):
    bmask = msk.astype(int)[...,None]
    masked_img = img * bmask
    true_points = np.argwhere(bmask)
    top_left = true_points.min(axis=0)
    bottom_right = true_points.max(axis=0)
    cropped_arr = masked_img[top_left[0]:bottom_right[0]+1,top_left[1]:bottom_right[1]+1]
    return cropped_arr

In [None]:
def get_stats(cropped_cell):
    x = (cropped_cell/255.0).reshape(-1,3).mean(0)
    x2 = ((cropped_cell/255.0)**2).reshape(-1,3).mean(0)
    return x, x2

In [None]:
def read_img(image_id, color, train_or_test='train', image_size=None):
    filename = f'{ROOT}/{train_or_test}/{image_id}_{color}.png'
    assert os.path.exists(filename), f'not found {filename}'
    img = cv2.imread(filename, cv2.IMREAD_UNCHANGED)
    if image_size is not None:
        img = cv2.resize(img, (image_size, image_size))
    if img.max() > 255:
        img_max = img.max()
        img = (img/255).astype('uint8')
    return img

In [None]:
dfs.Label.unique()

In [None]:
os.mkdir('/kaggle/working/cells')

In [None]:
x_tot,x2_tot = [],[]
lbls = []
num_files = len(dfs)
all_cells = []
cell_mask_dir = '../input/hpa-mask/hpa_cell_mask'

with zipfile.ZipFile('cells.zip', 'w') as img_out:

    for idx in tqdm(range(num_files)):
        image_id = dfs.iloc[idx].ID
        labels = dfs.iloc[idx].Label
        cell_mask = np.load(f'{cell_mask_dir}/{image_id}.npz')['arr_0']
        red = read_img(image_id, "red", train_or_test, None)
        green = read_img(image_id, "green", train_or_test, None)
        blue = read_img(image_id, "blue", train_or_test, None)
        #yellow = read_img(image_id, "yellow", train_or_test, image_size)
        stacked_image = np.transpose(np.array([blue, green, red]), (1,2,0))

        for cell in range(1, np.max(cell_mask)):
            bmask = cell_mask == cell
            cropped_cell = get_cropped_cell(stacked_image, bmask)
            fname = f'{image_id}_{cell}.jpg'
            im = cv2.imencode('.jpg', cropped_cell)[1]
            img_out.writestr(fname, im)
            x, x2 = get_stats(cropped_cell)
            x_tot.append(x)
            x2_tot.append(x2)
            all_cells.append({
                'image_id': image_id,
                'r_mean': x[0],
                'g_mean': x[1],
                'b_mean': x[2],
                'cell_id': cell,
                'image_labels': labels,
                'size1': cropped_cell.shape[0],
                'size2': cropped_cell.shape[1],
            })

#image stats
img_avr =  np.array(x_tot).mean(0)
img_std =  np.sqrt(np.array(x2_tot).mean(0) - img_avr**2)
cell_df = pd.DataFrame(all_cells)
cell_df.to_csv('cell_df.csv', index=False)
print('mean:',img_avr, ', std:', img_std)

In [None]:
cell_df.head()

In [None]:
!ls -l --block-size=M

In [None]:
cell_df.g_mean.hist(bins=100)

In [None]:
cell_df.r_mean.hist(bins=100)

In [None]:
cell_df.b_mean.hist(bins=100)