In [None]:
from fastai.vision.all import *
import pandas as pd
import numpy as np
import os
from tqdm import tqdm
import cv2
from shutil import copyfile
from scipy import ndimage

In [None]:
path = Path('../input/hpa-single-cell-image-classification')
df = pd.read_csv(path/'train.csv')

In [None]:
ROOT = '../input/hpa-single-cell-image-classification/'
train_or_test = 'train'

In [None]:
dfs = df.sample(n=1500, replace=False, random_state=58).reset_index(drop=True)
dfs_train = dfs[0:1000]
dfs_val = dfs[1000:].reset_index(drop=True)

In [None]:
dfs.head()

In [None]:
def read_img(image_id, color, train_or_test='train', image_size=None):
    filename = f'{ROOT}/{train_or_test}/{image_id}_{color}.png'
    assert os.path.exists(filename), f'not found {filename}'
    img = cv2.imread(filename, cv2.IMREAD_UNCHANGED)
    if image_size is not None:
        img = cv2.resize(img, (image_size, image_size))
    if img.max() > 255:
        img_max = img.max()
        img = (img/255).astype('uint8')
    return img

In [None]:
num_files = len(dfs)
cell_mask_dir = '../input/hpa-mask/hpa_cell_mask'

if not os.path.exists('masks'):
    os.makedirs('masks')

channels = ["red", "green", "blue", "yellow"]
with zipfile.ZipFile('hpa_sample.zip', 'w') as img_out:

    for idx in tqdm(range(num_files)):
        image_id = dfs.iloc[idx].ID
        labels = dfs.iloc[idx].Label
        #cell_mask = np.load(f'{cell_mask_dir}/{image_id}.npz')
        #red = read_img(image_id, "red", train_or_test, None)
        fname_mask = f'masks/{image_id}.npz'
        copyfile(f'{cell_mask_dir}/{image_id}.npz', fname_mask)
        
        for chan in channels:
            curr_img = read_img(image_id, chan, train_or_test, None)
            fname = f'{image_id}_{chan}.jpg'

            im = cv2.imencode('.jpg', curr_img)[1]
            img_out.writestr(fname, im)
                    


dfs_train.to_csv('train_sample.csv', index=False)
dfs_val.to_csv('val_sample.csv', index=False)


In [None]:
# sourced from https://stackoverflow.com/questions/1855095/how-to-create-a-zip-archive-of-a-directory-in-python
def zipdir(path, ziph):
    # ziph is zipfile handle
    for root, dirs, files in os.walk(path):
        for file in files:
            ziph.write(os.path.join(root, file), 
                       os.path.relpath(os.path.join(root, file), 
                                       os.path.join(path, '..')))
            
zipf = zipfile.ZipFile('hpa_sample_masks.zip', 'w', zipfile.ZIP_DEFLATED)
zipdir('masks/', zipf)
zipf.close()

In [None]:
%matplotlib inline
from matplotlib import pyplot as plt
ROOT = '../input/hpa-single-cell-image-classification/'
OUT = '../output/kaggle/working/'
train_or_test = 'train'
label_colors = []
n_labels = 19
threshold = 10
cats_per_col = np.ceil((1+n_labels)**(1/3))
rgb_val_per = np.floor(255/(cats_per_col-1))
for ix in range(1,1+n_labels):
    r = np.floor(ix/(cats_per_col**2))
    g = np.floor((ix / cats_per_col))%cats_per_col
    b = np.floor((ix % cats_per_col) % cats_per_col)
    label_colors.append([r*rgb_val_per,g*rgb_val_per,b*rgb_val_per])
for ix in range(1):
    image_id = df.iloc[ix].ID
    labels = df.iloc[ix].Label.split("|")
    img = []
    for color in ['blue','red','green','yellow']:
        filename = f'{ROOT}/{train_or_test}/{image_id}_{color}.png'
        if os.path.exists(filename):
            img.append(cv2.imread(filename, cv2.IMREAD_UNCHANGED))
    img = np.array(img).max(axis=0)
    img = ndimage.median_filter(img,5)
    img[img < threshold] = 0
    img[img >= threshold] = 1
    img = np.expand_dims(img, axis=2)
    for label in labels:
        label_color = np.expand_dims(np.array(label_colors[int(label)]), axis=0)
        mask_im = np.matmul(img, label_color)
        cv2.imwrite(f'{OUT}/{image_id}.png', img)
    