In [None]:
import os
import cv2
import skimage.io
from tqdm.notebook import tqdm
import zipfile
import numpy as np
import pandas as pd

In [None]:
TRAIN = '../input/prostate-cancer-grade-assessment/train_images/'
MASKS = '../input/prostate-cancer-grade-assessment/train_label_masks/'
OUT_TRAIN = 'train.zip'
OUT_MASKS = 'masks.zip'
sz = 224
N = 24

Some notes for the below function. The first bullet point here refers to the the code below the first "# note" in the below function.
* finds the values we should pad h & w by so that our dims are multiples of our sz
* pads our images nd masks with chosen  constant vals
* * reshapes our images into an array of 7 x 128 x 8 x 128 x 3 if img was 896x1024x3
* swaps the order so our e.g dims would now be 7x8x128x128x3 and reshapes so that our example dim would be 56x128x128x3. In our e.g the 56 elements tiles filled by column first as per reshape.
* same process for our masks
* if our img or mask has less than N tiles we pad
* three things occur here in this one line, we reshape our img so that in our example it would have dim of 56 x (3x128x128). We sum along the last dimension, so we have 56 values, we then order those values in ascending order and return the index of the N smallest values. In the subsequent lines we select the 16 tiles with most tissue.
* The function returns a list of of 16 dictionaries, each dictionary contains an array describing an image tile, the corresponding mask array, and the index of the tile in the ordering of tiles based on most tissue value (descending order.

In [None]:
def tile(img, mask):
    result = []
    shape = img.shape
    pad0,pad1 = (sz - shape[0]%sz)%sz, (sz - shape[1]%sz)%sz
    img = np.pad(img,[[pad0//2,pad0-pad0//2],[pad1//2,pad1-pad1//2],[0,0]],
                constant_values=255)
    mask = np.pad(mask,[[pad0//2,pad0-pad0//2],[pad1//2,pad1-pad1//2],[0,0]],
                constant_values=0)
    img = img.reshape(img.shape[0]//sz,sz,img.shape[1]//sz,sz,3)
    img = img.transpose(0,2,1,3,4).reshape(-1,sz,sz,3)
    mask = mask.reshape(mask.shape[0]//sz,sz,mask.shape[1]//sz,sz,3)
    mask = mask.transpose(0,2,1,3,4).reshape(-1,sz,sz,3)
    if len(img) < N:
        mask = np.pad(mask,[[0,N-len(img)],[0,0],[0,0],[0,0]],constant_values=0)
        img = np.pad(img,[[0,N-len(img)],[0,0],[0,0],[0,0]],constant_values=255)
    idxs = np.argsort(img.reshape(img.shape[0],-1).sum(-1))[:N]
    img = img[idxs]
    mask = mask[idxs]
    for i in range(len(img)):
        result.append({'img':img[i], 'mask':mask[i], 'idx':i})
    return result

In [None]:
import matplotlib.pyplot as plt
names = [name[:-10] for name in os.listdir(MASKS)]
test_names = names[0:10]
def img_load(name):
    img = skimage.io.MultiImage(os.path.join(TRAIN,name+'.tiff'))[-2]
    return(img)
def mask_load(name):
    mask = skimage.io.MultiImage(os.path.join(MASKS,name+'_mask.tiff'))[-2]
    return(mask)

images = [img_load(name) for name in test_names]
masks = [mask_load(name) for name in test_names]
img = images[0]
mask = masks[0]
tiled_test = tile(img, mask)
# tiled test is a list of 12 dictionaries, each dictionary has an img array, mask array and idx num_ones = (msk == 1).sum()
# gonna need the train CSV to load masks in different ways according to data_providers
# we can makeuse of all Radboudumc examples as they do have labels by gleason score, we call also make use of any karolinska example with only one gleason score e.g. 3+3 4+4 etc


Radboudumc: Prostate glands are individually labelled. Valid values are:

0: background (non tissue) or unknown
1: stroma (connective tissue, non-epithelium tissue)
2: healthy (benign) epithelium
3: cancerous epithelium (Gleason 3)
4: cancerous epithelium (Gleason 4)
5: cancerous epithelium (Gleason 5)

Karolinska: Regions are labelled. Valid values:
0: background (non tissue) or unknown
1: benign tissue (stroma and epithelium combined)
2: cancerous tissue (stroma and epithelium combined)

The label masks of Radboudumc were semi-automatically generated by several deep learning algorithms, contain noise, and can be considered as weakly-supervised labels. The label masks of Karolinska were semi-autotomatically generated based on annotations by a pathologist.

Notes for the below cell in similar format to above.
* we first instantiate our lists of tile information.
* we access image id's by ignoring the last 10 characters of mask file names
* set the distinct zip files we will write to
* Use tqdm to create a progress bar
* load our image and masks as array using skimage
* apply our tile function above and obtain our list of dictionaries which each describe a tile and corresponding.
* for each tile we calculate mean and std of px val px val squared and append valds to our lists of tile info. 
* we write to png and reverse rgb to bgr for reading with PIL
* create same file name for tile by id and idx for both imgs and masks

In [None]:
x_tot,x2_tot = [],[]
with zipfile.ZipFile(OUT_TRAIN, 'w') as img_out,\
 zipfile.ZipFile(OUT_MASKS, 'w') as mask_out:
    for name in tqdm(names):
        img = skimage.io.MultiImage(os.path.join(TRAIN,name+'.tiff'))[-2]
        mask = skimage.io.MultiImage(os.path.join(MASKS,name+'_mask.tiff'))[-2]
        tiles = tile(img,mask)
        for t in tiles:
            img,mask,idx = t['img'],t['mask'],t['idx']
            x_tot.append((img/255.0).reshape(-1,3).mean(0))
            x2_tot.append(((img/255.0)**2).reshape(-1,3).mean(0)) 
            #if read with PIL RGB turns into BGR
            img[:,:,0] = ((img[:,:,0]/255) - 0.8094)/ 0.4055
            img[:,:,1] = ((img[:,:,1]/255) - 0.6067)/ 0.5094
            img[:,:,2] = ((img[:,:,2]/255) - 0.7383)/ 0.4158
            img = cv2.imencode('.png',cv2.cvtColor(img, cv2.COLOR_RGB2BGR))[1]
            img_out.writestr(f'{name}_{idx}.png', img)
            mask = cv2.imencode('.png',mask[:,:,0])[1]
            mask_out.writestr(f'{name}_{idx}.png', mask)

In [None]:
#image stats
img_avr =  np.array(x_tot).mean(0)
img_std =  np.sqrt(np.array(x2_tot).mean(0) - img_avr**2)
print('mean:',img_avr, ', std:', np.sqrt(img_std))