# Introduction

This notebook is intended to demonstrate the use of PyTables for organizing and preparing PyTorch datasets. 

Useful Links:
- [http://machinelearninguru.com/deep_learning/data_preparation/hdf5/hdf5.html](http://machinelearninguru.com/deep_learning/data_preparation/hdf5/hdf5.html)
- [https://github.com/choosehappy/PytorchDigitalPathology/blob/master/classification_lymphoma_densenet/make_hdf5.ipynb](https://github.com/choosehappy/PytorchDigitalPathology/blob/master/classification_lymphoma_densenet/make_hdf5.ipynb)
- [http://www.andrewjanowczyk.com/digital-pathology-classification-using-pytorch-densenet/](http://www.andrewjanowczyk.com/digital-pathology-classification-using-pytorch-densenet/)



For the purposes of this demonstration, we will walk through an example of a semantic segmentation dataset consisting of **images** and **binary masks**.

# Imports and Workspace Preparation

We must first define the structure of the datasets that we want to work with. In this case, we have a set of data for nuclei detection, which consists of an H&E image and its associated nuclei labelmaps.

In [None]:
import torch
import tables

import os
import sys
import glob

import PIL
from PIL import Image
import numpy as np

import matplotlib.pyplot as plt

from sklearn import model_selection
import sklearn.feature_extraction.image
import random

In [None]:
# Name of the database, used to save the .pytables file
dataset_name = "nuclei"

# Tiles will be pulled from the ROI images; this is the size of the tiles
# to extract and save in the database, must be >= to training size
patch_size = 1000

# Distance to skip between tiles.
# 1 = pixel wise extraction
# patch_size = non-overlapping tiles
stride_size = 250 

# Number of pixels to pad *after* resize to image with by mirroring 
# This ensures that the edges of the tiles will be analyzed properly
# ---Note---
# One should likely make sure that  (nrow + mirror_pad_size) mod patch_size == 0, 
# where nrow is the number of rows after resizing
# so that no pixels are lost (any remainer is ignored)
mirror_pad_size = 250

# what percentage of the dataset should be used as a held out validation/testing set
test_set_size = 0.1

# Ratio to resize input images
# 1: No resizing
# 0.5: Reduce size by half
# 2: Make the image 2x the size
resize = 1

# Class labels, as recorded on the mask PNGs (?)
# TODO: Edit this for annotation formatting (RGB or Index)
classes = [0,255] 

In [None]:
# Get a random seed so that we can reproducibly do the cross validation setup
seed = random.randrange(sys.maxsize)

# Set the seed
random.seed(seed)

print(f"random seed (note down for reproducibility): {seed}")

# Define Data Sources

Here we create pointers to data sources (images and masks).

In [None]:
# File paths
img_dir = os.path.join('data', 'nuclei_segmentation', 'images')
img_ext = '.jpg'
mask_dir = os.path.join('data', 'nuclei_segmentation', 'masks')
mask_ext = '.png'

# Create a list of the files, in this case we're only
# interested in files which have masks so we can use supervised learning
img_files = glob.glob(os.path.join(mask_dir, '*' + mask_ext))

print(f"Found {len(img_files)} mask files.")

In [None]:
# Create training and validation stages and split the files appropriately between them
phases = {}
phases["train"], phases["val"] = next(iter(model_selection.ShuffleSplit(n_splits=1,test_size=test_set_size).split(img_files)))

# Specify that we'll be saving 2 different image types to the database
# an image and its associated mask
imgtypes = ["img", "mask"]

print(f"Training set size: {len(phases['train'])}")
print(phases['train'])
print()
print(f"Validation set size: {len(phases['val'])}")
print(phases['val'])


# Define PyTables Data

Here, we define a few characteristics of the images that are stored as columns in the table.

In [None]:
# dtype in which the images will be saved, this indicates
# that images will be saved as unsigned int 8 bit, i.e., [0,255]
img_dtype = tables.UInt8Atom()

# Image filename of the source image
img_filename = tables.StringAtom(itemsize=255)

In [None]:
# Holder for pytables
storage = {}

# Block shape specifies what we'll be saving into the pytable array.
# Here we assume that masks are 1d and images are 3d
block_shape = {}
block_shape["img"] = np.array((patch_size, patch_size, 3))
block_shape["mask"] = np.array((patch_size, patch_size)) 

# We can also specify filters, such as compression, to improve storage speed
filters = tables.Filters(complevel=6, complib='zlib')

# Create PyTables Records

In [None]:
# Create separate records for each phase (training and validation)
for phase in phases.keys():
    print(f"Processing data from {phase} phase.")

    # We can keep counts of all the classes for training, since we 
    # can later use this information to create {better weights}
    totals = np.zeros((2, len(classes)))
    totals[0,:] = classes
  
    # Open the respective pytable relative to current working_dir
    hdf5_file = tables.open_file(os.path.join('data', 'nuclei_segmentation', f"{dataset_name}_{phase}.pytable"), mode='w')

    # Create the array for storage
    storage["filename"] = hdf5_file.create_earray(hdf5_file.root, 'filename', img_filename, (0,))

    # For each of the image types, in this case mask and image, we need to create the associated earray
    for imgtype in imgtypes:
        storage[imgtype] = hdf5_file.create_earray(hdf5_file.root, imgtype, img_dtype,  
                                                   shape=np.append([0], block_shape[imgtype]),
                                                   chunkshape=np.append([1], block_shape[imgtype]),
                                                   filters=filters)

    # Now for each of the files
    for fileidx in phases[phase]:
        fname = img_files[fileidx] 
        print(fname)

        for imgtype in imgtypes:
            # if we're looking at an img, it must be 3 channel, but cv2 won't load 
            # it in the correct channel order, so we need to fix that
            if(imgtype=="img"):
                #io=cv2.cvtColor(cv2.imread('data/imgs/'+os.path.basename(fname).replace("_mask.png",".tif")), cv2.COLOR_BGR2RGB)
                io = np.array(Image.open(os.path.join(img_dir, os.path.basename(fname).replace("_mask"+mask_ext, img_ext))))
                interp_method=PIL.Image.BICUBIC
                
                # Apply the padding specified in the parameters
                # Need to check that this works for 
                io = np.pad(io, [(mirror_pad_size, mirror_pad_size), (mirror_pad_size, mirror_pad_size), (0, 0)], mode="reflect")

                patch_extraction_size = (patch_size,patch_size,3)
            # If its a mask image, then we only need a single channel 
            # (since grayscale 3D images are equal in all channels)
            else:
                # the image is loaded as {0,255}, 
                # but we'd like to store it as {0,1} since this represents the binary nature of the mask easier
                #io = cv2.imread(fname)/255
                io = np.array(Image.open(fname))
#                 print(f"Image mask maximum: {np.max(np.array(io))}")
#                 print(f"Image mask size: {np.shape(io)}")
                
                # Want to use nearest! otherwise resizing 
                # may cause non-existing classes to be produced via interpolation (e.g., ".25")
                interp_method = PIL.Image.NEAREST

                # sum the number of pixels, this is done pre-resize, 
                # the but proportions don't change which is really what we're after
                for i,key in enumerate(classes):
                    totals[1,i] += sum(sum(io[:,:]==1))
                
                # Apply the padding specified in the parameters
                # Need to check that this works for 
                io = np.pad(io, [(mirror_pad_size, mirror_pad_size), (mirror_pad_size, mirror_pad_size)], mode="reflect")
                patch_extraction_size = (patch_size, patch_size)

            # Resize the image, if desired
            #       io = cv2.resize(io,(0,0),fx=resize,fy=resize, interpolation=interp_method) #resize it as specified above
#             if resize != 1:
#                 io = io.resize((resize,resize),Image.BILINEAR)

#             io = np.array(io)
            
            
            #convert input image into overlapping tiles, size is ntiler x ntilec x 1 x patch_size x patch_size x3
            io_arr_out = sklearn.feature_extraction.image.extract_patches(io, patch_extraction_size, stride_size)

            #resize it into a ntile x patch_size x patch_size x 3
            if imgtype == "img":
                io_arr_out = io_arr_out.reshape(-1,patch_size,patch_size,3)
            else:
                io_arr_out = io_arr_out.reshape(-1,patch_size,patch_size)

            #save the 4D tensor to the table
            if(imgtype=="img"):
                storage[imgtype].append(io_arr_out)
            else:
                storage[imgtype].append(io_arr_out.squeeze()) #only need 1 channel for mask data

        storage["filename"].append([fname for x in range(io_arr_out.shape[0])]) #add the filename to the storage array

    # lastely, we should store the number of pixels
    npixels = hdf5_file.create_carray(hdf5_file.root, 'numpixels', tables.Atom.from_dtype(totals.dtype), totals.shape)
    npixels[:] = totals
    hdf5_file.close()

In [None]:
hdf5_file.close()