# HuBMAP Quick Reference
So finally we have the new data and annotations. I decided to put together a "quick reference" with the images, and some basic statistics about the glomeruli.  
First we will install [pyvips](https://pypi.org/project/pyvips/) for handling the large images. It is a lenghty install process (6-7min).

In [None]:
%%capture
!wget -O vips-8.10.5.tar.gz https://github.com/libvips/libvips/releases/download/v8.10.5/vips-8.10.5.tar.gz
!sleep .5
!tar xf ./vips-8.10.5.tar.gz
!rm -fr ./vips-8.10.5.tar.gz
%cd vips-8.10.5
!./configure
!make
!make install
!ldconfig
%cd /kaggle/working
!pip install --user pyvips

A few helper functions below to plot images.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from skimage import io
import json
import cv2
import pyvips
%matplotlib inline  

# a few helper functions below to plot the images and creates masks
SCALE = 0.1
TRAIN_PATH = '../input/hubmap-kidney-segmentation/train/'
TEST_PATH = '../input/hubmap-kidney-segmentation/test/'

def get_h_w(poly):
    pol = poly.astype('int32')
    width = cv2.boundingRect(pol)[3]
    height = cv2.boundingRect(pol)[2]
    return width, height

def read_tif_file(fname):
    img = io.imread(fname)
    img = np.squeeze(img)
    if img.shape[0] == 3: # swap axes as required
        img = img.swapaxes(0,1)
        img = img.swapaxes(1,2)
    return img

def read_mask_file(fname, mshape):
    with open(fname) as f:
        mdata = json.load(f)
        polys = []
        gcnt = 0
        pmin = [1e6,1e6]
        pmax = [0,0]
        for index in range(mdata.__len__()):
            if mdata[index]['properties']['classification']['name'] == 'glomerulus':
                geom = np.array(mdata[index]['geometry']['coordinates'])
                if geom.shape[0] == 1:
                    poly = geom[0] * SCALE
                    poly = poly.astype('int32')
                    polys.append(poly)
                    gcnt += 1
                    h, w = get_h_w(geom[0].astype('int32'))
                    if h*w > pmax[0] *pmax[1]:
                        pmax = [h,w]
                    if h*w < pmin[0] *pmin[1]:
                        pmin = [h,w]
        mask = np.zeros(mshape, dtype=np.int8)
        cv2.fillPoly(mask, polys, 1)
        mask = mask.astype(bool, copy=False)
    return mask, gcnt, pmin, pmax

stats = []
        
def show_file(path, file, train=True):
    img = pyvips.Image.tiffload(path+file, access='sequential')
    org_dims = np.array((img.width, img.height))
    if img.get('n-pages') == 3: # multi-page file
        img_r = pyvips.Image.tiffload(path+file, page=0, access='sequential')
        img_r = img_r.resize(SCALE)
        dims = np.array((img_r.height, img_r.width))
        img_g = pyvips.Image.tiffload(path+file, page=1, access='sequential')
        img_g = img_g.resize(SCALE)
        img_b = pyvips.Image.tiffload(path+file, page=2, access='sequential')
        img_b = img_b.resize(SCALE)
        img_r = np.ndarray(buffer=img_r.write_to_memory(),
                           dtype=np.uint8,
                           shape=[img_r.height, img_r.width, img_r.bands])
        img_g = np.ndarray(buffer=img_g.write_to_memory(),
                           dtype=np.uint8,
                           shape=[img_g.height, img_g.width, img_g.bands])
        img_b = np.ndarray(buffer=img_b.write_to_memory(),
                           dtype=np.uint8,
                           shape=[img_b.height, img_b.width, img_b.bands])
        img = np.concatenate((img_r, img_g, img_b), axis=2)
    else:
        img = img.resize(SCALE)
        dims = np.array((img.height, img.width))
        img = np.ndarray(buffer=img.write_to_memory(),
                         dtype=np.uint8,
                         shape=[dims[0], dims[1], 3])
    if train:
        mask, gcnt, pmin, pmax = read_mask_file(path+file.replace('.tiff','.json'), dims)
        stats.append(['train', file, org_dims[0], org_dims[1], gcnt, '{}x{}'.format(pmin[0], pmin[1]), '{}x{}'.format(pmax[0], pmax[1])])
    else:
        stats.append(['test', file, org_dims[0], org_dims[1], 0, 'N/A', 'N/A'])
    plt.figure(figsize=(20,20))
    plt.imshow(img)
    if train:
        plt.title('Glomeruli:{}   Smallest:{}x{}   Largest:{}x{}'.format(gcnt, pmin[0], pmin[1], pmax[0], pmax[1]), fontdict={'fontsize': 20})
        plt.imshow(mask, alpha=0.25);

The size of the individual glomeruli is determined using cv2.boundingRect on the polyline. Example here:

In [None]:
poly = np.array([[2787, 7396],[2725, 7426],[2706, 7458],[2704, 7468],[2700, 7478],
                [2699, 7510],[2698, 7521],[2699, 7522],[2699, 7525],[2735, 7566],
                [2744, 7577],[2745, 7577],[2756, 7590],[2844, 7616],[2882, 7589],
                [2908, 7559],[2927, 7536],[2934, 7515],[2934, 7507],[2889, 7445],
                [2876, 7424],[2872, 7421],[2870, 7420],[2849, 7405],[2838, 7403],
                [2826, 7397],[2792, 7396],[2787, 7396]])

width, height = get_h_w(poly)
poly[:,0] -= cv2.boundingRect(poly)[0]
poly[:,1] -= cv2.boundingRect(poly)[1]
mask = np.zeros([width,height], dtype=np.int8)
cv2.fillPoly(mask, [poly], (255,255,255))
plt.imshow(mask);

Note that the images are scaled down with a factor of 0.1, which means that the x- and y-axis ticks are  also 1/10th of original size. The glomeruli sizes are original though.

# Train: 0486052bb.tiff

In [None]:
show_file(TRAIN_PATH, '0486052bb.tiff')

# Train: 095bf7a1f.tiff

In [None]:
show_file(TRAIN_PATH, '095bf7a1f.tiff')

# Train: 1e2425f28.tiff

In [None]:
show_file(TRAIN_PATH, '1e2425f28.tiff')

# Train: 26dc41664.tiff

In [None]:
show_file(TRAIN_PATH, '26dc41664.tiff')

# Train: 2f6ecfcdf.tiff

In [None]:
show_file(TRAIN_PATH, '2f6ecfcdf.tiff')

# Train: 4ef6695ce.tiff

In [None]:
show_file(TRAIN_PATH, '4ef6695ce.tiff')

# Train: 54f2eec69.tiff

In [None]:
show_file(TRAIN_PATH, '54f2eec69.tiff')

# Train: 8242609fa.tiff

In [None]:
show_file(TRAIN_PATH, '8242609fa.tiff')

# Train: aaa6a05cc.tiff

In [None]:
show_file(TRAIN_PATH, 'aaa6a05cc.tiff')

# Train: afa5e8098.tiff

In [None]:
show_file(TRAIN_PATH, 'afa5e8098.tiff')

# Train: b2dc8411c.tiff

In [None]:
show_file(TRAIN_PATH, 'b2dc8411c.tiff')

# Train: b9a3865fc.tiff

In [None]:
show_file(TRAIN_PATH, 'b9a3865fc.tiff')

# Train: c68fe75ea.tiff

In [None]:
show_file(TRAIN_PATH, 'c68fe75ea.tiff')

# Train: cb2d976f4.tiff

In [None]:
show_file(TRAIN_PATH, 'cb2d976f4.tiff')

# Train: e79de561c.tiff

In [None]:
show_file(TRAIN_PATH, 'e79de561c.tiff')

# Test: 2ec3f1bb9.tiff

In [None]:
show_file(TEST_PATH, '2ec3f1bb9.tiff', False)

# Test: 3589adb90.tiff

In [None]:
show_file(TEST_PATH, '3589adb90.tiff', False)

# Test: 57512b7f1.tiff

In [None]:
show_file(TEST_PATH, '57512b7f1.tiff', False)

# Test: aa05346ff.tiff

In [None]:
show_file(TEST_PATH, 'aa05346ff.tiff', False)

# Test: d488c759a.tiff

In [None]:
show_file(TEST_PATH, 'd488c759a.tiff', False)

# Summary

In [None]:
df = pd.DataFrame(stats, columns=['dataset', 'file', 'x-size', 'y-size', 'glomeruli', 'smallest', 'largest'])
df.to_pickle('./image_stats.pkl')
df