# Masks - Quick EDA - New data

What do we learn here:

- From tiny masks to large masks
- 3 groups based on masks aera
- Masks are in Cortex
- Some outlier masks outside tissue or in Medulla
- Masks footprint compared 1024x1024 tile

In [None]:
import os, sys, random, gc, math, glob, time, pathlib
import numpy as np
import pandas as pd
import io, timeit, os, gc, pickle, psutil
import warnings
import cv2
import gdal
import osgeo
import json
import rasterio
from rasterio.windows import Window

import seaborn as sns
sns.set()
sns.set_context("paper", font_scale=1.2) 

import matplotlib.pyplot as plt
import matplotlib.patches as patches
import matplotlib.colors as cols

warnings.filterwarnings('ignore')

In [None]:
print('Python        : ' + sys.version.split('\n')[0])
print('Numpy         : ' + np.__version__)
print('Pandas        : ' + pd.__version__)
print('Rasterio      : ' + rasterio.__version__)
print('GDal          : ' + osgeo.gdal.__version__)
print('OpenCV        : ' + cv2.__version__)

In [None]:
HOME =  "./"
DATA_HOME = "../input/hubmap-kidney-segmentation/"
TRAIN_HOME = DATA_HOME + "train/"
TEST_HOME = DATA_HOME + "test/"

IMAGE_ID = "image_id"

In [None]:
anatomical_files = [os.path.basename(f) for f in glob.glob(TRAIN_HOME + "*.json") if "anatomical" in f]
masks_files = [os.path.basename(f) for f in glob.glob(TRAIN_HOME + "*.json") if "anatomical" not in f]
masks_files

In [None]:
mask_polys = {}
for file in masks_files:
    with open(TRAIN_HOME + file) as jsonfile:
        data = json.load(jsonfile)
        key = file.split(".")[0]
        mask_polys[key] = []
        for index in range(data.__len__()):
            if (data[index]['properties']['classification']['name'] == 'glomerulus'):             
                geom = np.array(data[index]['geometry']['coordinates'])
                mask_polys[key].append(geom)

Compute statistics on masks: Area and points

In [None]:
mask_areas = {}
mask_points = {}
stats = []
for item, polys in mask_polys.items():
    areas = []
    points = []
    for p in polys:
        p = p.squeeze(axis=0)
        area = cv2.contourArea(p.astype(np.float32))
        areas.append(area)
        points.append(len(p))
    mask_areas[item] = areas
    mask_points[item] = points
    stats.append((item, len(polys), np.min(areas), np.max(areas), np.mean(areas), np.median(areas), np.std(areas), np.min(points), np.mean(points), np.max(points)))
poly_pd = pd.DataFrame(stats, columns=[IMAGE_ID, "total", "min", "max", "mean", "median", "std", "min_pts", "mean_pts", "max_pts"])
poly_pd = poly_pd.sort_values(["median"]).reset_index(drop=True)
poly_pd = poly_pd.set_index(IMAGE_ID)
poly_pd

In [None]:
fix, ax = plt.subplots(1,2, figsize=(22, 5))
for item, row in poly_pd.iterrows():
    d = sns.distplot(pd.DataFrame(mask_points[item]), ax=ax[0], label=item)
    d.set_title("Mask points distribution")
    d = sns.distplot(pd.DataFrame(mask_areas[item]), ax=ax[1], label=item)
    d.set_title("Mask area distribution")
plt.legend()
plt.show()

In [None]:
def read_cortex_medulla(file):
    cortex_polys = []
    medulla_polys = []
    with open(file) as jsonfile:
        data = json.load(jsonfile)    
        for index in range(data.__len__()):
            if (data[index]['properties']['classification']['name'] == 'Cortex'):
                geom = np.array(data[index]['geometry']['coordinates'])
                cortex_polys.append(geom)                
            if (data[index]['properties']['classification']['name'] == 'Medulla'):
                geom = np.array(data[index]['geometry']['coordinates'])
                medulla_polys.append(geom)
    return cortex_polys, medulla_polys

In [None]:
# Shape with height, width
def make_grid(shape, window=1024, min_overlap=0):
    """
        Return Array of size (N,4), where N - number of tiles,
        2nd axis represente slices: x1,x2,y1,y2 
    """
    y, x = shape
    nx = x // (window - min_overlap) + 1
    x1 = np.linspace(0, x, num=nx, endpoint=False, dtype=np.int64)
    x1[-1] = x - window
    x2 = (x1 + window).clip(0, x)
    ny = y // (window - min_overlap) + 1
    y1 = np.linspace(0, y, num=ny, endpoint=False, dtype=np.int64)
    y1[-1] = y - window
    y2 = (y1 + window).clip(0, y)
    slices = np.zeros((nx,ny, 4), dtype=np.int64)
    
    for i in range(nx):
        for j in range(ny):
            slices[i,j] = x1[i], x2[i], y1[j], y2[j]    
    return slices.reshape(nx*ny,4)

Display cortex (blue), medulla (green) and masks (red) for each train image.
Also display 1024x1024 grid (white) to see mask footprint compared to tile size 

In [None]:
for image_id, row in poly_pd.iterrows():
    anatomical_file = TRAIN_HOME + image_id + "-anatomical-structure.json"
    cortex_polys, medulla_polys = read_cortex_medulla(anatomical_file)
        
    with rasterio.open(TRAIN_HOME + image_id + ".tiff") as file:
        if file.count == 3:
            image = file.read([1,2,3]).transpose(1,2,0).copy()
        else:
            h, w = (file.height, file.width)
            subdatasets = file.subdatasets
            if len(subdatasets) > 0:
                image = np.zeros((h, w, len(subdatasets)), dtype=np.uint8)
                for i, subdataset in enumerate(subdatasets, 0):
                    with rasterio.open(subdataset) as layer:
                        image[:,:,i] = layer.read(1) # np.moveaxis(image_, 0, -1) #.squeeze(axis=2)  
                
    
    # Green = Medulla
    if len(medulla_polys) > 0:
        for medulla_poly in medulla_polys:
            image = cv2.polylines(image, medulla_poly.astype(np.int32), True, (0,255,0), thickness=30)

    # Blue = Cortex
    if len(cortex_polys) > 0:
        for cortex_poly in cortex_polys:
            if len(cortex_poly) > 1:
                for cortex_pts in cortex_poly:
                    image = cv2.polylines(image, np.expand_dims(np.array(cortex_pts[0]).astype(np.int32), axis=0), True, (0,0,255), thickness=30)
            else:
                image = cv2.polylines(image, cortex_poly.astype(np.int32), True, (0,0,255), thickness=30)
    
    
    # Red = Mask
    for mask_poly in mask_polys[image_id]:
        image = cv2.polylines(image, mask_poly.astype(np.int32), True, (255,0,0), thickness=30)        
    
    fix, ax = plt.subplots(1,1, figsize=(32, 30))

    # make_grid to compare mask footprint to tile size
    size = image.shape
    boxes = make_grid((size[0], size[1]), window=1024)
    
    for i, box in enumerate(boxes):
        x1, y1 = box[0], box[2]
        x2, y2 = box[1], box[3]
        image = cv2.rectangle(image, (x1, y1), (x2, y2), color=(255,255,255), thickness=8) 

    image = cv2.resize(image, (image.shape[1]//4, image.shape[0]//4))
    d = ax.imshow(image)
    d = ax.set_title("%s, %dx%d, masks=%d, median area=%.1f, blue=Cortex, green=Medulla" % (image_id, size[0], size[1], len(mask_polys[image_id]), poly_pd.loc[image_id]["median"]))
    d = ax.grid(None)
    d = ax.axis('off')
    plt.show()
    
    del image
    
    # break