# CPTAC-PDA Pancreas masks and probability density function

> Proof of concept and generalization of HuBMAP Efficient Sampling II

## Implementation (masks and probability density function)

> - Create segmentations masks from JSON
> - Create probability density function (PDF) for efficient sampling from mask and anatmical structure


**Inputs**
- https://www.kaggle.com/matjes/cptacpda dataset comprising three whole slice images \[[1](https://pathology.cancerimagingarchive.net/pathdata/cptac_camicroscope/osdCamicroscope.php?tissueId=C3L-03371-25), [2](https://pathology.cancerimagingarchive.net/pathdata/cptac_camicroscope/osdCamicroscope.php?tissueId=C3L-01158-25), [3](https://pathology.cancerimagingarchive.net/pathdata/cptac_camicroscope/osdCamicroscope.php?tissueId=C3L-03350-24)] from [cancerimagingarchive](https://www.cancerimagingarchive.net/).
    

**Settings**

- Sampling probability for pancreas regions (`pancreas_p`): 0.9 
- Sampling probability for background regions (`bg_p`): 0.1      


In [None]:
# Install zarr and load packages
!pip install -qq zarr
import cv2, zarr, json, rasterio
import matplotlib.pyplot as plt, numpy as np, pandas as pd
from shapely.geometry import shape, GeometryCollection
from rasterio import features
from pathlib import Path

In [None]:
def read_polys(file, classification):
    with open(file) as jsonfile:
        data = json.load(jsonfile)   
        geom_list = []
        for features in data:
            try:
                if features['properties']['classification']['name']==classification:
                    geom_list.append(shape(features["geometry"]))
            except:
                print('Classification Missing')
                print(features)
        return GeometryCollection(geom_list)  

Settings

In [None]:
class CONFIG:
    path = Path('../input/cptacpda')
    
    scale = 2        # Downscale final mask by factor 2
    cdf_size = 512   # Downscale CDF for memory efficient loading during training
    bg_p = 0.1       # Background Probability
    pancreas_p = 0.9  # Pancreas Probability

cfg = CONFIG()

**Load inputs and define outputs**

In [None]:
# Input 
df_info = pd.read_csv(cfg.path/"dataset_information.csv", index_col='image_file')

# Output
root = zarr.group(f'/kaggle/working/masks_scale{cfg.scale}')
# Saving cdf in 'pdfs' due to naming conventions for sampling during training in deepflash2
g_msk, g_pdf, g_cdf = root.create_groups('labels', 'pdfs', 'cdfs', overwrite=True)

**Loop over files to create...**
1. The segmentation mask (.zarr)
1. A list of pancreatic islands
1. The probability density function for *region sampling*

In [None]:
identity = rasterio.Affine(1, 0, 0, 0, 1, 0)
df_list = []
for idx, row in df_info.iterrows():
    
    print(f'##### {idx} ####')
    f = cfg.path/f'{idx}.json'
    out_shape = (row.width_pixels, row.height_pixels)
    geoms_msk = read_polys(f, classification='done')
    msk = features.geometry_mask(geoms_msk, out_shape, identity, invert=True).astype('uint8')
    
    # Plot
    fig, ax = plt.subplots(ncols=2, figsize=(15,15))
    resize_w = int((msk.shape[1]/msk.shape[0])*cfg.cdf_size)
    ax[0].imshow(cv2.resize(msk, dsize=(resize_w, cfg.cdf_size)))
    ax[0].set_title('Mask')
    ax[0].set_axis_off()
    
    geoms_regions = read_polys(f, classification='pancreas')
    pdf = features.geometry_mask(geoms_regions, out_shape, identity, invert=True).astype('uint8')
    
    if cfg.scale!=1:
        new_size = (msk.shape[1] // cfg.scale, msk.shape[0] // cfg.scale)
        print('Scaling to', new_size)
        msk = cv2.resize(msk, new_size)
        pdf = cv2.resize(pdf, new_size)
        
    pdf = pdf.astype('float32')          
    pdf[pdf==0] = cfg.bg_p/np.sum(pdf==0)
    pdf[msk>0] = 0
    pdf[pdf==1] = cfg.pancreas_p/np.sum(pdf==1)
      
    print('Getting pancreatic islets stats')
    nb_components, output, stats, centroids = cv2.connectedComponentsWithStats(msk, connectivity=4)
    print(f'Found {nb_components} pancreas')
    df_centroids = pd.DataFrame(centroids[1:], columns=['cy', 'cx'])
    df_centroids = df_centroids.join(pd.DataFrame(stats[1:], columns=['left', 'top', 'width', 'height', 'area']))
    df_centroids['idx'] = idx 
    df_centroids.reset_index(inplace=True)
    df_centroids.set_index(['idx', 'index'], inplace=True)
    df_list.append(df_centroids)
    
    # Saving 
    g_msk[idx] = msk
    g_pdf[idx] = pdf
    
    # Saving cdf
    pdf = cv2.resize(pdf, dsize=(resize_w, cfg.cdf_size))      
    g_cdf[idx] = np.cumsum(pdf/np.sum(pdf)) 
            
    ax[1].imshow(pdf)
    ax[1].set_title('Probability density function for sampling')
    ax[1].set_axis_off() 
    plt.show()


df_stats = pd.concat(df_list)
df_stats.to_csv(f'/kaggle/working/masks_scale{cfg.scale}/roi_stats.csv')
df_stats