## For patches (h5) from tumoral annotations, remove background by using tissue mask generated by CLAM
### For h5 in folder patches_tumor, check if each coordinate pair is in the same h5 in folder patches. If not exist, remove the coordinates and image. Save to folder patches_tumor_masked.

In [22]:
import os
import h5py
import numpy as np
from tqdm.notebook import tqdm
import time

In [24]:
path = '/media/visiopharm5/WDGold/deeplearning/MIL/CLAM/results'
in_path = '/media/visiopharm5/WDRed(backup)/clam_extension/results/patches_test'
ref_path = os.path.join(path, 'patches')
out_path = '/media/visiopharm5/WDRed(backup)/clam_extension/results/patches_tumor_masked'

In [25]:
# for root, dirs, files in os.walk(os.path.join(path, 'patches_tumor')):
for root, dirs, files in os.walk(in_path):
    fnames = files
print(len(fnames))
print(fnames[:3])

1
['TCGA-2Y-A9GV-01Z-00-DX1.524FA6DD-7C84-425C-A2B5-17690DF50A28.h5']


In [26]:
for fname in tqdm(fnames):
    if os.path.isfile(os.path.join(out_path, fname)):
        print("File already exists: " + fname)
    else:      
        print("Processing: " + fname)
        
    # file pointer
    f = h5py.File(os.path.join(in_path, fname), 'r')
    #print(list(f.keys())) # ['coords', 'imgs']
    # patch number
    print(f['coords'].shape[0])

    # copy coordinates
    coords = np.empty((f['coords'].shape[0], 2), dtype=np.int32)
    f['coords'].read_direct(coords, np.s_[:, :])

    # copy images
    imgs = np.empty((f['coords'].shape[0], 256, 256, 3), dtype=np.uint8)
    f['imgs'].read_direct(imgs, np.s_[:, :])

    # attributes
    print("Attrbutes of coords:") 
    print(list(f['coords'].attrs.keys()))
    print("Attrbutes of imgs:") 
    print(list(f['imgs'].attrs.keys()))

    # load attributes
    patch_level = f['imgs'].attrs['patch_level']
    wsi_name = f['imgs'].attrs['wsi_name']
    downsample = f['imgs'].attrs['downsample']
    level_dim = f['imgs'].attrs['level_dim']
    downsampled_level_dim = f['imgs'].attrs['downsampled_level_dim']

    #del f['imgs']
    #del f['coords']
    f.close()
    
    # load the coordinates in ref (patches)
    f_ref = h5py.File(os.path.join(ref_path, fname), 'r')
    print(f_ref['coords'].shape[0])
    coords_ref = np.empty((f_ref['coords'].shape[0], 2), dtype=np.int32)
    f_ref['coords'].read_direct(coords_ref, np.s_[:, :])
    f_ref.close()

    start = time.time()
    counter_patch = 0
    for i in tqdm(range(coords.shape[0])):
        if not coords[i-counter_patch] in coords_ref:
            imgs = np.delete(imgs, i-counter_patch, 0)
            coords = np.delete(coords, i-counter_patch, 0)
            counter_patch = counter_patch + 1
   
    print(coords.shape[0])
    end = time.time()
    print(end - start)

    f = h5py.File(os.path.join(out_path, fname), 'a')

    coord_dset = f.create_dataset('coords', shape=coords.shape, dtype=np.int32, maxshape=(None, 2), chunks=(1, 2))
    dset = f.create_dataset('imgs', shape=imgs.shape, dtype=np.uint8, maxshape=(None, 256, 256, 3), chunks=(1, 256, 256, 3))

    coord_dset[:] = coords

    dset[:] = imgs
    dset.attrs['patch_level'] =  patch_level
    dset.attrs['wsi_name'] = wsi_name
    dset.attrs['downsample'] = downsample
    dset.attrs['level_dim'] = level_dim
    dset.attrs['downsampled_level_dim'] = downsampled_level_dim

    print(f['imgs'])
    print(f['coords'])

    f.close()



HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=1.0), HTML(value='')))

Processing: TCGA-2Y-A9GV-01Z-00-DX1.524FA6DD-7C84-425C-A2B5-17690DF50A28.h5
12241
Attrbutes of coords:
[]
Attrbutes of imgs:
['downsample', 'downsampled_level_dim', 'level_dim', 'patch_level', 'wsi_name']
12966


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=12241.0), HTML(value='')))





KeyboardInterrupt: 