In [15]:
import os
import numpy as np
import pandas as pd
import tifffile as tiff
import matplotlib.pyplot as plt
import cv2
import json
from PIL import Image
import pickle


In [4]:
base_path = './'
wsi_df = pd.read_csv(f'{base_path}/wsi_meta.csv')
tile_df = pd.read_csv(f'{base_path}/tile_meta.csv')
with open(f'{base_path}/polygons.jsonl', 'r') as json_file:
    json_list = list(json_file)
    
tiles_dicts = []
for json_str in json_list:
    tiles_dicts.append(json.loads(json_str))
print(tiles_dicts[0])

{'id': '0006ff2aa7cd', 'annotations': [{'type': 'glomerulus', 'coordinates': [[[167, 249], [166, 249], [165, 249], [164, 249], [163, 249], [162, 249], [161, 249], [160, 249], [159, 249], [158, 249], [157, 249], [156, 249], [155, 249], [154, 249], [153, 249], [152, 249], [151, 249], [150, 249], [149, 249], [148, 249], [147, 249], [146, 249], [145, 249], [144, 249], [143, 249], [142, 249], [141, 249], [140, 249], [139, 249], [138, 249], [137, 249], [136, 249], [135, 249], [134, 249], [133, 249], [132, 249], [131, 249], [130, 249], [129, 249], [128, 249], [127, 249], [126, 249], [125, 249], [124, 249], [123, 249], [122, 249], [122, 248], [121, 248], [120, 248], [119, 248], [118, 248], [117, 248], [117, 247], [116, 247], [115, 247], [115, 246], [114, 246], [113, 246], [112, 246], [111, 246], [110, 246], [110, 245], [109, 245], [108, 245], [107, 245], [106, 245], [106, 244], [105, 244], [105, 243], [104, 243], [103, 243], [103, 242], [102, 242], [101, 242], [100, 242], [99, 242], [99, 241],

In [7]:
def visualize(**images):
    """PLot images in one row."""
    n = len(images)
    plt.figure(figsize=(16, 5))
    for i, (name, image) in enumerate(images.items()):
        plt.subplot(1, n, i + 1)
        plt.xticks([])
        plt.yticks([])
        plt.title(' '.join(name.split('_')).title())
        plt.imshow(image)
    plt.show()

def save_imgs_and_masks(img_tiles_dicts, output_dir, output_mask_dir):
  for img_tiles_dict in img_tiles_dicts:
    img_id = img_tiles_dict['id']
    base_image = cv2.imread(f'{base_path}/train/{img_id}.tif')
    blood_vessel_masked_image = np.zeros((512, 512))
    for annot_idx in range(0, len(img_tiles_dict['annotations'])):
        coords = np.array(img_tiles_dict['annotations'][annot_idx]['coordinates'][0])
        mask_type = img_tiles_dict['annotations'][annot_idx]['type']
        if mask_type == 'blood_vessel':
            cv2.fillPoly(blood_vessel_masked_image, pts=[coords], color=1)
    blood_vessel_masked_image = Image.fromarray(blood_vessel_masked_image.astype(np.uint8))
    base_image = Image.fromarray(base_image.astype(np.uint8))
    base_image.save(f'{output_dir}/{img_id}.png')
    blood_vessel_masked_image.save(f'{output_mask_dir}/{img_id}.png')

In [8]:
from PIL import Image
import shutil
if os.path.exists('./all_train_imgs'):
  shutil.rmtree('./all_train_imgs')
if os.path.exists('./all_valid_imgs'):
  shutil.rmtree('./all_valid_imgs')
if os.path.exists('./all_train_masks'):
  shutil.rmtree('./all_train_masks')
if os.path.exists('./all_valid_masks'):
  shutil.rmtree('./all_valid_masks') 

os.mkdir('./all_train_imgs')
os.mkdir('./all_valid_imgs')
os.mkdir('./all_train_masks')
os.mkdir('./all_valid_masks')

all_train_dicts = tiles_dicts

print(f'Processing a total of {len(all_train_dicts)} training images')
save_imgs_and_masks(all_train_dicts, './all_train_imgs', './all_train_masks')

Processing a total of 1633 training images and 0 validation images


In [13]:
output_annotations_path = './all_train_annotations'
tile_categories = ['blood_vessel','glomerulus','unsure']

In [16]:
def calculate_area(coordinates):
    num_points = len(coordinates)
    if num_points < 3:
        return 0

    area = 0
    for i in range(num_points - 1):
        x_i, y_i = coordinates[i]
        x_iplus1, y_iplus1 = coordinates[i + 1]
        area += (x_i * y_iplus1) - (x_iplus1 * y_i)

    x_n, y_n = coordinates[-1]
    x_0, y_0 = coordinates[0]
    area += (x_n * y_0) - (x_0 * y_n)

    area = abs(area / 2)
    return area

for tgt_tile_dict in all_train_dicts:
    cur_tile_coco_annots = []
    img_id = tgt_tile_dict['id']
    annotations = [annot for annot in tgt_tile_dict['annotations'] if annot['type'] in tile_categories]
    for annot in annotations:
        coords = annot['coordinates'][0]
        segmentations = [[pt for pair in coords for pt in pair]]
        segmentation_area = calculate_area(coords)
        min_x = min(coords, key=lambda x: x[0])[0]
        max_x = max(coords, key=lambda x: x[0])[0]
        min_y = min(coords, key=lambda x: x[1])[1]
        max_y = max(coords, key=lambda x: x[1])[1]
        segmentation_bbox = [min_x, min_y, max_x-min_x, max_y-min_y]
        category_id = tile_categories.index(annot['type'])
        cur_tile_coco_annots.append({
          'segmentation': segmentations,
          'area': segmentation_area,
          'bbox': segmentation_bbox,
          'category_id': category_id
        })
    with open(f'{output_annotations_path}/{img_id}.pkl', 'wb') as f:
        pickle.dump(cur_tile_coco_annots, f, protocol=pickle.HIGHEST_PROTOCOL)

In [30]:
inimgsorig = os.listdir('/home/ec2-user/hubmap-hacking-the-human-vasculature/all_dataset_files/all_dataset_imgs_train_0')
inimgsorig = [x.split('.')[0] for x in inimgsorig]
with open('/home/ec2-user/hubmap-hacking-the-human-vasculature/tmp/train_img_ids_fold_0.pkl', 'rb') as f:
    inimgs = pickle.load(f)
print([x for x in inimgs if x not in inimgsorig])

[]


In [None]:
inimgsorig = os.listdir('/home/ec2-user/hubmap-hacking-the-human-vasculature/all_dataset_files/all_dataset_imgs_validation_0')
inimgsorig = [x.split('.')[0] for x in inimgsorig]
with open('/home/ec2-user/hubmap-hacking-the-human-vasculature/tmp/train_img_ids_fold_0.pkl', 'rb') as f:
    inimgs = pickle.load(f)
print([x for x in inimgs if x not in inimgsorig])

[]
