In [1]:

import os

if hasattr(os, 'add_dll_directory'):
    # Windows
    OPENSLIDE_PATH = os.path.join(os.path.abspath(os.getcwd()),
                                  "libs/openslide-bin-4.0.0.3-windows-x64/bin")
    with os.add_dll_directory(OPENSLIDE_PATH):
        import openslide
else:
    import openslide

In [2]:
import json
import os
import xml.etree.ElementTree as ET
from pathlib import Path

import cv2
import numpy as np
import openslide

input_dir = "data/whole-slides/gut"

for filename in os.listdir(input_dir):
    if Path(filename).suffix != ".xml":
        continue
    regions_tag = ET.parse(f'{input_dir}/{filename}').getroot().find('Annotation/Regions')
    bounding_boxes = []
    for region_tag in regions_tag.findall('Region'):
        # print(region_tag)
        xs = []
        ys = []
        points = []
        for vertex_tag in region_tag.findall('Vertices/Vertex'):
            x, y = int(vertex_tag.attrib['X']), int(vertex_tag.attrib['Y'])
            xs.append(x)
            ys.append(y)
            points.append((x, y))
        x_min, y_min = min(xs), min(ys)
        x_max, y_max = max(xs), max(ys)
        width = x_max - x_min
        height = y_max - y_min
        if width == 0 and height == 0:
            continue
        bounding_boxes.append({"x_min": x_min, "y_min": y_min, "width": width, "height": height, "points": points})
    print()
    print(bounding_boxes)
    with open(f"{input_dir}/{Path(filename).stem}.json", 'w') as f:
        json.dump(bounding_boxes, f)

    slide_filename = f"{Path(filename).stem}.svs"
    output_dir = f"data/rois/{Path(slide_filename).stem}/"
    os.makedirs(output_dir, exist_ok=True)
    slide = openslide.OpenSlide(f"{input_dir}/{slide_filename}")
    for i, bounding_box in enumerate(bounding_boxes):
        x, y, w, h = bounding_box["x_min"], bounding_box["y_min"], bounding_box["width"], bounding_box["height"]
        roi = np.array(slide.read_region((x, y), 0, (w, h)))
        cv2.imwrite(f"{output_dir}/{i}_{x},{y}_{w},{h}.png", roi)


[{'x_min': 108200, 'y_min': 23667, 'width': 292, 'height': 323, 'points': [(108384, 23700), (108363, 23721), (108339, 23744), (108329, 23754), (108318, 23766), (108309, 23776), (108299, 23786), (108292, 23796), (108284, 23805), (108279, 23814), (108273, 23821), (108268, 23828), (108263, 23837), (108258, 23842), (108255, 23850), (108252, 23853), (108248, 23857), (108242, 23866), (108241, 23869), (108238, 23873), (108235, 23879), (108234, 23883), (108231, 23886), (108229, 23890), (108226, 23896), (108222, 23902), (108221, 23906), (108215, 23915), (108213, 23919), (108210, 23925), (108209, 23928), (108208, 23931), (108206, 23934), (108205, 23938), (108205, 23941), (108203, 23944), (108202, 23945), (108202, 23948), (108202, 23951), (108202, 23954), (108202, 23958), (108200, 23964), (108200, 23969), (108200, 23971), (108200, 23974), (108200, 23977), (108200, 23980), (108200, 23983), (108200, 23985), (108200, 23986), (108200, 23987), (108202, 23989), (108203, 23989), (108206, 23990), (10820

In [3]:
import json
import os

total = 0
for file_name in os.listdir('data/whole-slides/gut'):
    if not file_name.endswith('.json'):
        continue
    with open('data/whole-slides/gut/' + file_name, 'r') as f:
        annotations = json.load(f)
        total += len(annotations)
        print(f"{file_name}: {len(annotations):03d}")
print("total", total)


522021.json: 001
522934.json: 031
593433.json: 001
593434.json: 014
593435.json: 060
593436.json: 094
593437.json: 050
593438.json: 031
593439.json: 024
593440.json: 030
593441.json: 004
593443.json: 004
593444.json: 027
593445.json: 084
593446.json: 008
593447.json: 018
593448.json: 013
593449.json: 129
593450.json: 048
593451.json: 001
593452.json: 126
593453.json: 015
593454.json: 018
total 831
