In [1]:

import os
import shutil
from pathlib import Path

import cv2
from tqdm import tqdm

if hasattr(os, 'add_dll_directory'):
    # Windows
    OPENSLIDE_PATH = os.path.join(os.path.abspath(os.getcwd()),
                                  "libs/openslide-bin-4.0.0.3-windows-x64/bin")
    with os.add_dll_directory(OPENSLIDE_PATH):
        import openslide
else:
    import openslide
import numpy as np
import math
import json
import glob

In [3]:

def calculate_bbox_overlap_percentage(bbox1, bbox2):
    x1, y1, w1, h1 = bbox1
    x2, y2, w2, h2 = bbox2
    # Bottom-right corners of both bounding boxes
    x1_br, y1_br = x1 + w1, y1 + h1
    x2_br, y2_br = x2 + w2, y2 + h2

    # Calculate the intersection coordinates
    x_int_left = max(x1, x2)
    y_int_top = max(y1, y2)
    x_int_right = min(x1_br, x2_br)
    y_int_bottom = min(y1_br, y2_br)

    # Check for no overlap
    if x_int_right <= x_int_left or y_int_bottom <= y_int_top:
        return 0.0

    # Intersection dimensions
    int_width = x_int_right - x_int_left
    int_height = y_int_bottom - y_int_top

    # Intersection area
    int_area = int_width * int_height

    # Area of the first bounding box (BB1)
    bb1_area = w1 * h1

    # Overlap percentage
    overlap_percentage = (int_area / bb1_area) * 100

    return overlap_percentage


def is_bbox_1_center_in_bbox_2(bbox1, bbox2):
    x1, y1, w1, h1 = bbox1
    x2, y2, w2, h2 = bbox2

    center_x = x1 + w1 / 2
    center_y = y1 + h1 / 2

    # Check if the center of BBox1 lies within BBox2
    if (x2 <= center_x <= x2 + w2) and (y2 <= center_y <= y2 + h2):
        return True
    else:
        return False


def grid_segment_slides(input_dir, root_output_dir, filter=None, cell_size=256, level=0):
    if os.path.exists(root_output_dir):
        shutil.rmtree(root_output_dir)
    for slide_filename in os.listdir(input_dir):
        if Path(slide_filename).suffix != ".svs":
            continue
        print(slide_filename)
        output_dir = f"{root_output_dir}/{Path(slide_filename).stem}/{cell_size}x{cell_size}"
        os.makedirs(output_dir, exist_ok=True)
        slide = openslide.OpenSlide(f"{input_dir}/{slide_filename}")

        slide_width, slide_height = slide.level_dimensions[level]
        cells_count_x = math.floor(slide_width / cell_size)
        cells_count_y = math.floor(slide_height / cell_size)
        with tqdm(total=cells_count_x * cells_count_y, desc="Progress") as pbar:
            for i, x in enumerate(range(0, slide_width, cell_size)):
                for j, y in enumerate(range(0, slide_height, cell_size)):
                    cell = np.array(slide.read_region((x, y), level, (cell_size, cell_size)))
                    if filter is None or filter(cell, x, y, slide_filename):
                        cell_file_path = f"{output_dir}/{i},{j}_{x}_{y}.png"
                        cv2.imwrite(cell_file_path, cell)
                    pbar.update(1)


def is_not_mostly_blank(cell, non_blank_percentage=0.5, blank_threshold=240):
    cell_gray = cv2.cvtColor(cell, cv2.COLOR_BGR2GRAY)
    non_white_pixels = np.sum(cell_gray < blank_threshold)
    return (non_white_pixels / cell_gray.size) > non_blank_percentage


rois_by_file = {}
for filepath in glob.glob("data/whole-slides/gut/*.json"):
    with open(filepath, 'r') as file:
        rois_by_file[Path(filepath).stem] = json.load(file)

print(rois_by_file)


def is_in_any_roi(cell, cell_x, cell_y, slide_filename, overlap_threshold=0.5):
    roi_bounding_boxes = rois_by_file[Path(slide_filename).stem]
    cell_size = cell.shape[0]
    cell_bbox = (cell_x, cell_y, cell_size, cell_size)
    for bbox in roi_bounding_boxes:
        roi_bbox = bbox["x_min"], bbox["y_min"], bbox["width"], bbox["height"]

        overlap_percentage = calculate_bbox_overlap_percentage(roi_bbox, cell_bbox)

        if overlap_percentage > overlap_threshold and is_bbox_1_center_in_bbox_2(roi_bbox, cell_bbox):
            return True
    return False


grid_segment_slides(
    input_dir="data/whole-slides/gut",
    root_output_dir="output/temp",
    filter=lambda cell, x, y, slide_filename: is_not_mostly_blank(cell, non_blank_percentage=0.5)
                                              and not is_in_any_roi(cell, x, y, slide_filename),
)

{'024048134068;0;A;1;HE;H;GB_522021': [{'x_min': 108200, 'y_min': 23667, 'width': 292, 'height': 323}], '522934': [{'x_min': 162664, 'y_min': 29288, 'width': 242, 'height': 236}, {'x_min': 163485, 'y_min': 29469, 'width': 253, 'height': 244}, {'x_min': 162129, 'y_min': 29096, 'width': 173, 'height': 164}, {'x_min': 167794, 'y_min': 20703, 'width': 486, 'height': 196}, {'x_min': 150454, 'y_min': 35879, 'width': 157, 'height': 129}, {'x_min': 147624, 'y_min': 32247, 'width': 178, 'height': 140}, {'x_min': 155621, 'y_min': 25969, 'width': 341, 'height': 218}, {'x_min': 168227, 'y_min': 22700, 'width': 335, 'height': 178}, {'x_min': 149590, 'y_min': 16378, 'width': 168, 'height': 330}, {'x_min': 134934, 'y_min': 39834, 'width': 178, 'height': 144}, {'x_min': 132153, 'y_min': 38294, 'width': 178, 'height': 234}, {'x_min': 127993, 'y_min': 39672, 'width': 199, 'height': 225}, {'x_min': 104503, 'y_min': 39517, 'width': 373, 'height': 403}, {'x_min': 103250, 'y_min': 38803, 'width': 307, 'heig

Progress: 135936it [05:44, 394.48it/s]                            


522934.svs


Progress: 165432it [06:50, 402.89it/s]                            
