In [None]:
#! rm -r WC
#! rm -r NC
#! mkdir WC
#! mkdir NC

!apt-get install openslide-tools
!pip install openslide-python

from google.colab import drive
# Mount Google Drive
drive.mount('/content/drive')

import os
import openslide
import xml.etree.cElementTree as ET
import numpy as np
import matplotlib.pyplot as plt
import cv2 as cv

In [33]:
# don't know what factor is, defaults to 1
def parse_xml(anno_path, factor=1):
    """
    builds the list that represent the ROI path starting from an xml annots file
    """
    tree = ET.ElementTree(file=anno_path)
    annolist = []
    root = tree.getroot()
    for coords in root.iter('Vertices'):
        for coord in coords:
            x = int(float(coord.attrib.get("X")) / factor)
            y = int(float(coord.attrib.get("Y")) / factor)
            annolist.append((x, y))
    return annolist


roi_path = [...]
roi_coords = parse_xml(roi_path)

In [34]:
def is_not_white_or_gray(patch, threshold=0.95, gray_threshold=200):
    """
    Check if the patch is not predominantly white or light gray.
    """
    gray = cv.cvtColor(np.array(patch), cv.COLOR_RGB2GRAY)
    _, binary = cv.threshold(gray, gray_threshold, 255, cv.THRESH_BINARY)
    foreground_pixels = cv.countNonZero(binary)
    foreground_ratio = foreground_pixels / (patch.shape[0] * patch.shape[1])
    return foreground_ratio < threshold


def point_inside_polygon(x, y, poly):
    """
    Check if a point (x, y) is inside a polygon defined by vertices in `poly`.
    The algorithm used is Ray Casting Method.

    Args:
    - x (float): x-coordinate of the point.
    - y (float): y-coordinate of the point.
    - poly (list of tuples): List of (x, y) coordinates defining the polygon vertices.

    Returns:
    - bool: True if the point is inside the polygon, False otherwise.
    """
    n = len(poly)
    inside = False
    p1x, p1y = poly[0]
    for i in range(n + 1):
        p2x, p2y = poly[i % n]
        if y > min(p1y, p2y):
            if y <= max(p1y, p2y):
                if x <= max(p1x, p2x):
                    if p1y != p2y:
                        xinters = (y - p1y) * (p2x - p1x) / (p2y - p1y) + p1x
                    if p1x == p2x or x <= xinters:
                        inside = not inside
        p1x, p1y = p2x, p2y
    return inside

In [None]:
def patch_inside_roi(patch_vertices, roi_vertices):
    """
    Check if a patch defined by its vertices is inside (partially or fully) the region of interest (ROI)
    defined by its vertices.

    Args:
    - patch_vertices (list of tuples): List of (x, y) coordinates defining the patch vertices.
    - roi_vertices (list of tuples): List of (x, y) coordinates defining the ROI vertices.

    Returns:
    - bool: True if at least one vertex of the patch is inside the ROI, False otherwise.
    """
    for vertex in patch_vertices:
        x, y = vertex
        if point_inside_polygon(x, y, roi_vertices):
            return True
    return False


def get_patch_vertices(x, y, patch_size):
    tl_x, tl_y = x, y
    tr_x, tr_y = tl_x + patch_size, tl_y
    br_x, br_y = tl_x + patch_size, tl_y + patch_size
    bl_x, bl_y = br_x - patch_size, br_y
    return [(tl_x, tl_y), (tr_x, tr_y), (bl_x, bl_y), (br_x, br_y)]


# Function to extract patches from the WSI
# TODO: Let this returns (patch, label), handle saving outside of this
def extract_patches(
    image_path, not_roi_path, in_roi_path, roi_coords, patch_size=512, mag_level=0
):
    """
    Extract patches from WSI image at a certain mag level, patches inside roi_coords are saved in a different folder.

    Args:
    - image_path (str): the original WSI image path
    - nc_path (str): path to folder for patches that are outside of roi_cords (no-cancer)
    - wc_path (str): path to folder for patches that are inside of roi_cords (with-cancer)
    - patch_size (int)
    - mag_level (int)

    Returns:
    - It saves patches that are inside the ROI into in_roi_path folder, others into not_roi_path
    """

    slide = openslide.open_slide(image_path)

    width, height = slide.level_dimensions[mag_level]

    for y in range(0, height - patch_size, patch_size):
        for x in range(0, width - patch_size, patch_size):
            downsampling_factor = slide.level_downsamples[mag_level]

            patch = slide.read_region(
                (int(x * downsampling_factor), int(y * downsampling_factor)),
                mag_level,
                (patch_size, patch_size),
            )

            patch_vertices = get_patch_vertices(x, y, patch_size)

            if patch_inside_roi(patch_vertices, roi_coords):
                patch.save(f"{in_roi_path}/{x}_{y}_mag{mag_level}.png")
            elif is_not_white_or_gray(np.array(patch)):
                patch.save(f"{not_roi_path}/{x}_{y}_mag{mag_level}.png")


image_path = [...]
nc_path = [...]
wc_path = [...]
extract_patches(image_path, not_roi_path=nc_path, in_roi_path=wc_path, roi_coords=roi_coords)
