In [1]:
import os
import sys
from collections import defaultdict
import json

import xml.etree.ElementTree as ET
from xml.dom import minidom

import numpy as np
import pandas as pd

import openslide
from PIL import Image
from PIL import ImageDraw

py_src_code_dir = '../src/python'
sys.path.insert(0, py_src_code_dir)
from digipath_toolkit import get_sample_selection_mask, get_strided_fence_array
from digipath_toolkit import get_patch_location_array_for_image_level

data_dir = '../../DigiPath_MLTK_data'
zip_tank = '../../DigiPath_MLTK_data/zipTank/wsi_annotation_sample/'
xml_name = os.path.join(zip_tank, 'e39a8d60a56844d695e9579bce8f0335.xml')
c_lab_id_fn = os.path.join(zip_tank, 'class_label_id.csv')

im_dir = '../../DigiPath_MLTK_data/RegistrationDevData'
im_file = 'e39a8d60a56844d695e9579bce8f0335.tiff'
image_file_name = os.path.join(im_dir, im_file)

# Safety Parse XML - *get_label_coords_dict(xml_file_name)*:
[python docs: xml module warning ](https://docs.python.org/3/library/xml.html#defused-packages) <br>
[defusexml - risks exhibit](https://pypi.org/project/defusedxml/) <br>
****
python library xml has unacceptable vulnerabilities ergo here parse the file as text - else fail (w no risk)

In [2]:
"""
        daily pause to consider .csv file: label, ID, Priority
        need Id: {'Priority': p, 'label': los_nombre}
"""

def get_label_coords_dict(xml_file_name):
    """ label_coords_dict = get_label_coords_dict(xml_name) 
    parse an xml file for key fields needed for annotation selection of images
    
    Args:
        xml_file_name:
        
    Returns:
        label_coords_dict:  python dict of dict s.t. 
                                keys are labels, 
                                values are python dicts with:
                                    Id:     region Id number
                                    Text:   label repeted
                                    coords: vertices as numpy (n x 2) array [[x, y], [x, y],...]
    """
    # define which region keys to include
    REGION_KEYS =  ['Id', 'Text']
    
    # guard file opening / reading
    failed_to_open = False
    try:
        with open(xml_file_name, 'r') as fh:
            lines = fh.readlines()
    except:
        failed_to_open = True
        pass
    
    if failed_to_open == True or len(lines) < 1:
        print('Failed to Open xml file:\n\t', xml_file_name)
        return None
    
    # initialize region-coords loop cycle variables
    reg_on = False
    v_on = False
    vertex_list = []
    region_dict = {}
    
    # initialize return variable
    label_dict = defaultdict(dict)
    
    for line in lines:
        if reg_on == True and v_on == True and line.strip() == '</Vertices>':
            # end of region - add to label_dict if coords found
            if 'Text' in region_dict and len(vertex_list) > 1:
                label_dict[region_dict['Text']] = region_dict
                label_dict[region_dict['Text']]['coords'] = np.array(vertex_list)

            # reset all region-coords loop cycle variables
            reg_on = False
            v_on = False
            vertex_list = []
            region_dict = {}

        elif reg_on == True and v_on == True and '<Vertex' in line.strip()[0:7]:
            # add every vertex's coords to the list of coords
            vertex_line_list = line.strip().strip('<').strip('>').strip('/').split(' ')
            # find and fill X, Y from the line list
            xy_dict = {}
            for v in vertex_line_list:
                if v[0] == 'X':
                    kv_pair = v.split('=')
                    xy_dict['X'] = float(kv_pair[1].strip('"'))

                elif v[0] == 'Y':
                    kv_pair = v.split('=')
                    xy_dict['Y'] = float(kv_pair[1].strip('"'))

            if 'X' in xy_dict and 'Y' in xy_dict:
                vertex_list.append([xy_dict['X'], xy_dict['Y']])

        if reg_on == True and '<Vertices' in line:
            v_on = True

        if reg_on == False and '<Region ' in line.strip()[0:8]:
            # begin parsing the new region
            reg_on = True
            # with paranoia
            v_on = False
            # re-initialize the new (temporary) region dictionary
            region_dict = {}
            
            # parse this line ( <Region ) to get find key-value pairs named in REGION_KEYS
            region_list = line.strip().split()
            for reg_item in region_list:
                if '=' in reg_item:
                    # split into a key-value pair
                    item_list = reg_item.strip().split('=')
                    if len(item_list) == 2:
                        for k in REGION_KEYS:
                            # insert key-value pair if key is defined above in REGION_KEYS
                            if k in item_list[0][0:len(k)]:
                                region_dict[k] = item_list[1]
        
    return label_dict


In [3]:
"""
            test: label_coords_dict = get_label_coords_dict(xml_name)
"""
label_coords_dict = get_label_coords_dict(xml_name)

for k, v in label_coords_dict.items():
    print(k, type(v), v['coords'].shape)

"offset" <class 'dict'> (3, 2)
"Region" <class 'dict'> (4, 2)
"malignant" <class 'dict'> (133, 2)
"normal" <class 'dict'> (4, 2)
"fat" <class 'dict'> (4, 2)
"null" <class 'dict'> (110, 2)
"lymph" <class 'dict'> (152, 2)
"ink" <class 'dict'> (619, 2)


In [4]:
"""
            View the xml file in the raw
"""
with open(xml_name, 'r') as fh:
    lines = fh.readlines()
    
if len(lines) > 0:
    for line in lines:
        if len(line.strip()) > 100:
            line_list = line.strip().split()
            for l in line_list:
                if '<' in l:
                    print(l)
                else:
                    print('\t',l)
        else:
            print(line.strip())

<Annotations MicronsPerPixel="0.252100">
<Annotation
	 Id="1"
	 Name=""
	 ReadOnly="0"
	 NameReadOnly="0"
	 LineColorReadOnly="0"
	 Incremental="0"
	 Type="4"
	 LineColor="65280"
	 Visible="1"
	 Selected="1"
	 MarkupImagePath=""
	 MacroName="">
<Attributes>
<Attribute Name="Description" Id="0" Value=""/>
</Attributes>
<Regions>
<RegionAttributeHeaders>
<AttributeHeader Id="9999" Name="Region" ColumnWidth="-1"/>
<AttributeHeader Id="9997" Name="Length" ColumnWidth="-1"/>
<AttributeHeader Id="9996" Name="Area" ColumnWidth="-1"/>
<AttributeHeader Id="9998" Name="Text" ColumnWidth="-1"/>
<AttributeHeader Id="1" Name="Description" ColumnWidth="-1"/>
</RegionAttributeHeaders>
<Region
	 Id="1"
	 Type="5"
	 Text="offset"
	 GeoShape="Points"
	 Zoom="0.042148"
	 Selected="0"
	 ImageLocation=""
	 ImageFocus="0"
	 Length="74565.8"
	 Area="213363186.2"
	 LengthMicrons="18798.0"
	 AreaMicrons="13560170.4"
	 NegativeROA="0"
	 InputRegionId="0"
	 Analyze="1"
	 DisplayId="1">
<Attributes/>
<Vertices>
<