# Safety Parse XML - *get_label_coords_dict(xml_file_name)*:
[python docs: xml module warning ](https://docs.python.org/3/library/xml.html#defused-packages) <br>
[defusexml - risks exhibit](https://pypi.org/project/defusedxml/) <br>
****
python library xml has unacceptable vulnerabilities ergo here parse the file as text - else fail (w no risk)

In [None]:
import os
import sys
from collections import defaultdict, OrderedDict
import json

import xml.etree.ElementTree as ET
from xml.dom import minidom

import numpy as np
import pandas as pd

import openslide
from PIL import Image
from PIL import ImageDraw

py_src_code_dir = '../src/python'
sys.path.insert(0, py_src_code_dir)
from digipath_toolkit import get_sample_selection_mask, get_strided_fence_array
from digipath_toolkit import get_patch_location_array_for_image_level

data_dir = '../../DigiPath_MLTK_data'
zip_tank = '../../DigiPath_MLTK_data/zipTank/wsi_annotation_sample/'
xml_name = os.path.join(zip_tank, 'e39a8d60a56844d695e9579bce8f0335.xml')
c_lab_id_fn = os.path.join(zip_tank, 'class_label_id.csv')

im_dir = '../../DigiPath_MLTK_data/RegistrationDevData'
im_file = 'e39a8d60a56844d695e9579bce8f0335.tiff'
image_file_name = os.path.join(im_dir, im_file)

In [None]:

def get_priority_ordered_labels(label_id_priority_fname):
    """ ordered_priority_dict = get_priority_ordered_labels(label_id_priority_fname) 
    
    Args:
        label_id_priority_fname:    with Header = Label, ID, Priority
        
    Returns:
        ordered_priority_dict:      {priority_number: {'label': label_str, 'ID': str_number}}
                                    sorted with largest priority number first
    """
    # create the ordered_priority_dict for return, and a id_priority reverse dict 4 lookups
    priority_tuples_list = []
    
    # read the file
    lines = ''
    try:        
        with open(label_id_priority_fname, 'r') as fh:
            lines = fh.readlines()
    except:
        print('failed opening: ', class_labels_id_file_name)
        lines = ''
        pass
    
    # read the .csv lines into the dict   Header = Label, ID, Priority
    if len(lines) > 0:
        for line in lines:
            ln_list = line.strip().split(',')
            if len(ln_list) > 1 and ln_list[0] != 'Label':
                #                   Fix name clash: .xml "Id" vs .csv "ID" - Renaming csv-ID as label_ID
                # tuple:                    (    priority,     {label: label_name,     ID: ID_number}    )
                priority_tuples_list.append((int(ln_list[2]), {'label':ln_list[0],'label_ID':ln_list[1]}))
    
    return OrderedDict(sorted(priority_tuples_list, reverse=True))
    
    
def get_ordered_priority_label_coords_dict(xml_file_name, label_id_priority_fname):
    """ 
    priority_dict = get_ordered_priority_label_coords_dict(xml_file_name, label_id_priority_fname)
        
    parse an xml file for key fields needed for annotation selection of images
    
    Args:
        xml_file_name:              QuPath Annotation convention xml file
        label_id_priority_fname:    with columns Label, Id, Priority
        
    Returns:
        priority_dict:              python dict of dicts - with priority numbers as ordered keys,
                                        values are python dicts with:
                                            label:  Text
                                            Text:   label
                                            coords: vertices as numpy (n x 2) array [[x, y], [x, y],...]
                                            ID:     region Id number - depreciated - defined by label
    """
    # define which region keys to include
    REGION_KEYS =  {'Id': int, 'Text': str}
    
    # module call:          get the {priority: {label: l, ID: n}
    ordered_priority_dict = get_priority_ordered_labels(label_id_priority_fname)
    
    # read the file into text
    with open(xml_file_name, 'r') as fh:
        lines = fh.readlines()
    
    if len(lines) == 0 or ordered_priority_dict is None:
        print('\n\n\t\t\tThrow_A_Pythonic_Conniption_Fit')
        print('\t\t\tFail to read: ',xml_file_name, '\n\n')
        return ordered_priority_dict

    # create the reverse dict {label: priority}
    label_ID_Priority_dict = {v['label']: k for k, v in ordered_priority_dict.items()}

    # initialize region-coords loop cycle variables
    reg_on = False
    v_on = False
    vertex_list = []
    region_dict = {}
    
    for line in lines:
        if reg_on == True and v_on == True and line.strip() == '</Vertices>':
            # end of region - add region_dict to ordered_priority_dict if coords found
            if 'Id' in region_dict and 'Text' in region_dict and len(vertex_list) > 1:
                priority = label_ID_Priority_dict[region_dict['Text']]
                ordered_priority_dict[priority]['Text'] = region_dict['Text']
                ordered_priority_dict[priority]['Id'] = region_dict['Id']
                ordered_priority_dict[priority]['coords'] = np.array(vertex_list)
            else:
                print('\n\n\t\t\tThrow_A_Pythonic_Conniption_Fit')
                print('region_dict', region_dict, 'len(vertex_list)', len(vertex_list), '\n\n')

            # reset all region-coords loop cycle variables
            reg_on = False
            v_on = False
            vertex_list = []
            region_dict = {}

        elif reg_on == True and v_on == True and '<Vertex' in line.strip()[0:7]:
            # add every vertex's coords to the list of coords
            vertex_line_list = line.strip().strip('<').strip('>').strip('/').split(' ')
            # find and fill X, Y from the line list
            xy_dict = {}
            for v in vertex_line_list:
                if v[0] == 'X':
                    kv_pair = v.split('=')
                    xy_dict['X'] = float(kv_pair[1].strip('"'))

                elif v[0] == 'Y':
                    kv_pair = v.split('=')
                    xy_dict['Y'] = float(kv_pair[1].strip('"'))

            if 'X' in xy_dict and 'Y' in xy_dict:
                vertex_list.append([xy_dict['X'], xy_dict['Y']])

        if reg_on == True and '<Vertices' in line:
            v_on = True

        if reg_on == False and '<Region ' in line.strip()[0:8]:
            # begin parsing the new region
            reg_on = True
            # with paranoia
            v_on = False
            # re-initialize the new (temporary) region dictionary
            region_dict = {}
            
            # parse this line ( <Region ) to get find key-value pairs named in REGION_KEYS
            region_list = line.strip().split()
            for reg_item in region_list:
                if '=' in reg_item:
                    # split into a key-value pair
                    item_list = reg_item.strip().split('=')
                    if len(item_list) == 2:
                        for k in REGION_KEYS:
                            # insert key-value pair if key is defined above in REGION_KEYS
                            if k in item_list[0][0:len(k)]:
                                if k == 'Id':
                                    region_dict[k] = int(item_list[1].strip('"'))
                                else:
                                    region_dict[k] = item_list[1].strip('"')                                
    
    return ordered_priority_dict

xml_file_name = os.path.join(zip_tank, 'e39a8d60a56844d695e9579bce8f0335.xml')
label_id_priority_fname = os.path.join(zip_tank, 'class_label_id.csv')

priority_dict = get_ordered_priority_label_coords_dict(xml_file_name, label_id_priority_fname)
print('\npriority_dict\n')
if len(priority_dict) > 0:
    for k, v in priority_dict.items():
        print('%i %20s\t'%(k, v['label']), v['label'] == v['Text'], 
              '\t%4i coords'%len(v['coords']), '\tlabel_ID %s'%v['label_ID'], '\tId %s'%v['Id'])
else:
    print('de Nada')

# Input files summary
****
#### priority_dict = get_ordered_priority_label_coords_dict(xml_file_name, label_id_priority_fname): function output print
```text
priority_dict

7               normal	 True 	   4 coords 	label_ID 7 	Id 4
6                  ink	 True 	 619 coords 	label_ID 6 	Id 8
5               offset	 True 	   3 coords 	label_ID 5 	Id 1
4            malignant	 True 	 133 coords 	label_ID 4 	Id 3
3               Region	 True 	   4 coords 	label_ID 3 	Id 2
2                lymph	 True 	 152 coords 	label_ID 2 	Id 7
1                  fat	 True 	   4 coords 	label_ID 1 	Id 5
0                 null	 True 	 110 coords 	label_ID 0 	Id 6
```
##### input file: class_label_id.csv
```
Label	   ID	Priority
null       0     0
fat        1     1
lymph      2     2
Region     3     3
malignant  4     4
offset     5     5
ink        6     6
normal     7     7	
```
##### input fiile: e39a8d60a56844d695e9579bce8f0335.xml
```
Id = 1 	Text =     offset 	Type = 5
Id = 2 	Text =     Region 	Type = 3
Id = 3 	Text =  malignant 	Type = 4
Id = 4 	Text =     normal 	Type = 7
Id = 5 	Text =        fat 	Type = 1
Id = 6 	Text =       null 	Type = 0
Id = 7 	Text =      lymph 	Type = 2
Id = 8 	Text =        ink 	Type = 6
```