XML parsing of nodules >3 mm and <3 mm using the python XML package. The LIDC-IDRI dataset includes a folder that contains just the xml files. This code is used to parse all the XML files corresponding to all the 1018 patients to extract nodule information. 
1. A list called SERIES stores the names of all the unique SERIES IDs. 
2. A dictionary called OVERALL_DICT stores all the SERIES IDs as the keys and the corresponding nodule co-ordinates are the values. Since the same nodule may be marked by more than one radiologist (4 radiologists were involved in the annotation process), duplicates were avoided by checking if the nodule ID was alreayd logged. 

In [1]:
import xml.etree.ElementTree as ET
import dicom
import matplotlib.pyplot as plt
import glob
import pickle
import numpy as np

In [63]:
SERIES = []
OVERALL_DICT = {}

folder1 = glob.glob("157/*.xml")
folder2 = glob.glob("185/*.xml")
folder3 = glob.glob("186/*.xml")
folder4 = glob.glob("187/*.xml")
folder5 = glob.glob("188/*.xml")
folder6 = glob.glob("189/*.xml")
xml_files = folder1+folder2+folder3+folder4+folder5+folder6 

for file in xml_files:
    tree = ET.parse(file)
    root = tree.getroot()

    SERIES.append(root[0][8].text)
    ALL_CORD = []
    image_names = []
    dictionary = {}
    z_position = []
    flg = 1
    for child in root: 

        if child.tag == '{http://www.nih.gov}readingSession' or child.tag =='{http://www.nih.gov/idri}CXRreadingSession': 

            
            for gc in child:

                if gc.tag == '{http://www.nih.gov}unblindedReadNodule' or gc.tag =='{http://www.nih.gov/idri}unblindedRead':  #taking only nodules >3mm and <3mm
                    
                    for ggc in gc:
                        if ggc.tag == '{http://www.nih.gov}roi' or ggc.tag == '{http://www.nih.gov/idri}roi':
                            coord = []
                            for gggc in ggc:
                                if gggc.tag == '{http://www.nih.gov}imageZposition' or gggc.tag == '{http://www.nih.gov/idri}imageZposition':
                                    z = gggc.text
                                    if z in z_position:
                                        flg = 0 
                                        break
                                    else:
                                        flg = 1
                                        z_position.append(z)
                                                  
                                if gggc.tag == '{http://www.nih.gov}imageSOP_UID' or gggc.tag == '{http://www.nih.gov/idri}imageSOP_UID':
                                    if flg == 1:
                                        image_names.append(gggc.text)
                            
                                                            
                                if gggc.tag == '{http://www.nih.gov}edgeMap' or gggc.tag == '{http://www.nih.gov/idri}edgeMap':
                                    if flg == 1:
                                        coord.append((int(gggc[0].text),int(gggc[1].text)))
                        
                                

                            if len(coord)!=0:
                                ALL_CORD.append(coord)

    for i in range(0,len(ALL_CORD)): 
        if image_names[i] in dictionary.keys():
            dictionary[image_names[i]].append(ALL_CORD[i])
        else:
            dictionary[image_names[i]] = [ALL_CORD[i]]
    if root[0][8].text in OVERALL_DICT.keys():

        for k in dictionary:
            OVERALL_DICT[root[0][8].text][k] = dictionary[k]
    else:
        OVERALL_DICT[root[0][8].text] = dictionary

In [65]:
with open('OVERALL_DICT.pkl','wb') as od:
    pickle.dump(OVERALL_DICT, od)
with open('SERIES.pkl','wb') as s:
    pickle.dump(SERIES, s)