# Introduction

This notebooks provides code to parse through the XML files from the 2012 i2b2 temporal relation dataset.  Part of this code was borrowed from [*Tao C. and Li H.*](https://github.com/chentao1999/MedicalRelationExtraction) and customed for this specific project.

# General Installs


In [None]:
%%capture
!pip install tqdm

from tqdm import tqdm

# Character Level - parsing

This code parses each character at a time and attempts to add all entities for each example.


In [None]:
import xml.etree.ElementTree as ET
import random
import os
import re

TRAINNING_DATA_DIR = "/content/drive/MyDrive/w266/Project/2012-07-15.original-annotation.release/"
TEST_DATA_DIR = "/content/drive/MyDrive/w266/Project/ground_truth/merged_xml/"
SAVE_DIR = "/content/drive/MyDrive/w266/Project/corpus/i2b2/"

def file_name(file_dir):
    '''
    Input:      file path 
    Output:     list of files with xml format
    '''
    L=[]
    for root, dirs, files in os.walk(file_dir):
        for file in files:
            # Splits (filename, extension)
            if os.path.splitext(file)[1] == '.xml':
                # L.append(os.path.join(root, file))
                L.append(file)
    return L


def data_process(inDIR, outFile):
    '''
    Input:      input file path, output file path
    Output:     text file

    The output will have format: guid\ttarget\tcontext\tlabel\n

    For example: 
      guid = 413_TL1 
      ttarget = hemoptysis substantially further hemotysis
      tcontext = text
      tlabel = OVERLAP

    413_TL1 hemoptysis substantially further hemoptysis text OVERLAP
    '''
    fileList = file_name(inDIR)
    print('Total Number of Files: ', len(fileList))
    lableType = set()
    outFile = open(outFile, "w")
    for f in fileList:
        print(f, end=' ')
        linkNO = 0
        inFile = open(inDIR + f, "r")
        xmlString = ""
        
        # Replaces all & (which is not compatible with XML) with Z
        for lines in inFile.readlines():
            xmlString += lines.replace(" & ", " Z ").replace("&", "Z")
        inFile.close()

        # Initiate parser
        parser = ET.XMLParser(encoding="utf-8")
        root = ET.fromstring(xmlString, parser=parser)

        # Replace next paragraph with space
        text = root.find("TEXT").text
        tags = root.find("TAGS")
        #print(text)

        # Get all event and timex index
        event_index = []
        event_ids = []
        for event in tags.findall("EVENT"):
            event_ids.append(event.attrib['id'])
            event_ids.append(event.attrib['id'])
            event_index.append(int(event.attrib['start']))
            event_index.append(int(event.attrib['end']))

        timex_index = []
        timex_ids = []
        for timex in tags.findall("TIMEX3"):
            timex_ids.append(timex.attrib['id'])
            timex_ids.append(timex.attrib['id'])
            timex_index.append(int(timex.attrib['start']))
            timex_index.append(int(timex.attrib['end']))

        # Create new text with special tokens'
        new_text = []
        E = 0
        T = 0
        for i, char in enumerate(text):
            if i not in (event_index + timex_index):
                new_text.append(char)
            # Event tokens [E] and [/E]
            elif i in event_index and E%2 == 0:
                idx = event_index.index(i)
                if len(event_ids[idx]) < 3:
                    token = " [" + str(event_ids[idx]) + "a" + "] "
                    new_text.append(token)
                    new_text.append(char)
                    E += 1
                else:
                    token = " [" + str(event_ids[idx]) + "] "
                    new_text.append(token)
                    new_text.append(char)
                    E += 1
            elif i in event_index and E%2 != 0:
                idx = event_index.index(i)
                if len(event_ids[idx]) < 3:
                    token = " [/" + str(event_ids[idx]) + "a" + "] "
                    new_text.append(token)
                    new_text.append(char)
                    E += 1
                else:
                    token = " [/" + str(event_ids[idx]) + "] "
                    new_text.append(token)
                    new_text.append(char)
                    E += 1
            # Timex tokens [T] and [/T]
            elif i in timex_index and T%2 == 0:
                idx = timex_index.index(i)
                if len(timex_ids[idx]) < 3:
                    token = " [" + str(timex_ids[idx]) + "a" + "] "
                    new_text.append(token)
                    new_text.append(char)
                    T += 1
                else:
                    token = " [" + str(timex_ids[idx]) + "] "
                    new_text.append(token)
                    new_text.append(char)
                    T += 1
            elif i in timex_index and T%2 != 0:
                idx = timex_index.index(i)
                if len(timex_ids[idx]) < 3:
                    token = " [/" + str(timex_ids[idx]) + "a" + "] "
                    new_text.append(token)
                    new_text.append(char)
                    T += 1
                else:
                    token = " [/" + str(timex_ids[idx]) + "] "
                    new_text.append(token)
                    new_text.append(char)
                    T += 1

        #print('--------------------------------------')
        #print(''.join(new_text))
        # Replace are next paragraph so one line of text
        new_text = ''.join(new_text).replace("\n", " ").strip()
        #new_text = ''.join(new_text)
        print(new_text)
        #print(new_text[431:435])

        # Rename all the E and T in the text to be the same:
        pattern1 = r"E\d{1}[a-z]\]+"
        pattern2 = r"E\d{2}\]+"
        pattern3 = r"T\d{1}[a-z]\]+"
        pattern4 = r"T\d{2}\]+"
        temp = re.sub(pattern1, 'E00]', new_text)
        temp = re.sub(pattern2, 'E00]', temp)
        temp = re.sub(pattern3, 'T00]', temp)
        temp = re.sub(pattern4, 'T00]', temp)
        final_text = temp
        #print(final_text)
        
        # Create tab separated file
        for i, tlink in enumerate(tags.findall("TLINK")):
            a = tlink.attrib['fromID']
            b = tlink.attrib['toID']
            a_idx = new_text.find(a) + 4 # new_text contains [E##] while final_text is only [E00]
            b_idx = new_text.find(b) + 5 # new_text contains [E##] while final_text is only [E00]
            
            id = f[:-4] +"_"+ str(tlink.attrib['id'] )
            target1 = tlink.attrib['fromText']
            target2 = tlink.attrib['toText']
            label = tlink.attrib['type'].upper()
            if label == '':
                continue
            lableType.add(label)

            #print(target1, a_idx, final_text(a_idx:a_idx+3))
            # Write to output
            outFile.write(id + "\t" + label + "\t" + target1 + "\t" + target2 + "\t" + final_text  + "\t" + str(a_idx) + "\t" + str(b_idx) + "\t" + 'end' + "\n")

            # Count number of links per file
            linkNO += 1
        print("linkNO = " + str(linkNO))
    print("*"*80)

# Word Level - parsing

This code parses each words at a time and attempts to add all entities at once for each example.

In [None]:
import xml.etree.ElementTree as ET
import random
import os
import re

TRAINNING_DATA_DIR = "/content/drive/MyDrive/w266/Project/2012-07-15.original-annotation.release/"
TEST_DATA_DIR = "/content/drive/MyDrive/w266/Project/ground_truth/merged_xml/"
SAVE_DIR = "/content/drive/MyDrive/w266/Project/corpus/i2b2/"

def file_name(file_dir):
    '''
    Input:      file path 
    Output:     list of files with xml format
    '''
    L=[]
    for root, dirs, files in os.walk(file_dir):
        for file in files:
            # Splits (filename, extension)
            if os.path.splitext(file)[1] == '.xml':
                # L.append(os.path.join(root, file))
                L.append(file)
    return L


def data_process(inDIR, outFile):
    '''
    Input:      input file path, output file path
    Output:     text file

    The output will have format: guid\ttarget\tcontext\tlabel\n

    For example: 
      guid = 413_TL1 
      ttarget = hemoptysis substantially further hemotysis
      tcontext = text
      tlabel = OVERLAP

    413_TL1 hemoptysis substantially further hemoptysis text OVERLAP
    '''
    fileList = file_name(inDIR)
    print('Total Number of Files: ', len(fileList))
    lableType = set()
    outFile = open(outFile, "w")
    for f in fileList:
        print(f, end=' ')
        linkNO = 0
        inFile = open(inDIR + f, "r")
        xmlString = ""
        
        # Replaces all & (which is not compatible with XML) with Z
        for lines in inFile.readlines():
            xmlString += lines.replace(" & ", " Z ").replace("&", "Z")
        inFile.close()

        # Initiate parser
        parser = ET.XMLParser(encoding="utf-8")
        root = ET.fromstring(xmlString, parser=parser)

        # Replace next paragraph with space
        text = root.find("TEXT").text
        tags = root.find("TAGS")
        #print(text)

        # Get all event and timex index
        event_index = []
        event_ids = []
        for event in tags.findall("EVENT"):
            event_ids.append(event.attrib['id'])
            event_ids.append(event.attrib['id'])
            event_index.append(int(event.attrib['start']))
            event_index.append(int(event.attrib['end']))

        timex_index = []
        timex_ids = []
        for timex in tags.findall("TIMEX3"):
            timex_ids.append(timex.attrib['id'])
            timex_ids.append(timex.attrib['id'])
            timex_index.append(int(timex.attrib['start']))
            timex_index.append(int(timex.attrib['end']))

        # Create new text with special tokens'
        new_text = []
        E = 0
        T = 0
        for i, char in enumerate(text):
            if i not in (event_index + timex_index):
                new_text.append(char)
            # Event tokens [E] and [/E]
            elif i in event_index and E%2 == 0:
                idx = event_index.index(i)
                token = "[" + str(event_ids[idx]) + "] "
                new_text.append(token)
                new_text.append(char)
                E += 1
            elif i in event_index and E%2 != 0:
                idx = event_index.index(i)
                token = " [/" + str(event_ids[idx]) + "]"
                new_text.append(token)
                new_text.append(char)
                E += 1
            # Timex tokens [T] and [/T]
            elif i in timex_index and T%2 == 0:
                idx = timex_index.index(i)
                token = "[" + str(timex_ids[idx]) + "] "
                new_text.append(token)
                new_text.append(char)
                T += 1
            elif i in timex_index and T%2 != 0:
                idx = timex_index.index(i)
                token = " [/" + str(timex_ids[idx]) + "]"
                new_text.append(token)
                new_text.append(char)
                T += 1

        #print('--------------------------------------')
        #print(''.join(new_text))
        # Replace are next paragraph so one line of text
        new_text = ''.join(new_text).replace("\n", " ").strip().split(' ')
        #print(new_text)
        #print(new_text[431:435])

        # Rename all the E and T in the text to be the same:
        final_text = new_text.copy()
        for i, word in enumerate(final_text):
            if '[E' in word:
                final_text[i] = '[E]'
            elif '[/E' in word:
                final_text[i] = '[/E]'
            elif '[T' in word:
                final_text[i] = '[T]'
            elif '[/T' in word:
                final_text[i] = '[/T]'
        
        final_text = ' '.join(final_text)
        #print(final_text)

        # Create tab separated file
        for i, tlink in enumerate(tags.findall("TLINK")):
            a = '[' + tlink.attrib['fromID'] + ']'
            b = '[' + tlink.attrib['toID'] + ']'
            
            a_idx = new_text.index(a)
            b_idx = new_text.index(b)

            #print(a, b)
             
            id = f[:-4] +"_"+ str(tlink.attrib['id'] )
            target1 = tlink.attrib['fromText']
            target2 = tlink.attrib['toText']
            label = tlink.attrib['type'].upper()
            if label == '':
                continue
            lableType.add(label)

            #print(target1, target2)
            #print(final_text.split(' ')[a_idx], final_text.split(' ')[b_idx])

            # Determine if a and b are event or timex
            if 'E' in a:
                atype = 'event'
            elif 'T' in a:
                atype = 'timex'
            if 'E' in b:
                btype = 'event'
            elif 'T' in b:
                btype = 'timex'

            # Write to output
            outFile.write(id + "\t" + label + "\t" + target1 + "\t" + target2 + "\t" + str(final_text)  +  \
                          "\t" + str(a_idx) + "\t" + str(b_idx) + "\t" + atype + "\t" + btype + "\t" + 'end' + "\n")

            # Count number of links per file
            linkNO += 1

        #print(final_text)
        print("linkNO = " + str(linkNO))
    print("*"*80)

# Specific Entity Marker - parsing

This code parses the XML file and only puts one entity pair for each example.  This parser was used for the project.

In [None]:
import xml.etree.ElementTree as ET
import random
import os
import re

TRAINNING_DATA_DIR = "/content/drive/MyDrive/w266/Project/2012-07-15.original-annotation.release/"
TEST_DATA_DIR = "/content/drive/MyDrive/w266/Project/ground_truth/merged_xml/"
SAVE_DIR = "/content/drive/MyDrive/w266/Project/corpus/i2b2/"

def file_name(file_dir):
    '''
    Input:      file path 
    Output:     list of files with xml format
    '''
    L=[]
    for root, dirs, files in os.walk(file_dir):
        for file in files:
            # Splits (filename, extension)
            if os.path.splitext(file)[1] == '.xml':
                # L.append(os.path.join(root, file))
                L.append(file)
    return L


def data_process(inDIR, outFile):
    '''
    Input:      input file path, output file path
    Output:     text file

    The output will have format: guid\ttarget\tcontext\tlabel\n

    For example: 
      guid = 413_TL1 
      ttarget = hemoptysis substantially further hemotysis
      tcontext = text
      tlabel = OVERLAP

    413_TL1 hemoptysis substantially further hemoptysis text OVERLAP
    '''
    fileList = file_name(inDIR)
    print('Total Number of Files: ', len(fileList))
    lableType = set()
    outFile = open(outFile, "w")
    for f in fileList:
        print(f, end=' ')
        linkNO = 0
        inFile = open(inDIR + f, "r")
        xmlString = ""
        
        # Replaces all & (which is not compatible with XML) with Z
        for lines in inFile.readlines():
            xmlString += lines.replace(" & ", " Z ").replace("&", "Z")
        inFile.close()

        # Initiate parser
        parser = ET.XMLParser(encoding="utf-8")
        root = ET.fromstring(xmlString, parser=parser)

        # Replace next paragraph with space
        text = root.find("TEXT").text
        tags = root.find("TAGS")

        # Get all event and timex index into a dictionary
        event_dict = {}
        for event in tags.findall("EVENT"):
            event_dict[event.attrib['id']] = (int(event.attrib['start']), int(event.attrib['end']))

        timex_dict = {}
        for timex in tags.findall("TIMEX3"):
            timex_dict[timex.attrib['id']] = (int(timex.attrib['start']), int(timex.attrib['end']))

        # Create tab separated file
        for i, tlink in enumerate(tags.findall("TLINK")):    
            # Parse targets 
            id = f[:-4] +"_"+ str(tlink.attrib['id'] )
            target1 = tlink.attrib['fromText']
            target2 = tlink.attrib['toText']
            label = tlink.attrib['type'].upper()
            if label == '':
                continue
            lableType.add(label)

            # Get event_id or timex_id
            a = tlink.attrib['fromID'] 
            b = tlink.attrib['toID'] 

            # Check whether event or timex and get index
            if 'E' in a:
                a_start = event_dict[a][0]
                a_end = event_dict[a][1]
            elif 'T' in a:
                a_start = timex_dict[a][0]
                a_end = timex_dict[a][1]
            
            if 'E' in b:
                b_start = event_dict[b][0]
                b_end = event_dict[b][1]
            elif 'T' in b:
                b_start = timex_dict[b][0]
                b_end = timex_dict[b][1]

            # Stich together new text
            new_text = []
            used_idx = []
            for i, char in enumerate(text):
                if i not in [a_start, a_end, b_start, b_end]:
                    new_text.append(char)
                elif (i == a_start and i == b_start):
                    token = "[L] [R] "
                    new_text.append(token)
                    new_text.append(char)
                elif (i == a_end and i == b_end):
                    token = " [L] [R]"
                    new_text.append(token)
                    new_text.append(char)
                elif (i == a_start and i == b_end):
                    token = "[L] [R]"
                    new_text.append(token)
                    new_text.append(char)
                elif (i == a_end and i == b_start):
                    token = " [L] [R] "
                    new_text.append(token)
                    new_text.append(char)
                elif i == a_start:
                    token = "[L] "
                    new_text.append(token)
                    new_text.append(char)
                elif i == a_end:
                    token = " [L]"
                    new_text.append(token)
                    new_text.append(char)
                elif i == b_start:
                    token = "[R] "
                    new_text.append(token)
                    new_text.append(char)
                elif i == b_end:
                    token = " [R]"
                    new_text.append(token)
                    new_text.append(char)

            # Remove indentations
            new_text = ''.join(new_text).replace("\n", " ").strip()

            # Write to output
            outFile.write(id + "\t" + label + "\t" + target1 + "\t" + target2 + "\t" + str(new_text) + "\t" + 'end' + "\n")

            # Count number of links per file
            linkNO += 1

            #print(new_text)
        print("linkNO = " + str(linkNO))
    print("*"*80)

# Choose parser to generate data file

In [None]:
TRAINNING_DATA_DIR = '/content/drive/MyDrive/w266/Project/play/'
SAVE_DIR = '/content/drive/MyDrive/w266/Project/play/'
data_process(TRAINNING_DATA_DIR , SAVE_DIR + "play5.txt")

Total Number of Files:  1
178.xml [L] Admission [L] Date : [R] 2012-01-20 [R] Discharge Date : 2012-01-23 Service : UROLOGY HISTORY OF PRESENT ILLNESS : The patient is a 64 year old gentleman with a history of elevated PSA , prostate cancer . He was diagnosed with needle biopsy on either side of the prostate with 3+3 equals 6 on the right side and 3+4 equals 7 on the left side . HOSPITAL COURSE : The patient underwent radical retropubic prostatectomy on 2012-01-20 . Dr. Marie Anderson performed the surgery . The patient tolerated the procedure well . On postoperative day number one , he was administered 1 mg Coumadin per protocol as well as on the operative day 1 mg of Coumadin . The patient was advanced to regular diet on postoperative day one . His laboratories were checked and his hematocrit was noted to be stable . He was made to ambulate on postoperative day one . His pain was well controlled with oral medication at this time . On postoperative day two , he continued to convalesce

In [None]:
TRAINNING_DATA_DIR = "/content/drive/MyDrive/w266/Project/2012-07-15.original-annotation.release/"
SAVE_DIR = '/content/drive/MyDrive/w266/Project/corpus/i2b2/'
data_process(TRAINNING_DATA_DIR , SAVE_DIR + "train_1.txt")

Total Number of Files:  190
92.xml linkNO = 75
87.xml linkNO = 353
93.xml linkNO = 164
98.xml linkNO = 133
797.xml linkNO = 184
8.xml linkNO = 96
787.xml linkNO = 101
807.xml linkNO = 231
791.xml linkNO = 319
801.xml linkNO = 52
86.xml linkNO = 79
81.xml linkNO = 97
786.xml linkNO = 214
751.xml linkNO = 133
736.xml linkNO = 145
747.xml linkNO = 267
757.xml linkNO = 187
777.xml linkNO = 128
776.xml linkNO = 336
756.xml linkNO = 195
701.xml linkNO = 167
726.xml linkNO = 182
72.xml linkNO = 228
711.xml linkNO = 81
707.xml linkNO = 103
717.xml linkNO = 331
722.xml linkNO = 265
721.xml linkNO = 72
697.xml linkNO = 220
68.xml linkNO = 403
682.xml linkNO = 215
676.xml linkNO = 54
692.xml linkNO = 148
666.xml linkNO = 95
681.xml linkNO = 145
647.xml linkNO = 217
656.xml linkNO = 102
641.xml linkNO = 117
631.xml linkNO = 431
626.xml linkNO = 150
637.xml linkNO = 161
642.xml linkNO = 100
636.xml linkNO = 169
602.xml linkNO = 247
622.xml linkNO = 140
596.xml linkNO = 97
6.xml linkNO = 99
591.xml 

In [None]:
TEST_DATA_DIR = "/content/drive/MyDrive/w266/Project/ground_truth/merged_xml/"
SAVE_DIR = '/content/drive/MyDrive/w266/Project/corpus/i2b2/'
data_process(TEST_DATA_DIR , SAVE_DIR + "test_1.txt")

Total Number of Files:  120
516.xml linkNO = 374
243.xml linkNO = 99
78.xml linkNO = 459
103.xml linkNO = 72
276.xml linkNO = 485
436.xml linkNO = 104
277.xml linkNO = 240
202.xml linkNO = 631
592.xml linkNO = 239
132.xml linkNO = 146
227.xml linkNO = 631
312.xml linkNO = 314
138.xml linkNO = 71
368.xml linkNO = 185
258.xml linkNO = 207
361.xml linkNO = 409
397.xml linkNO = 297
88.xml linkNO = 158
296.xml linkNO = 130
33.xml linkNO = 265
142.xml linkNO = 294
298.xml linkNO = 222
383.xml linkNO = 322
447.xml linkNO = 229
303.xml linkNO = 71
392.xml linkNO = 122
562.xml linkNO = 150
326.xml linkNO = 253
487.xml linkNO = 207
317.xml linkNO = 186
561.xml linkNO = 129
196.xml linkNO = 25
53.xml linkNO = 233
283.xml linkNO = 69
358.xml linkNO = 220
398.xml linkNO = 104
552.xml linkNO = 213
83.xml linkNO = 385
261.xml linkNO = 93
263.xml linkNO = 253
137.xml linkNO = 223
431.xml linkNO = 70
441.xml linkNO = 201
391.xml linkNO = 52
617.xml linkNO = 387
101.xml linkNO = 375
71.xml linkNO = 136
