https://bionlp.nlm.nih.gov/tac2017adversereactions/

Task 1: Extract AdverseReactions and related mentions (Severity, Factor, DrugClass, Negation, Animal). This is similar to many NLP Named Entity Recognition (NER) evaluations.

In [1]:
import untangle
import glob
import pandas as pd
from collections import Counter
import re
import string
import csv

In [2]:
path = '/Users/jzhu/git/nlp_adversedrug/data/train_xml/'

In [3]:
def parse_xml(filename):
    """
    @input a filename string
    @return:
    1. For training data: both a dictionary (key is the section) for X (text strings) and a list of dictionary for y 
        (keys: id (not for task 1), section, type, start, len)
    2. For test data: only a list of X
    """
    X = {}
    Y = []
    
    obj = untangle.parse(filename)
    for text in obj.Label.Text.Section:
        X[text['id']] = text.cdata
        
    if obj.Label.Mentions.Mention:
        for mention in obj.Label.Mentions.Mention:
            entity = {}
            entity['id'] = mention['id']
            entity['section'] = mention['section']
            entity['type'] = mention['type']
            entity['start'] = mention['start']
            entity['len'] = mention['len']
            entity['text'] = mention['str']
            Y.append(entity)
            
    return X, Y

# test

In [4]:
filename = path + 'ADCETRIS.xml'
X, Y = parse_xml(filename)

In [5]:
X.keys()

[u'S3', u'S2', u'S1']

In [6]:
X['S1']



In [7]:
X['S1'][236:(236+11)]

u'Anaphylaxis'

In [8]:
Y[:2]

[{'id': u'M1',
  'len': u'21',
  'section': u'S1',
  'start': u'156',
  'text': u'Peripheral Neuropathy',
  'type': u'AdverseReaction'},
 {'id': u'M2',
  'len': u'11',
  'section': u'S1',
  'start': u'236',
  'text': u'Anaphylaxis',
  'type': u'AdverseReaction'}]

### For NER_DL: extract only the entities and tags in training folder (then use the pre-trained word2vec from spacy)

In [9]:
f = path + 'ADCETRIS.xml'
X, Y = parse_xml(f)
X.keys()

[u'S3', u'S2', u'S1']

In [37]:
train = []
ct_good = 0
ct_bad = 0

for f in glob.glob(path+'*.xml'):
    X, Y = parse_xml(f)

    for section in X.keys():
        doc = X[section]
        print f, section
    
        # split the words in a doc
        word_ind = [[m.group(0), m.start(), m.end(), 'O'] for m in re.finditer(r'\w+', doc)
            if m.group(0) ]
        words = [w[0] for w in word_ind]
        starts = [s[1] for s in word_ind]
        ends = [e[2] for e in word_ind]
        types = [e[3] for e in word_ind]
        start_dict = dict(zip(starts, range(len(starts))))
        end_dict = dict(zip(ends, range(len(ends))))
        

        # parse the names in the same doc
        e_text = []
        e_type = []
        e_start = []
        e_end = []
        for e in Y:
            if e['section'] == section:
                starts = e['start'].split(',')
                lens = e['len'].split(',')
                for i in range(len(starts)):
                    e_text.append(e['text'])
                    e_type.append(e['type'])
                    e_start.append(int(starts[i]))
                    e_end.append(int(starts[i]) + int(lens[i]))

        # label the names in the list of total words
        for i in range(len(e_start)):
            if (e_start[i] in start_dict) and (e_end[i] in end_dict):
                ct_good += 1
                ind_start = start_dict[e_start[i]]
                ind_end = end_dict[e_end[i]]

                words[ind_start] = ' '.join(words[ind_start:(ind_end + 1)])
                types[ind_start] = e_type[i]
                for j in range(ind_start, ind_end):
                    types[j+1] = 'rm' # label to remove later

            else:
                ct_bad += 1
#                 print f, section, e_text[i], e_start[i], e_end[i]
                
        # remove those words tagged with 'rm' (i.e. combine names with more than one word)
        train += [[words[i], types[i]] for i in range(len(words)) if types[i] != 'rm']
        print len(train)

        # insert a new line to seperate from the next section
        train.append([''])
        print len(train)

/Users/jzhu/git/nlp_adversedrug/data/train_xml/ADCETRIS.xml S3
1021
1022
/Users/jzhu/git/nlp_adversedrug/data/train_xml/ADCETRIS.xml S2
1080
1081
/Users/jzhu/git/nlp_adversedrug/data/train_xml/ADCETRIS.xml S1
3307
3308
/Users/jzhu/git/nlp_adversedrug/data/train_xml/ADREVIEW.xml S2
4130
4131
/Users/jzhu/git/nlp_adversedrug/data/train_xml/ADREVIEW.xml S1
4430
4431
/Users/jzhu/git/nlp_adversedrug/data/train_xml/AFINITOR.xml S2
6102
6103
/Users/jzhu/git/nlp_adversedrug/data/train_xml/AFINITOR.xml S1
10335
10336
/Users/jzhu/git/nlp_adversedrug/data/train_xml/AMPYRA.xml S2
10853
10854
/Users/jzhu/git/nlp_adversedrug/data/train_xml/AMPYRA.xml S1
11324
11325
/Users/jzhu/git/nlp_adversedrug/data/train_xml/AMYVID.xml S2
11543
11544
/Users/jzhu/git/nlp_adversedrug/data/train_xml/AMYVID.xml S1
11810
11811
/Users/jzhu/git/nlp_adversedrug/data/train_xml/APTIOM.xml S2
13836
13837
/Users/jzhu/git/nlp_adversedrug/data/train_xml/APTIOM.xml S1
14721
14722
/Users/jzhu/git/nlp_adversedrug/data/train_xml/AR

96973
/Users/jzhu/git/nlp_adversedrug/data/train_xml/JARDIANCE.xml S1
98730
98731
/Users/jzhu/git/nlp_adversedrug/data/train_xml/JEVTANA.xml S3
99818
99819
/Users/jzhu/git/nlp_adversedrug/data/train_xml/JEVTANA.xml S2
100032
100033
/Users/jzhu/git/nlp_adversedrug/data/train_xml/JEVTANA.xml S1
101132
101133
/Users/jzhu/git/nlp_adversedrug/data/train_xml/JUBLIA.xml S1
101332
101333
/Users/jzhu/git/nlp_adversedrug/data/train_xml/KALBITOR.xml S3
101541
101542
/Users/jzhu/git/nlp_adversedrug/data/train_xml/KALBITOR.xml S2
101718
101719
/Users/jzhu/git/nlp_adversedrug/data/train_xml/KALBITOR.xml S1
102458
102459
/Users/jzhu/git/nlp_adversedrug/data/train_xml/KALYDECO.xml S2
102863
102864
/Users/jzhu/git/nlp_adversedrug/data/train_xml/KALYDECO.xml S1
103825
103826
/Users/jzhu/git/nlp_adversedrug/data/train_xml/KYPROLIS.xml S2
105204
105205
/Users/jzhu/git/nlp_adversedrug/data/train_xml/KYPROLIS.xml S1
107230
107231
/Users/jzhu/git/nlp_adversedrug/data/train_xml/LUMIZYME.xml S3
108160
108161
/

/Users/jzhu/git/nlp_adversedrug/data/train_xml/YERVOY.xml S3
204492
204493
/Users/jzhu/git/nlp_adversedrug/data/train_xml/YERVOY.xml S2
204750
204751
/Users/jzhu/git/nlp_adversedrug/data/train_xml/YERVOY.xml S1
205572
205573
/Users/jzhu/git/nlp_adversedrug/data/train_xml/ZERBAXA.xml S2
206068
206069
/Users/jzhu/git/nlp_adversedrug/data/train_xml/ZERBAXA.xml S1
206809
206810
/Users/jzhu/git/nlp_adversedrug/data/train_xml/ZYDELIG.xml S3
207530
207531
/Users/jzhu/git/nlp_adversedrug/data/train_xml/ZYDELIG.xml S2
207831
207832
/Users/jzhu/git/nlp_adversedrug/data/train_xml/ZYDELIG.xml S1
208885
208886
/Users/jzhu/git/nlp_adversedrug/data/train_xml/ZYKADIA.xml S2
210227
210228
/Users/jzhu/git/nlp_adversedrug/data/train_xml/ZYKADIA.xml S1
210882
210883
/Users/jzhu/git/nlp_adversedrug/data/train_xml/ZYTIGA.xml S2
211648
211649
/Users/jzhu/git/nlp_adversedrug/data/train_xml/ZYTIGA.xml S1
213036
213037


In [38]:
print ct_good, ct_bad, ct_bad * 1.0 /ct_good

16728 128 0.00765184122429


In [39]:
len(train)

213037

In [40]:
for i in train[1020:1030]:
    print i

[u'1', 'O']
['']
[u'BOXED', 'O']
[u'PROGRESSIVE MULTIFOCAL LEUKOENCEPHALOPATHY', u'AdverseReaction']
[u'PML', u'AdverseReaction']
[u'PROGRESSIVE MULTIFOCAL LEUKOENCEPHALOPATHY', u'AdverseReaction']
[u'PML', u'AdverseReaction']


In [41]:
213037 - 212798 # num of sections of all xml files

239

In [42]:
with open("train.csv", "wb") as f:
    writer = csv.writer(f)
    writer.writerows(train)

Note: 
* Since the name index is based on original text, we cannot do word labeling one word by one word. 
* Thus I first wrote all words in sentences one in a line, with the index on the same line 
* then use the name index from xml to relabel those types of names
* Only 0.7% names are not found in the processed train.csv. So I am fine with that. Most of them are due to hard to seperated words such as "decreasedSOD"