https://bionlp.nlm.nih.gov/tac2017adversereactions/

Task 1: Extract AdverseReactions and related mentions (Severity, Factor, DrugClass, Negation, Animal). This is similar to many NLP Named Entity Recognition (NER) evaluations.

In [2]:
import untangle
import glob
import pandas as pd
from collections import Counter
import re
import string

In [3]:
path = '/Users/jzhu/git/nlp_adversedrug/data/train_xml/'

In [4]:
def parse_xml(filename):
    """
    @input a filename string
    @return:
    1. For training data: both a dictionary (key is the section) for X (text strings) and a list of dictionary for y 
        (keys: id (not for task 1), section, type, start, len)
    2. For test data: only a list of X
    """
    X = {}
    Y = []
    
    obj = untangle.parse(filename)
    for text in obj.Label.Text.Section:
        X[text['id']] = text.cdata
        
    if obj.Label.Mentions.Mention:
        for mention in obj.Label.Mentions.Mention:
            entity = {}
            entity['id'] = mention['id']
            entity['section'] = mention['section']
            entity['type'] = mention['type']
            entity['start'] = mention['start']
            entity['len'] = mention['len']
            entity['text'] = mention['str']
            Y.append(entity)
            
    return X, Y

# test

In [70]:
filename = path + 'ADCETRIS.xml'
X, Y = parse_xml(filename)

In [71]:
X.keys()

[u'S3', u'S2', u'S1']

In [72]:
X['S1']



In [73]:
X['S1'][236:(236+11)]

u'Anaphylaxis'

In [41]:
Y[:2]

[{'id': u'M1',
  'len': u'21',
  'section': u'S1',
  'start': u'156',
  'text': u'Peripheral Neuropathy',
  'type': u'AdverseReaction'},
 {'id': u'M2',
  'len': u'11',
  'section': u'S1',
  'start': u'236',
  'text': u'Anaphylaxis',
  'type': u'AdverseReaction'}]

### For NER_DL: extract only the entities and tags in training folder (then use the pre-trained word2vec from spacy)

In [21]:
f = path + 'ADCETRIS.xml'
X, Y = parse_xml(f)
X.keys()

[u'S3', u'S2', u'S1']

In [39]:
doc = X['S1']
words = [[m.group(0), m.start(), m.end(), 'O'] for m in re.finditer(r'\w+', doc)
        if m.group(0) ]
words

[[u'6', 4, 5, 'O'],
 [u'ADVERSE', 6, 13, 'O'],
 [u'REACTIONS', 14, 23, 'O'],
 [u'The', 27, 30, 'O'],
 [u'following', 31, 40, 'O'],
 [u'serious', 41, 48, 'O'],
 [u'adverse', 49, 56, 'O'],
 [u'reactions', 57, 66, 'O'],
 [u'are', 67, 70, 'O'],
 [u'discussed', 71, 80, 'O'],
 [u'in', 81, 83, 'O'],
 [u'greater', 84, 91, 'O'],
 [u'detail', 92, 98, 'O'],
 [u'in', 99, 101, 'O'],
 [u'other', 102, 107, 'O'],
 [u'sections', 108, 116, 'O'],
 [u'of', 117, 119, 'O'],
 [u'the', 120, 123, 'O'],
 [u'prescribing', 124, 135, 'O'],
 [u'information', 136, 147, 'O'],
 [u'Peripheral', 156, 166, 'O'],
 [u'Neuropathy', 167, 177, 'O'],
 [u'see', 179, 182, 'O'],
 [u'and', 193, 196, 'O'],
 [u'Precautions', 197, 208, 'O'],
 [u'5', 216, 217, 'O'],
 [u'1', 218, 219, 'O'],
 [u'Anaphylaxis', 236, 247, 'O'],
 [u'and', 248, 251, 'O'],
 [u'Infusion', 252, 260, 'O'],
 [u'Reactions', 261, 270, 'O'],
 [u'see', 272, 275, 'O'],
 [u'and', 286, 289, 'O'],
 [u'Precautions', 290, 301, 'O'],
 [u'5', 309, 310, 'O'],
 [u'2', 311, 312

In [31]:
for e in Y:
#         print e, e['start']
    start = int(e['start'])
    end = start + int(e['len'])
    out.append([e['text'], e['type']])

u'of'

In [41]:
Y[:10]

[{'id': u'M1',
  'len': u'21',
  'section': u'S1',
  'start': u'156',
  'text': u'Peripheral Neuropathy',
  'type': u'AdverseReaction'},
 {'id': u'M2',
  'len': u'11',
  'section': u'S1',
  'start': u'236',
  'text': u'Anaphylaxis',
  'type': u'AdverseReaction'},
 {'id': u'M3',
  'len': u'18',
  'section': u'S1',
  'start': u'252',
  'text': u'Infusion Reactions',
  'type': u'AdverseReaction'},
 {'id': u'M4',
  'len': u'22',
  'section': u'S1',
  'start': u'329',
  'text': u'Hematologic Toxicities',
  'type': u'AdverseReaction'},
 {'id': u'M5',
  'len': u'7',
  'section': u'S1',
  'start': u'410',
  'text': u'Serious',
  'type': u'Severity'},
 {'id': u'M6',
  'len': u'10',
  'section': u'S1',
  'start': u'418',
  'text': u'Infections',
  'type': u'AdverseReaction'},
 {'id': u'M7',
  'len': u'24',
  'section': u'S1',
  'start': u'433',
  'text': u'Opportunistic Infections',
  'type': u'AdverseReaction'},
 {'id': u'M8',
  'len': u'20',
  'section': u'S1',
  'start': u'516',
  'text': u'T

# try

In [20]:
# split into sentences and words (with the original index)
out = []
ct = 0
for f in glob.glob(path+'*.xml'):
    ct += 1
    print f
    
    X, Y = parse_xml(f)
    sent = [s for s in X['S1'].split('\n') if s]
    print len(sent)
#     print sent[:5]
#     for s in sent:
#         if s: print s
#     for text in X:
        
#     for e in Y:
# #         print e, e['start']
#         try:
#             start = int(e['start'])
#             end = start + int(e['len'])
#             out.append([e['text'], e['type']])
#         except:
#             pass
        
    if ct > 0: break
        
# out

/Users/jzhu/git/nlp_adversedrug/data/train_xml/ADCETRIS.xml
157


In [26]:
a = X['S1']
a = a.sub()
matches = [(m.group(0), m.start(), m.end(), ) for m in re.finditer(r'\S+', a)]
matches

AttributeError: 'unicode' object has no attribute 'sub'

In [43]:
# cut the sentences and words; write into a txt file
out = []
for f in glob.glob(path+'*.xml'):
    print f
    X, Y = parse_xml(f)
    for e in Y:
#         print e, e['start']
        try:
            start = int(e['start'])
            end = start + int(e['len'])
            out.append([e['text'], e['type']])
        except:
            pass

/Users/jzhu/git/nlp_adversedrug/data/train_xml/ADCETRIS.xml
/Users/jzhu/git/nlp_adversedrug/data/train_xml/ADREVIEW.xml
/Users/jzhu/git/nlp_adversedrug/data/train_xml/AFINITOR.xml
/Users/jzhu/git/nlp_adversedrug/data/train_xml/AMPYRA.xml
/Users/jzhu/git/nlp_adversedrug/data/train_xml/AMYVID.xml
/Users/jzhu/git/nlp_adversedrug/data/train_xml/APTIOM.xml
/Users/jzhu/git/nlp_adversedrug/data/train_xml/ARCAPTA.xml
/Users/jzhu/git/nlp_adversedrug/data/train_xml/BELEODAQ.xml
/Users/jzhu/git/nlp_adversedrug/data/train_xml/BENLYSTA.xml
/Users/jzhu/git/nlp_adversedrug/data/train_xml/BEPREVE.xml
/Users/jzhu/git/nlp_adversedrug/data/train_xml/BESIVANCE.xml
/Users/jzhu/git/nlp_adversedrug/data/train_xml/BLINCYTO.xml
/Users/jzhu/git/nlp_adversedrug/data/train_xml/BOSULIF.xml
/Users/jzhu/git/nlp_adversedrug/data/train_xml/BREO.xml
/Users/jzhu/git/nlp_adversedrug/data/train_xml/CARBAGLU.xml
/Users/jzhu/git/nlp_adversedrug/data/train_xml/CERDELGA.xml
/Users/jzhu/git/nlp_adversedrug/data/train_xml/CHOLI

In [44]:
len(out)

14644

In [45]:
types = [e[1] for e in out]
Counter(types)

Counter({u'AdverseReaction': 12792,
         u'Animal': 44,
         u'DrugClass': 248,
         u'Factor': 602,
         u'Negation': 95,
         u'Severity': 863})

In [46]:
# pd.DataFrame(out).to_csv('ner_train_1.txt', index=False, header=False, sep=' ')

Note: not finished
as the word index is based on original text. We cannot do this one word by one word.     
one way is to first write all words in sentences one in a line, with the index on the same line     
then use the name index from xml to relabel those types of names

Alternative: change the modeling code, instead of input