https://bionlp.nlm.nih.gov/tac2017adversereactions/

Task 1: Extract AdverseReactions and related mentions (Severity, Factor, DrugClass, Negation, Animal). This is similar to many NLP Named Entity Recognition (NER) evaluations.

In [76]:
import untangle
import glob
import pandas as pd
from collections import Counter
import re
import string

In [2]:
path = '/Users/jzhu/git/nlp_adversedrug/data/train_xml/'

In [36]:
def parse_xml(filename):
    """
    @input a filename string
    @return:
    1. For training data: both a dictionary (key is the section) for X (text strings) and a list of dictionary for y 
        (keys: id (not for task 1), section, type, start, len)
    2. For test data: only a list of X
    """
    X = {}
    Y = []
    
    obj = untangle.parse(filename)
    for text in obj.Label.Text.Section:
        X[text['id']] = text.cdata
        
    if obj.Label.Mentions.Mention:
        for mention in obj.Label.Mentions.Mention:
            entity = {}
            entity['id'] = mention['id']
            entity['section'] = mention['section']
            entity['type'] = mention['type']
            entity['start'] = mention['start']
            entity['len'] = mention['len']
            entity['text'] = mention['str']
            Y.append(entity)
            
    return X, Y

# test

In [70]:
filename = path + 'ADCETRIS.xml'
X, Y = parse_xml(filename)

In [71]:
X.keys()

[u'S3', u'S2', u'S1']

In [72]:
X['S1']



In [73]:
X['S1'][236:(236+11)]

u'Anaphylaxis'

In [41]:
Y[:2]

[{'id': u'M1',
  'len': u'21',
  'section': u'S1',
  'start': u'156',
  'text': u'Peripheral Neuropathy',
  'type': u'AdverseReaction'},
 {'id': u'M2',
  'len': u'11',
  'section': u'S1',
  'start': u'236',
  'text': u'Anaphylaxis',
  'type': u'AdverseReaction'}]

### For NER_DL: extract only the entities and tags in training folder (then use the pre-trained word2vec from spacy)

In [66]:
# split into sentences and words (with the original index)
out = []
for f in glob.glob(path+'*.xml'):
    print f
    X, Y = parse_xml(f)
    for e in Y:
#         print e, e['start']
        try:
            start = int(e['start'])
            end = start + int(e['len'])
            out.append([e['text'], e['type']])
        except:
            pass

In [86]:
a = X['S1']
a = a.sub()
matches = [(m.group(0), m.start(), m.end(), ) for m in re.finditer(r'\S+', a)]
matches

[(u'6', (4, 4)),
 (u'ADVERSE', (6, 12)),
 (u'REACTIONS', (14, 22)),
 (u'The', (27, 29)),
 (u'following', (31, 39)),
 (u'serious', (41, 47)),
 (u'adverse', (49, 55)),
 (u'reactions', (57, 65)),
 (u'are', (67, 69)),
 (u'discussed', (71, 79)),
 (u'in', (81, 82)),
 (u'greater', (84, 90)),
 (u'detail', (92, 97)),
 (u'in', (99, 100)),
 (u'other', (102, 106)),
 (u'sections', (108, 115)),
 (u'of', (117, 118)),
 (u'the', (120, 122)),
 (u'prescribing', (124, 134)),
 (u'information:', (136, 147)),
 (u'*', (153, 153)),
 (u'Peripheral', (156, 165)),
 (u'Neuropathy', (167, 176)),
 (u'[see', (178, 181)),
 (u'and', (193, 195)),
 (u'Precautions', (197, 207)),
 (u'(', (209, 209)),
 (u'5.1', (216, 218)),
 (u')', (225, 225)),
 (u']', (228, 228)),
 (u'*', (233, 233)),
 (u'Anaphylaxis', (236, 246)),
 (u'and', (248, 250)),
 (u'Infusion', (252, 259)),
 (u'Reactions', (261, 269)),
 (u'[see', (271, 274)),
 (u'and', (286, 288)),
 (u'Precautions', (290, 300)),
 (u'(', (302, 302)),
 (u'5.2', (309, 311)),
 (u')', (

In [43]:
# cut the sentences and words; write into a txt file
out = []
for f in glob.glob(path+'*.xml'):
    print f
    X, Y = parse_xml(f)
    for e in Y:
#         print e, e['start']
        try:
            start = int(e['start'])
            end = start + int(e['len'])
            out.append([e['text'], e['type']])
        except:
            pass

/Users/jzhu/git/nlp_adversedrug/data/train_xml/ADCETRIS.xml
/Users/jzhu/git/nlp_adversedrug/data/train_xml/ADREVIEW.xml
/Users/jzhu/git/nlp_adversedrug/data/train_xml/AFINITOR.xml
/Users/jzhu/git/nlp_adversedrug/data/train_xml/AMPYRA.xml
/Users/jzhu/git/nlp_adversedrug/data/train_xml/AMYVID.xml
/Users/jzhu/git/nlp_adversedrug/data/train_xml/APTIOM.xml
/Users/jzhu/git/nlp_adversedrug/data/train_xml/ARCAPTA.xml
/Users/jzhu/git/nlp_adversedrug/data/train_xml/BELEODAQ.xml
/Users/jzhu/git/nlp_adversedrug/data/train_xml/BENLYSTA.xml
/Users/jzhu/git/nlp_adversedrug/data/train_xml/BEPREVE.xml
/Users/jzhu/git/nlp_adversedrug/data/train_xml/BESIVANCE.xml
/Users/jzhu/git/nlp_adversedrug/data/train_xml/BLINCYTO.xml
/Users/jzhu/git/nlp_adversedrug/data/train_xml/BOSULIF.xml
/Users/jzhu/git/nlp_adversedrug/data/train_xml/BREO.xml
/Users/jzhu/git/nlp_adversedrug/data/train_xml/CARBAGLU.xml
/Users/jzhu/git/nlp_adversedrug/data/train_xml/CERDELGA.xml
/Users/jzhu/git/nlp_adversedrug/data/train_xml/CHOLI

In [44]:
len(out)

14644

In [45]:
types = [e[1] for e in out]
Counter(types)

Counter({u'AdverseReaction': 12792,
         u'Animal': 44,
         u'DrugClass': 248,
         u'Factor': 602,
         u'Negation': 95,
         u'Severity': 863})

In [46]:
pd.DataFrame(out).to_csv('ner_train_1.txt', index=False, header=False, sep=' ')