https://bionlp.nlm.nih.gov/tac2017adversereactions/

Task 1: Extract AdverseReactions and related mentions (Severity, Factor, DrugClass, Negation, Animal). This is similar to many NLP Named Entity Recognition (NER) evaluations.

In [1]:
import untangle
import glob
import pandas as pd
from collections import Counter
import re
import string
import csv

In [2]:
path = '/Users/jzhu/git/nlp_adversedrug/data/train_xml/'

In [57]:
def parse_xml(filename):
    """
    @input a filename string
    @return:
    1. For training data: both a dictionary (key is the section) for X (text strings) and a list of dictionary for y 
        (keys: id (not for task 1), section, type, start, len)
    2. For test data: only a list of X
    """
    X = {}
    Y = []
    
    obj = untangle.parse(filename)
    for text in obj.Label.Text.Section:
        X[text['id']] = text.cdata
        
    if obj.Label.Mentions.Mention:
        for mention in obj.Label.Mentions.Mention:
            entity = {}
            entity['id'] = mention['id']
            entity['section'] = mention['section']
            entity['type'] = mention['type']
            entity['start'] = mention['start']
            entity['len'] = mention['len']
            entity['text'] = mention['str']
            Y.append(entity)
            
    return X, Y

def parse_unannotated_xml(filename):
    """
    @input a filename string
    @return:
    1. For training data: both a dictionary (key is the section) for X (text strings) and a list of dictionary for y 
        (keys: id (not for task 1), section, type, start, len)
    2. For test data: only a list of X
    """
    import untangle
    
    X = {}
    
    obj = untangle.parse(filename)
    for text in obj.Label.Text.Section:
        X[text['id']] = text.cdata
            
    return X

# test

In [4]:
filename = path + 'ADCETRIS.xml'
X, Y = parse_xml(filename)

In [5]:
X.keys()

[u'S3', u'S2', u'S1']

In [6]:
X['S1']



In [7]:
X['S1'][236:(236+11)]

u'Anaphylaxis'

In [8]:
Y[:2]

[{'id': u'M1',
  'len': u'21',
  'section': u'S1',
  'start': u'156',
  'text': u'Peripheral Neuropathy',
  'type': u'AdverseReaction'},
 {'id': u'M2',
  'len': u'11',
  'section': u'S1',
  'start': u'236',
  'text': u'Anaphylaxis',
  'type': u'AdverseReaction'}]

### For NER_DL: extract only the entities and tags in training folder (then use the pre-trained word2vec from spacy)

In [85]:
f = path + 'ADCETRIS.xml'
X, Y = parse_xml(f)
X.keys()

[u'S3', u'S2', u'S1']

In [86]:
train = []
ct_good = 0
ct_bad = 0

for f in glob.glob(path+'*.xml'):
    X, Y = parse_xml(f)

    for section in X.keys():
        doc = X[section]
        print f, section
    
        # split the words in a doc
        word_ind = [[m.group(0).lower(), m.start(), m.end(), 'O'] for m in re.finditer(r'\w+', doc)
            if m.group(0) ]
        words = [w[0] for w in word_ind]
        starts = [s[1] for s in word_ind]
        ends = [e[2] for e in word_ind]
        types = [e[3] for e in word_ind]
        start_dict = dict(zip(starts, range(len(starts))))
        end_dict = dict(zip(ends, range(len(ends))))
        

        # parse the names in the same doc
        e_text = []
        e_type = []
        e_start = []
        e_end = []
        for e in Y:
            if e['section'] == section:
                starts = e['start'].split(',')
                lens = e['len'].split(',')
                for i in range(len(starts)):
                    e_text.append(e['text'])
                    e_type.append(e['type'])
                    e_start.append(int(starts[i]))
                    e_end.append(int(starts[i]) + int(lens[i]))

        # label the names in the list of total words
        for i in range(len(e_start)):
            if (e_start[i] in start_dict) and (e_end[i] in end_dict):
                ct_good += 1
                ind_start = start_dict[e_start[i]]
                ind_end = end_dict[e_end[i]]

                words[ind_start] = ' '.join(words[ind_start:(ind_end + 1)])
                types[ind_start] = e_type[i]
                for j in range(ind_start, ind_end):
                    types[j+1] = 'rm' # label to remove later

            else:
                ct_bad += 1
#                 print f, section, e_text[i], e_start[i], e_end[i]
                
        # remove those words tagged with 'rm' (i.e. combine names with more than one word)
        train += [[words[i], types[i]] for i in range(len(words)) if types[i] != 'rm']
        print len(train)

        # insert a new line to seperate from the next section
        train.append([''])
        print len(train)

/Users/jzhu/git/nlp_adversedrug/data/train_xml/ADCETRIS.xml S3
1021
1022
/Users/jzhu/git/nlp_adversedrug/data/train_xml/ADCETRIS.xml S2
1080
1081
/Users/jzhu/git/nlp_adversedrug/data/train_xml/ADCETRIS.xml S1
3307
3308
/Users/jzhu/git/nlp_adversedrug/data/train_xml/ADREVIEW.xml S2
4130
4131
/Users/jzhu/git/nlp_adversedrug/data/train_xml/ADREVIEW.xml S1
4430
4431
/Users/jzhu/git/nlp_adversedrug/data/train_xml/AFINITOR.xml S2
6102
6103
/Users/jzhu/git/nlp_adversedrug/data/train_xml/AFINITOR.xml S1
10335
10336
/Users/jzhu/git/nlp_adversedrug/data/train_xml/AMPYRA.xml S2
10853
10854
/Users/jzhu/git/nlp_adversedrug/data/train_xml/AMPYRA.xml S1
11324
11325
/Users/jzhu/git/nlp_adversedrug/data/train_xml/AMYVID.xml S2
11543
11544
/Users/jzhu/git/nlp_adversedrug/data/train_xml/AMYVID.xml S1
11810
11811
/Users/jzhu/git/nlp_adversedrug/data/train_xml/APTIOM.xml S2
13836
13837
/Users/jzhu/git/nlp_adversedrug/data/train_xml/APTIOM.xml S1
14721
14722
/Users/jzhu/git/nlp_adversedrug/data/train_xml/AR

/Users/jzhu/git/nlp_adversedrug/data/train_xml/LUMIZYME.xml S2
108458
108459
/Users/jzhu/git/nlp_adversedrug/data/train_xml/LUMIZYME.xml S1
110410
110411
/Users/jzhu/git/nlp_adversedrug/data/train_xml/MULTAQ.xml S3
111207
111208
/Users/jzhu/git/nlp_adversedrug/data/train_xml/MULTAQ.xml S2
111430
111431
/Users/jzhu/git/nlp_adversedrug/data/train_xml/MULTAQ.xml S1
111986
111987
/Users/jzhu/git/nlp_adversedrug/data/train_xml/NATAZIA.xml S3
113403
113404
/Users/jzhu/git/nlp_adversedrug/data/train_xml/NATAZIA.xml S2
113512
113513
/Users/jzhu/git/nlp_adversedrug/data/train_xml/NATAZIA.xml S1
114005
114006
/Users/jzhu/git/nlp_adversedrug/data/train_xml/NESINA.xml S2
114723
114724
/Users/jzhu/git/nlp_adversedrug/data/train_xml/NESINA.xml S1
115790
115791
/Users/jzhu/git/nlp_adversedrug/data/train_xml/NEURACEQ.xml S2
116017
116018
/Users/jzhu/git/nlp_adversedrug/data/train_xml/NEURACEQ.xml S1
116241
116242
/Users/jzhu/git/nlp_adversedrug/data/train_xml/NORTHERA.xml S3
116526
116527
/Users/jzhu/

In [87]:
print ct_good, ct_bad, ct_bad * 1.0 /ct_good

16728 128 0.00765184122429


In [88]:
len(train)

213037

In [89]:
for i in train[1020:1030]:
    print i

[u'1', 'O']
['']
[u'boxed', 'O']
[u'progressive multifocal leukoencephalopathy', u'AdverseReaction']
[u'pml', u'AdverseReaction']
[u'progressive multifocal leukoencephalopathy', u'AdverseReaction']
[u'pml', u'AdverseReaction']


In [90]:
213037 - 212798 # num of sections of all xml files

239

In [91]:
with open("train.csv", "wb") as f:
    writer = csv.writer(f)
    writer.writerows(train)

Note: 
* Since the name index is based on original text, we cannot do word labeling one word by one word. 
* Thus I first wrote all words in sentences one in a line, with the index on the same line 
* then use the name index from xml to relabel those types of names
* Only 0.7% names are not found in the processed train.csv. So I am fine with that. Most of them are due to hard to seperated words such as "decreasedSOD"

# Process unannotated dataset

In [80]:
testpath = '/Users/jzhu/git/nlp_adversedrug/data/unannotated_xml/'

In [81]:
test = []
ct = 1

for f in glob.glob(testpath + '*.xml'):
    filename = f.split('/')[-1].split('.')[0]
    print "Processing:", filename
    X = parse_unannotated_xml(f)

    for section in X.keys():
        doc = X[section]
    
        # split the words in a doc
        word_ind = [[filename, section, m.group(0).lower(), m.start(), m.end()] 
                    for m in re.finditer(r'\w+', doc) if m.group(0)]
        test.extend(word_ind)

Processing: 8MOP
Processing: ABELCET
Processing: ABILIFY0
Processing: ABILIFY1
Processing: ABILIFY2
Processing: ABIRATERONE
Processing: ABRAXANE
Processing: ABSORICA
Processing: ABSTRAL
Processing: ACANYA
Processing: ACCURETIC
Processing: ACEON
Processing: ACETADOTE
Processing: ACIPHEX0
Processing: ACIPHEX1
Processing: ACTEMRA
Processing: ACTHIB
Processing: ACTHREL
Processing: ACTICLATE
Processing: ACTIVELLA
Processing: ACTOPLUS0
Processing: ACTOPLUS1
Processing: ACULAR0
Processing: ACULAR1
Processing: ACUVAIL
Processing: ACZONE0
Processing: ACZONE1
Processing: ADACEL
Processing: ADAGEN
Processing: ADAPALENE
Processing: ADASUVE
Processing: ADCETRIS
Processing: ADCIRCA
Processing: ADDYI
Processing: ADEMPAS
Processing: ADENOCARD
Processing: ADENOSCAN
Processing: ADENOVIRUS
Processing: ADLYXIN
Processing: ADRENALIN
Processing: ADREVIEW
Processing: ADULT0
Processing: ADULT1
Processing: ADVAIR0
Processing: ADVAIR1
Processing: ADVATE
Processing: ADYNOVATE
Processing: ADZENYS
Processing: AERO

Processing: COUMADIN
Processing: COZAAR
Processing: CREON
Processing: CRESEMBA
Processing: CRESTOR
Processing: CRINONE
Processing: CRIXIVAN
Processing: CROFAB
Processing: CUBICIN
Processing: CUPRIMINE
Processing: CUROSURF
Processing: CUTIVATE0
Processing: CUTIVATE1
Processing: CUVITRU
Processing: CUVPOSA
Processing: CYANOKIT
Processing: CYCLESSA
Processing: CYCLOPHOSPHAMIDE
Processing: CYCLOSET
Processing: CYKLOKAPRON
Processing: CYMBALTA
Processing: CYRAMZA
Processing: CYSTADANE
Processing: CYSTAGON
Processing: CYSTARAN
Processing: CYSTOGRAFIN
Processing: CYSVIEW
Processing: CYTOGAM
Processing: CYTOMEL
Processing: CYTOTEC
Processing: CYTOVENE
Processing: DACOGEN
Processing: DAKLINZA
Processing: DALIRESP
Processing: DALVANCE
Processing: DANTRIUM
Processing: DANTROLENE
Processing: DAPTACEL
Processing: DARANIDE
Processing: DARAPRIM
Processing: DARIFENACIN
Processing: DATSCAN
Processing: DAYPRO
Processing: DAYTRANA
Processing: DDAVP0
Processing: DDAVP1
Processing: DDAVP2
Processing: DECIT

Processing: HYDROCORTISONE0
Processing: HYDROCORTISONE1
Processing: HYDROCORTISONE2
Processing: HYDROCORTISONE3
Processing: HYDROCORTISONE4
Processing: HYDROCORTISONE5
Processing: HYDROMORPHONE0
Processing: HYDROMORPHONE1
Processing: HYDROXYCHLOROQUINE
Processing: HYDROXYETHYL
Processing: HYLENEX
Processing: HYPERHEP
Processing: HYPERRAB
Processing: HYPERRHO
Processing: HYPERTET
Processing: HYQVIA
Processing: HYSINGLA
Processing: HYZAAR
Processing: IBRANCE
Processing: IBUPROFEN
Processing: ICGREEN
Processing: ICLUSIG
Processing: IDAMYCIN
Processing: IDELVION
Processing: IDKIT
Processing: ILEVRO
Processing: ILUVIEN
Processing: IMBRUVICA
Processing: IMIPRAMINE
Processing: IMITREX0
Processing: IMITREX1
Processing: IMLYGIC
Processing: IMOGAM
Processing: IMOVAX
Processing: IMPAVIDO
Processing: IMPLANON
Processing: IMURAN
Processing: INCRELEX
Processing: INCRUSE
Processing: INDICLOR
Processing: INDIUM
Processing: INDOCIN
Processing: INDOMETHACIN
Processing: INFANRIX
Processing: INFASURF
Proc

Processing: NUVARING
Processing: NUVESSA
Processing: NUVIGIL
Processing: NUWIQ
Processing: NYMALIZE
Processing: OBIZUR
Processing: OBREDON
Processing: OCALIVA
Processing: OCELLA
Processing: OCTAGAM
Processing: OCTREOSCAN
Processing: OCUFLOX
Processing: ODEFSEY
Processing: ODOMZO
Processing: OFEV
Processing: OFIRMEV
Processing: OLUXE
Processing: OLYSIO
Processing: OMECLAMOXPAK
Processing: OMEPRAZOLE
Processing: OMEPRAZOLESODIUM
Processing: OMIDRIA
Processing: OMNARIS
Processing: OMNIPAQUE
Processing: OMNISCAN0
Processing: OMNISCAN1
Processing: OMTRYG
Processing: ONFI
Processing: ONGLYZA
Processing: ONIVYDE
Processing: ONMEL
Processing: ONZETRA
Processing: OPANA
Processing: OPANA0
Processing: OPANA1
Processing: OPDIVO
Processing: OPSUMIT
Processing: OPTIMARK0
Processing: OPTIMARK1
Processing: OPTIMARK2
Processing: OPTIRAY0
Processing: OPTIRAY1
Processing: OPTISON
Processing: ORALAIR
Processing: ORAP
Processing: ORAPRED
Processing: ORAQIX
Processing: ORAVERSE
Processing: ORAVIG
Processing

Processing: STELARA
Processing: STENDRA
Processing: STERILE
Processing: STERILE0
Processing: STERILE1
Processing: STERILE2
Processing: STERILE3
Processing: STERILE4
Processing: STERILE5
Processing: STIMATE
Processing: STIOLTO
Processing: STIVARGA
Processing: STRATTERA
Processing: STRIANT
Processing: STRIBILD
Processing: STRIVERDI
Processing: STROMECTOL
Processing: SUBOXONE0
Processing: SUBOXONE1
Processing: SUBSYS
Processing: SUCRAID
Processing: SUCRALFATE
Processing: SUFENTANIL
Processing: SULFAMYLON0
Processing: SULFAMYLON1
Processing: SULFASALAZINE
Processing: SUMATRIPTAN
Processing: SUMAVEL
Processing: SUPPRELIN
Processing: SUPRANE
Processing: SUPRAX
Processing: SUPREP
Processing: SURFAXIN
Processing: SURMONTIL
Processing: SURVANTA
Processing: SUSTIVA0
Processing: SUSTIVA1
Processing: SUSTOL
Processing: SUTENT
Processing: SYLVANT
Processing: SYMBICORT
Processing: SYMBYAX
Processing: SYMLINPEN
Processing: SYNALAR0
Processing: SYNALAR1
Processing: SYNALAR2
Processing: SYNALGOS
Proces

Processing: ZYDELIG
Processing: ZYFLO0
Processing: ZYFLO1
Processing: ZYKADIA
Processing: ZYLET
Processing: ZYLOPRIM
Processing: ZYMAXID
Processing: ZYPREXA0
Processing: ZYPREXA1
Processing: ZYTIGA
Processing: ZYVOX0
Processing: ZYVOX1


In [82]:
len(test)

3594380

In [83]:
test[0]

['8MOP', u'S2', u'boxed', 6, 11]

In [84]:
with open("test.csv", "wb") as f:
    writer = csv.writer(f)
    writer.writerows(test)