In [1]:
import sys
import random
random.seed(12)

import spacy
from spacy.tokenizer import Tokenizer
sys.path.insert(1, '../../utils_paper/')

from bratUtils import myCorpus_brat2conll
from conllUtils import describe_entities
from myDocClass import ncbi_doc

from collections import Counter
%reload_ext autoreload
%autoreload 2

# Training and dev corpus construction for NCBI-disease corpus

We build a train and dev set from the [NCBI-disease corpus](https://www.ncbi.nlm.nih.gov/CBBresearch/Dogan/DISEASE/). This corpus contains 592 documents that we convert to conll files (suitable input format for the yaset tool).

In [2]:
path2train = '../data/src/NCBItrainset_corpus.txt'
path2train_write = '../data/ncbi_train'
path2dev = '../data/src/NCBIdevelopset_corpus.txt'
path2dev_write = '../data/ncbi_dev'
path2test = '../data/src/NCBItestset_corpus.txt'
path2test_write = '../data/ncbi_test'

In [3]:
with open(path2train, 'r') as f:
    train_data = f.read().splitlines()
with open(path2dev, 'r') as f:
    dev_data = f.read().splitlines()
with open(path2test, 'r') as f:
    test_data = f.read().splitlines()

In [4]:
# build document objects from the brat-like format of the orginal data
train_docs = []
point = 0
entities = []
for l in train_data[1:]:
    if l == '':
        train_docs.append(ncbi_doc(title, text, entities))
        point = 0
        entities = []
    else:
        if point == 0:
            title = l
        elif point == 1:
            text = l
        else:
            entities.append(l)
        point += 1
len(train_docs)

592

In [5]:
dev_docs = []
point = 0
entities = []
for l in dev_data[1:]:
    if l == '':
        dev_docs.append(ncbi_doc(title, text, entities))
        point = 0
        entities = []
    else:
        if point == 0:
            title = l
        elif point == 1:
            text = l
        else:
            entities.append(l)
        point += 1
len(dev_docs)

99

In [6]:
test_docs = []
point = 0
entities = []
for l in test_data[1:]:
    if l == '':
        test_docs.append(ncbi_doc(title, text, entities))
        point = 0
        entities = []
    else:
        if point == 0:
            title = l
        elif point == 1:
            text = l
        else:
            entities.append(l)
        point += 1
len(test_docs)

99

## Using spacy tokenizer

In [7]:
nlp = spacy.load('en')

In [8]:
# specifying nlp tokenizer to be consistent 
def create_custom_tokenizer(nlp):
    
    my_prefix = [r'[0-9]\.']
    
    all_prefixes_re = spacy.util.compile_prefix_regex(tuple(list(nlp.Defaults.prefixes) + my_prefix))
    
    # Handle ( that doesn't have proper spacing around it
    custom_infixes = ['\.\.\.+', '(?<=[0-9])-(?=[0-9])','(\w\w*)-(\w\w*)', '[!&:,()]']
    infix_re = spacy.util.compile_infix_regex(tuple(list(nlp.Defaults.infixes) + custom_infixes))
    
    suffix_re = spacy.util.compile_suffix_regex(nlp.Defaults.suffixes)   
    
    return Tokenizer(nlp.vocab, nlp.Defaults.tokenizer_exceptions,
                     prefix_search = all_prefixes_re.search, 
                     infix_finditer = infix_re.finditer, suffix_search = suffix_re.search,
                     token_match=None)
nlp.tokenizer = create_custom_tokenizer(nlp)
nlp.pipeline

[('tagger', <spacy.pipeline.Tagger at 0x7fe46b9552e8>),
 ('parser', <spacy.pipeline.DependencyParser at 0x7fe46b8faa40>),
 ('ner', <spacy.pipeline.EntityRecognizer at 0x7fe46b8faa98>)]

In [9]:
print('ratio #dev/(#dev + #train) = {}/{} = {:.3}'.format\
      (len(dev_docs), len(dev_docs) + len(train_docs), len(dev_docs)/(len(dev_docs) + len(train_docs))))

ratio #dev/(#dev + #train) = 99/691 = 0.143


### Write to conll

In [10]:
_ = myCorpus_brat2conll(train_docs, nlp, path2train_write)
_ = myCorpus_brat2conll(dev_docs, nlp, path2dev_write)

  0%|          | 0/592 [00:00<?, ?it/s]

Found 592 documents. Processing conversion...


100%|██████████| 592/592 [00:39<00:00, 14.86it/s]
  2%|▏         | 2/99 [00:00<00:05, 17.02it/s]

Found 99 documents. Processing conversion...


100%|██████████| 99/99 [00:06<00:00, 14.61it/s]


In [11]:
_ = myCorpus_brat2conll(test_docs, nlp, path2test_write)

  1%|          | 1/99 [00:00<00:13,  7.00it/s]

Found 99 documents. Processing conversion...


100%|██████████| 99/99 [00:06<00:00, 15.21it/s]


# Verifying that there are no missing entities after conversion from brat-like to conll

In [15]:
brat_ents = []
for doc in train_docs:
    for ent in doc.entities:
        for w in ent.word.split(' '):
            brat_ents.append(ent.label)
Counter(brat_ents)

Counter({'DiseaseClass': 1708,
         'SpecificDisease': 5756,
         'CompositeMention': 558,
         'Modifier': 1755})

In [12]:
brat_ents = []
for doc in dev_docs:
    for ent in doc.entities:
        for w in ent.word.split(' '):
            brat_ents.append(ent.label)
Counter(brat_ents)

Counter({'DiseaseClass': 247,
         'SpecificDisease': 841,
         'Modifier': 328,
         'CompositeMention': 160})

In [13]:
brat_ents = []
for doc in test_docs:
    for ent in doc.entities:
        for w in ent.word.split(' '):
            brat_ents.append(ent.label)
Counter(brat_ents)

Counter({'Modifier': 360,
         'SpecificDisease': 1072,
         'DiseaseClass': 244,
         'CompositeMention': 78})

## Describe datasets

In [14]:
path2test = '../data/ncbi_test.conll'
path2dev = '../data/ncbi_dev.conll'
path2conll = '../data/ncbi_train.conll'

print(describe_entities(path2conll), '\n')
print(describe_entities(path2dev), '\n')
print(describe_entities(path2test))
tot_tagged = 9775 + 1575
print(tot_tagged/151000)
print(tot_tagged)

Corpus containing 128557 tokens in 6233 sentences with 9775 non-O tags
Counter({'O': 118782, 'SpecificDisease': 5755, 'Modifier': 1755, 'DiseaseClass': 1708, 'CompositeMention': 557}) 

Corpus containing 22508 tokens in 1046 sentences with 1575 non-O tags
Counter({'O': 20933, 'SpecificDisease': 840, 'Modifier': 328, 'DiseaseClass': 247, 'CompositeMention': 160}) 

Corpus containing 22956 tokens in 1058 sentences with 1754 non-O tags
Counter({'O': 21202, 'SpecificDisease': 1072, 'Modifier': 360, 'DiseaseClass': 244, 'CompositeMention': 78})
0.07516556291390729
11350


In [8]:
tot_tagged = 9775 + 1575
nb_tokens = 128557 + 22508
nb_sents = 6233 + 1046
print('nb_tags= {}, prop= {}'.format(tot_tagged, tot_tagged/nb_tokens))
print('nb_tokens= {}, nb_sent {}'.format(nb_tokens, nb_sents))

nb_tags= 11350, prop= 0.07513322079899382
nb_tokens= 151065, nb_sent 7279
