In [1]:
import sys
import random
random.seed(12)

import spacy
from spacy.tokenizer import Tokenizer
sys.path.insert(1, '../../utils/')

from bratUtils import myCorpus_brat2conll
from conllUtils import describe_entities
from myDocClass import ncbi_doc

from collections import Counter
%reload_ext autoreload
%autoreload 2

# Training and dev corpus construction for NCBI-disease corpus

We build a train and dev set from the [NCBI-disease corpus](https://www.ncbi.nlm.nih.gov/CBBresearch/Dogan/DISEASE/). This corpus contains 592 documents that we convert to conll files (suitable input format for the yaset tool).

In [2]:
path2train = '../data/src/NCBItrainset_corpus.txt'
path2train_write = '../data/ncbi_train'
path2dev = '../data/src/NCBIdevelopset_corpus.txt'
path2dev_write = '../data/ncbi_dev'

In [4]:
with open(path2train, 'r') as f:
    train_data = f.read().splitlines()
with open(path2dev, 'r') as f:
    dev_data = f.read().splitlines()

In [5]:
train_docs = []
point = 0
entities = []
for l in train_data[1:]:
    if l == '':
        train_docs.append(ncbi_doc(title, text, entities))
        point = 0
        entities = []
    else:
        if point == 0:
            title = l
        elif point == 1:
            text = l
        else:
            entities.append(l)
        point += 1
len(train_docs)

592

In [6]:
dev_docs = []
point = 0
entities = []
for l in dev_data[1:]:
    if l == '':
        dev_docs.append(ncbi_doc(title, text, entities))
        point = 0
        entities = []
    else:
        if point == 0:
            title = l
        elif point == 1:
            text = l
        else:
            entities.append(l)
        point += 1
len(dev_docs)

99

## Using spacy tokenizer

In [7]:
nlp = spacy.load('en')

In [8]:
# specifying nlp tokenizer to be consistent 
def create_custom_tokenizer(nlp):
    
    my_prefix = [r'[0-9]\.']
    
    all_prefixes_re = spacy.util.compile_prefix_regex(tuple(list(nlp.Defaults.prefixes) + my_prefix))
    
    # Handle ( that doesn't have proper spacing around it
    custom_infixes = ['\.\.\.+', '(?<=[0-9])-(?=[0-9])','(\w\w*)-(\w\w*)', '[!&:,()]']
    infix_re = spacy.util.compile_infix_regex(tuple(list(nlp.Defaults.infixes) + custom_infixes))
    
    suffix_re = spacy.util.compile_suffix_regex(nlp.Defaults.suffixes)   
    
    return Tokenizer(nlp.vocab, nlp.Defaults.tokenizer_exceptions,
                     prefix_search = all_prefixes_re.search, 
                     infix_finditer = infix_re.finditer, suffix_search = suffix_re.search,
                     token_match=None)
nlp.tokenizer = create_custom_tokenizer(nlp)
nlp.pipeline

[('tagger', <spacy.pipeline.Tagger at 0x7f0b714f9080>),
 ('parser', <spacy.pipeline.DependencyParser at 0x7f0b71531f68>),
 ('ner', <spacy.pipeline.EntityRecognizer at 0x7f0b71531fc0>)]

In [9]:
print('ratio #dev/(#dev + #train) = {}/{} = {:.3}'.format\
      (len(dev_docs), len(dev_docs) + len(train_docs), len(dev_docs)/(len(dev_docs) + len(train_docs))))

ratio #dev/(#dev + #train) = 99/691 = 0.143


### Write to conll

In [10]:
myCorpus_brat2conll(train_docs, nlp, path2train_write)
myCorpus_brat2conll(dev_docs, nlp, path2dev_write)

  0%|          | 0/592 [00:00<?, ?it/s]

Found 592 documents. Processing conversion...


100%|██████████| 592/592 [00:49<00:00, 11.86it/s]
  2%|▏         | 2/99 [00:00<00:06, 14.00it/s]

Found 99 documents. Processing conversion...


100%|██████████| 99/99 [00:07<00:00, 12.97it/s]


['Somatic-cell\tO\n',
 'selection\tO\n',
 'is\tO\n',
 'a\tO\n',
 'major\tO\n',
 'determinant\tO\n',
 'of\tO\n',
 'the\tO\n',
 'blood-cell\tO\n',
 'phenotype\tO\n',
 'in\tO\n',
 'heterozygotes\tO\n',
 'for\tO\n',
 'glucose-6-phosphate\tO\n',
 'dehydrogenase\tO\n',
 'mutations\tO\n',
 'causing\tO\n',
 'severe\tO\n',
 'enzyme\tB-DiseaseClass\n',
 'deficiency\tI-DiseaseClass\n',
 '.\tO\n',
 '\n',
 'X-chromosome\tO\n',
 'inactivation\tO\n',
 'in\tO\n',
 'mammals\tO\n',
 'is\tO\n',
 'regarded\tO\n',
 'as\tO\n',
 'an\tO\n',
 'essentially\tO\n',
 'random\tO\n',
 'process\tO\n',
 ',\tO\n',
 'but\tO\n',
 'the\tO\n',
 'resulting\tO\n',
 'somatic-cell\tO\n',
 'mosaicism\tO\n',
 'creates\tO\n',
 'the\tO\n',
 'opportunity\tO\n',
 'for\tO\n',
 'cell\tO\n',
 'selection\tO\n',
 '.\tO\n',
 '\n',
 'In\tO\n',
 'most\tO\n',
 'people\tO\n',
 'with\tO\n',
 'red-blood\tO\n',
 '-\tO\n',
 'cell\tO\n',
 'glucose-6-phosphate\tB-SpecificDisease\n',
 'dehydrogenase\tI-SpecificDisease\n',
 '(G6PD)\tI-SpecificDisease

In [11]:
train_sum = describe_entities('../data/ncbi_train.conll')
dev_sum = describe_entities('../data/ncbi_dev.conll')

all_ann = 0
for ent in train_sum.keys():
    print('{} : {}'.format(ent, train_sum[ent] + dev_sum[ent]))
    if ent != 'O':
        all_ann += train_sum[ent] + dev_sum[ent]
print('# of annotations {}'.format(all_ann))

Corpus containing 128557 tokens in 6233 sentences
Corpus containing 22508 tokens in 1046 sentences
O : 139715
DiseaseClass : 1955
SpecificDisease : 6595
CompositeMention : 717
Modifier : 2083
# of annotations 11350


# Verifying that there are no missing entities after conversion

In [4]:
path2conll = '../data/ncbi_train.conll'

In [13]:
brat_ents = []
for doc in train_docs:
    for ent in doc.entities:
        for w in ent.word.split(' '):
            brat_ents.append(ent.label)
Counter(brat_ents)

Counter({'DiseaseClass': 1708,
         'SpecificDisease': 5756,
         'CompositeMention': 558,
         'Modifier': 1755})

In [5]:
describe_entities(path2conll)

Corpus containing 128557 tokens in 6233 sentences with 9775 non-O tags


Counter({'O': 118782,
         'DiseaseClass': 1708,
         'SpecificDisease': 5755,
         'CompositeMention': 557,
         'Modifier': 1755})

In [15]:
brat_ents = []
for doc in dev_docs:
    for ent in doc.entities:
        for w in ent.word.split(' '):
            brat_ents.append(ent.label)
Counter(brat_ents)

Counter({'DiseaseClass': 247,
         'SpecificDisease': 841,
         'Modifier': 328,
         'CompositeMention': 160})

In [8]:
path2dev = '../data/ncbi_dev.conll'
print(describe_entities(path2dev))
tot_tagged = 9775 + 1575
print(tot_tagged/151000)
print(tot_tagged)

Corpus containing 22508 tokens in 1046 sentences with 1575 non-O tags
Counter({'O': 20933, 'SpecificDisease': 840, 'Modifier': 328, 'DiseaseClass': 247, 'CompositeMention': 160})
0.07516556291390729
11350
