In [1]:
import gzip
from genia_tokenizer import tokenize as genia_tokenize
from os.path import join
import glob
import re
import os
import spacy
from spacy.gold import biluo_tags_from_offsets

In [2]:
nlp = spacy.load('en')

In [3]:
DIGITS_RE = re.compile(r'(.*)\/(\d+)\.(.*)')
bionlp_dir = '/home/rohit/Documents/Spring_2018/Team_Project/data/bionlp09/'
ANN_RE = re.compile(r'.*Protein (\d+) (\d+) (.*)$')

In [4]:
total_files = glob.glob(join(bionlp_dir, '*'))
stoken_files = glob.glob(join(bionlp_dir, '*stoken'))
a1_files = glob.glob(join(bionlp_dir, '*a1'))
txt_files = glob.glob(join(bionlp_dir, '*txt'))
print(len(total_files), len(a1_files), len(txt_files), len(stoken_files))

3203 800 800 800


In [5]:
def biluo_tags_from_offsets(text, entities, tokens, missing='O'):
    """Encode labelled spans into per-token tags, using the
    Begin/In/Last/Unit/Out scheme (BILUO).
    doc (Doc): The document that the entity offsets refer to. The output tags
        will refer to the token boundaries within the document.
    entities (iterable): A sequence of `(start, end, label)` triples. `start`
        and `end` should be character-offset integers denoting the slice into
        the original string.
    RETURNS (list): A list of unicode strings, describing the tags. Each tag
        string will be of the form either "", "O" or "{action}-{label}", where
        action is one of "B", "I", "L", "U". The string "-" is used where the
        entity offsets don't align with the tokenization in the `Doc` object.
        The training algorithm will view these as missing values. "O" denotes a
        non-entity token. "B" denotes the beginning of a multi-token entity,
        "I" the inside of an entity of three or more tokens, and "L" the end
        of an entity of two or more tokens. "U" denotes a single-token entity.
    EXAMPLE:
        >>> text = 'I like London.'
        >>> entities = [(len('I like '), len('I like London'), 'LOC')]
        >>> doc = nlp.tokenizer(text)
        >>> tags = biluo_tags_from_offsets(doc, entities)
        >>> assert tags == ['O', 'O', 'U-LOC', 'O']
    """
    starts = {token.idx: token.i for i, token in enumerate(entities)}
    ends = {token.idx+len(token): token.i for token in doc}
    biluo = ['-' for _ in doc]
    # Handle entity cases
    for start_char, end_char, label in entities:
        start_token = starts.get(start_char)
        end_token = ends.get(end_char)
        # Only interested if the tokenization is correct
        if start_token is not None and end_token is not None:
            if start_token == end_token:
                biluo[start_token] = 'U-%s' % label
            else:
                biluo[start_token] = 'B-%s' % label
                for i in range(start_token+1, end_token):
                    biluo[i] = 'I-%s' % label
                biluo[end_token] = 'L-%s' % label
    # Now distinguish the O cases from ones where we miss the tokenization
    entity_chars = set()
    for start_char, end_char, label in entities:
        for i in range(start_char, end_char):
            entity_chars.add(i)
    for token in doc:
        for i in range(token.idx, token.idx+len(token)):
            if i in entity_chars:
                break
        else:
            biluo[token.i] = missing
    return biluo

## Convert 7 digit IDS to digit IDS

In [6]:
sevens = eights = outsiders = 0
for filename in total_files:
    m = DIGITS_RE.match(filename)
    if not m:
        continue
        
    if len(m.group(2)) == 7:
        sevens += 1
        new_id = '0' + m.group(2)
        new_filename = m.group(1) + '/' + new_id + '.' + m.group(3)
        os.rename(filename, new_filename)
    elif len(m.group(2)) == 8:
        eights += 1
    else:
        outsiders += 1
    
print(sevens, eights, outsiders)

0 3200 0


## Text to annotation file maps

In [7]:
stoken2ann_fnames = {}
for stoken_file in stoken_files:
    m = DIGITS_RE.match(stoken_file)
    stoken_pid = m.group(2)
    for a1_file in a1_files:
        m = DIGITS_RE.match(a1_file)
        ann_pid = m.group(2)
        if ann_pid == stoken_pid:
            break
    stoken2ann_fnames[stoken_file] = a1_file

In [None]:
for f_idx, (stoken_file, a1_file) in enumerate(stoken2ann_fnames.items()):
    if f_idx == 0:
        continue
    print(stoken_file, a1_file)
    abst = ''
    with open(stoken_file, 'r') as f_stoken:
        for line_idx, line in enumerate(f_stoken):
            line = line.strip()
            abst += line + ' '
            if line_idx == 0:
                abst += '\n'
            tokens = genia_tokenize(line)
            print(tokens)
    doc = nlp(abst)
    anns = []
    with open(a1_file, 'r') as f_a1:
        for line in f_a1:
            line = line.strip()
            _, ann, mention = line.split('\t')
            _, start, end = ann.split()
            start, end = int(start), int(end)
            anns.append((start, end, 'Protein'))
    print(anns, biluo_tags_from_offsets(doc, anns))
    break
    if f_idx == 5:
        break