In [1]:
import os
import re
import json
import random
random.seed(12)

import numpy as np
import spacy
from collections import Counter
from tqdm import tqdm

import sys
sys.path.insert(1, '../../utils_paper/')
from conllUtils import describe_entities, group_conll


%reload_ext autoreload
%autoreload 2

# Training and dev corpus construction for medpost corpus

We build a train and dev set from the [medpost corpus](http://biocreative.sourceforge.net/bio_corpora_links.html) ([download link](ftp://ftp.ncbi.nlm.nih.gov/pub/lsmith/MedTag/medtag.tar.gz)). The MedPost corpus consists of 6 700 sentences, and is annotated with parts of speech, and gerund arguments. It is based on MEDLINE abstracts, the original paper describing the construction of the corpus can be found [here](https://academic.oup.com/bioinformatics/article/20/14/2320/213968).

In [2]:
path2corpora = '../data/medtag/medpost/'
path2texts = [path2corpora + f for f in os.listdir(path2corpora) if os.path.splitext(f)[1] == '.ioc']
path2original_dev = ['../data/medtag/medpost/tag_mb.ioc']
path2original_train = [f for f in path2texts if f not in path2original_dev]

# Conversion to conll

In [6]:
# Specific functions to convert medpost format to conll format with bio tagging-scheme
def medpost_format2conll(path2file):
    new_line = re.compile('P(\d\d*)A(\d\d*)')
    with open(path2file, 'r') as f:
        data = f.read().splitlines()
    conll = []
    prec_label = ''
    for l in data:
        if new_line.search(l) is not None:
            conll.append('\n')
        else:
            for token in l.split(' '):
                string, label = token.split('_')
                if prec_label == label:
                    label = 'I-'+label
                else:
                    label = 'B-'+label
                prec_label = label[2:]
                conll.append(string + '\t' + label + '\n')
    return conll

# Wrapper for the whole corpus
def medpost2conll(path2texts, save_path = None):
    corpora_conll = []
    for f in path2texts:
        text_conll = medpost_format2conll(f)
        corpora_conll += text_conll
    if save_path is not None:
        with open(save_path,'w') as f:
            f.writelines(corpora_conll[1:])
    return corpora_conll[1:]

In [7]:
path2train = '../data/train.conll' 
path2dev = '../data/dev.conll'

In [8]:
train_data = medpost2conll(path2original_train, save_path = path2train)
dev_data = medpost2conll(path2original_dev, save_path = path2dev)

In [9]:
print('ratio #dev/(#dev + #train) = {}/{} = {:.3}'.format(len(dev_data), len(dev_data) + len(train_data), len(dev_data)/(len(dev_data) + len(train_data))))

ratio #dev/(#dev + #train) = 28766/189020 = 0.152


## Fused classes
Fusionned some unexplained classes as well as all punctuation in a unique punctation class

In [10]:
# For the evaluation, we would want to reduce the number of classes
path2fused_train = '../data/train.conll'
path2fused_dev = '../data/dev.conll'

In [11]:
# reducing the number of classes
punct = [',', '.', ';', ':', ')', '(', '``', "''"]
grouping = {'PUNCT': punct, 'CC' : ['CC+', 'CC'], 'II' : ['II+', 'II'], 'RR' : ['RR+', 'RR'], 'JJ' : ['JJ+', 'JJ'], 'CS' : ['CS+', 'CS'], 'NN' : ['NN+', 'NN']}
fused_train = group_conll(path2train, path2fused_train, grouping)
fused_dev = group_conll(path2dev, path2fused_dev, grouping)

['fundamental\tB-JJ', 'services\tB-NNS', 'that\tB-PNR']
['fundamental\tB-JJ\n', 'services\tB-NNS\n', 'that\tB-PNR\n']
['of\tB-II', 'human\tB-NN', 'renal\tB-JJ']
['of\tB-II\n', 'human\tB-NN\n', 'renal\tB-JJ\n']


## Label exploration

In [12]:
# explicit labels extracted from the medpost paper
with open('../data/medpost_pos.json') as f:
    pos2label = json.load(f)

In [15]:
print(describe_entities(path2train), '\n')
print(describe_entities(path2dev))
#pos2label

Corpus containing 154552 tokens in 5702 sentences with 154552 non-O tags
Counter({'NN': 39903, 'II': 19042, 'PUNCT': 18649, 'JJ': 12247, 'NNS': 11272, 'DD': 11076, 'CC': 6171, 'MC': 4476, 'VVN': 4092, 'RR': 3927, 'VVNJ': 2122, 'VBD': 2067, 'VVD': 1698, 'VVB': 1656, 'SYM': 1572, 'TO': 1042, 'VBZ': 1035, 'CST': 1025, 'VVGN': 1013, 'VVZ': 970, 'VVI': 892, 'PN': 877, 'VVG': 869, 'VBB': 839, 'VVGJ': 818, 'PNR': 762, 'VM': 670, 'CS': 652, 'PNG': 424, 'PND': 388, 'VHB': 317, 'JJR': 260, 'CSN': 213, 'VBN': 202, 'NNP': 191, 'VHZ': 189, 'VBI': 171, 'VHD': 156, 'EX': 126, 'GE': 106, 'VDD': 105, 'RRT': 81, 'JJT': 42, 'VDZ': 41, 'VDB': 30, 'DB': 28, 'VHI': 18, 'VBG': 14, 'RRR': 8, 'VHG': 4, 'VDN': 4}) 

Corpus containing 27767 tokens in 999 sentences with 27767 non-O tags
Counter({'NN': 7339, 'II': 3453, 'PUNCT': 3148, 'DD': 2178, 'JJ': 2142, 'NNS': 1948, 'CC': 1029, 'MC': 832, 'VVN': 827, 'RR': 692, 'VBD': 405, 'VVNJ': 364, 'VVB': 309, 'VVD': 294, 'SYM': 249, 'CST': 188, 'VBZ': 179, 'TO': 167, 'VV

In [14]:
# More human readable but heavier
train_ents = describe_entities(path2train)
readable_labels = {}
for k, v in train_ents.items():
    if k in pos2label.keys():
        key = pos2label[k]
    else:
        key = k
    readable_labels[key] = v
readable_labels

Corpus containing 154552 tokens in 5702 sentences with 154552 non-O tags


{'noun': 39903,
 'PUNCT': 18649,
 'adjective': 12247,
 'coordinating conjunction': 6171,
 'base be, am, are': 839,
 'plural noun': 11272,
 'relative pronoun': 762,
 'base form lexical verb': 1656,
 'genitive marker ’s': 106,
 'infinitive marker to': 1042,
 'infinitive lexical verb': 892,
 'preposition': 19042,
 'genitive pronoun': 424,
 'number or numeric': 4476,
 'past was, were': 2067,
 'past part.': 4092,
 'subordinating conjunction': 652,
 'adverb': 3927,
 'prenominal past part.': 2122,
 'present part.': 869,
 'complementizer (that)': 1025,
 '3rd pers. sing. is': 1035,
 'determiner': 11076,
 'prenominal present part.': 818,
 'comparative conjunction (than)': 213,
 'base have': 317,
 'nominal gerund': 1013,
 'past tense': 1698,
 'comparative adjective': 260,
 'existential there': 126,
 '3rd pers. sing. has': 189,
 'participle been': 202,
 '3rd pers. sing.': 970,
 'symbol': 1572,
 'pronoun': 877,
 'modal': 670,
 'superlative adverb': 81,
 'infinitive be': 171,
 'proper noun': 191,
 '

In [7]:
tot_tagged = 27767 + 154552
nb_tokens = 27767 + 154552
nb_sents = 5702 + 999
print('nb_tags= {}, prop= {}'.format(tot_tagged, tot_tagged/nb_tokens))
print('nb_tokens= {}, nb_sent {}'.format(nb_tokens, nb_sents))

nb_tags= 182319, prop= 1.0
nb_tokens= 182319, nb_sent 6701


In [18]:
print(len(describe_entities(path2fused_train, iob=False)))

Corpus containing 154552 tokens in 5702 sentences
51


In [19]:
print(len(describe_entities(path2fused_dev, iob=False)))

Corpus containing 27767 tokens in 999 sentences
50
