In [1]:
import re 
import os
import random
random.seed(12)
import spacy
import numpy as np

from collections import Counter
from tqdm import tqdm

import sys
sys.path.insert(1, '../../utils_paper/')
from conllUtils import write_trainfiles, describe_entities, group_conll, describe_entities

%reload_ext autoreload
%autoreload 2

# Training and dev corpus construction for conll2003 corpus

We build a train and dev set from the [conll2003 corpus](https://www.clips.uantwerpen.be/conll2003/ner/) with a complient format for the neural tagger [YASET](http://yaset.readthedocs.io/en/stable/). This corpus contains 592 documents that we convert to conll files (suitable input format for the yaset tool). We got it from [Lample](https://github.com/glample/tagger/tree/master/dataset) (see [Lample et al., 2016](https://arxiv.org/abs/1603.01360)).

In [2]:
iob_train = '../data/iob_data/eng.train'
iob_dev = '../data/iob_data/eng.testa'
iob_test = '../data/iob_data/eng.testb'

path2train = '../data/train.conll'
path2dev = '../data/dev.conll'
path2test = '../data/test.conll'

# Conversion

In [3]:
### replace space between columns by tabulations to be complient with YASET format
with open(iob_train, 'r') as f:
    train_data = f.read().splitlines()
train_data = [re.sub(' ', '\t', l)+'\n' for l in train_data]
with open(path2train, 'w') as f:
    f.writelines(train_data)

with open(iob_dev, 'r') as f:
    dev_data = f.read().splitlines()
dev_data = [re.sub(' ', '\t', l)+'\n' for l in dev_data]
with open(path2dev, 'w') as f:
    f.writelines(dev_data)

with open(iob_test, 'r') as f:
    test_data = f.read().splitlines()
test_data = [re.sub(' ', '\t', l)+'\n' for l in test_data]
with open(path2test, 'w') as f:
    f.writelines(test_data)

## From iob to bio

In [5]:
## the tagging scheme in the orignial dataset is not exactely bio so we have to convert it
grouping = {'PER':['PER'], 'ORG':['ORG'], 'LOC':['LOC'], 'MISC':['MISC']}
_ = group_conll(path2train, path2train, grouping)
_ = group_conll(path2dev, path2dev, grouping)
_ = group_conll(path2test, path2test, grouping)

['Peter\tNNP\tI-NP\tB-PER', 'Blackburn\tNNP\tI-NP\tI-PER', '']
['Peter\tNNP\tI-NP\tB-PER\n', 'Blackburn\tNNP\tI-NP\tI-PER\n', '\n']
['.\t.\tO\tO', '', 'LONDON\tNNP\tI-NP\tB-LOC']
['.\t.\tO\tO\n', '\n', 'LONDON\tNNP\tI-NP\tB-LOC\n']
['DEFEAT\tNN\tI-NP\tO', '.\t.\tO\tO', '']
['DEFEAT\tNN\tI-NP\tO\n', '.\t.\tO\tO\n', '\n']


## Brief statistics

In [7]:
print('Train set: ')
print(describe_entities(path2train, verbose = True), '\n')
print('Dev set: ')
print(describe_entities(path2dev, verbose = True), '\n')
print('Test set: ')
print(describe_entities(path2test, verbose = True), '\n')

Train set: 
Corpus containing 204567 tokens in 14986 sentences with 34043 non-O tags
Counter({'O': 170524, 'PER': 11128, 'ORG': 10025, 'LOC': 8297, 'MISC': 4593}) 

Dev set: 
Corpus containing 51578 tokens in 3465 sentences with 8603 non-O tags
Counter({'O': 42975, 'PER': 3149, 'LOC': 2094, 'ORG': 2092, 'MISC': 1268}) 

Test set: 
Corpus containing 46666 tokens in 3683 sentences with 8112 non-O tags
Counter({'O': 38554, 'PER': 2773, 'ORG': 2496, 'LOC': 1925, 'MISC': 918}) 



In [4]:
tot_tagged = 34043 + 8603
nb_tokens = 51578 + 204567
nb_sents = 3465 + 14986
print('nb_tags= {}, prop= {}'.format(tot_tagged, tot_tagged/nb_tokens))
print('nb_tokens= {}, nb_sent {}'.format(nb_tokens, nb_sents))

nb_tags= 42646, prop= 0.16649163559702512
nb_tokens= 256145, nb_sent 18451
