In [1]:
import sys
import re 
import os
import random
random.seed(12)

import numpy as np
import spacy
sys.path.insert(1, '../../utils/')

from conllUtils import write_trainfiles, describe_entities, group_conll, describe_entities

from collections import Counter
from tqdm import tqdm
%reload_ext autoreload
%autoreload 2

# Training and dev corpus construction for conll2003 corpus

We build a train and dev set from the [conll2003 corpus](https://www.clips.uantwerpen.be/conll2003/ner/). This corpus contains 592 documents that we convert to conll files (suitable input format for the yaset tool). We got it from [Lample](https://github.com/glample/tagger/tree/master/dataset) (see [Lample et al., 2016](https://arxiv.org/abs/1603.01360)).

In [3]:
path2lample_train = '../data/lample_conll2003_datasets/eng.train'
path2lample_dev = '../data/lample_conll2003_datasets/eng.testa'
path2lample_test = '../data/lample_conll2003_datasets/eng.testb'

path2train = '../data/train.conll'
path2dev = '../data/dev.conll'
path2test = '../data/test.conll'

# Conversion

In [3]:
### replace space between columns by tabulations to be complient with yaset format
with open(path2lample_train, 'r') as f:
    train_data = f.read().splitlines()
train_data = [re.sub(' ', '\t', l)+'\n' for l in train_data]
with open(path2train, 'w') as f:
    f.writelines(train_data)

with open(path2lample_dev, 'r') as f:
    dev_data = f.read().splitlines()
dev_data = [re.sub(' ', '\t', l)+'\n' for l in dev_data]
with open(path2dev, 'w') as f:
    f.writelines(dev_data)

with open(path2lample_test, 'r') as f:
    test_data = f.read().splitlines()
test_data = [re.sub(' ', '\t', l)+'\n' for l in test_data]
with open(path2test, 'w') as f:
    f.writelines(test_data)

## From iob to bio

In [4]:
grouping = {'PER':['PER'], 'ORG':['ORG'], 'LOC':['LOC'], 'MISC':['MISC']}
_ = group_conll(path2train, '../data/bio_train.conll', grouping)
_ = group_conll(path2dev, '../data/bio_dev.conll', grouping)
_ = group_conll(path2test, '../data/bio_test.conll', grouping)

['Peter\tNNP\tI-NP\tI-PER', 'Blackburn\tNNP\tI-NP\tI-PER', '']
['Peter\tNNP\tI-NP\tB-PER\n', 'Blackburn\tNNP\tI-NP\tI-PER\n', '\n']
['.\t.\tO\tO', '', 'LONDON\tNNP\tI-NP\tI-LOC']
['.\t.\tO\tO\n', '\n', 'LONDON\tNNP\tI-NP\tB-LOC\n']
['DEFEAT\tNN\tI-NP\tO', '.\t.\tO\tO', '']
['DEFEAT\tNN\tI-NP\tO\n', '.\t.\tO\tO\n', '\n']


## Brief statistics

In [4]:
print(describe_entities(path2train, verbose = True))
print(describe_entities(path2dev, verbose = True))
print(describe_entities(path2test, verbose = True))

Corpus containing 204567 tokens in 14986 sentences with 34043 non-O tags
Counter({'O': 170524, 'PER': 11128, 'ORG': 10025, 'LOC': 8297, 'MISC': 4593})
Corpus containing 51578 tokens in 3465 sentences with 8603 non-O tags
Counter({'O': 42975, 'PER': 3149, 'LOC': 2094, 'ORG': 2092, 'MISC': 1268})
Corpus containing 46666 tokens in 3683 sentences with 8112 non-O tags
Counter({'O': 38554, 'PER': 2773, 'ORG': 2496, 'LOC': 1925, 'MISC': 918})


In [8]:
tot_tagged = 34043 + 8603
print(tot_tagged/256000)
print(tot_tagged)

0.1665859375
42646
