In [1]:
import numpy as np
import pandas as pd
import os
import sys

In [2]:
def read_sentences_labels(sentence_file_path, label_file_path):
    sentences = []
    labels = []
    with open(sentence_file_path, 'r') as sf, open(label_file_path, 'r') as lf:
        sent = sf.readlines()
        lab = lf.readlines()
        assert len(sent) == len(lab)
        for sentence, label in zip(sent, lab):
            sentence_split = sentence.strip().split(' ')
            label_split = label.strip().split(' ')
            assert len(sentence_split) == len(label_split)
            sentences.append(sentence_split)
            labels.append(label_split)
        
        assert len(sentences) == len(labels)
        return sentences, labels

In [3]:
sys.path.append('../../../../')

In [4]:
data_dir = '../../../../src/ner/data/conll03_iobes_id/'

In [5]:
train, val, test = os.path.join(data_dir, 'train'), os.path.join(data_dir, 'val'), os.path.join(data_dir, 'test')

In [6]:
splits = [train, val, test]

In [7]:
splits

['../../../../src/ner/data/conll03_iobes_id/train',
 '../../../../src/ner/data/conll03_iobes_id/val',
 '../../../../src/ner/data/conll03_iobes_id/test']

In [8]:
data = {}
for split in splits:
    data[split.split('/')[-1]] = dict()
    sentences, labels = read_sentences_labels(os.path.join(split, 'sentences.txt'), os.path.join(split, 'labels.txt'))
    data[split.split('/')[-1]]['sentences'] = sentences
    data[split.split('/')[-1]]['labels'] = labels
    data[split.split('/')[-1]]['size'] = len(sentences)

In [9]:
all_data = dict()
all_data['sentences'] = [item for sublist in [data[split]['sentences'] for split in data] for item in sublist]
all_data['labels'] = [item for sublist in [data[split]['labels'] for split in data] for item in sublist]

In [10]:
# different entity types
unique_entities_iobes = list(set([item for sublist in all_data['labels'] for item in sublist]))
unique_entities = set([entity.split('-')[-1] for entity in unique_entities_iobes])
unique_entities

{'LOC', 'MISC', 'O', 'ORG', 'PER'}

In [11]:
# distributions in whole dataset
from collections import Counter
all_entities_iobes = [item for sublist in all_data['labels'] for item in sublist]
all_entities = [entity.split('-')[-1] for entity in all_entities_iobes]
counter = Counter(all_entities)
counter

Counter({'LOC': 12316, 'MISC': 6779, 'O': 250660, 'ORG': 14613, 'PER': 17050})

In [12]:
# size of the whole dataset
num_sentences = 0
for split in data:
    num_sentences += data[split]['size']
num_sentences

20744

In [13]:
# num_tokens unique
all_tokens = [item for sublist in all_data['sentences'] for item in sublist]
unique_tokens = set(all_tokens)
len(unique_tokens)

30289

In [14]:
# num_tokens total
all_tokens = [item for sublist in all_data['sentences'] for item in sublist]
len(all_tokens)

301418

In [15]:
# num_annotations total
all_annotations = [item for sublist in all_data['labels'] for item in sublist]
all_annotations = [ann for ann in all_annotations if ann is not 'O']
len(all_annotations)

50758

In [16]:
# num_annotations unique
unique_annotations = []
for sentence, label in zip(all_data['sentences'], all_data['labels']):
    indices = [i for i, x in enumerate(label) if x != 'O']
    unique_annotations.extend([sentence[i] for i in indices])
len(set(unique_annotations))

10943

In [17]:
# Train-Val-Test split, distribution in terms of sentences
for split in data:
    print('{}:{}'.format(split, data[split]['size']))

train:14041
val:3250
test:3453


In [18]:
# Train-Val-Test split, distribution in terms of tokens
for split in data:
    print('{}:{}'.format(split, len(set([item for sublist in data[split]['sentences'] for item in sublist]))))

train:23623
val:9966
test:9488


In [78]:
# Train-Val-Test split, distribution in terms of entities
from collections import Counter
for split in data:
    split_entities_iobes = [item for sublist in data[split]['labels'] for item in sublist]
    split_entities = [entity.split('-')[-1] for entity in split_entities_iobes]
    counter = Counter(split_entities)
    data[split]['entity_distribution'] = counter