In [36]:
import numpy as np
import pandas as pd
import os
import sys

In [37]:
def read_sentences_labels(sentence_file_path, label_file_path):
    sentences = []
    labels = []
    with open(sentence_file_path, 'r', encoding='utf-8') as sf, open(label_file_path, 'r') as lf:
        sent = sf.readlines()
        lab = lf.readlines()
        assert len(sent) == len(lab)
        for sentence, label in zip(sent, lab):
            sentence_split = sentence.strip().split(' ')
            label_split = label.strip()
            #print(label_split)
            assert len(label_split) == 1
            sentences.append(sentence_split)
            labels.append(label_split)
        
        assert len(sentences) == len(labels)
        return sentences, labels

In [38]:
sys.path.append('../../../../')

In [39]:
data_dir = '../../../../src/tc/data/organic/'

In [40]:
train, val, test = os.path.join(data_dir, 'train'), os.path.join(data_dir, 'val'), os.path.join(data_dir, 'test')

In [41]:
splits = [train, val, test]

In [42]:
splits

['../../../../src/tc/data/organic/train',
 '../../../../src/tc/data/organic/val',
 '../../../../src/tc/data/organic/test']

In [43]:
data = {}
for split in splits:
    data[split.split('/')[-1]] = dict()
    sentences, labels = read_sentences_labels(os.path.join(split, 'sentences.txt'), os.path.join(split, 'labels.txt'))
    data[split.split('/')[-1]]['sentences'] = sentences
    data[split.split('/')[-1]]['labels'] = labels
    data[split.split('/')[-1]]['size'] = len(sentences)

In [44]:
all_data = dict()
all_data['sentences'] = [item for sublist in [data[split]['sentences'] for split in data] for item in sublist]
all_data['labels'] = [item for sublist in [data[split]['labels'] for split in data] for item in sublist]

In [45]:
# different entity types
unique_entities_iobes = list(set([item for sublist in all_data['labels'] for item in sublist]))
unique_entities = set([entity.split('-')[-1] for entity in unique_entities_iobes])
unique_entities

{'0', '1', '2'}

In [46]:
# distributions in whole dataset
from collections import Counter
all_entities_iobes = [item for sublist in all_data['labels'] for item in sublist]
all_entities = [entity.split('-')[-1] for entity in all_entities_iobes]
counter = Counter(all_entities)
counter

Counter({'0': 1394, '1': 2044, '2': 1554})

In [47]:
# size of the whole dataset
num_sentences = 0
for split in data:
    num_sentences += data[split]['size']
num_sentences

4992

In [48]:
# num_tokens unique
all_tokens = [item for sublist in all_data['sentences'] for item in sublist]
unique_tokens = set(all_tokens)
len(unique_tokens)

9424

In [49]:
# num_tokens total
all_tokens = [item for sublist in all_data['sentences'] for item in sublist]
len(all_tokens)

113204

In [51]:
# Train-Val-Test split, distribution in terms of sentences
for split in data:
    print('{}:{}'.format(split, data[split]['size']))

train:4293
val:333
test:366


In [16]:
# Train-Val-Test split, distribution in terms of tokens
for split in data:
    print('{}:{}'.format(split, len(set([item for sublist in data[split]['sentences'] for item in sublist]))))

train:7424
test:3340


In [33]:
# Train-Val-Test split, distribution in terms of entities
from collections import Counter
for split in data:
    split_entities_iobes = [item for sublist in data[split]['labels'] for item in sublist]
    split_entities = [entity.split('-')[-1] for entity in split_entities_iobes]
    counter = Counter(split_entities)
    data[split]['entity_distribution'] = counter