In [55]:
import numpy as np
import pandas as pd
import os
import sys

In [56]:
def read_sentences_labels(sentence_file_path, label_file_path):
    sentences = []
    labels = []
    with open(sentence_file_path, 'r') as sf, open(label_file_path, 'r') as lf:
        sent = sf.readlines()
        lab = lf.readlines()
        assert len(sent) == len(lab)
        for sentence, label in zip(sent, lab):
            sentence_split = sentence.strip().split(' ')
            label_split = label.strip()
            
            assert len(label_split) == 1
            sentences.append(sentence_split)
            labels.append(label_split)
        
        assert len(sentences) == len(labels)
        return sentences, labels

In [57]:
sys.path.append('../../../../')

In [58]:
data_dir = '../../../../src/tc/data/kitchen_housewares/'

In [59]:
train, val, test = os.path.join(data_dir, 'train'), os.path.join(data_dir, 'val'), os.path.join(data_dir, 'test')

In [60]:
splits = [train, val, test]

In [61]:
splits

['../../../../src/tc/data/kitchen_housewares/train',
 '../../../../src/tc/data/kitchen_housewares/val',
 '../../../../src/tc/data/kitchen_housewares/test']

In [62]:
data = {}
for split in splits:
    data[split.split('/')[-1]] = dict()
    sentences, labels = read_sentences_labels(os.path.join(split, 'sentences.txt'), os.path.join(split, 'labels.txt'))
    data[split.split('/')[-1]]['sentences'] = sentences
    data[split.split('/')[-1]]['labels'] = labels
    data[split.split('/')[-1]]['size'] = len(sentences)

In [63]:
all_data = dict()
all_data['sentences'] = [item for sublist in [data[split]['sentences'] for split in data] for item in sublist]
all_data['labels'] = [item for sublist in [data[split]['labels'] for split in data] for item in sublist]

In [64]:
# different entity types
unique_entities_iobes = list(set([item for sublist in all_data['labels'] for item in sublist]))
unique_entities = set([entity.split('-')[-1] for entity in unique_entities_iobes])
unique_entities

{'0', '1'}

In [65]:
# distributions in whole dataset
from collections import Counter
all_entities_iobes = [item for sublist in all_data['labels'] for item in sublist]
all_entities = [entity.split('-')[-1] for entity in all_entities_iobes]
counter = Counter(all_entities)
counter

Counter({'0': 1000, '1': 1000})

In [66]:
# size of the whole dataset
num_sentences = 0
for split in data:
    num_sentences += data[split]['size']
num_sentences

2000

In [67]:
# num_tokens unique
all_tokens = [item for sublist in all_data['sentences'] for item in sublist]
unique_tokens = set(all_tokens)
len(unique_tokens)

10728

In [68]:
# num_tokens total
all_tokens = [item for sublist in all_data['sentences'] for item in sublist]
len(all_tokens)

197188

In [69]:
# Train-Val-Test split, distribution in terms of sentences
for split in data:
    print('{}:{}'.format(split, data[split]['size']))

train:1440
val:160
test:400


In [70]:
# Train-Val-Test split, distribution in terms of tokens
for split in data:
    print('{}:{}'.format(split, len(set([item for sublist in data[split]['sentences'] for item in sublist]))))

train:9120
val:2562
test:4556


In [None]:
# Train-Val-Test split, distribution in terms of entities
from collections import Counter
for split in data:
    split_entities_iobes = [item for sublist in data[split]['labels'] for item in sublist]
    split_entities = [entity.split('-')[-1] for entity in split_entities_iobes]
    counter = Counter(split_entities)
    data[split]['entity_distribution'] = counter