In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [12]:
import sys
root_path = '../../../'
sys.path.append(root_path)

In [13]:
"""Build vocabularies of words and tags from datasets"""
import argparse
from collections import Counter
import json
import os
from src.ner.encoder import WordEncoder, CharEncoder, ClassEncoder
from src.ner.data import SLIterator
from src.ner import utils

In [14]:
def save_vocab_to_txt_file(vocab, txt_path):
    """Writes one token per line, 0-based line id corresponds to the id of the token.

    Args:
        vocab: (iterable object) yields token
        txt_path: (stirng) path to vocab file
    """
    with open(txt_path, "w") as f:
        for token in vocab:
            f.write(token + '\n')
            

def save_dict_to_json(d, json_path):
    """Saves dict to json file

    Args:
        d: (dict)
        json_path: (string) path to json file
    """
    with open(json_path, 'w') as f:
        d = {k: v for k, v in d.items()}
        json.dump(d, f, indent=4)


def update_vocab(txt_path, vocab):
    """Update word and tag vocabulary from dataset

    Args:
        txt_path: (string) path to file, one sentence per line
        vocab: (dict or Counter) with update method

    Returns:
        dataset_size: (int) number of elements in the dataset
    """
    with open(txt_path) as f:
        for i, line in enumerate(f):
            vocab.update(line.strip().split(' '))

    return i + 1


def get_stats(data_iterator, words, tags):
    sentence_gen = data_iterator.get_next_X()
    label_gen = data_iterator.get_next_Y()
    num_sentences = 0
    num_labels = 0
    for sent, label in zip(sentence_gen, label_gen):
        assert len(label) == 1
        words.update(sent)
        tags.update(label)
        if sent:
            num_sentences += 1
        if label:
            num_labels += 1
    assert num_sentences == num_labels
    return num_sentences

In [18]:
k_fold = False

In [25]:
min_word_count = 1
min_count_tag=1
data_folder = os.path.join(root_path, 'src/tc/data/sst_binary')
feats_folder = os.path.join(data_folder, 'feats')
json_folder = data_folder
if not os.path.exists(feats_folder):
    os.makedirs(feats_folder)
if k_fold:
    data_folder = os.path.join(data_folder, 'split_1')

In [26]:
print('loading train')
train_data_iterator = SLIterator(os.path.join(data_folder, 'train'))
print('loading val')
val_data_iterator = SLIterator(os.path.join(data_folder, 'val'))
print('loading test')
test_data_iterator = SLIterator(os.path.join(data_folder, 'test'))
data_iterators = [train_data_iterator, val_data_iterator, test_data_iterator]

loading train
loading val
loading test


In [27]:
words = Counter()
tags = Counter()
print('getting stats')
num_sentences_train = get_stats(train_data_iterator, words, tags)
num_sentences_val = get_stats(val_data_iterator, words, tags)
num_sentences_test = get_stats(test_data_iterator, words, tags)

getting stats


In [32]:
# for all types of features for which vocab has to be created, load encoders
print('encoding')
from collections import OrderedDict
data_encoders = OrderedDict()
label_encoders = OrderedDict()
data_encoders[WordEncoder.FEATURE_NAME] = WordEncoder(os.path.join(root_path, 'src/tc/embeddings/glove.6B.300d.txt'),
                                                      dim=300,
                                                      type='glove')
data_encoders[CharEncoder.FEATURE_NAME] = CharEncoder()
label_encoders[ClassEncoder.FEATURE_NAME] = ClassEncoder()

print('creating and saving maps..')
# create vocabs with iterators and encoders.
for feature_name, encoder in data_encoders.items():
    print('creating map: {}'.format(feature_name))
    encoder.create_map(data_iterators)
    # save maps in the data folder.
    print('saving map: {}'.format(feature_name))
    for map_name, map in encoder.maps.items():
        utils.save_map(map, map_name, feats_folder)

for feature_name, encoder in label_encoders.items():
    print('creating map: {}'.format(feature_name))
    encoder.create_map(data_iterators)
    # save maps in the data folder.
    print('saving map: {}'.format(feature_name))
    for map_name, map in encoder.maps.items():
        utils.save_map(map, map_name, feats_folder)

encoding
creating and saving maps..
creating map: WORD
saving map: WORD
creating map: CHAR
Char distribution:  Counter({'e': 465215, 't': 366247, 'a': 336081, 'i': 320054, 'o': 292440, 'n': 284376, 's': 281161, 'r': 244672, 'l': 198539, 'h': 173988, 'd': 140714, 'c': 130265, 'u': 119907, 'm': 118677, 'f': 100059, 'g': 94585, 'y': 86157, 'p': 78367, 'b': 65565, 'w': 59734, 'v': 49595, ',': 37314, 'k': 33378, '-': 30484, '.': 30147, "'": 22831, 'x': 7893, 'j': 7403, 'q': 4462, 'z': 4343, '`': 3779, '0': 1381, '1': 1160, '9': 775, '2': 707, ':': 665, 'Ã': 612, '\\': 606, '©': 487, ';': 434, '/': 422, '8': 395, '5': 376, '?': 337, '3': 302, '!': 216, '4': 213, '7': 211, '*': 184, '6': 182, '&': 95, '$': 58, '¯': 20, '±': 20, '\xa0': 17, '#': 17, 'Â': 16, '¨': 13, '¦': 13, '¢': 12, '+': 11, '³': 10, '¼': 9, '\xad': 8, '¡': 8, '=': 7, '£': 3, '´': 3, '§': 3, '__PAD__': 1, '__UNK__': 1, '¶': 1, '%': 1, '»': 1})
saving map: CHAR
creating map: CLASS
Class Distribution:  Counter({'1': 47222, '0'

In [33]:
# Save vocabularies to file
print("Saving vocabularies to file...")
save_vocab_to_txt_file(words, os.path.join(json_folder, 'words.txt'))
save_vocab_to_txt_file(tags, os.path.join(json_folder, 'tags.txt'))
print("- done.")

# Save datasets properties in json file
print('saving dataset stats')
encoder_params = dict()
encoders = [data_encoders, label_encoders]
for encoder in encoders:
    for encoder_name, enc in encoder.items():
        encoder_params['pad_'+encoder_name] = enc.PAD
        encoder_params['unk_'+encoder_name] = enc.UNK

sizes = {
    'train_size': num_sentences_train,
    'dev_size': num_sentences_val,
    'test_size': num_sentences_test,
    'vocab_size': len(words),
    'number_of_tags': len(tags),
    'special_tokens': encoder_params,
    'data_iterators': {'train': train_data_iterator.__class__.__name__,
                       'val': val_data_iterator.__class__.__name__,
                       'test': test_data_iterator.__class__.__name__}
}
save_dict_to_json(sizes, os.path.join(json_folder, 'dataset_params.json'))

# Logging sizes
to_print = "\n".join("- {}: {}".format(k, v) for k, v in sizes.items())
print("Characteristics of the dataset:\n{}".format(to_print))

Saving vocabularies to file...
- done.
saving dataset stats
Characteristics of the dataset:
- train_size: 83881
- dev_size: 872
- test_size: 1821
- vocab_size: 18844
- number_of_tags: 2
- special_tokens: {'pad_WORD': '__PAD__', 'unk_WORD': '__UNK__', 'pad_CHAR': '__PAD__', 'unk_CHAR': '__UNK__', 'pad_CLASS': None, 'unk_CLASS': None}
- data_iterators: {'train': 'SLIterator', 'val': 'SLIterator', 'test': 'SLIterator'}
