Instructions to run:

1. Specify the data folder. Currently it assumes that the data folder contains train, test, and val folders.
2. Specify the word embeddings to be used. Currently, it assumes that the word embeddings are stored in the src/ner/embeddings folder, as a txt file. 

In [12]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [13]:
import sys
root_path = '../../../'
sys.path.append(root_path)

In [14]:
import argparse
from collections import Counter
import json
import os
from src.booster.progressive_encoder import WordEncoder, CharEncoder, EntityEncoder
from src.ner.data import SLIterator
from src.ner import utils

In [15]:
def save_vocab_to_txt_file(vocab, txt_path):
    """Writes one token per line, 0-based line id corresponds to the id of the token.

    Args:
        vocab: (iterable object) yields token
        txt_path: (stirng) path to vocab file
    """
    with open(txt_path, "w") as f:
        for token in vocab:
            f.write(token + '\n')
            

def save_dict_to_json(d, json_path):
    """Saves dict to json file

    Args:
        d: (dict)
        json_path: (string) path to json file
    """
    with open(json_path, 'w') as f:
        d = {k: v for k, v in d.items()}
        json.dump(d, f, indent=4)


def update_vocab(txt_path, vocab):
    """Update word and tag vocabulary from dataset

    Args:
        txt_path: (string) path to file, one sentence per line
        vocab: (dict or Counter) with update method

    Returns:
        dataset_size: (int) number of elements in the dataset
    """
    with open(txt_path) as f:
        for i, line in enumerate(f):
            vocab.update(line.strip().split(' '))

    return i + 1


def get_stats(data_iterator, words, tags):
    sentence_gen = data_iterator.get_next_X()
    label_gen = data_iterator.get_next_Y()
    num_sentences = 0
    num_labels = 0
    for sent, label in zip(sentence_gen,label_gen):
        assert len(sent) == len(label)
        words.update(sent)
        tags.update(label)
        if sent:
            num_sentences += 1
        if label:
            num_labels += 1
    assert num_sentences == num_labels
    return num_sentences

In [16]:
# specify the data directory, containing train, val, and test folders.

min_word_count = 1
min_count_tag=1
data_folder = os.path.join(root_path, 'src/ner/data/bc5cdr_iobes_id')

In [17]:
# load the datasets

print('loading train')
train_data_iterator = SLIterator(os.path.join(data_folder, 'train'))
print('loading val')
val_data_iterator = SLIterator(os.path.join(data_folder, 'val'))
print('loading test')
test_data_iterator = SLIterator(os.path.join(data_folder, 'test'))
data_iterators = [train_data_iterator, val_data_iterator, test_data_iterator]

loading train
loading val
loading test


In [18]:
# get the stats

words = Counter()
tags = Counter()
print('getting stats')
num_sentences_train = get_stats(train_data_iterator, words, tags)
num_sentences_val = get_stats(val_data_iterator, words, tags)
num_sentences_test = get_stats(test_data_iterator, words, tags)

getting stats


In [19]:
# for all types of features for which vocab has to be created, load encoders
print('encoding')
from collections import OrderedDict
data_encoders = OrderedDict()
label_encoders = OrderedDict()
data_encoders[WordEncoder.FEATURE_NAME] = WordEncoder(os.path.join(root_path,'src/ner/embeddings/glove.6B.100d.txt'), 
                                                      dim=100)
data_encoders[CharEncoder.FEATURE_NAME] = CharEncoder()
label_encoders[EntityEncoder.FEATURE_NAME] = EntityEncoder()

encoding


In [20]:
# encode the dataset

print('creating and saving maps..')
feats_folder = os.path.join(data_folder, 'feats')
if not os.path.exists(feats_folder):
    os.makedirs(feats_folder)
    
# create vocabs with iterators and encoders.
for feature_name, encoder in data_encoders.items():
    print('creating map: {}'.format(feature_name))
    encoder.create_map(data_iterators)
    # save maps in the data folder.
    for map_name, map in encoder.maps.items():
        utils.save_map(map, map_name, feats_folder)

for feature_name, encoder in label_encoders.items():
    print('creating map: {}'.format(feature_name))
    encoder.create_map(data_iterators)
    # save maps in the data folder.
    for map_name, map in encoder.maps.items():
        utils.save_map(map, map_name, feats_folder)


creating and saving maps..
creating map: WORD
creating map: CHAR
Char distribution:  Counter({'e': 188548, 'i': 135258, 't': 132076, 'a': 130865, 'n': 119724, 'o': 113597, 'r': 100813, 's': 95445, 'd': 67059, 'c': 63740, 'l': 61000, 'h': 52613, 'p': 41964, 'm': 41760, 'u': 40683, 'f': 33563, 'y': 27208, 'g': 26253, 'w': 18493, '.': 18226, 'v': 18100, 'b': 17712, ',': 12223, '-': 10548, 'T': 6583, '0': 6422, ')': 6068, '(': 6033, '1': 5812, 'x': 5671, 'A': 5281, 'k': 4908, 'C': 4472, '2': 4397, 'S': 4390, 'I': 3863, 'P': 3654, '5': 3498, 'D': 3059, 'N': 3034, '3': 2895, 'E': 2868, '/': 2866, 'z': 2737, 'O': 2706, 'R': 2500, '4': 2499, 'M': 2318, 'H': 2212, 'L': 2087, ':': 1939, '6': 1871, 'B': 1868, '%': 1709, '7': 1572, '8': 1564, '9': 1522, 'U': 1400, 'F': 1381, 'G': 1363, 'j': 1282, 'q': 1061, 'W': 995, 'V': 922, '+': 890, '=': 683, ';': 662, 'K': 591, "'": 436, '<': 370, 'J': 259, 'X': 254, 'Y': 191, ']': 188, '[': 187, 'Z': 185, 'Q': 165, '"': 147, '>': 131, '?': 30, '_': 4, '~': 2

In [21]:
# Save vocabularies to file
print("Saving vocabularies to file...")
save_vocab_to_txt_file(words, os.path.join(data_folder, 'words.txt'))
save_vocab_to_txt_file(tags, os.path.join(data_folder, 'tags.txt'))
print("- done.")

# Save datasets properties in json file
print('saving dataset stats')
encoder_params = dict()
encoders = [data_encoders, label_encoders]
for encoder in encoders:
    for encoder_name, enc in encoder.items():
        encoder_params['pad_'+encoder_name] = enc.PAD
        encoder_params['unk_'+encoder_name] = enc.UNK

sizes = {
    'train_size': num_sentences_train,
    'val_size': num_sentences_val,
    'test_size': num_sentences_test,
    'vocab_size': len(words),
    'number_of_tags': len(tags),
    'special_tokens': encoder_params,
    'data_iterators': {'train': train_data_iterator.__class__.__name__,
                       'val': val_data_iterator.__class__.__name__,
                       'test': test_data_iterator.__class__.__name__}
}
save_dict_to_json(sizes, os.path.join(data_folder, 'dataset_params.json'))

# Logging sizes
to_print = "\n".join("- {}: {}".format(k, v) for k, v in sizes.items())
print("Characteristics of the dataset:\n{}".format(to_print))

Saving vocabularies to file...
- done.
saving dataset stats
Characteristics of the dataset:
- train_size: 4559
- val_size: 4580
- test_size: 4796
- vocab_size: 17379
- number_of_tags: 9
- special_tokens: {'pad_WORD': '__PAD__', 'unk_WORD': '__UNK__', 'pad_CHAR': '__PAD__', 'unk_CHAR': '__UNK__', 'pad_ENTITY': '__PAD__', 'unk_ENTITY': None}
- data_iterators: {'train': 'SLIterator', 'val': 'SLIterator', 'test': 'SLIterator'}
