In [4]:
# importing libraries
import os

## Build Vocabulary

In [None]:
"""Build vocabularies of words and tags from datasets"""

import argparse
from collections import Counter
import json
import os


parser = argparse.ArgumentParser()
parser.add_argument('--min_count_word', default=1, help="Minimum count for words in the dataset", type=int)
parser.add_argument('--min_count_tag', default=1, help="Minimum count for tags in the dataset", type=int)
parser.add_argument('--data_dir', default='data/small', help="Directory containing the dataset")

# Hyper parameters for the vocab
PAD_WORD = '<pad>'
PAD_TAG = 'O'
UNK_WORD = 'UNK'


def save_vocab_to_txt_file(vocab, txt_path):
    """Writes one token per line, 0-based line id corresponds to the id of the token.

    Args:
        vocab: (iterable object) yields token
        txt_path: (stirng) path to vocab file
    """
    with open(txt_path, "w") as f:
        for token in vocab:
            f.write(token + '\n')
            

def save_dict_to_json(d, json_path):
    """Saves dict to json file

    Args:
        d: (dict)
        json_path: (string) path to json file
    """
    with open(json_path, 'w') as f:
        d = {k: v for k, v in d.items()}
        json.dump(d, f, indent=4)


def update_vocab(txt_path, vocab):
    """Update word and tag vocabulary from dataset

    Args:
        txt_path: (string) path to file, one sentence per line
        vocab: (dict or Counter) with update method

    Returns:
        dataset_size: (int) number of elements in the dataset
    """
    with open(txt_path) as f:
        for i, line in enumerate(f):
            vocab.update(line.strip().split(' '))

    return i + 1


if __name__ == '__main__':
    args = parser.parse_args()

    # Build word vocab with train and test datasets
    print("Building word vocabulary...")
    words = Counter()
    size_train_sentences = update_vocab(os.path.join(args.data_dir, 'train/sentences.txt'), words)
    size_dev_sentences = update_vocab(os.path.join(args.data_dir, 'val/sentences.txt'), words)
    size_test_sentences = update_vocab(os.path.join(args.data_dir, 'test/sentences.txt'), words)
    print("- done.")

    # Build tag vocab with train and test datasets
    print("Building tag vocabulary...")
    tags = Counter()
    size_train_tags = update_vocab(os.path.join(args.data_dir, 'train/labels.txt'), tags)
    size_dev_tags = update_vocab(os.path.join(args.data_dir, 'val/labels.txt'), tags)
    size_test_tags = update_vocab(os.path.join(args.data_dir, 'test/labels.txt'), tags)
    print("- done.")

    # Assert same number of examples in datasets
    assert size_train_sentences == size_train_tags
    assert size_dev_sentences == size_dev_tags
    assert size_test_sentences == size_test_tags

    # Only keep most frequent tokens
    words = [tok for tok, count in words.items() if count >= args.min_count_word]
    tags = [tok for tok, count in tags.items() if count >= args.min_count_tag]

    # Add pad tokens
    if PAD_WORD not in words: words.append(PAD_WORD)
    if PAD_TAG not in tags: tags.append(PAD_TAG)
    
    # add word for unknown words 
    words.append(UNK_WORD)

    # Save vocabularies to file
    print("Saving vocabularies to file...")
    save_vocab_to_txt_file(words, os.path.join(args.data_dir, 'words.txt'))
    save_vocab_to_txt_file(tags, os.path.join(args.data_dir, 'tags.txt'))
    print("- done.")

    # Save datasets properties in json file
    sizes = {
        'train_size': size_train_sentences,
        'dev_size': size_dev_sentences,
        'test_size': size_test_sentences,
        'vocab_size': len(words),
        'number_of_tags': len(tags),
        'pad_word': PAD_WORD,
        'pad_tag': PAD_TAG,
        'unk_word': UNK_WORD
    }
    save_dict_to_json(sizes, os.path.join(args.data_dir, 'dataset_params.json'))

    # Logging sizes
    to_print = "\n".join("- {}: {}".format(k, v) for k, v in sizes.items())
    print("Characteristics of the dataset:\n{}".format(to_print))


In [6]:
# Loading text data
words_path = "/Users/Prasann/Desktop/cs230-code-examples/pytorch/nlp/data/small/words.txt"
vocab = {}
with open(words_path) as f:
    for i, l in enumerate(f.read().splitlines()):
        vocab[l] = i

In [8]:
tags_path = "/Users/Prasann/Desktop/cs230-code-examples/pytorch/nlp/data/small/tags.txt"
tag_map = {}
with open(tags_path) as f:
    for i, l in enumerate(f.read().splitlines()):
        tag_map[l] = i

In [12]:
train_sentences = []        
train_labels = []
train_sentences_file = '/Users/Prasann/Desktop/cs230-code-examples/pytorch/nlp/data/small/train/sentences.txt'
train_labels_file = '/Users/Prasann/Desktop/cs230-code-examples/pytorch/nlp/data/small/train/labels.txt'

with open(train_sentences_file) as f:
    for sentence in f.read().splitlines():
        # replace each token by its index if it is in vocab
        # else use index of UNK
        s = [vocab[token] if token in vocab 
             else vocab['UNK']
             for token in sentence.split(' ')]
        train_sentences.append(s)
    
with open(train_labels_file) as f:
    for sentence in f.read().splitlines():
        # replace each label by its index
        l = [tag_map[label] for label in sentence.split(' ')]
        train_labels.append(l)  

In [17]:
len(train_labels)

10

## Preparing a Batch

In [None]:
# compute length of longest sentence in batch
batch_max_len = max([len(s) for s in batch_sentences])

# prepare a numpy array with the data, initializing the data with 'PAD' 
# and all labels with -1; initializing labels to -1 differentiates tokens 
# with tags from 'PAD' tokens
batch_data = vocab['PAD']*np.ones((len(batch_sentences), batch_max_len))
batch_labels = -1*np.ones((len(batch_sentences), batch_max_len))

# copy the data to the numpy array
for j in range(len(batch_sentences)):
    cur_len = len(batch_sentences[j])
    batch_data[j][:cur_len] = batch_sentences[j]
    batch_labels[j][:cur_len] = batch_tags[j]

# since all data are indices, we convert them to torch LongTensors
batch_data, batch_labels = torch.LongTensor(batch_data), torch.LongTensor(batch_labels)

# convert Tensors to Variables
batch_data, batch_labels = Variable(batch_data), Variable(batch_labels)
