In [3]:
# import libraries that will help us preprocess data, map to word embeddings
import torchtext
from torchtext.vocab import Vectors, GloVe

In [5]:
# this will be our input x to the classifiers
TEXT = torchtext.data.Field()

# this will be what we map to, the tag y
LABEL = torchtext.data.Field(sequential=False)

In [12]:
# split the dataset into train, val, and test sets. Exclude neutral labels, so just positive or negative
train, val, test = torchtext.datasets.SST.splits(TEXT, LABEL, filter_pred=lambda ex: ex.label != 'neutral')

In [13]:
# each consists of a label and it's original words
print('len(train)', len(train))
print('vars(train[0])', vars(train[0]))

len(train) 6920
vars(train[0]) {'text': ['The', 'Rock', 'is', 'destined', 'to', 'be', 'the', '21st', 'Century', "'s", 'new', '``', 'Conan', "''", 'and', 'that', 'he', "'s", 'going', 'to', 'make', 'a', 'splash', 'even', 'greater', 'than', 'Arnold', 'Schwarzenegger', ',', 'Jean-Claud', 'Van', 'Damme', 'or', 'Steven', 'Segal', '.'], 'label': 'positive'}


In [16]:
# assign an index to each word and label (unique) kind of like countvectorizer
TEXT.build_vocab(train)
LABEL.build_vocab(train)
print('len(TEXT.vocab)', len(TEXT.vocab))
print('len(LABEL.vocab)', len(LABEL.vocab))

len(TEXT.vocab) 16284
len(LABEL.vocab) 3


In [18]:
# produce three batch iterators that iterate 10 examples at a time
train_iter, val_iter, test_iter = torchtext.data.BucketIterator.splits((train, val, test), batch_size=10, device=-1)

In [20]:
# consider a batch generated by one of these iterators. 
# Yields [max_sent_len, batch_size] and the indices in the 0th dim are the word IDs
batch = next(iter(train_iter))
print("Size of text batch [max sent length, batch size]", batch.text.size())
print("Second in batch", batch.text[:, 0])
print("Converted back to string: ", " ".join([TEXT.vocab.itos[i] for i in batch.text[:, 0].data]))

Size of text batch [max sent length, batch size] torch.Size([31, 10])
Second in batch Variable containing:
   835
     6
    14
  2490
     5
     4
  2458
     9
    35
   871
     3
 11218
   219
   180
  1217
     6
     4
   851
 11645
    15
    88
  4308
   128
     8
 14154
    15
     4
   196
 12687
     2
     1
[torch.LongTensor of size 31]

Converted back to string:  Much of The Lady and the Duke is about quiet , decisive moments between members of the cultural elite as they determine how to proceed as the world implodes . <pad>


In [21]:
# Build the vocabulary with word embeddings
url = 'https://s3-us-west-1.amazonaws.com/fasttext-vectors/wiki.simple.vec'
TEXT.vocab.load_vectors(vectors=Vectors('wiki.simple.vec', url=url))

print("Word embeddings size ", TEXT.vocab.vectors.size())
print("Word embedding of 'follows', first 10 dim ", TEXT.vocab.vectors[TEXT.vocab.stoi['follows']][:10])

.vector_cache/wiki.simple.vec: 293MB [13:09, 398kB/s]                               
  0%|          | 0/111052 [00:00<?, ?it/s]Skipping token 111051 with 1-dimensional vector ['300']; likely a header
100%|██████████| 111052/111052 [00:12<00:00, 9224.36it/s]


Word embeddings size  torch.Size([16284, 300])
Word embedding of 'follows', first 10 dim  
 0.3925
-0.4770
 0.1754
-0.0845
 0.1396
 0.3722
-0.0878
-0.2398
 0.0367
 0.2800
[torch.FloatTensor of size 10]

