In [1]:
# import libraries
import torchtext
from torchtext.vocab import Vectors

In [2]:
# intialize inputs
TEXT = torchtext.data.Field()

In [46]:
# get 10k fields of the PTB data
train, val, test = torchtext.datasets.LanguageModelingDataset.splits(
    path="../data", 
    train="train.txt", validation="valid.txt", test="valid.txt", text_field=TEXT)

In [8]:
# we train on the entire corpus, modeled as a single sentence
print('len(train)', len(train))

len(train) 1


In [33]:
# build the vocabulary. 10001 because the vocab has <unk> but then torchtext adds its own
TEXT.build_vocab(train)
print('len(TEXT.vocab)', len(TEXT.vocab))

len(TEXT.vocab) 10001


In [32]:
# for debugging, reduce vocabulary.
if False:
    TEXT.build_vocab(train, max_size=1000)
    print(len(TEXT.vocab))

In [15]:
# make batch iterators
train_iter, val_iter, test_iter = torchtext.data.BPTTIterator.splits(
    (train, val, test), batch_size=10, device=-1, bptt_len=32, repeat=False)

In [16]:
# each batch is a string of length 32 and sentences are ended with a special <eos> token
it = iter(train_iter)
batch = next(it) 
print("Size of text batch [max bptt length, batch size]", batch.text.size())
print("Second in batch", batch.text[:, 2])
print("Converted back to string: ", " ".join([TEXT.vocab.itos[i] for i in batch.text[:, 2].data]))

Size of text batch [max bptt length, batch size] torch.Size([32, 10])
Second in batch Variable containing:
   8
 202
  77
   5
 183
 561
   0
  18
 975
 976
   7
 943
   5
 157
  78
   0
 289
 645
   3
  30
 132
   0
  20
   2
 273
   0
  17
   9
 117
   0
 969
   6
[torch.LongTensor of size 32]

Converted back to string:  in part because of buy programs <unk> by stock-index arbitrage a form of program trading <unk> futures contracts <eos> but interest <unk> as the day <unk> on and investors <unk> ahead to


In [17]:
# each consecutive batch is a continuation of the previous one. there are no separate labels
batch = next(it)
print("Converted back to string: ", " ".join([TEXT.vocab.itos[i] for i in batch.text[:, 2].data]))

Converted back to string:  the <unk> later this week of two important economic reports <eos> the first is wednesday 's <unk> of <unk> managers considered a good <unk> of how the nation 's manufacturing <unk> <unk>


In [18]:
# the task is such that given a 10 word prefix of sentences, 
# we predict 10 possible next word candidates
!head input.txt

but while the new york stock exchange did n't fall ___
some circuit breakers installed after the october N crash failed ___
the N stock specialist firms on the big board floor ___
big investment banks refused to step up to the plate ___
heavy selling of standard & poor 's 500-stock index futures ___
seven big board stocks ual amr bankamerica walt disney capital ___
once again the specialists were not able to handle the ___
<unk> james <unk> chairman of specialists henderson brothers inc. it ___
when the dollar is in a <unk> even central banks ___
speculators are calling for a degree of liquidity that is ___


In [19]:
# as a sample Kaggle submission, let's build a majority-baseline (naive unigram model)
from collections import Counter
count = Counter()
for batch in iter(train_iter):
    count.update(batch.text.view(-1).data.tolist())
count[TEXT.vocab.stoi["<eos>"]] = 0
predictions = [TEXT.vocab.itos[i] for i, c in count.most_common(20)]
with open("sample.txt", "w") as fout: 
    print("id,word", file=fout)
    for i, l in enumerate(open("input.txt"), 1):
        print("%d,%s"%(i, " ".join(predictions)), file=fout)

  This is separate from the ipykernel package so we can avoid doing imports until
