In [1]:
!echo "=== Acquiring datasets ==="
!echo "---"

!mkdir -p data



=== Acquiring datasets ===
---


In [4]:
%cd data

/content/data


In [5]:
!ls

In [6]:
!echo "- Downloading Penn Treebank (PTB)"
!wget --quiet --continue http://www.fit.vutbr.cz/~imikolov/rnnlm/simple-examples.tgz
!tar -xzf simple-examples.tgz

- Downloading Penn Treebank (PTB)


In [7]:
!mkdir -p penn
%cd penn
!mv ../simple-examples/data/ptb.train.txt train.txt
!mv ../simple-examples/data/ptb.test.txt test.txt
!mv ../simple-examples/data/ptb.valid.txt valid.txt

/content/data/penn


In [8]:
!echo "- Downloading Penn Treebank (Character)"
!mkdir -p ../pennchar
%cd ../pennchar

- Downloading Penn Treebank (Character)
/content/data/pennchar


In [9]:
!mv ../simple-examples/data/ptb.char.train.txt train.txt
!mv ../simple-examples/data/ptb.char.test.txt test.txt
!mv ../simple-examples/data/ptb.char.valid.txt valid.txt

In [10]:
!rm -rf ../simple-examples/

In [59]:
import torch.nn as nn


class RNNModel(nn.Module):

    def __init__(self, ntoken, ninp, nhid, nlayers, dropout=0.5):
        super(RNNModel, self).__init__()
        self.drop = nn.Dropout(dropout)
        self.encoder = nn.Embedding(ntoken, ninp) # Token2Embeddings
        self.rnn = nn.LSTM(ninp, nhid, nlayers, dropout=dropout) #(seq_len, batch_size, emb_size)
        self.decoder = nn.Linear(nhid, ntoken)
        self.init_weights()
        self.nhid = nhid
        self.nlayers = nlayers

    def init_weights(self):
        initrange = 0.05
        self.encoder.weight.data.uniform_(-initrange, initrange)
        self.decoder.bias.data.fill_(0)
        self.decoder.weight.data.uniform_(-initrange, initrange)

    def forward(self, input, hidden):
        # input size(bptt, bsz)
        emb = self.drop(self.encoder(input))
        # emb size(bptt, bsz, embsize)
        # hid size(layers, bsz, nhid)
        output, hidden = self.rnn(emb, hidden)
        # output size(bptt, bsz, nhid)
        output = self.drop(output)
        # decoder: nhid -> ntoken
        decoded = self.decoder(output.view(output.size(0)*output.size(1), output.size(2)))
        return decoded, hidden

    def init_hidden(self, bsz):
        # LSTM h and c
        weight = next(self.parameters()).data
        return weight.new_zeros(self.nlayers, bsz, self.nhid), weight.new_zeros(self.nlayers, bsz, self.nhid)

In [60]:

import os
import torch

from collections import Counter


class Dictionary(object):
    def __init__(self):
        self.word2idx = {}
        self.idx2word = []
        self.counter = Counter()
        self.total = 0

    def add_word(self, word):
        if word not in self.word2idx:
            self.idx2word.append(word)
            self.word2idx[word] = len(self.idx2word) - 1
        token_id = self.word2idx[word]
        self.counter[token_id] += 1
        self.total += 1
        return self.word2idx[word]

    def __len__(self):
        return len(self.idx2word)


class Corpus(object):
    def __init__(self, path):
        self.dictionary = Dictionary()
        self.train = self.tokenize(os.path.join(path, 'train.txt'))
        self.valid = self.tokenize(os.path.join(path, 'valid.txt'))
        self.test = self.tokenize(os.path.join(path, 'test.txt'))

    def tokenize(self, path):
        """Tokenizes a text file."""
        assert os.path.exists(path)
        # Add words to the dictionary
        with open(path, 'r') as f:
            tokens = 0
            for line in f:
                words = line.split() + ['<eos>']
                tokens += len(words)
                for word in words:
                    self.dictionary.add_word(word)

        # Tokenize file content
        with open(path, 'r') as f:
            ids = torch.LongTensor(tokens)
            token = 0
            for line in f:
                words = line.split() + ['<eos>']
                for word in words:
                    ids[token] = self.dictionary.word2idx[word]
                    token += 1

        return ids

In [61]:
import torch
import time
import math

In [17]:
%cd ..

/content/data


In [19]:
%cd ..

/content


In [95]:
data = '/content/data/pennchar'
batch_size = 256
emsize = 256
nlayers = 1
nhid = 1000
lr = 0.0001
dropout = 0.5
checkpoint = ''
clip = 1
bptt = 35
epochs = 10
save = '/content/output/model_test.pt'

torch.manual_seed(1111)

# Load data
corpus = Corpus(data)


In [96]:
def batchify(data, bsz):
    # Work out how cleanly we can divide the dataset into bsz parts.
    nbatch = data.size(0) // bsz
    # Trim off any extra elements that wouldn't cleanly fit (remainders).
    data = data.narrow(0, 0, nbatch * bsz)
    # Evenly divide the data across the bsz batches.
    data = data.view(bsz, -1).t().contiguous()
    return data

In [97]:
eval_batch_size = 256
train_data = batchify(corpus.train, batch_size) # size(total_len//bsz, bsz)
val_data = batchify(corpus.valid, eval_batch_size)
test_data = batchify(corpus.test, eval_batch_size)

In [98]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [99]:
train_data.to(device)

tensor([[ 0,  1,  9,  ...,  3, 20,  0],
        [ 1,  2, 10,  ...,  7,  7, 24],
        [ 2,  3,  7,  ..., 17, 13,  0],
        ...,
        [ 8,  4,  7,  ..., 21, 28, 26],
        [ 7, 10, 15,  ...,  3,  3, 10],
        [ 4,  9,  2,  ..., 16, 12,  5]], device='cuda:0')

In [100]:
val_data.to(device)
test_data.to(device)

tensor([[ 5,  3,  3,  ...,  7,  3, 14],
        [ 7, 29, 24,  ..., 15, 18,  3],
        [ 3,  3,  0,  ...,  5,  2, 33],
        ...,
        [ 3,  3,  3,  ...,  7,  2, 17],
        [ 7,  0,  8,  ..., 18, 21,  3],
        [ 2, 16, 20,  ...,  1,  0, 13]], device='cuda:0')

In [117]:
# Build the model
interval = 50 # interval to report
ntokens = len(corpus.dictionary) # 10000
model = RNNModel(ntokens, emsize, nhid, nlayers, dropout)

# Load checkpoint
if checkpoint != '':
    model = torch.load(checkpoint, map_location=lambda storage, loc: storage)

print(model)
criterion = torch.nn.CrossEntropyLoss()

RNNModel(
  (drop): Dropout(p=0.5, inplace=False)
  (encoder): Embedding(50, 256)
  (rnn): LSTM(256, 1000, dropout=0.5)
  (decoder): Linear(in_features=1000, out_features=50, bias=True)
)


  "num_layers={}".format(dropout, num_layers))


In [118]:
model.to(device)

RNNModel(
  (drop): Dropout(p=0.5, inplace=False)
  (encoder): Embedding(50, 256)
  (rnn): LSTM(256, 1000, dropout=0.5)
  (decoder): Linear(in_features=1000, out_features=50, bias=True)
)

In [119]:
def repackage_hidden(h):
    # detach
    return tuple(v.clone().detach() for v in h)


In [120]:

def get_batch(source, i):
    # source: size(total_len//bsz, bsz)
    seq_len = min(bptt, len(source) - 1 - i)
    #data = torch.tensor(source[i:i+seq_len]) # size(bptt, bsz)
    data = source[i:i+seq_len].clone().detach()
    target = source[i+1:i+1+seq_len].clone().detach().view(-1)
    #target = torch.tensor(source[i+1:i+1+seq_len].view(-1)) # size(bptt * bsz)
    return data, target

In [121]:
def evaluate(data_source):
    # Turn on evaluation mode which disables dropout.
    with torch.no_grad():
        model.eval()
        total_loss = 0
        ntokens = len(corpus.dictionary)
        hidden = model.init_hidden(eval_batch_size) #hidden size(nlayers, bsz, hdsize)
        for i in range(0, data_source.size(0) - 1, bptt):# iterate over every timestep
            data, targets = get_batch(data_source, i)
            output, hidden = model(data.to(device), hidden)
            # model input and output
            # inputdata size(bptt, bsz), and size(bptt, bsz, embsize) after embedding
            # output size(bptt*bsz, ntoken)
            total_loss += len(data) * criterion(output.to(device), targets.to(device)).data
            hidden = repackage_hidden(hidden)
        return total_loss / len(data_source)


In [122]:
def train():
    # choose a optimizer

    model.train()
    total_loss = 0
    start_time = time.time()
    hidden = model.init_hidden(batch_size)
    # train_data size(batchcnt, bsz)
    for batch, i in enumerate(range(0, train_data.size(0) - 1, bptt)):
        data, targets = get_batch(train_data, i)
        # Starting each batch, we detach the hidden state from how it was previously produced.
        # If we didn't, the model would try backpropagating all the way to start of the dataset.
        hidden = repackage_hidden(hidden)
        # print(hidden.to(device))
        output, hidden = model(data.to(device), hidden)
        loss = criterion(output.to(device), targets.to(device))
        opt.zero_grad()
        loss.backward()

        # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
        torch.nn.utils.clip_grad_value_(model.parameters(), clip)
        opt.step()

        total_loss += loss.data

        if batch % interval == 0 and batch > 0:
            cur_loss = total_loss / interval
            elapsed = time.time() - start_time
            print('| epoch {:3d} | {:5d}/{:5d} batches | lr {:02.4f} | ms/batch {:5.2f} | '
                    'loss {:5.2f} | ppl {:8.2f} | bpc {:8.3f}'.format(
                epoch, batch, len(train_data) // bptt, lr,
                elapsed * 1000 / interval, cur_loss, math.exp(cur_loss), cur_loss / math.log(2)))
            total_loss = 0
            start_time = time.time()


In [123]:
lr = lr
best_val_loss = None
opt = torch.optim.SGD(model.parameters(), lr=lr, momentum=0.99)
opts = 'SGD'
# if opt == 'Adam':
#     opt = torch.optim.Adam(model.parameters(), lr=0.001, betas=(0.9, 0.99))
#     lr = 0.001
# if args.opt == 'Momentum':
#     opt = torch.optim.SGD(model.parameters(), lr=lr, momentum=0.8)
# if args.opt == 'RMSprop':
#     opt = torch.optim.RMSprop(model.parameters(), lr=0.001, alpha=0.9)
#     lr = 0.001

try:
    for epoch in range(1, epochs+1):
        epoch_start_time = time.time()
        train()
        val_loss = evaluate(val_data)
        print('-' * 89)
        print('| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | '
                'valid ppl {:8.2f} | bpc {:8.3f}'.format(epoch, (time.time() - epoch_start_time),
                                           val_loss, math.exp(val_loss), val_loss / math.log(2)))
        print('-' * 89)
        # Save the model if the validation loss is the best we've seen so far.
        if not best_val_loss or val_loss < best_val_loss:
            with open(save, 'wb') as f:
                torch.save(model, f)
            best_val_loss = val_loss
        else:
            # Anneal the learning rate if no improvement has been seen in the validation dataset.
            if opts == 'SGD' or opts == 'Momentum':
                lr /= 4.0
                for group in opt.param_groups:
                    group['lr'] = lr

except KeyboardInterrupt:
    print('-' * 89)
    print('Exiting from training early')

# Load the best saved model.
with open(save, 'rb') as f:
    model = torch.load(f)

# Run on test data.
test_loss = evaluate(test_data)
print('=' * 89)
print('| End of training | test loss {:5.2f} | test ppl {:8.2f} | bpc {:8.3f}'.format(
    test_loss, math.exp(test_loss), test_loss / math.log(2)))
print('=' * 89)

| epoch   1 |    50/  559 batches | lr 0.0001 | ms/batch 78.50 | loss  3.99 | ppl    54.01 | bpc    5.755
| epoch   1 |   100/  559 batches | lr 0.0001 | ms/batch 76.49 | loss  3.90 | ppl    49.16 | bpc    5.619
| epoch   1 |   150/  559 batches | lr 0.0001 | ms/batch 76.98 | loss  3.87 | ppl    47.93 | bpc    5.583
| epoch   1 |   200/  559 batches | lr 0.0001 | ms/batch 77.79 | loss  3.84 | ppl    46.49 | bpc    5.539
| epoch   1 |   250/  559 batches | lr 0.0001 | ms/batch 78.49 | loss  3.81 | ppl    44.93 | bpc    5.489
| epoch   1 |   300/  559 batches | lr 0.0001 | ms/batch 79.27 | loss  3.77 | ppl    43.36 | bpc    5.438
| epoch   1 |   350/  559 batches | lr 0.0001 | ms/batch 79.58 | loss  3.73 | ppl    41.80 | bpc    5.385
| epoch   1 |   400/  559 batches | lr 0.0001 | ms/batch 79.00 | loss  3.70 | ppl    40.28 | bpc    5.332
| epoch   1 |   450/  559 batches | lr 0.0001 | ms/batch 78.86 | loss  3.66 | ppl    38.78 | bpc    5.277
| epoch   1 |   500/  559 batches | lr 0.0001 

In [135]:
# Build the model
interval = 50 # interval to report
ntokens = len(corpus.dictionary) # 10000
model = RNNModel(ntokens, emsize, nhid, nlayers, dropout)
checkpoint = "/content/output/model_test.pt"

# Load checkpoint
if checkpoint != '':
    model = torch.load(checkpoint, map_location=lambda storage, loc: storage)

print(model)
criterion = torch.nn.CrossEntropyLoss()

RNNModel(
  (drop): Dropout(p=0.5, inplace=False)
  (encoder): Embedding(50, 256)
  (rnn): LSTM(256, 1000, dropout=0.5)
  (decoder): Linear(in_features=1000, out_features=50, bias=True)
)


  "num_layers={}".format(dropout, num_layers))


In [139]:
from torch.nn.utils import vector_to_parameters, parameters_to_vector
lr = lr
best_val_loss = None
opt = torch.optim.SGD(model.parameters(), lr=lr, momentum=0.99)
opts = 'SGD'
# if opt == 'Adam':
#     opt = torch.optim.Adam(model.parameters(), lr=0.001, betas=(0.9, 0.99))
#     lr = 0.001
# if args.opt == 'Momentum':
#     opt = torch.optim.SGD(model.parameters(), lr=lr, momentum=0.8)
# if args.opt == 'RMSprop':
#     opt = torch.optim.RMSprop(model.parameters(), lr=0.001, alpha=0.9)
#     lr = 0.001

try:
    for epoch in range(1, epochs+1):
        epoch_start_time = time.time()
        model.to(device)

        param_vector = parameters_to_vector(model.parameters())
        param_vector.to(device)
        n_params = len(param_vector)
        noise = torch.distributions.Normal(0, 0.075).sample_n(n_params)
        param_vector.add_(noise.to(device))
        
        
        train()
        val_loss = evaluate(val_data)
        print('-' * 89)
        print('| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | '
                'valid ppl {:8.2f} | bpc {:8.3f}'.format(epoch, (time.time() - epoch_start_time),
                                           val_loss, math.exp(val_loss), val_loss / math.log(2)))
        print('-' * 89)
        # Save the model if the validation loss is the best we've seen so far.
        if not best_val_loss or val_loss < best_val_loss:
            with open(save, 'wb') as f:
                torch.save(model, f)
            best_val_loss = val_loss
        else:
            # Anneal the learning rate if no improvement has been seen in the validation dataset.
            if opts == 'SGD' or opts == 'Momentum':
                lr /= 4.0
                for group in opt.param_groups:
                    group['lr'] = lr

except KeyboardInterrupt:
    print('-' * 89)
    print('Exiting from training early')

# Load the best saved model.
with open(save, 'rb') as f:
    model = torch.load(f)

# Run on test data.
test_loss = evaluate(test_data)
print('=' * 89)
print('| End of training | test loss {:5.2f} | test ppl {:8.2f} | bpc {:8.3f}'.format(
    test_loss, math.exp(test_loss), test_loss / math.log(2)))
print('=' * 89)



| epoch   1 |    50/  559 batches | lr 0.0001 | ms/batch 81.26 | loss  3.03 | ppl    20.74 | bpc    4.374
| epoch   1 |   100/  559 batches | lr 0.0001 | ms/batch 81.02 | loss  2.97 | ppl    19.44 | bpc    4.281
| epoch   1 |   150/  559 batches | lr 0.0001 | ms/batch 81.36 | loss  2.97 | ppl    19.40 | bpc    4.278
| epoch   1 |   200/  559 batches | lr 0.0001 | ms/batch 81.28 | loss  2.97 | ppl    19.40 | bpc    4.278
| epoch   1 |   250/  559 batches | lr 0.0001 | ms/batch 82.24 | loss  2.96 | ppl    19.24 | bpc    4.266
| epoch   1 |   300/  559 batches | lr 0.0001 | ms/batch 83.44 | loss  2.95 | ppl    19.15 | bpc    4.259
| epoch   1 |   350/  559 batches | lr 0.0001 | ms/batch 84.40 | loss  2.95 | ppl    19.07 | bpc    4.253
| epoch   1 |   400/  559 batches | lr 0.0001 | ms/batch 85.20 | loss  2.94 | ppl    18.99 | bpc    4.247
| epoch   1 |   450/  559 batches | lr 0.0001 | ms/batch 85.59 | loss  2.94 | ppl    18.88 | bpc    4.239
| epoch   1 |   500/  559 batches | lr 0.0001 