In [1]:
!echo "=== Acquiring datasets ==="
!echo "---"

!mkdir -p data



=== Acquiring datasets ===
---


In [2]:
%cd data

/content/data


In [3]:
!ls

In [4]:
!echo "- Downloading Penn Treebank (PTB)"
!wget --quiet --continue http://www.fit.vutbr.cz/~imikolov/rnnlm/simple-examples.tgz
!tar -xzf simple-examples.tgz

- Downloading Penn Treebank (PTB)


In [5]:
!mkdir -p penn
%cd penn
!mv ../simple-examples/data/ptb.train.txt train.txt
!mv ../simple-examples/data/ptb.test.txt test.txt
!mv ../simple-examples/data/ptb.valid.txt valid.txt

/content/data/penn


In [6]:
!echo "- Downloading Penn Treebank (Character)"
!mkdir -p ../pennchar
%cd ../pennchar

- Downloading Penn Treebank (Character)
/content/data/pennchar


In [7]:
!mv ../simple-examples/data/ptb.char.train.txt train.txt
!mv ../simple-examples/data/ptb.char.test.txt test.txt
!mv ../simple-examples/data/ptb.char.valid.txt valid.txt

In [8]:
!rm -rf ../simple-examples/

**Character - Without Noise**

In [9]:
import torch.nn as nn


class RNNModel(nn.Module):

    def __init__(self, ntoken, ninp, nhid, nlayers, dropout=0.5):
        super(RNNModel, self).__init__()
        self.drop = nn.Dropout(dropout)
        self.encoder = nn.Embedding(ntoken, ninp) # Token2Embeddings
        self.rnn = nn.LSTM(ninp, nhid, nlayers, dropout=dropout) #(seq_len, batch_size, emb_size)
        self.decoder = nn.Linear(nhid, ntoken)
        self.init_weights()
        self.nhid = nhid
        self.nlayers = nlayers

    def init_weights(self):
        initrange = 0.05
        self.encoder.weight.data.uniform_(-initrange, initrange)
        self.decoder.bias.data.fill_(0)
        self.decoder.weight.data.uniform_(-initrange, initrange)

    def forward(self, input, hidden):
        # input size(bptt, bsz)
        emb = self.drop(self.encoder(input))
        # emb size(bptt, bsz, embsize)
        # hid size(layers, bsz, nhid)
        output, hidden = self.rnn(emb, hidden)
        # output size(bptt, bsz, nhid)
        output = self.drop(output)
        # decoder: nhid -> ntoken
        decoded = self.decoder(output.view(output.size(0)*output.size(1), output.size(2)))
        return decoded, hidden

    def init_hidden(self, bsz):
        # LSTM h and c
        weight = next(self.parameters()).data
        return weight.new_zeros(self.nlayers, bsz, self.nhid), weight.new_zeros(self.nlayers, bsz, self.nhid)

In [10]:

import os
import torch

from collections import Counter


class Dictionary(object):
    def __init__(self):
        self.word2idx = {}
        self.idx2word = []
        self.counter = Counter()
        self.total = 0

    def add_word(self, word):
        if word not in self.word2idx:
            self.idx2word.append(word)
            self.word2idx[word] = len(self.idx2word) - 1
        token_id = self.word2idx[word]
        self.counter[token_id] += 1
        self.total += 1
        return self.word2idx[word]

    def __len__(self):
        return len(self.idx2word)


class Corpus(object):
    def __init__(self, path):
        self.dictionary = Dictionary()
        self.train = self.tokenize(os.path.join(path, 'train.txt'))
        self.valid = self.tokenize(os.path.join(path, 'valid.txt'))
        self.test = self.tokenize(os.path.join(path, 'test.txt'))

    def tokenize(self, path):
        """Tokenizes a text file."""
        assert os.path.exists(path)
        # Add words to the dictionary
        with open(path, 'r') as f:
            tokens = 0
            for line in f:
                words = line.split() + ['<eos>']
                tokens += len(words)
                for word in words:
                    self.dictionary.add_word(word)

        # Tokenize file content
        with open(path, 'r') as f:
            ids = torch.LongTensor(tokens)
            token = 0
            for line in f:
                words = line.split() + ['<eos>']
                for word in words:
                    ids[token] = self.dictionary.word2idx[word]
                    token += 1

        return ids

In [11]:
import torch
import time
import math

In [None]:
%cd ..

/content/data


In [None]:
%cd ..

/content


In [12]:
data = '/content/data/pennchar'
batch_size = 256
emsize = 256
nlayers = 1
nhid = 1000
lr = 0.0001
dropout = 0.5
checkpoint = ''
clip = 1
bptt = 35
epochs = 10
save = '/content/output/model_test_character_none.pt'

torch.manual_seed(1111)

# Load data
corpus = Corpus(data)


In [13]:
def batchify(data, bsz):
    # Work out how cleanly we can divide the dataset into bsz parts.
    nbatch = data.size(0) // bsz
    # Trim off any extra elements that wouldn't cleanly fit (remainders).
    data = data.narrow(0, 0, nbatch * bsz)
    # Evenly divide the data across the bsz batches.
    data = data.view(bsz, -1).t().contiguous()
    return data

In [14]:
eval_batch_size = 256
train_data = batchify(corpus.train, batch_size) # size(total_len//bsz, bsz)
val_data = batchify(corpus.valid, eval_batch_size)
test_data = batchify(corpus.test, eval_batch_size)

In [15]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [16]:
train_data.to(device)

tensor([[ 0,  1,  9,  ...,  3, 20,  0],
        [ 1,  2, 10,  ...,  7,  7, 24],
        [ 2,  3,  7,  ..., 17, 13,  0],
        ...,
        [ 8,  4,  7,  ..., 21, 28, 26],
        [ 7, 10, 15,  ...,  3,  3, 10],
        [ 4,  9,  2,  ..., 16, 12,  5]], device='cuda:0')

In [17]:
val_data.to(device)
test_data.to(device)

tensor([[ 5,  3,  3,  ...,  7,  3, 14],
        [ 7, 29, 24,  ..., 15, 18,  3],
        [ 3,  3,  0,  ...,  5,  2, 33],
        ...,
        [ 3,  3,  3,  ...,  7,  2, 17],
        [ 7,  0,  8,  ..., 18, 21,  3],
        [ 2, 16, 20,  ...,  1,  0, 13]], device='cuda:0')

In [None]:
# Build the model
interval = 50 # interval to report
ntokens = len(corpus.dictionary) # 10000
model = RNNModel(ntokens, emsize, nhid, nlayers, dropout)

# Load checkpoint
if checkpoint != '':
    model = torch.load(checkpoint, map_location=lambda storage, loc: storage)

print(model)
criterion = torch.nn.CrossEntropyLoss()

RNNModel(
  (drop): Dropout(p=0.5, inplace=False)
  (encoder): Embedding(50, 256)
  (rnn): LSTM(256, 1000, dropout=0.5)
  (decoder): Linear(in_features=1000, out_features=50, bias=True)
)


  "num_layers={}".format(dropout, num_layers))


In [None]:
model.to(device)

RNNModel(
  (drop): Dropout(p=0.5, inplace=False)
  (encoder): Embedding(50, 256)
  (rnn): LSTM(256, 1000, dropout=0.5)
  (decoder): Linear(in_features=1000, out_features=50, bias=True)
)

In [36]:
def repackage_hidden(h):
    # detach
    return tuple(v.clone().detach() for v in h)


In [37]:

def get_batch(source, i):
    # source: size(total_len//bsz, bsz)
    seq_len = min(bptt, len(source) - 1 - i)
    #data = torch.tensor(source[i:i+seq_len]) # size(bptt, bsz)
    data = source[i:i+seq_len].clone().detach()
    target = source[i+1:i+1+seq_len].clone().detach().view(-1)
    #target = torch.tensor(source[i+1:i+1+seq_len].view(-1)) # size(bptt * bsz)
    return data, target

In [38]:
def evaluate(data_source):
    # Turn on evaluation mode which disables dropout.
    with torch.no_grad():
        model.eval()
        total_loss = 0
        ntokens = len(corpus.dictionary)
        hidden = model.init_hidden(eval_batch_size) #hidden size(nlayers, bsz, hdsize)
        for i in range(0, data_source.size(0) - 1, bptt):# iterate over every timestep
            data, targets = get_batch(data_source, i)
            output, hidden = model(data.to(device), hidden)
            # model input and output
            # inputdata size(bptt, bsz), and size(bptt, bsz, embsize) after embedding
            # output size(bptt*bsz, ntoken)
            total_loss += len(data) * criterion(output.to(device), targets.to(device)).data
            hidden = repackage_hidden(hidden)
        return total_loss / len(data_source)


In [31]:
def train():
    # choose a optimizer

    model.train()
    total_loss = 0
    start_time = time.time()
    hidden = model.init_hidden(batch_size)
    # train_data size(batchcnt, bsz)
    for batch, i in enumerate(range(0, train_data.size(0) - 1, bptt)):
        data, targets = get_batch(train_data, i)
        # Starting each batch, we detach the hidden state from how it was previously produced.
        # If we didn't, the model would try backpropagating all the way to start of the dataset.
        hidden = repackage_hidden(hidden)
        # print(hidden.to(device))
        output, hidden = model(data.to(device), hidden)
        loss = criterion(output.to(device), targets.to(device))
        opt.zero_grad()
        loss.backward()

        # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
        torch.nn.utils.clip_grad_value_(model.parameters(), clip)
        opt.step()

        total_loss += loss.data

        if batch % interval == 0 and batch > 0:
            cur_loss = total_loss / interval
            elapsed = time.time() - start_time
            print('| epoch {:3d} | {:5d}/{:5d} batches | lr {:02.4f} | ms/batch {:5.2f} | '
                    'loss {:5.2f} | ppl {:8.2f} | bpc {:8.3f}'.format(
                epoch, batch, len(train_data) // bptt, lr,
                elapsed * 1000 / interval, cur_loss, math.exp(cur_loss), cur_loss / math.log(2)))
            total_loss = 0
            start_time = time.time()


In [39]:
print("Number of tokens:")
print("Train: ", len(corpus.train))
print("Valid: ", len(corpus.valid))
print("Test:  ", len(corpus.test))

Number of tokens:
Train:  5017483
Valid:  393043
Test:   442424


In [None]:
lr = lr
best_val_loss = None
opt = torch.optim.SGD(model.parameters(), lr=lr, momentum=0.99)
opts = 'SGD'
# if opt == 'Adam':
#     opt = torch.optim.Adam(model.parameters(), lr=0.001, betas=(0.9, 0.99))
#     lr = 0.001
# if args.opt == 'Momentum':
#     opt = torch.optim.SGD(model.parameters(), lr=lr, momentum=0.8)
# if args.opt == 'RMSprop':
#     opt = torch.optim.RMSprop(model.parameters(), lr=0.001, alpha=0.9)
#     lr = 0.001

try:
    for epoch in range(1, epochs+1):
        epoch_start_time = time.time()
        train()
        val_loss = evaluate(val_data)
        print('-' * 89)
        print('| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | '
                'valid ppl {:8.2f} | bpc {:8.3f}'.format(epoch, (time.time() - epoch_start_time),
                                           val_loss, math.exp(val_loss), val_loss / math.log(2)))
        print('-' * 89)
        # Save the model if the validation loss is the best we've seen so far.
        if not best_val_loss or val_loss < best_val_loss:
            with open(save, 'wb') as f:
                torch.save(model, f)
            best_val_loss = val_loss
        else:
            # Anneal the learning rate if no improvement has been seen in the validation dataset.
            if opts == 'SGD' or opts == 'Momentum':
                lr /= 4.0
                for group in opt.param_groups:
                    group['lr'] = lr

except KeyboardInterrupt:
    print('-' * 89)
    print('Exiting from training early')

# Load the best saved model.
with open(save, 'rb') as f:
    model = torch.load(f)

# Run on test data.
test_loss = evaluate(test_data)
print('=' * 89)
print('| End of training | test loss {:5.2f} | test ppl {:8.2f} | bpc {:8.3f}'.format(
    test_loss, math.exp(test_loss), test_loss / math.log(2)))
print('=' * 89)

| epoch   1 |    50/  559 batches | lr 0.0001 | ms/batch 179.50 | loss  3.99 | ppl    53.92 | bpc    5.753
| epoch   1 |   100/  559 batches | lr 0.0001 | ms/batch 175.88 | loss  3.89 | ppl    49.09 | bpc    5.617
| epoch   1 |   150/  559 batches | lr 0.0001 | ms/batch 175.70 | loss  3.87 | ppl    47.90 | bpc    5.582
| epoch   1 |   200/  559 batches | lr 0.0001 | ms/batch 175.62 | loss  3.84 | ppl    46.49 | bpc    5.539
| epoch   1 |   250/  559 batches | lr 0.0001 | ms/batch 175.46 | loss  3.81 | ppl    44.95 | bpc    5.490
| epoch   1 |   300/  559 batches | lr 0.0001 | ms/batch 175.06 | loss  3.77 | ppl    43.41 | bpc    5.440
| epoch   1 |   350/  559 batches | lr 0.0001 | ms/batch 174.96 | loss  3.73 | ppl    41.87 | bpc    5.388
| epoch   1 |   400/  559 batches | lr 0.0001 | ms/batch 174.64 | loss  3.70 | ppl    40.36 | bpc    5.335
| epoch   1 |   450/  559 batches | lr 0.0001 | ms/batch 174.95 | loss  3.66 | ppl    38.87 | bpc    5.281
| epoch   1 |   500/  559 batches | l

In [None]:
bptt

35

In [None]:
# Build the model
interval = 50 # interval to report
ntokens = len(corpus.dictionary) # 10000
model = RNNModel(ntokens, emsize, nhid, nlayers, dropout)
save = '/content/output/model_test_character_noise.pt'
checkpoint = "/content/output/model_test_character_none.pt"

# Load checkpoint
if checkpoint != '':
    model = torch.load(checkpoint, map_location=lambda storage, loc: storage)

print(model)
criterion = torch.nn.CrossEntropyLoss()

RNNModel(
  (drop): Dropout(p=0.5, inplace=False)
  (encoder): Embedding(50, 256)
  (rnn): LSTM(256, 1000, dropout=0.5)
  (decoder): Linear(in_features=1000, out_features=50, bias=True)
)


  "num_layers={}".format(dropout, num_layers))


In [None]:
model.rnn.parameters()

<generator object Module.parameters at 0x7f5749274150>

In [None]:
from torch.nn.utils import vector_to_parameters, parameters_to_vector
lr = lr
best_val_loss = None
opt = torch.optim.SGD(model.parameters(), lr=lr, momentum=0.99)
opts = 'SGD'
epochs = 25
# if opt == 'Adam':
#     opt = torch.optim.Adam(model.parameters(), lr=0.001, betas=(0.9, 0.99))
#     lr = 0.001
# if args.opt == 'Momentum':
#     opt = torch.optim.SGD(model.parameters(), lr=lr, momentum=0.8)
# if args.opt == 'RMSprop':
#     opt = torch.optim.RMSprop(model.parameters(), lr=0.001, alpha=0.9)
#     lr = 0.001

try:
    for epoch in range(1, epochs+1):
        epoch_start_time = time.time()
        model.to(device)

        param_vector = parameters_to_vector(model.rnn.parameters())
        param_vector.to(device)
        n_params = len(param_vector)
        noise = torch.distributions.Normal(loc=torch.tensor(0.), scale=torch.tensor(0.075)).sample_n(n_params)
        param_vector.add_(noise.to(device))
        
        vector_to_parameters(param_vector, model.rnn.parameters())
        model.to(device)
        train()
        val_loss = evaluate(val_data)
        print('-' * 89)
        print('| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | '
                'valid ppl {:8.2f} | bpc {:8.3f}'.format(epoch, (time.time() - epoch_start_time),
                                           val_loss, math.exp(val_loss), val_loss / math.log(2)))
        print('-' * 89)
        # Save the model if the validation loss is the best we've seen so far.
        if not best_val_loss or val_loss < best_val_loss:
            with open(save, 'wb') as f:
                torch.save(model, f)
            best_val_loss = val_loss
        else:
            # Anneal the learning rate if no improvement has been seen in the validation dataset.
            if opts == 'SGD' or opts == 'Momentum':
                lr /= 4.0
                for group in opt.param_groups:
                    group['lr'] = lr

except KeyboardInterrupt:
    print('-' * 89)
    print('Exiting from training early')

# Load the best saved model.
with open(save, 'rb') as f:
    model = torch.load(f)

# Run on test data.
test_loss = evaluate(test_data)
print('=' * 89)
print('| End of training | test loss {:5.2f} | test ppl {:8.2f} | bpc {:8.3f}'.format(
    test_loss, math.exp(test_loss), test_loss / math.log(2)))
print('=' * 89)



| epoch   1 |    50/  559 batches | lr 0.0000 | ms/batch 180.71 | loss  3.15 | ppl    23.41 | bpc    4.549
| epoch   1 |   100/  559 batches | lr 0.0000 | ms/batch 176.41 | loss  3.09 | ppl    22.00 | bpc    4.460
| epoch   1 |   150/  559 batches | lr 0.0000 | ms/batch 175.86 | loss  3.09 | ppl    22.03 | bpc    4.462
| epoch   1 |   200/  559 batches | lr 0.0000 | ms/batch 175.55 | loss  3.09 | ppl    22.07 | bpc    4.464
| epoch   1 |   250/  559 batches | lr 0.0000 | ms/batch 175.15 | loss  3.09 | ppl    22.04 | bpc    4.462
| epoch   1 |   300/  559 batches | lr 0.0000 | ms/batch 175.00 | loss  3.09 | ppl    22.01 | bpc    4.460
| epoch   1 |   350/  559 batches | lr 0.0000 | ms/batch 174.57 | loss  3.09 | ppl    22.01 | bpc    4.460
| epoch   1 |   400/  559 batches | lr 0.0000 | ms/batch 175.26 | loss  3.09 | ppl    22.03 | bpc    4.461
| epoch   1 |   450/  559 batches | lr 0.0000 | ms/batch 174.58 | loss  3.09 | ppl    22.02 | bpc    4.460
| epoch   1 |   500/  559 batches | l

In [35]:

# Build the model
interval = 50 # interval to report
ntokens = len(corpus.dictionary) # 10000
model = RNNModel(ntokens, emsize, nhid, nlayers, dropout)
save = '/content/output/model_test_character_adaptive.pt'
checkpoint = "/content/output/model_test_character_noise.pt"

# Load checkpoint
if checkpoint != '':
    model = torch.load(checkpoint, map_location=lambda storage, loc: storage)

print(model)

criterion = torch.nn.CrossEntropyLoss()

RNNModel(
  (drop): Dropout(p=0.5, inplace=False)
  (encoder): Embedding(50, 256)
  (rnn): LSTM(256, 1000, dropout=0.5)
  (decoder): Linear(in_features=1000, out_features=50, bias=True)
)


  "num_layers={}".format(dropout, num_layers))


In [44]:
def train():
    # choose a optimizer

    model.train()
    total_loss = 0
    start_time = time.time()
    hidden = model.init_hidden(batch_size)
    # train_data size(batchcnt, bsz)
    for batch, i in enumerate(range(0, train_data.size(0) - 1, bptt)):
        data, targets = get_batch(train_data, i)
        # Starting each batch, we detach the hidden state from how it was previously produced.
        # If we didn't, the model would try backpropagating all the way to start of the dataset.
        hidden = repackage_hidden(hidden)
        # print(hidden.to(device))
        output, hidden = model(data.to(device), hidden)
        loss = criterion(output.to(device), targets.to(device))
        opt.zero_grad()
        loss.backward()

        # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
        torch.nn.utils.clip_grad_value_(model.parameters(), clip)
        opt.step()

        l2_lambda = 0.01
        l2_reg = torch.tensor(0.).to(device)
        for param in model.rnn.parameters():
            l2_reg += torch.norm(param.to(device))

        total_loss += loss.data
        total_loss += l2_lambda * l2_reg

        if batch % interval == 0 and batch > 0:
            cur_loss = total_loss / interval
            elapsed = time.time() - start_time
            print('| epoch {:3d} | {:5d}/{:5d} batches | lr {:02.4f} | ms/batch {:5.2f} | '
                    'loss {:5.2f} | ppl {:8.2f} | bpc {:8.3f}'.format(
                epoch, batch, len(train_data) // bptt, lr,
                elapsed * 1000 / interval, cur_loss, math.exp(cur_loss), cur_loss / math.log(2)))
            total_loss = 0
            start_time = time.time()


In [45]:
from torch.nn.utils import vector_to_parameters, parameters_to_vector
lr = lr
best_val_loss = None
opt = torch.optim.SGD(model.parameters(), lr=lr, momentum=0.99)
opts = 'SGD'
# if opt == 'Adam':
#     opt = torch.optim.Adam(model.parameters(), lr=0.001, betas=(0.9, 0.99))
#     lr = 0.001
# if args.opt == 'Momentum':
#     opt = torch.optim.SGD(model.parameters(), lr=lr, momentum=0.8)
# if args.opt == 'RMSprop':
#     opt = torch.optim.RMSprop(model.parameters(), lr=0.001, alpha=0.9)
#     lr = 0.001

try:
    for epoch in range(1, epochs+1):
        epoch_start_time = time.time()
        model.to(device)

        param_vector = parameters_to_vector(model.rnn.parameters())
        param_vector.to(device)
        n_params = len(param_vector)
        noise = torch.distributions.Normal(loc=torch.tensor(0.), scale=torch.tensor(0.075)).sample_n(n_params)
        param_vector.add_(noise.to(device))
        
        vector_to_parameters(param_vector, model.rnn.parameters())
        model.to(device)
        
        train()
        val_loss = evaluate(val_data)
        print('-' * 89)
        print('| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | '
                'valid ppl {:8.2f} | bpc {:8.3f}'.format(epoch, (time.time() - epoch_start_time),
                                           val_loss, math.exp(val_loss), val_loss / math.log(2)))
        print('-' * 89)
        # Save the model if the validation loss is the best we've seen so far.
        if not best_val_loss or val_loss < best_val_loss:
            with open(save, 'wb') as f:
                torch.save(model, f)
            best_val_loss = val_loss
        else:
            # Anneal the learning rate if no improvement has been seen in the validation dataset.
            if opts == 'SGD' or opts == 'Momentum':
                lr /= 4.0
                for group in opt.param_groups:
                    group['lr'] = lr

except KeyboardInterrupt:
    print('-' * 89)
    print('Exiting from training early')

# Load the best saved model.
with open(save, 'rb') as f:
    model = torch.load(f)

# Run on test data.
test_loss = evaluate(test_data)
print('=' * 89)
print('| End of training | test loss {:5.2f} | test ppl {:8.2f} | bpc {:8.3f}'.format(
    test_loss, math.exp(test_loss), test_loss / math.log(2)))
print('=' * 89)



| epoch   1 |    50/  559 batches | lr 0.0001 | ms/batch 175.09 | loss  8.39 | ppl  4399.20 | bpc   12.103
| epoch   1 |   100/  559 batches | lr 0.0001 | ms/batch 170.32 | loss  8.19 | ppl  3598.25 | bpc   11.813
| epoch   1 |   150/  559 batches | lr 0.0001 | ms/batch 169.06 | loss  8.13 | ppl  3395.52 | bpc   11.729
| epoch   1 |   200/  559 batches | lr 0.0001 | ms/batch 169.75 | loss  8.07 | ppl  3199.12 | bpc   11.643
| epoch   1 |   250/  559 batches | lr 0.0001 | ms/batch 169.55 | loss  8.02 | ppl  3030.41 | bpc   11.565
| epoch   1 |   300/  559 batches | lr 0.0001 | ms/batch 169.52 | loss  7.98 | ppl  2920.70 | bpc   11.512
| epoch   1 |   350/  559 batches | lr 0.0001 | ms/batch 168.80 | loss  7.95 | ppl  2826.23 | bpc   11.465
| epoch   1 |   400/  559 batches | lr 0.0001 | ms/batch 169.23 | loss  7.92 | ppl  2756.35 | bpc   11.429
| epoch   1 |   450/  559 batches | lr 0.0001 | ms/batch 168.68 | loss  7.90 | ppl  2700.34 | bpc   11.399
| epoch   1 |   500/  559 batches | l

In [60]:
data = '/content/data/penn'
batch_size = 64
emsize = 256
nlayers = 1
nhid = 1000
lr = 0.0001
dropout = 0.5
checkpoint = ''
clip = 1
bptt = 35
epochs = 10
save = '/content/output/model_test_word_none.pt'

torch.manual_seed(1111)

# Load data
corpus = Corpus(data)


In [61]:
def batchify(data, bsz):
    # Work out how cleanly we can divide the dataset into bsz parts.
    nbatch = data.size(0) // bsz
    # Trim off any extra elements that wouldn't cleanly fit (remainders).
    data = data.narrow(0, 0, nbatch * bsz)
    # Evenly divide the data across the bsz batches.
    data = data.view(bsz, -1).t().contiguous()
    return data

In [62]:
eval_batch_size = 64
train_data = batchify(corpus.train, batch_size) # size(total_len//bsz, bsz)
val_data = batchify(corpus.valid, eval_batch_size)
test_data = batchify(corpus.test, eval_batch_size)

In [63]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [64]:
train_data.to(device)

tensor([[   0,  988,   48,  ...,   32, 3490,  556],
        [   1,   40,   32,  ..., 6789,  119,   27],
        [   2, 2756,  189,  ..., 1168,  129, 1880],
        ...,
        [1825,   54,   32,  ...,  416,   26,   35],
        [  35, 3940, 2361,  ...,   27,  373,  198],
        [ 101, 1305, 4923,  ...,   24,   42,   42]], device='cuda:0')

In [65]:
val_data.to(device)
test_data.to(device)

tensor([[ 142,  712,  439,  ..., 1940,   64, 3981],
        [  78, 4480,   48,  ...,   64, 4500,  500],
        [  54,  556,   40,  ...,  872,  398,   32],
        ...,
        [ 555,   64, 2380,  ...,  801,   32,   26],
        [1319,   26,  301,  ..., 2030, 6851,   64],
        [ 410,  119,   32,  ...,  159,  548,  220]], device='cuda:0')

In [74]:
# Build the model
interval = 50 # interval to report
ntokens = len(corpus.dictionary) # 10000
model = RNNModel(ntokens, emsize, nhid, nlayers, dropout)

# Load checkpoint
if checkpoint != '':
    model = torch.load(checkpoint, map_location=lambda storage, loc: storage)

print(model)
criterion = torch.nn.CrossEntropyLoss()

  "num_layers={}".format(dropout, num_layers))


RNNModel(
  (drop): Dropout(p=0.5, inplace=False)
  (encoder): Embedding(10000, 256)
  (rnn): LSTM(256, 1000, dropout=0.5)
  (decoder): Linear(in_features=1000, out_features=10000, bias=True)
)


In [75]:
model.to(device)

RNNModel(
  (drop): Dropout(p=0.5, inplace=False)
  (encoder): Embedding(10000, 256)
  (rnn): LSTM(256, 1000, dropout=0.5)
  (decoder): Linear(in_features=1000, out_features=10000, bias=True)
)

In [76]:
def repackage_hidden(h):
    # detach
    return tuple(v.clone().detach() for v in h)


In [77]:

def get_batch(source, i):
    # source: size(total_len//bsz, bsz)
    seq_len = min(bptt, len(source) - 1 - i)
    #data = torch.tensor(source[i:i+seq_len]) # size(bptt, bsz)
    data = source[i:i+seq_len].clone().detach()
    target = source[i+1:i+1+seq_len].clone().detach().view(-1)
    #target = torch.tensor(source[i+1:i+1+seq_len].view(-1)) # size(bptt * bsz)
    return data, target

In [78]:
def evaluate(data_source):
    # Turn on evaluation mode which disables dropout.
    with torch.no_grad():
        model.eval()
        total_loss = 0
        ntokens = len(corpus.dictionary)
        hidden = model.init_hidden(eval_batch_size) #hidden size(nlayers, bsz, hdsize)
        for i in range(0, data_source.size(0) - 1, bptt):# iterate over every timestep
            data, targets = get_batch(data_source, i)
            output, hidden = model(data.to(device), hidden)
            # model input and output
            # inputdata size(bptt, bsz), and size(bptt, bsz, embsize) after embedding
            # output size(bptt*bsz, ntoken)
            total_loss += len(data) * criterion(output.to(device), targets.to(device)).data
            hidden = repackage_hidden(hidden)
        return total_loss / len(data_source)


In [79]:
def train():
    # choose a optimizer

    model.train()
    total_loss = 0
    start_time = time.time()
    hidden = model.init_hidden(batch_size)
    # train_data size(batchcnt, bsz)
    for batch, i in enumerate(range(0, train_data.size(0) - 1, bptt)):
        data, targets = get_batch(train_data, i)
        # Starting each batch, we detach the hidden state from how it was previously produced.
        # If we didn't, the model would try backpropagating all the way to start of the dataset.
        hidden = repackage_hidden(hidden)
        # print(hidden.to(device))
        output, hidden = model(data.to(device), hidden)
        loss = criterion(output.to(device), targets.to(device))
        opt.zero_grad()
        loss.backward()

        # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
        torch.nn.utils.clip_grad_value_(model.parameters(), clip)
        opt.step()

        total_loss += loss.data

        if batch % interval == 0 and batch > 0:
            cur_loss = total_loss / interval
            elapsed = time.time() - start_time
            print('| epoch {:3d} | {:5d}/{:5d} batches | lr {:02.4f} | ms/batch {:5.2f} | '
                    'loss {:5.2f} | ppl {:8.2f} | bpc {:8.3f}'.format(
                epoch, batch, len(train_data) // bptt, lr,
                elapsed * 1000 / interval, cur_loss, math.exp(cur_loss), cur_loss / math.log(2)))
            total_loss = 0
            start_time = time.time()


In [80]:
print("Number of tokens:")
print("Train: ", len(corpus.train))
print("Valid: ", len(corpus.valid))
print("Test:  ", len(corpus.test))

Number of tokens:
Train:  929589
Valid:  73760
Test:   82430


In [81]:
lr = lr
best_val_loss = None
opt = torch.optim.SGD(model.parameters(), lr=lr, momentum=0.99)
opts = 'SGD'
# if opt == 'Adam':
#     opt = torch.optim.Adam(model.parameters(), lr=0.001, betas=(0.9, 0.99))
#     lr = 0.001
# if args.opt == 'Momentum':
#     opt = torch.optim.SGD(model.parameters(), lr=lr, momentum=0.8)
# if args.opt == 'RMSprop':
#     opt = torch.optim.RMSprop(model.parameters(), lr=0.001, alpha=0.9)
#     lr = 0.001

try:
    for epoch in range(1, epochs+1):
        epoch_start_time = time.time()
        train()
        val_loss = evaluate(val_data)
        print('-' * 89)
        print('| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | '
                'valid ppl {:8.2f} | bpc {:8.3f}'.format(epoch, (time.time() - epoch_start_time),
                                           val_loss, math.exp(val_loss), val_loss / math.log(2)))
        print('-' * 89)
        # Save the model if the validation loss is the best we've seen so far.
        if not best_val_loss or val_loss < best_val_loss:
            with open(save, 'wb') as f:
                torch.save(model, f)
            best_val_loss = val_loss
        else:
            # Anneal the learning rate if no improvement has been seen in the validation dataset.
            if opts == 'SGD' or opts == 'Momentum':
                lr /= 4.0
                for group in opt.param_groups:
                    group['lr'] = lr

except KeyboardInterrupt:
    print('-' * 89)
    print('Exiting from training early')

# Load the best saved model.
with open(save, 'rb') as f:
    model = torch.load(f)

# Run on test data.
test_loss = evaluate(test_data)
print('=' * 89)
print('| End of training | test loss {:5.2f} | test ppl {:8.2f} | bpc {:8.3f}'.format(
    test_loss, math.exp(test_loss), test_loss / math.log(2)))
print('=' * 89)

| epoch   1 |    50/  414 batches | lr 0.0001 | ms/batch 135.47 | loss  9.40 | ppl 12033.63 | bpc   13.555
| epoch   1 |   100/  414 batches | lr 0.0001 | ms/batch 133.15 | loss  9.21 | ppl  9968.75 | bpc   13.283
| epoch   1 |   150/  414 batches | lr 0.0001 | ms/batch 132.97 | loss  9.20 | ppl  9903.14 | bpc   13.274
| epoch   1 |   200/  414 batches | lr 0.0001 | ms/batch 132.67 | loss  9.19 | ppl  9825.97 | bpc   13.262
| epoch   1 |   250/  414 batches | lr 0.0001 | ms/batch 133.06 | loss  9.18 | ppl  9735.82 | bpc   13.249
| epoch   1 |   300/  414 batches | lr 0.0001 | ms/batch 132.34 | loss  9.17 | ppl  9644.49 | bpc   13.235
| epoch   1 |   350/  414 batches | lr 0.0001 | ms/batch 132.51 | loss  9.16 | ppl  9546.44 | bpc   13.221
| epoch   1 |   400/  414 batches | lr 0.0001 | ms/batch 132.47 | loss  9.15 | ppl  9447.08 | bpc   13.206
-----------------------------------------------------------------------------------------
| end of epoch   1 | time: 57.40s | valid loss  9.14 |

In [82]:
# Build the model
interval = 50 # interval to report
ntokens = len(corpus.dictionary) # 10000
model = RNNModel(ntokens, emsize, nhid, nlayers, dropout)
save = '/content/output/model_test_word_noise.pt'
checkpoint = "/content/output/model_test_word_none.pt"

# Load checkpoint
if checkpoint != '':
    model = torch.load(checkpoint, map_location=lambda storage, loc: storage)

print(model)
criterion = torch.nn.CrossEntropyLoss()

  "num_layers={}".format(dropout, num_layers))


RNNModel(
  (drop): Dropout(p=0.5, inplace=False)
  (encoder): Embedding(10000, 256)
  (rnn): LSTM(256, 1000, dropout=0.5)
  (decoder): Linear(in_features=1000, out_features=10000, bias=True)
)


In [83]:
from torch.nn.utils import vector_to_parameters, parameters_to_vector
lr = lr
best_val_loss = None
opt = torch.optim.SGD(model.parameters(), lr=lr, momentum=0.99)
opts = 'SGD'
epochs = 25
# if opt == 'Adam':
#     opt = torch.optim.Adam(model.parameters(), lr=0.001, betas=(0.9, 0.99))
#     lr = 0.001
# if args.opt == 'Momentum':
#     opt = torch.optim.SGD(model.parameters(), lr=lr, momentum=0.8)
# if args.opt == 'RMSprop':
#     opt = torch.optim.RMSprop(model.parameters(), lr=0.001, alpha=0.9)
#     lr = 0.001

try:
    for epoch in range(1, epochs+1):
        epoch_start_time = time.time()
        model.to(device)

        param_vector = parameters_to_vector(model.rnn.parameters())
        param_vector.to(device)
        n_params = len(param_vector)
        noise = torch.distributions.Normal(loc=torch.tensor(0.), scale=torch.tensor(0.075)).sample_n(n_params)
        param_vector.add_(noise.to(device))
        
        vector_to_parameters(param_vector, model.rnn.parameters())
        model.to(device)
        train()
        val_loss = evaluate(val_data)
        print('-' * 89)
        print('| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | '
                'valid ppl {:8.2f} | bpc {:8.3f}'.format(epoch, (time.time() - epoch_start_time),
                                           val_loss, math.exp(val_loss), val_loss / math.log(2)))
        print('-' * 89)
        # Save the model if the validation loss is the best we've seen so far.
        if not best_val_loss or val_loss < best_val_loss:
            with open(save, 'wb') as f:
                torch.save(model, f)
            best_val_loss = val_loss
        else:
            # Anneal the learning rate if no improvement has been seen in the validation dataset.
            if opts == 'SGD' or opts == 'Momentum':
                lr /= 4.0
                for group in opt.param_groups:
                    group['lr'] = lr

except KeyboardInterrupt:
    print('-' * 89)
    print('Exiting from training early')

# Load the best saved model.
with open(save, 'rb') as f:
    model = torch.load(f)

# Run on test data.
test_loss = evaluate(test_data)
print('=' * 89)
print('| End of training | test loss {:5.2f} | test ppl {:8.2f} | bpc {:8.3f}'.format(
    test_loss, math.exp(test_loss), test_loss / math.log(2)))
print('=' * 89)



| epoch   1 |    50/  414 batches | lr 0.0001 | ms/batch 136.33 | loss  7.64 | ppl  2083.54 | bpc   11.025
| epoch   1 |   100/  414 batches | lr 0.0001 | ms/batch 133.73 | loss  7.01 | ppl  1106.82 | bpc   10.112
| epoch   1 |   150/  414 batches | lr 0.0001 | ms/batch 133.12 | loss  6.90 | ppl   997.01 | bpc    9.961
| epoch   1 |   200/  414 batches | lr 0.0001 | ms/batch 133.57 | loss  6.89 | ppl   983.50 | bpc    9.942
| epoch   1 |   250/  414 batches | lr 0.0001 | ms/batch 133.09 | loss  6.90 | ppl   988.29 | bpc    9.949
| epoch   1 |   300/  414 batches | lr 0.0001 | ms/batch 132.93 | loss  6.87 | ppl   959.97 | bpc    9.907
| epoch   1 |   350/  414 batches | lr 0.0001 | ms/batch 133.00 | loss  6.85 | ppl   939.19 | bpc    9.875
| epoch   1 |   400/  414 batches | lr 0.0001 | ms/batch 132.95 | loss  6.82 | ppl   915.84 | bpc    9.839
-----------------------------------------------------------------------------------------
| end of epoch   1 | time: 57.63s | valid loss  6.74 |