In [1]:
import torch.nn as nn
import numpy as np

import os
import torch

from collections import Counter
import torch
import time
import math

from torch.nn.utils import vector_to_parameters, parameters_to_vector
import matplotlib.pyplot as plt

In [2]:
!echo "=== Acquiring datasets ==="
!echo "---"

!mkdir -p data



=== Acquiring datasets ===
---


In [3]:
%cd data

/ssd_scratch/cvit/sashank.sridhar/data


In [4]:
!ls

penn  pennchar	simple-examples.tgz


In [5]:
!echo "- Downloading Penn Treebank (PTB)"
!wget --quiet --continue http://www.fit.vutbr.cz/~imikolov/rnnlm/simple-examples.tgz
!tar -xzf simple-examples.tgz

- Downloading Penn Treebank (PTB)


In [6]:
!mkdir -p penn
%cd penn
!mv ../simple-examples/data/ptb.train.txt train.txt
!mv ../simple-examples/data/ptb.test.txt test.txt
!mv ../simple-examples/data/ptb.valid.txt valid.txt

/ssd_scratch/cvit/sashank.sridhar/data/penn


In [7]:
!echo "- Downloading Penn Treebank (Character)"
!mkdir -p ../pennchar
%cd ../pennchar

- Downloading Penn Treebank (Character)
/ssd_scratch/cvit/sashank.sridhar/data/pennchar


In [8]:
!mv ../simple-examples/data/ptb.char.train.txt train.txt
!mv ../simple-examples/data/ptb.char.test.txt test.txt
!mv ../simple-examples/data/ptb.char.valid.txt valid.txt

In [9]:
!rm -rf ../simple-examples/

**Character - Without Noise**

In [10]:
class RNNModel(nn.Module):

    def __init__(self, ntoken, ninp, nhid, nlayers, dropout=0.5):
        super(RNNModel, self).__init__()
        self.drop = nn.Dropout(dropout)
        self.encoder = nn.Embedding(ntoken, ninp) # Token2Embeddings
        self.rnn = nn.LSTM(ninp, nhid, nlayers, dropout=dropout) #(seq_len, batch_size, emb_size)
        self.decoder = nn.Linear(nhid, ntoken)
        self.init_weights()
        self.nhid = nhid
        self.nlayers = nlayers

    def init_weights(self):
        initrange = 0.05
        self.encoder.weight.data.uniform_(-initrange, initrange)
        self.decoder.bias.data.fill_(0)
        self.decoder.weight.data.uniform_(-initrange, initrange)

    def forward(self, input, hidden):
        # input size(bptt, bsz)
        emb = self.drop(self.encoder(input))
        # emb size(bptt, bsz, embsize)
        # hid size(layers, bsz, nhid)
        output, hidden = self.rnn(emb, hidden)
        # output size(bptt, bsz, nhid)
        output = self.drop(output)
        # decoder: nhid -> ntoken
        decoded = self.decoder(output.view(output.size(0)*output.size(1), output.size(2)))
        return decoded, hidden

    def init_hidden(self, bsz):
        # LSTM h and c
        weight = next(self.parameters()).data
        return weight.new_zeros(self.nlayers, bsz, self.nhid), weight.new_zeros(self.nlayers, bsz, self.nhid)

In [11]:
class Dictionary(object):
    def __init__(self):
        self.word2idx = {}
        self.idx2word = []
        self.counter = Counter()
        self.total = 0

    def add_word(self, word):
        if word not in self.word2idx:
            self.idx2word.append(word)
            self.word2idx[word] = len(self.idx2word) - 1
        token_id = self.word2idx[word]
        self.counter[token_id] += 1
        self.total += 1
        return self.word2idx[word]

    def __len__(self):
        return len(self.idx2word)


class Corpus(object):
    def __init__(self, path):
        self.dictionary = Dictionary()
        self.train = self.tokenize(os.path.join(path, 'train.txt'))
        self.valid = self.tokenize(os.path.join(path, 'valid.txt'))
        self.test = self.tokenize(os.path.join(path, 'test.txt'))

    def tokenize(self, path):
        """Tokenizes a text file."""
        assert os.path.exists(path)
        # Add words to the dictionary
        with open(path, 'r') as f:
            tokens = 0
            for line in f:
                words = line.split() + ['<eos>']
                tokens += len(words)
                for word in words:
                    self.dictionary.add_word(word)

        # Tokenize file content
        with open(path, 'r') as f:
            ids = torch.LongTensor(tokens)
            token = 0
            for line in f:
                words = line.split() + ['<eos>']
                for word in words:
                    ids[token] = self.dictionary.word2idx[word]
                    token += 1

        return ids

In [12]:
%cd ..

/ssd_scratch/cvit/sashank.sridhar/data


In [13]:
%cd ..

/ssd_scratch/cvit/sashank.sridhar


In [14]:
!ls

Char_Noise_Adaptive.png  data				validation.txt
Char_Noise.png		 losses.txt			Word_Noise.png
Char_None.png		 output
checkpoint.pt		 PennTreebankExperiments.ipynb


In [15]:
!mkdir -p output

In [16]:
data = 'data/pennchar'
batch_size = 256
emsize = 256
nlayers = 1
nhid = 1000
lr = 0.0001
dropout = 0.5
checkpoint = ''
clip = 1
bptt = 35
save = 'output/model_test_character_none.pt'

torch.manual_seed(1111)

# Load data
corpus = Corpus(data)


In [17]:
def batchify(data, bsz):
    # Work out how cleanly we can divide the dataset into bsz parts.
    nbatch = data.size(0) // bsz
    # Trim off any extra elements that wouldn't cleanly fit (remainders).
    data = data.narrow(0, 0, nbatch * bsz)
    # Evenly divide the data across the bsz batches.
    data = data.view(bsz, -1).t().contiguous()
    return data

In [18]:
eval_batch_size = 256
train_data = batchify(corpus.train, batch_size) # size(total_len//bsz, bsz)
val_data = batchify(corpus.valid, eval_batch_size)
test_data = batchify(corpus.test, eval_batch_size)

In [19]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [20]:
train_data.to(device)

tensor([[ 0,  1,  9,  ...,  3, 20,  0],
        [ 1,  2, 10,  ...,  7,  7, 24],
        [ 2,  3,  7,  ..., 17, 13,  0],
        ...,
        [ 8,  4,  7,  ..., 21, 28, 26],
        [ 7, 10, 15,  ...,  3,  3, 10],
        [ 4,  9,  2,  ..., 16, 12,  5]], device='cuda:0')

In [21]:
val_data.to(device)
test_data.to(device)

tensor([[ 5,  3,  3,  ...,  7,  3, 14],
        [ 7, 29, 24,  ..., 15, 18,  3],
        [ 3,  3,  0,  ...,  5,  2, 33],
        ...,
        [ 3,  3,  3,  ...,  7,  2, 17],
        [ 7,  0,  8,  ..., 18, 21,  3],
        [ 2, 16, 20,  ...,  1,  0, 13]], device='cuda:0')

In [22]:
# Build the model
interval = 50 # interval to report
ntokens = len(corpus.dictionary) # 10000
epochs = 50
model = RNNModel(ntokens, emsize, nhid, nlayers, dropout)

# Load checkpoint
if checkpoint != '':
    model = torch.load(checkpoint, map_location=lambda storage, loc: storage)

print(model)
criterion = torch.nn.CrossEntropyLoss()

RNNModel(
  (drop): Dropout(p=0.5, inplace=False)
  (encoder): Embedding(50, 256)
  (rnn): LSTM(256, 1000, dropout=0.5)
  (decoder): Linear(in_features=1000, out_features=50, bias=True)
)




In [23]:
model.to(device)

RNNModel(
  (drop): Dropout(p=0.5, inplace=False)
  (encoder): Embedding(50, 256)
  (rnn): LSTM(256, 1000, dropout=0.5)
  (decoder): Linear(in_features=1000, out_features=50, bias=True)
)

In [24]:
def repackage_hidden(h):
    # detach
    return tuple(v.clone().detach() for v in h)


In [25]:

def get_batch(source, i):
    # source: size(total_len//bsz, bsz)
    seq_len = min(bptt, len(source) - 1 - i)
    #data = torch.tensor(source[i:i+seq_len]) # size(bptt, bsz)
    data = source[i:i+seq_len].clone().detach()
    target = source[i+1:i+1+seq_len].clone().detach().view(-1)
    #target = torch.tensor(source[i+1:i+1+seq_len].view(-1)) # size(bptt * bsz)
    return data, target

In [26]:
def evaluate(data_source):
    # Turn on evaluation mode which disables dropout.
    with torch.no_grad():
        model.eval()
        total_loss = 0
        ntokens = len(corpus.dictionary)
        hidden = model.init_hidden(eval_batch_size) #hidden size(nlayers, bsz, hdsize)
        for i in range(0, data_source.size(0) - 1, bptt):# iterate over every timestep
            data, targets = get_batch(data_source, i)
            output, hidden = model(data.to(device), hidden)
            # model input and output
            # inputdata size(bptt, bsz), and size(bptt, bsz, embsize) after embedding
            # output size(bptt*bsz, ntoken)
            total_loss += len(data) * criterion(output.to(device), targets.to(device)).data
            hidden = repackage_hidden(hidden)
        return total_loss / len(data_source)


In [27]:
class EarlyStopping:
    """Early stops the training if validation loss doesn't improve after a given patience."""
    def __init__(self, patience=7, verbose=False, delta=0, path='checkpoint.pt', trace_func=print):
        """
        Args:
            patience (int): How long to wait after last time validation loss improved.
                            Default: 7
            verbose (bool): If True, prints a message for each validation loss improvement. 
                            Default: False
            delta (float): Minimum change in the monitored quantity to qualify as an improvement.
                            Default: 0
            path (str): Path for the checkpoint to be saved to.
                            Default: 'checkpoint.pt'
            trace_func (function): trace print function.
                            Default: print            
        """
        self.patience = patience
        self.verbose = verbose
        self.counter = 0
        self.best_score = None
        self.early_stop = False
        self.val_loss_min = np.Inf
        self.delta = delta
        self.path = path
        self.trace_func = trace_func
    def __call__(self, val_loss, model):

        score = -val_loss

        if self.best_score is None:
            self.best_score = score
            self.save_checkpoint(val_loss, model)
        elif score < self.best_score + self.delta:
            self.counter += 1
            self.trace_func(f'EarlyStopping counter: {self.counter} out of {self.patience}')
            if self.counter >= self.patience:
                self.early_stop = True
        else:
            self.best_score = score
            self.save_checkpoint(val_loss, model)
            self.counter = 0

    def save_checkpoint(self, val_loss, model):
        '''Saves model when validation loss decrease.'''
        if self.verbose:
            self.trace_func(f'Validation loss decreased ({self.val_loss_min:.6f} --> {val_loss:.6f}).  Saving model ...')
        torch.save(model.state_dict(), self.path)
        self.val_loss_min = val_loss

In [29]:
def train():
    # choose a optimizer

    model.train()
    total_loss = 0
    start_time = time.time()
    hidden = model.init_hidden(batch_size)
    # train_data size(batchcnt, bsz)
    for batch, i in enumerate(range(0, train_data.size(0) - 1, bptt)):
        data, targets = get_batch(train_data, i)
        # Starting each batch, we detach the hidden state from how it was previously produced.
        # If we didn't, the model would try backpropagating all the way to start of the dataset.
        hidden = repackage_hidden(hidden)
        # print(hidden.to(device))
        output, hidden = model(data.to(device), hidden)
        loss = criterion(output.to(device), targets.to(device))
        opt.zero_grad()
        loss.backward()

        # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
        torch.nn.utils.clip_grad_value_(model.parameters(), clip)
        opt.step()

        total_loss += loss.data

        if batch % interval == 0 and batch > 0:
            cur_loss = total_loss / interval
            elapsed = time.time() - start_time
            print('| epoch {:3d} | {:5d}/{:5d} batches | lr {:02.4f} | ms/batch {:5.2f} | '
                    'loss {:5.2f} | ppl {:8.2f} | bpc {:8.3f}'.format(
                epoch, batch, len(train_data) // bptt, lr,
                elapsed * 1000 / interval, cur_loss, math.exp(cur_loss), cur_loss / math.log(2)))
            total_loss = 0
            start_time = time.time()


In [28]:
print("Number of tokens:")
print("Train: ", len(corpus.train))
print("Valid: ", len(corpus.valid))
print("Test:  ", len(corpus.test))

Number of tokens:
Train:  5017483
Valid:  393043
Test:   442424


In [30]:
lr = lr
best_val_loss = None
opt = torch.optim.SGD(model.parameters(), lr=lr, momentum=0.99)
opts = 'SGD'
val_losses = []
train_losses = []
early_stopping = EarlyStopping(patience=10, verbose=True)
try:
    for epoch in range(1, epochs+1):
        epoch_start_time = time.time()
        train()
        val_loss = evaluate(val_data)
        train_loss = evaluate(train_data)
        
        print('-' * 89)
        print('| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | '
                'valid ppl {:8.2f} | bpc {:8.3f}'.format(epoch, (time.time() - epoch_start_time),
                                           val_loss, math.exp(val_loss), val_loss / math.log(2)))
        print('-' * 89)
        val_losses.append(val_loss)
        train_losses.append(train_loss)
        early_stopping(val_loss, model)
        if early_stopping.early_stop:
            print("Early stopping")
            break
        # Save the model if the validation loss is the best we've seen so far.
        if not best_val_loss or val_loss < best_val_loss:
            with open(save, 'wb') as f:
                torch.save(model, f)
            best_val_loss = val_loss
        else:
            # Anneal the learning rate if no improvement has been seen in the validation dataset.
            if opts == 'SGD' or opts == 'Momentum':
                lr /= 4.0
                for group in opt.param_groups:
                    group['lr'] = lr

except KeyboardInterrupt:
    print('-' * 89)
    print('Exiting from training early')

# Load the best saved model.
with open(save, 'rb') as f:
    model = torch.load(f)

# Run on test data.
test_loss = evaluate(test_data)
print('=' * 89)
print('| End of training | test loss {:5.2f} | test ppl {:8.2f} | bpc {:8.3f}'.format(
    test_loss, math.exp(test_loss), test_loss / math.log(2)))
print('=' * 89)

| epoch   1 |    50/  559 batches | lr 0.0001 | ms/batch 38.79 | loss  3.99 | ppl    53.98 | bpc    5.754
| epoch   1 |   100/  559 batches | lr 0.0001 | ms/batch 36.95 | loss  3.89 | ppl    49.16 | bpc    5.619
| epoch   1 |   150/  559 batches | lr 0.0001 | ms/batch 36.91 | loss  3.87 | ppl    47.96 | bpc    5.584
| epoch   1 |   200/  559 batches | lr 0.0001 | ms/batch 37.22 | loss  3.84 | ppl    46.56 | bpc    5.541
| epoch   1 |   250/  559 batches | lr 0.0001 | ms/batch 37.26 | loss  3.81 | ppl    45.04 | bpc    5.493
| epoch   1 |   300/  559 batches | lr 0.0001 | ms/batch 37.21 | loss  3.77 | ppl    43.52 | bpc    5.444
| epoch   1 |   350/  559 batches | lr 0.0001 | ms/batch 37.23 | loss  3.74 | ppl    42.01 | bpc    5.393
| epoch   1 |   400/  559 batches | lr 0.0001 | ms/batch 37.37 | loss  3.70 | ppl    40.53 | bpc    5.341
| epoch   1 |   450/  559 batches | lr 0.0001 | ms/batch 37.34 | loss  3.67 | ppl    39.08 | bpc    5.288
| epoch   1 |   500/  559 batches | lr 0.0001 

| epoch   6 |   400/  559 batches | lr 0.0001 | ms/batch 42.01 | loss  3.02 | ppl    20.52 | bpc    4.359
| epoch   6 |   450/  559 batches | lr 0.0001 | ms/batch 42.03 | loss  3.02 | ppl    20.49 | bpc    4.357
| epoch   6 |   500/  559 batches | lr 0.0001 | ms/batch 42.00 | loss  3.02 | ppl    20.49 | bpc    4.357
| epoch   6 |   550/  559 batches | lr 0.0001 | ms/batch 41.97 | loss  3.02 | ppl    20.44 | bpc    4.354
-----------------------------------------------------------------------------------------
| end of epoch   6 | time: 31.73s | valid loss  3.01 | valid ppl    20.30 | bpc    4.344
-----------------------------------------------------------------------------------------
Validation loss decreased (3.015363 --> 3.010698).  Saving model ...
| epoch   7 |    50/  559 batches | lr 0.0001 | ms/batch 39.51 | loss  3.08 | ppl    21.78 | bpc    4.445
| epoch   7 |   100/  559 batches | lr 0.0001 | ms/batch 40.47 | loss  3.02 | ppl    20.45 | bpc    4.354
| epoch   7 |   150/  559 

| epoch  12 |    50/  559 batches | lr 0.0001 | ms/batch 41.05 | loss  3.07 | ppl    21.59 | bpc    4.432
| epoch  12 |   100/  559 batches | lr 0.0001 | ms/batch 41.96 | loss  3.01 | ppl    20.27 | bpc    4.341
| epoch  12 |   150/  559 batches | lr 0.0001 | ms/batch 41.96 | loss  3.01 | ppl    20.30 | bpc    4.343
| epoch  12 |   200/  559 batches | lr 0.0001 | ms/batch 41.97 | loss  3.01 | ppl    20.37 | bpc    4.349
| epoch  12 |   250/  559 batches | lr 0.0001 | ms/batch 41.98 | loss  3.01 | ppl    20.28 | bpc    4.342
| epoch  12 |   300/  559 batches | lr 0.0001 | ms/batch 41.98 | loss  3.01 | ppl    20.29 | bpc    4.343
| epoch  12 |   350/  559 batches | lr 0.0001 | ms/batch 41.98 | loss  3.01 | ppl    20.29 | bpc    4.343
| epoch  12 |   400/  559 batches | lr 0.0001 | ms/batch 41.96 | loss  3.01 | ppl    20.29 | bpc    4.343
| epoch  12 |   450/  559 batches | lr 0.0001 | ms/batch 41.92 | loss  3.01 | ppl    20.28 | bpc    4.342
| epoch  12 |   500/  559 batches | lr 0.0001 

| epoch  17 |   400/  559 batches | lr 0.0001 | ms/batch 41.90 | loss  3.00 | ppl    20.13 | bpc    4.331
| epoch  17 |   450/  559 batches | lr 0.0001 | ms/batch 41.91 | loss  3.00 | ppl    20.11 | bpc    4.330
| epoch  17 |   500/  559 batches | lr 0.0001 | ms/batch 41.98 | loss  3.00 | ppl    20.11 | bpc    4.330
| epoch  17 |   550/  559 batches | lr 0.0001 | ms/batch 41.97 | loss  3.00 | ppl    20.07 | bpc    4.327
-----------------------------------------------------------------------------------------
| end of epoch  17 | time: 32.01s | valid loss  2.99 | valid ppl    19.97 | bpc    4.320
-----------------------------------------------------------------------------------------
Validation loss decreased (2.995986 --> 2.994123).  Saving model ...
| epoch  18 |    50/  559 batches | lr 0.0001 | ms/batch 40.19 | loss  3.06 | ppl    21.37 | bpc    4.418
| epoch  18 |   100/  559 batches | lr 0.0001 | ms/batch 40.77 | loss  3.00 | ppl    20.08 | bpc    4.328
| epoch  18 |   150/  559 

| epoch  23 |    50/  559 batches | lr 0.0001 | ms/batch 40.01 | loss  3.04 | ppl    21.00 | bpc    4.392
| epoch  23 |   100/  559 batches | lr 0.0001 | ms/batch 41.13 | loss  2.98 | ppl    19.73 | bpc    4.303
| epoch  23 |   150/  559 batches | lr 0.0001 | ms/batch 41.80 | loss  2.98 | ppl    19.74 | bpc    4.303
| epoch  23 |   200/  559 batches | lr 0.0001 | ms/batch 41.86 | loss  2.99 | ppl    19.81 | bpc    4.308
| epoch  23 |   250/  559 batches | lr 0.0001 | ms/batch 41.77 | loss  2.98 | ppl    19.71 | bpc    4.301
| epoch  23 |   300/  559 batches | lr 0.0001 | ms/batch 41.79 | loss  2.98 | ppl    19.71 | bpc    4.301
| epoch  23 |   350/  559 batches | lr 0.0001 | ms/batch 41.76 | loss  2.98 | ppl    19.70 | bpc    4.300
| epoch  23 |   400/  559 batches | lr 0.0001 | ms/batch 41.83 | loss  2.98 | ppl    19.71 | bpc    4.301
| epoch  23 |   450/  559 batches | lr 0.0001 | ms/batch 41.82 | loss  2.98 | ppl    19.68 | bpc    4.299
| epoch  23 |   500/  559 batches | lr 0.0001 

| epoch  28 |   400/  559 batches | lr 0.0001 | ms/batch 41.77 | loss  2.94 | ppl    18.83 | bpc    4.235
| epoch  28 |   450/  559 batches | lr 0.0001 | ms/batch 41.78 | loss  2.93 | ppl    18.79 | bpc    4.232
| epoch  28 |   500/  559 batches | lr 0.0001 | ms/batch 41.74 | loss  2.93 | ppl    18.79 | bpc    4.232
| epoch  28 |   550/  559 batches | lr 0.0001 | ms/batch 41.75 | loss  2.93 | ppl    18.73 | bpc    4.227
-----------------------------------------------------------------------------------------
| end of epoch  28 | time: 31.90s | valid loss  2.92 | valid ppl    18.61 | bpc    4.218
-----------------------------------------------------------------------------------------
Validation loss decreased (2.935608 --> 2.923865).  Saving model ...
| epoch  29 |    50/  559 batches | lr 0.0001 | ms/batch 41.18 | loss  2.99 | ppl    19.90 | bpc    4.315
| epoch  29 |   100/  559 batches | lr 0.0001 | ms/batch 41.81 | loss  2.93 | ppl    18.70 | bpc    4.225
| epoch  29 |   150/  559 

| epoch  34 |    50/  559 batches | lr 0.0001 | ms/batch 42.55 | loss  2.94 | ppl    18.82 | bpc    4.234
| epoch  34 |   100/  559 batches | lr 0.0001 | ms/batch 41.69 | loss  2.87 | ppl    17.71 | bpc    4.146
| epoch  34 |   150/  559 batches | lr 0.0001 | ms/batch 41.76 | loss  2.87 | ppl    17.72 | bpc    4.147
| epoch  34 |   200/  559 batches | lr 0.0001 | ms/batch 41.82 | loss  2.88 | ppl    17.76 | bpc    4.151
| epoch  34 |   250/  559 batches | lr 0.0001 | ms/batch 41.74 | loss  2.87 | ppl    17.68 | bpc    4.144
| epoch  34 |   300/  559 batches | lr 0.0001 | ms/batch 41.76 | loss  2.87 | ppl    17.61 | bpc    4.139
| epoch  34 |   350/  559 batches | lr 0.0001 | ms/batch 41.78 | loss  2.87 | ppl    17.62 | bpc    4.139
| epoch  34 |   400/  559 batches | lr 0.0001 | ms/batch 41.76 | loss  2.87 | ppl    17.64 | bpc    4.141
| epoch  34 |   450/  559 batches | lr 0.0001 | ms/batch 41.75 | loss  2.87 | ppl    17.59 | bpc    4.137
| epoch  34 |   500/  559 batches | lr 0.0001 

| epoch  39 |   400/  559 batches | lr 0.0001 | ms/batch 41.82 | loss  2.81 | ppl    16.60 | bpc    4.053
| epoch  39 |   450/  559 batches | lr 0.0001 | ms/batch 41.84 | loss  2.81 | ppl    16.55 | bpc    4.049
| epoch  39 |   500/  559 batches | lr 0.0001 | ms/batch 41.83 | loss  2.81 | ppl    16.55 | bpc    4.049
| epoch  39 |   550/  559 batches | lr 0.0001 | ms/batch 41.84 | loss  2.80 | ppl    16.48 | bpc    4.043
-----------------------------------------------------------------------------------------
| end of epoch  39 | time: 32.00s | valid loss  2.79 | valid ppl    16.33 | bpc    4.029
-----------------------------------------------------------------------------------------
Validation loss decreased (2.807595 --> 2.792894).  Saving model ...
| epoch  40 |    50/  559 batches | lr 0.0001 | ms/batch 40.17 | loss  2.86 | ppl    17.46 | bpc    4.126
| epoch  40 |   100/  559 batches | lr 0.0001 | ms/batch 41.42 | loss  2.80 | ppl    16.46 | bpc    4.041
| epoch  40 |   150/  559 

| epoch  45 |    50/  559 batches | lr 0.0001 | ms/batch 39.21 | loss  2.76 | ppl    15.86 | bpc    3.988
| epoch  45 |   100/  559 batches | lr 0.0001 | ms/batch 40.93 | loss  2.71 | ppl    14.97 | bpc    3.904
| epoch  45 |   150/  559 batches | lr 0.0001 | ms/batch 41.79 | loss  2.71 | ppl    14.97 | bpc    3.904
| epoch  45 |   200/  559 batches | lr 0.0001 | ms/batch 41.75 | loss  2.71 | ppl    15.00 | bpc    3.907
| epoch  45 |   250/  559 batches | lr 0.0001 | ms/batch 41.80 | loss  2.70 | ppl    14.89 | bpc    3.897
| epoch  45 |   300/  559 batches | lr 0.0001 | ms/batch 41.48 | loss  2.70 | ppl    14.82 | bpc    3.889
| epoch  45 |   350/  559 batches | lr 0.0001 | ms/batch 41.56 | loss  2.70 | ppl    14.81 | bpc    3.888
| epoch  45 |   400/  559 batches | lr 0.0001 | ms/batch 41.68 | loss  2.70 | ppl    14.82 | bpc    3.890
| epoch  45 |   450/  559 batches | lr 0.0001 | ms/batch 41.81 | loss  2.69 | ppl    14.78 | bpc    3.885
| epoch  45 |   500/  559 batches | lr 0.0001 

| epoch  50 |   400/  559 batches | lr 0.0001 | ms/batch 41.79 | loss  2.60 | ppl    13.48 | bpc    3.752
| epoch  50 |   450/  559 batches | lr 0.0001 | ms/batch 41.81 | loss  2.60 | ppl    13.45 | bpc    3.750
| epoch  50 |   500/  559 batches | lr 0.0001 | ms/batch 41.64 | loss  2.60 | ppl    13.45 | bpc    3.749
| epoch  50 |   550/  559 batches | lr 0.0001 | ms/batch 41.75 | loss  2.59 | ppl    13.38 | bpc    3.742
-----------------------------------------------------------------------------------------
| end of epoch  50 | time: 31.92s | valid loss  2.57 | valid ppl    13.10 | bpc    3.712
-----------------------------------------------------------------------------------------
Validation loss decreased (2.587694 --> 2.572656).  Saving model ...
| End of training | test loss  2.57 | test ppl    13.05 | bpc    3.706


In [31]:
validation_losses = [i.item() for i in val_losses]
training_losses = [i.item() for i in train_losses]

In [35]:
plt.plot(range(50), training_losses)
plt.plot(range(50), validation_losses, c='#00ff00')
plt.xlim(0, 50)
plt.ylim(0, 5.0)
plt.xlabel('EPOCH')
plt.ylabel('Loss')
plt.legend(['train', 'val'])
plt.title('Loss')
plt.savefig('Char_None'+'.png')
plt.close()

**With Gaussian Noise**

In [38]:
# Build the model
interval = 50 # interval to report
ntokens = len(corpus.dictionary) # 10000
model = RNNModel(ntokens, emsize, nhid, nlayers, dropout)
save = 'output/model_test_character_noise.pt'
checkpoint = "output/model_test_character_none.pt"

# Load checkpoint
if checkpoint != '':
    model = torch.load(checkpoint, map_location=lambda storage, loc: storage)

print(model)
model.to(device)
criterion = torch.nn.CrossEntropyLoss()

RNNModel(
  (drop): Dropout(p=0.5, inplace=False)
  (encoder): Embedding(50, 256)
  (rnn): LSTM(256, 1000, dropout=0.5)
  (decoder): Linear(in_features=1000, out_features=50, bias=True)
)


In [39]:
lr = 0.0001
best_val_loss = None
opt = torch.optim.SGD(model.parameters(), lr=lr, momentum=0.99)
val_losses = []
train_losses = []
opts = 'SGD'
epochs = 100
early_stopping = EarlyStopping(patience=25, verbose=True)
try:
    for epoch in range(1, epochs+1):
        epoch_start_time = time.time()
        model.to(device)

        param_vector = parameters_to_vector(model.rnn.parameters())
        param_vector.to(device)
        n_params = len(param_vector)
        noise = torch.distributions.Normal(loc=torch.tensor(0.), scale=torch.tensor(0.075)).sample_n(n_params)
        param_vector.add_(noise.to(device))
        
        vector_to_parameters(param_vector, model.rnn.parameters())
        model.to(device)
        train()
        val_loss = evaluate(val_data)
        train_loss = evaluate(train_data)
        val_losses.append(val_loss)
        train_losses.append(train_loss)
        print('-' * 89)
        early_stopping(val_loss, model)
        if early_stopping.early_stop:
            print("Early stopping")
            break
    
        print('| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | '
                'valid ppl {:8.2f} | bpc {:8.3f}'.format(epoch, (time.time() - epoch_start_time),
                                           val_loss, math.exp(val_loss), val_loss / math.log(2)))
        print('-' * 89)
        # Save the model if the validation loss is the best we've seen so far.
        if not best_val_loss or val_loss < best_val_loss:
            with open(save, 'wb') as f:
                torch.save(model, f)
            best_val_loss = val_loss

except KeyboardInterrupt:
    print('-' * 89)
    print('Exiting from training early')

# Load the best saved model.
with open(save, 'rb') as f:
    model = torch.load(f)

# Run on test data.
test_loss = evaluate(test_data)
print('=' * 89)
print('| End of training | test loss {:5.2f} | test ppl {:8.2f} | bpc {:8.3f}'.format(
    test_loss, math.exp(test_loss), test_loss / math.log(2)))
print('=' * 89)



| epoch   1 |    50/  559 batches | lr 0.0001 | ms/batch 38.04 | loss  2.92 | ppl    18.59 | bpc    4.217
| epoch   1 |   100/  559 batches | lr 0.0001 | ms/batch 36.91 | loss  2.82 | ppl    16.70 | bpc    4.061
| epoch   1 |   150/  559 batches | lr 0.0001 | ms/batch 36.89 | loss  2.79 | ppl    16.21 | bpc    4.018
| epoch   1 |   200/  559 batches | lr 0.0001 | ms/batch 36.95 | loss  2.76 | ppl    15.83 | bpc    3.985
| epoch   1 |   250/  559 batches | lr 0.0001 | ms/batch 37.00 | loss  2.74 | ppl    15.45 | bpc    3.950
| epoch   1 |   300/  559 batches | lr 0.0001 | ms/batch 37.04 | loss  2.72 | ppl    15.11 | bpc    3.917
| epoch   1 |   350/  559 batches | lr 0.0001 | ms/batch 37.08 | loss  2.70 | ppl    14.94 | bpc    3.901
| epoch   1 |   400/  559 batches | lr 0.0001 | ms/batch 37.16 | loss  2.69 | ppl    14.79 | bpc    3.887
| epoch   1 |   450/  559 batches | lr 0.0001 | ms/batch 37.15 | loss  2.68 | ppl    14.63 | bpc    3.871
| epoch   1 |   500/  559 batches | lr 0.0001 

| epoch   6 |   450/  559 batches | lr 0.0001 | ms/batch 41.65 | loss  2.93 | ppl    18.78 | bpc    4.231
| epoch   6 |   500/  559 batches | lr 0.0001 | ms/batch 41.80 | loss  2.93 | ppl    18.64 | bpc    4.220
| epoch   6 |   550/  559 batches | lr 0.0001 | ms/batch 41.70 | loss  2.92 | ppl    18.46 | bpc    4.206
-----------------------------------------------------------------------------------------
EarlyStopping counter: 5 out of 25
| end of epoch   6 | time: 31.90s | valid loss  2.82 | valid ppl    16.80 | bpc    4.071
-----------------------------------------------------------------------------------------
| epoch   7 |    50/  559 batches | lr 0.0001 | ms/batch 40.23 | loss  3.09 | ppl    21.87 | bpc    4.451
| epoch   7 |   100/  559 batches | lr 0.0001 | ms/batch 40.49 | loss  3.01 | ppl    20.26 | bpc    4.341
| epoch   7 |   150/  559 batches | lr 0.0001 | ms/batch 41.75 | loss  2.99 | ppl    19.97 | bpc    4.320
| epoch   7 |   200/  559 batches | lr 0.0001 | ms/batch 41.

| epoch  12 |   200/  559 batches | lr 0.0001 | ms/batch 41.78 | loss  2.90 | ppl    18.13 | bpc    4.180
| epoch  12 |   250/  559 batches | lr 0.0001 | ms/batch 41.82 | loss  2.89 | ppl    18.05 | bpc    4.174
| epoch  12 |   300/  559 batches | lr 0.0001 | ms/batch 41.81 | loss  2.89 | ppl    17.94 | bpc    4.166
| epoch  12 |   350/  559 batches | lr 0.0001 | ms/batch 41.82 | loss  2.88 | ppl    17.80 | bpc    4.154
| epoch  12 |   400/  559 batches | lr 0.0001 | ms/batch 41.82 | loss  2.88 | ppl    17.75 | bpc    4.150
| epoch  12 |   450/  559 batches | lr 0.0001 | ms/batch 41.82 | loss  2.88 | ppl    17.75 | bpc    4.150
| epoch  12 |   500/  559 batches | lr 0.0001 | ms/batch 41.80 | loss  2.87 | ppl    17.71 | bpc    4.146
| epoch  12 |   550/  559 batches | lr 0.0001 | ms/batch 41.79 | loss  2.87 | ppl    17.70 | bpc    4.145
-----------------------------------------------------------------------------------------
EarlyStopping counter: 11 out of 25
| end of epoch  12 | time:

-----------------------------------------------------------------------------------------
EarlyStopping counter: 16 out of 25
| end of epoch  17 | time: 31.90s | valid loss  2.72 | valid ppl    15.18 | bpc    3.924
-----------------------------------------------------------------------------------------
| epoch  18 |    50/  559 batches | lr 0.0001 | ms/batch 39.91 | loss  2.88 | ppl    17.80 | bpc    4.154
| epoch  18 |   100/  559 batches | lr 0.0001 | ms/batch 40.75 | loss  2.82 | ppl    16.75 | bpc    4.066
| epoch  18 |   150/  559 batches | lr 0.0001 | ms/batch 41.77 | loss  2.82 | ppl    16.85 | bpc    4.075
| epoch  18 |   200/  559 batches | lr 0.0001 | ms/batch 41.76 | loss  2.83 | ppl    16.97 | bpc    4.084
| epoch  18 |   250/  559 batches | lr 0.0001 | ms/batch 41.76 | loss  2.83 | ppl    16.92 | bpc    4.081
| epoch  18 |   300/  559 batches | lr 0.0001 | ms/batch 41.76 | loss  2.83 | ppl    16.88 | bpc    4.077
| epoch  18 |   350/  559 batches | lr 0.0001 | ms/batch 41

| epoch  23 |   350/  559 batches | lr 0.0001 | ms/batch 41.81 | loss  2.82 | ppl    16.73 | bpc    4.065
| epoch  23 |   400/  559 batches | lr 0.0001 | ms/batch 41.78 | loss  2.81 | ppl    16.57 | bpc    4.050
| epoch  23 |   450/  559 batches | lr 0.0001 | ms/batch 41.78 | loss  2.80 | ppl    16.38 | bpc    4.034
| epoch  23 |   500/  559 batches | lr 0.0001 | ms/batch 41.80 | loss  2.79 | ppl    16.23 | bpc    4.021
| epoch  23 |   550/  559 batches | lr 0.0001 | ms/batch 41.83 | loss  2.78 | ppl    16.04 | bpc    4.004
-----------------------------------------------------------------------------------------
EarlyStopping counter: 22 out of 25
| end of epoch  23 | time: 32.01s | valid loss  2.67 | valid ppl    14.39 | bpc    3.847
-----------------------------------------------------------------------------------------
| epoch  24 |    50/  559 batches | lr 0.0001 | ms/batch 41.39 | loss  2.84 | ppl    17.18 | bpc    4.102
| epoch  24 |   100/  559 batches | lr 0.0001 | ms/batch 41

| epoch  29 |    50/  559 batches | lr 0.0001 | ms/batch 42.54 | loss  2.71 | ppl    15.05 | bpc    3.912
| epoch  29 |   100/  559 batches | lr 0.0001 | ms/batch 41.84 | loss  2.66 | ppl    14.25 | bpc    3.833
| epoch  29 |   150/  559 batches | lr 0.0001 | ms/batch 41.80 | loss  2.66 | ppl    14.34 | bpc    3.842
| epoch  29 |   200/  559 batches | lr 0.0001 | ms/batch 41.85 | loss  2.67 | ppl    14.44 | bpc    3.852
| epoch  29 |   250/  559 batches | lr 0.0001 | ms/batch 41.87 | loss  2.67 | ppl    14.45 | bpc    3.853
| epoch  29 |   300/  559 batches | lr 0.0001 | ms/batch 41.85 | loss  2.67 | ppl    14.42 | bpc    3.850
| epoch  29 |   350/  559 batches | lr 0.0001 | ms/batch 41.84 | loss  2.67 | ppl    14.44 | bpc    3.852
| epoch  29 |   400/  559 batches | lr 0.0001 | ms/batch 41.83 | loss  2.67 | ppl    14.47 | bpc    3.855
| epoch  29 |   450/  559 batches | lr 0.0001 | ms/batch 41.87 | loss  2.67 | ppl    14.51 | bpc    3.859
| epoch  29 |   500/  559 batches | lr 0.0001 

| epoch  34 |   450/  559 batches | lr 0.0001 | ms/batch 41.87 | loss  2.97 | ppl    19.43 | bpc    4.280
| epoch  34 |   500/  559 batches | lr 0.0001 | ms/batch 41.84 | loss  3.04 | ppl    20.94 | bpc    4.388
| epoch  34 |   550/  559 batches | lr 0.0001 | ms/batch 41.85 | loss  3.04 | ppl    20.91 | bpc    4.386
-----------------------------------------------------------------------------------------
EarlyStopping counter: 7 out of 25
| end of epoch  34 | time: 31.90s | valid loss  2.91 | valid ppl    18.45 | bpc    4.205
-----------------------------------------------------------------------------------------
| epoch  35 |    50/  559 batches | lr 0.0001 | ms/batch 40.40 | loss  3.07 | ppl    21.53 | bpc    4.429
| epoch  35 |   100/  559 batches | lr 0.0001 | ms/batch 40.86 | loss  2.99 | ppl    19.79 | bpc    4.307
| epoch  35 |   150/  559 batches | lr 0.0001 | ms/batch 41.78 | loss  2.98 | ppl    19.60 | bpc    4.292
| epoch  35 |   200/  559 batches | lr 0.0001 | ms/batch 41.

| epoch  40 |   150/  559 batches | lr 0.0001 | ms/batch 41.76 | loss  2.63 | ppl    13.94 | bpc    3.801
| epoch  40 |   200/  559 batches | lr 0.0001 | ms/batch 41.75 | loss  2.63 | ppl    13.90 | bpc    3.797
| epoch  40 |   250/  559 batches | lr 0.0001 | ms/batch 41.76 | loss  2.62 | ppl    13.80 | bpc    3.786
| epoch  40 |   300/  559 batches | lr 0.0001 | ms/batch 41.80 | loss  2.62 | ppl    13.70 | bpc    3.776
| epoch  40 |   350/  559 batches | lr 0.0001 | ms/batch 41.79 | loss  2.62 | ppl    13.70 | bpc    3.777
| epoch  40 |   400/  559 batches | lr 0.0001 | ms/batch 41.77 | loss  2.62 | ppl    13.74 | bpc    3.780
| epoch  40 |   450/  559 batches | lr 0.0001 | ms/batch 41.78 | loss  2.62 | ppl    13.75 | bpc    3.782
| epoch  40 |   500/  559 batches | lr 0.0001 | ms/batch 41.79 | loss  2.62 | ppl    13.73 | bpc    3.780
| epoch  40 |   550/  559 batches | lr 0.0001 | ms/batch 41.79 | loss  2.61 | ppl    13.67 | bpc    3.773
----------------------------------------------

| epoch  45 |   550/  559 batches | lr 0.0001 | ms/batch 41.83 | loss  2.60 | ppl    13.45 | bpc    3.749
-----------------------------------------------------------------------------------------
Validation loss decreased (2.493998 --> 2.475320).  Saving model ...
| end of epoch  45 | time: 31.95s | valid loss  2.48 | valid ppl    11.89 | bpc    3.571
-----------------------------------------------------------------------------------------
| epoch  46 |    50/  559 batches | lr 0.0001 | ms/batch 42.61 | loss  2.66 | ppl    14.26 | bpc    3.834
| epoch  46 |   100/  559 batches | lr 0.0001 | ms/batch 41.77 | loss  2.60 | ppl    13.51 | bpc    3.755
| epoch  46 |   150/  559 batches | lr 0.0001 | ms/batch 41.79 | loss  2.60 | ppl    13.51 | bpc    3.756
| epoch  46 |   200/  559 batches | lr 0.0001 | ms/batch 41.78 | loss  2.60 | ppl    13.49 | bpc    3.754
| epoch  46 |   250/  559 batches | lr 0.0001 | ms/batch 41.78 | loss  2.59 | ppl    13.37 | bpc    3.741
| epoch  46 |   300/  559 

| epoch  51 |   250/  559 batches | lr 0.0001 | ms/batch 41.78 | loss  2.95 | ppl    19.18 | bpc    4.261
| epoch  51 |   300/  559 batches | lr 0.0001 | ms/batch 41.73 | loss  2.93 | ppl    18.77 | bpc    4.230
| epoch  51 |   350/  559 batches | lr 0.0001 | ms/batch 41.77 | loss  2.91 | ppl    18.41 | bpc    4.202
| epoch  51 |   400/  559 batches | lr 0.0001 | ms/batch 41.78 | loss  2.89 | ppl    17.97 | bpc    4.168
| epoch  51 |   450/  559 batches | lr 0.0001 | ms/batch 41.73 | loss  2.86 | ppl    17.49 | bpc    4.128
| epoch  51 |   500/  559 batches | lr 0.0001 | ms/batch 41.76 | loss  2.84 | ppl    17.12 | bpc    4.098
| epoch  51 |   550/  559 batches | lr 0.0001 | ms/batch 41.76 | loss  2.81 | ppl    16.67 | bpc    4.059
-----------------------------------------------------------------------------------------
EarlyStopping counter: 4 out of 25
| end of epoch  51 | time: 31.93s | valid loss  2.65 | valid ppl    14.20 | bpc    3.828
--------------------------------------------

| epoch  57 |    50/  559 batches | lr 0.0001 | ms/batch 42.58 | loss  2.61 | ppl    13.59 | bpc    3.765
| epoch  57 |   100/  559 batches | lr 0.0001 | ms/batch 41.82 | loss  2.56 | ppl    12.90 | bpc    3.690
| epoch  57 |   150/  559 batches | lr 0.0001 | ms/batch 41.83 | loss  2.56 | ppl    12.95 | bpc    3.695
| epoch  57 |   200/  559 batches | lr 0.0001 | ms/batch 41.75 | loss  2.56 | ppl    12.95 | bpc    3.695
| epoch  57 |   250/  559 batches | lr 0.0001 | ms/batch 41.81 | loss  2.55 | ppl    12.86 | bpc    3.685
| epoch  57 |   300/  559 batches | lr 0.0001 | ms/batch 41.77 | loss  2.55 | ppl    12.74 | bpc    3.672
| epoch  57 |   350/  559 batches | lr 0.0001 | ms/batch 41.80 | loss  2.55 | ppl    12.78 | bpc    3.676
| epoch  57 |   400/  559 batches | lr 0.0001 | ms/batch 41.84 | loss  2.55 | ppl    12.81 | bpc    3.679
| epoch  57 |   450/  559 batches | lr 0.0001 | ms/batch 41.82 | loss  2.55 | ppl    12.83 | bpc    3.681
| epoch  57 |   500/  559 batches | lr 0.0001 

| epoch  62 |   400/  559 batches | lr 0.0001 | ms/batch 41.79 | loss  2.53 | ppl    12.49 | bpc    3.643
| epoch  62 |   450/  559 batches | lr 0.0001 | ms/batch 41.83 | loss  2.53 | ppl    12.56 | bpc    3.651
| epoch  62 |   500/  559 batches | lr 0.0001 | ms/batch 41.82 | loss  2.53 | ppl    12.53 | bpc    3.648
| epoch  62 |   550/  559 batches | lr 0.0001 | ms/batch 41.83 | loss  2.52 | ppl    12.48 | bpc    3.641
-----------------------------------------------------------------------------------------
EarlyStopping counter: 1 out of 25
| end of epoch  62 | time: 32.00s | valid loss  2.39 | valid ppl    10.91 | bpc    3.447
-----------------------------------------------------------------------------------------
| epoch  63 |    50/  559 batches | lr 0.0001 | ms/batch 39.88 | loss  2.58 | ppl    13.21 | bpc    3.723
| epoch  63 |   100/  559 batches | lr 0.0001 | ms/batch 41.17 | loss  2.53 | ppl    12.57 | bpc    3.652
| epoch  63 |   150/  559 batches | lr 0.0001 | ms/batch 41.

| epoch  68 |   150/  559 batches | lr 0.0001 | ms/batch 41.80 | loss  2.71 | ppl    14.98 | bpc    3.905
| epoch  68 |   200/  559 batches | lr 0.0001 | ms/batch 41.81 | loss  2.75 | ppl    15.60 | bpc    3.964
| epoch  68 |   250/  559 batches | lr 0.0001 | ms/batch 41.79 | loss  2.76 | ppl    15.83 | bpc    3.985
| epoch  68 |   300/  559 batches | lr 0.0001 | ms/batch 41.76 | loss  2.76 | ppl    15.79 | bpc    3.981
| epoch  68 |   350/  559 batches | lr 0.0001 | ms/batch 41.80 | loss  2.76 | ppl    15.77 | bpc    3.979
| epoch  68 |   400/  559 batches | lr 0.0001 | ms/batch 41.79 | loss  2.76 | ppl    15.78 | bpc    3.980
| epoch  68 |   450/  559 batches | lr 0.0001 | ms/batch 41.79 | loss  2.76 | ppl    15.78 | bpc    3.980
| epoch  68 |   500/  559 batches | lr 0.0001 | ms/batch 41.80 | loss  2.77 | ppl    16.04 | bpc    4.003
| epoch  68 |   550/  559 batches | lr 0.0001 | ms/batch 41.76 | loss  2.78 | ppl    16.08 | bpc    4.007
----------------------------------------------

| epoch  73 |   550/  559 batches | lr 0.0001 | ms/batch 41.83 | loss  2.56 | ppl    12.96 | bpc    3.696
-----------------------------------------------------------------------------------------
EarlyStopping counter: 12 out of 25
| end of epoch  73 | time: 31.93s | valid loss  2.42 | valid ppl    11.26 | bpc    3.493
-----------------------------------------------------------------------------------------
| epoch  74 |    50/  559 batches | lr 0.0001 | ms/batch 40.47 | loss  2.62 | ppl    13.70 | bpc    3.776
| epoch  74 |   100/  559 batches | lr 0.0001 | ms/batch 41.73 | loss  2.56 | ppl    13.00 | bpc    3.700
| epoch  74 |   150/  559 batches | lr 0.0001 | ms/batch 41.80 | loss  2.57 | ppl    13.00 | bpc    3.701
| epoch  74 |   200/  559 batches | lr 0.0001 | ms/batch 41.77 | loss  2.57 | ppl    13.02 | bpc    3.703
| epoch  74 |   250/  559 batches | lr 0.0001 | ms/batch 41.80 | loss  2.56 | ppl    12.94 | bpc    3.694
| epoch  74 |   300/  559 batches | lr 0.0001 | ms/batch 41

| epoch  79 |   200/  559 batches | lr 0.0001 | ms/batch 41.78 | loss  2.51 | ppl    12.32 | bpc    3.623
| epoch  79 |   250/  559 batches | lr 0.0001 | ms/batch 41.79 | loss  2.51 | ppl    12.28 | bpc    3.619
| epoch  79 |   300/  559 batches | lr 0.0001 | ms/batch 41.78 | loss  2.50 | ppl    12.18 | bpc    3.606
| epoch  79 |   350/  559 batches | lr 0.0001 | ms/batch 41.79 | loss  2.50 | ppl    12.20 | bpc    3.608
| epoch  79 |   400/  559 batches | lr 0.0001 | ms/batch 41.78 | loss  2.51 | ppl    12.27 | bpc    3.617
| epoch  79 |   450/  559 batches | lr 0.0001 | ms/batch 41.81 | loss  2.51 | ppl    12.33 | bpc    3.624
| epoch  79 |   500/  559 batches | lr 0.0001 | ms/batch 41.79 | loss  2.51 | ppl    12.29 | bpc    3.619
| epoch  79 |   550/  559 batches | lr 0.0001 | ms/batch 41.77 | loss  2.51 | ppl    12.25 | bpc    3.615
-----------------------------------------------------------------------------------------
EarlyStopping counter: 1 out of 25
| end of epoch  79 | time: 

-----------------------------------------------------------------------------------------
EarlyStopping counter: 6 out of 25
| end of epoch  84 | time: 31.92s | valid loss  2.35 | valid ppl    10.47 | bpc    3.389
-----------------------------------------------------------------------------------------
| epoch  85 |    50/  559 batches | lr 0.0001 | ms/batch 40.69 | loss  2.55 | ppl    12.85 | bpc    3.683
| epoch  85 |   100/  559 batches | lr 0.0001 | ms/batch 41.83 | loss  2.50 | ppl    12.23 | bpc    3.612
| epoch  85 |   150/  559 batches | lr 0.0001 | ms/batch 41.82 | loss  2.51 | ppl    12.25 | bpc    3.615
| epoch  85 |   200/  559 batches | lr 0.0001 | ms/batch 41.83 | loss  2.50 | ppl    12.23 | bpc    3.612
| epoch  85 |   250/  559 batches | lr 0.0001 | ms/batch 41.84 | loss  2.50 | ppl    12.18 | bpc    3.606
| epoch  85 |   300/  559 batches | lr 0.0001 | ms/batch 41.86 | loss  2.49 | ppl    12.06 | bpc    3.592
| epoch  85 |   350/  559 batches | lr 0.0001 | ms/batch 41.

| epoch  90 |   250/  559 batches | lr 0.0001 | ms/batch 41.83 | loss  2.46 | ppl    11.72 | bpc    3.551
| epoch  90 |   300/  559 batches | lr 0.0001 | ms/batch 41.78 | loss  2.45 | ppl    11.60 | bpc    3.537
| epoch  90 |   350/  559 batches | lr 0.0001 | ms/batch 41.86 | loss  2.45 | ppl    11.63 | bpc    3.539
| epoch  90 |   400/  559 batches | lr 0.0001 | ms/batch 41.78 | loss  2.46 | ppl    11.65 | bpc    3.542
| epoch  90 |   450/  559 batches | lr 0.0001 | ms/batch 41.81 | loss  2.46 | ppl    11.73 | bpc    3.552
| epoch  90 |   500/  559 batches | lr 0.0001 | ms/batch 41.82 | loss  2.46 | ppl    11.66 | bpc    3.543
| epoch  90 |   550/  559 batches | lr 0.0001 | ms/batch 41.83 | loss  2.45 | ppl    11.59 | bpc    3.535
-----------------------------------------------------------------------------------------
Validation loss decreased (2.294201 --> 2.285452).  Saving model ...
| end of epoch  90 | time: 32.08s | valid loss  2.29 | valid ppl     9.83 | bpc    3.297
----------

-----------------------------------------------------------------------------------------
Validation loss decreased (2.268887 --> 2.268609).  Saving model ...
| end of epoch  95 | time: 31.86s | valid loss  2.27 | valid ppl     9.67 | bpc    3.273
-----------------------------------------------------------------------------------------
| epoch  96 |    50/  559 batches | lr 0.0001 | ms/batch 39.76 | loss  2.49 | ppl    12.11 | bpc    3.598
| epoch  96 |   100/  559 batches | lr 0.0001 | ms/batch 41.69 | loss  2.45 | ppl    11.57 | bpc    3.533
| epoch  96 |   150/  559 batches | lr 0.0001 | ms/batch 41.75 | loss  2.45 | ppl    11.59 | bpc    3.535
| epoch  96 |   200/  559 batches | lr 0.0001 | ms/batch 41.83 | loss  2.45 | ppl    11.59 | bpc    3.535
| epoch  96 |   250/  559 batches | lr 0.0001 | ms/batch 41.79 | loss  2.44 | ppl    11.51 | bpc    3.525
| epoch  96 |   300/  559 batches | lr 0.0001 | ms/batch 41.79 | loss  2.43 | ppl    11.40 | bpc    3.511
| epoch  96 |   350/  559 

In [40]:
validation_losses = [i.item() for i in val_losses]
training_losses = [i.item() for i in train_losses]

In [47]:
plt.figure(figsize=(6,6))
plt.plot(range(100), training_losses, c='#00ff00')
plt.plot(range(100), validation_losses)
plt.xlim(0, 100)
plt.ylim(0, 5.0)
plt.xlabel('EPOCH')
plt.ylabel('Loss')
plt.legend(['train', 'val'])
plt.title('Loss')
plt.savefig('Char_Noise'+'.png')
plt.close()

**Adaptive Weight Noise**

In [48]:
# Build the model
interval = 50 # interval to report
ntokens = len(corpus.dictionary) # 10000
model = RNNModel(ntokens, emsize, nhid, nlayers, dropout)
save = 'output/model_test_character_adaptive.pt'
checkpoint = "output/model_test_character_noise.pt"

# Load checkpoint
if checkpoint != '':
    model = torch.load(checkpoint, map_location=lambda storage, loc: storage)

print(model)

criterion = torch.nn.CrossEntropyLoss()

RNNModel(
  (drop): Dropout(p=0.5, inplace=False)
  (encoder): Embedding(50, 256)
  (rnn): LSTM(256, 1000, dropout=0.5)
  (decoder): Linear(in_features=1000, out_features=50, bias=True)
)


In [49]:
model.to(device)

RNNModel(
  (drop): Dropout(p=0.5, inplace=False)
  (encoder): Embedding(50, 256)
  (rnn): LSTM(256, 1000, dropout=0.5)
  (decoder): Linear(in_features=1000, out_features=50, bias=True)
)

In [50]:
def train():
    # choose a optimizer

    model.train()
    total_loss = 0
    start_time = time.time()
    hidden = model.init_hidden(batch_size)
    # train_data size(batchcnt, bsz)
    for batch, i in enumerate(range(0, train_data.size(0) - 1, bptt)):
        data, targets = get_batch(train_data, i)
        # Starting each batch, we detach the hidden state from how it was previously produced.
        # If we didn't, the model would try backpropagating all the way to start of the dataset.
        hidden = repackage_hidden(hidden)
        # print(hidden.to(device))
        output, hidden = model(data.to(device), hidden)
        loss = criterion(output.to(device), targets.to(device))
        opt.zero_grad()
        loss.backward()

        # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
        torch.nn.utils.clip_grad_value_(model.parameters(), clip)
        opt.step()

        l2_lambda = 0.01
        l2_reg = torch.tensor(0.).to(device)
        for param in model.rnn.parameters():
            l2_reg += torch.norm(param.to(device))

        total_loss += loss.data
        total_loss += l2_lambda * l2_reg

        if batch % interval == 0 and batch > 0:
            cur_loss = total_loss / interval
            elapsed = time.time() - start_time
            print('| epoch {:3d} | {:5d}/{:5d} batches | lr {:02.4f} | ms/batch {:5.2f} | '
                    'loss {:5.2f} | ppl {:8.2f} | bpc {:8.3f}'.format(
                epoch, batch, len(train_data) // bptt, lr,
                elapsed * 1000 / interval, cur_loss, math.exp(cur_loss), cur_loss / math.log(2)))
            total_loss = 0
            start_time = time.time()


In [51]:
lr = lr
best_val_loss = None
opt = torch.optim.SGD(model.parameters(), lr=lr, momentum=0.99)
opts = 'SGD'
val_losses = []
train_losses = []
epochs = 100
early_stopping = EarlyStopping(patience=25, verbose=True)
try:
    for epoch in range(1, epochs+1):
        epoch_start_time = time.time()
        model.to(device)

        param_vector = parameters_to_vector(model.rnn.parameters())
        param_vector.to(device)
        n_params = len(param_vector)
        noise = torch.distributions.Normal(loc=torch.tensor(0.), scale=torch.tensor(0.075)).sample_n(n_params)
        param_vector.add_(noise.to(device))
        
        vector_to_parameters(param_vector, model.rnn.parameters())
        model.to(device)
        
        train()
        train_loss = evaluate(train_data)
        val_loss = evaluate(val_data)
        print('-' * 89)
        print('| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | '
                'valid ppl {:8.2f} | bpc {:8.3f}'.format(epoch, (time.time() - epoch_start_time),
                                           val_loss, math.exp(val_loss), val_loss / math.log(2)))
        val_losses.append(val_loss)
        train_losses.append(train_loss)
        print('-' * 89)
        early_stopping(val_loss, model)
        if early_stopping.early_stop:
            print("Early stopping")
            break
        # Save the model if the validation loss is the best we've seen so far.
        if not best_val_loss or val_loss < best_val_loss:
            with open(save, 'wb') as f:
                torch.save(model, f)
            best_val_loss = val_loss

except KeyboardInterrupt:
    print('-' * 89)
    print('Exiting from training early')

# Load the best saved model.
with open(save, 'rb') as f:
    model = torch.load(f)

# Run on test data.
test_loss = evaluate(test_data)
print('=' * 89)
print('| End of training | test loss {:5.2f} | test ppl {:8.2f} | bpc {:8.3f}'.format(
    test_loss, math.exp(test_loss), test_loss / math.log(2)))
print('=' * 89)

| epoch   1 |    50/  559 batches | lr 0.0001 | ms/batch 38.24 | loss 29.73 | ppl 8154984876036.43 | bpc   42.891
| epoch   1 |   100/  559 batches | lr 0.0001 | ms/batch 37.13 | loss 29.15 | ppl 4556627983698.00 | bpc   42.051
| epoch   1 |   150/  559 batches | lr 0.0001 | ms/batch 37.22 | loss 29.15 | ppl 4571636064217.23 | bpc   42.056
| epoch   1 |   200/  559 batches | lr 0.0001 | ms/batch 37.27 | loss 29.15 | ppl 4588207303604.57 | bpc   42.061
| epoch   1 |   250/  559 batches | lr 0.0001 | ms/batch 37.31 | loss 29.15 | ppl 4576329672629.55 | bpc   42.057
| epoch   1 |   300/  559 batches | lr 0.0001 | ms/batch 37.35 | loss 29.15 | ppl 4551806987473.66 | bpc   42.050
| epoch   1 |   350/  559 batches | lr 0.0001 | ms/batch 37.39 | loss 29.15 | ppl 4567356693798.47 | bpc   42.054
| epoch   1 |   400/  559 batches | lr 0.0001 | ms/batch 37.50 | loss 29.16 | ppl 4599387575973.59 | bpc   42.065
| epoch   1 |   450/  559 batches | lr 0.0001 | ms/batch 37.45 | loss 29.17 | ppl 463867

| epoch   6 |   200/  559 batches | lr 0.0001 | ms/batch 42.22 | loss 29.76 | ppl 8421719152865.82 | bpc   42.937
| epoch   6 |   250/  559 batches | lr 0.0001 | ms/batch 42.28 | loss 29.76 | ppl 8409696440029.57 | bpc   42.935
| epoch   6 |   300/  559 batches | lr 0.0001 | ms/batch 42.24 | loss 29.75 | ppl 8333340840419.02 | bpc   42.922
| epoch   6 |   350/  559 batches | lr 0.0001 | ms/batch 42.26 | loss 29.76 | ppl 8368190864005.85 | bpc   42.928
| epoch   6 |   400/  559 batches | lr 0.0001 | ms/batch 42.25 | loss 29.76 | ppl 8421478208994.88 | bpc   42.937
| epoch   6 |   450/  559 batches | lr 0.0001 | ms/batch 42.27 | loss 29.77 | ppl 8481346859225.07 | bpc   42.947
| epoch   6 |   500/  559 batches | lr 0.0001 | ms/batch 42.24 | loss 29.76 | ppl 8440324744517.50 | bpc   42.940
| epoch   6 |   550/  559 batches | lr 0.0001 | ms/batch 42.26 | loss 29.76 | ppl 8415617365683.05 | bpc   42.936
----------------------------------------------------------------------------------------

| epoch  11 |   350/  559 batches | lr 0.0001 | ms/batch 42.27 | loss 30.36 | ppl 15285931710242.78 | bpc   43.797
| epoch  11 |   400/  559 batches | lr 0.0001 | ms/batch 42.30 | loss 30.36 | ppl 15383446207433.62 | bpc   43.806
| epoch  11 |   450/  559 batches | lr 0.0001 | ms/batch 42.27 | loss 30.37 | ppl 15465173488065.68 | bpc   43.814
| epoch  11 |   500/  559 batches | lr 0.0001 | ms/batch 42.26 | loss 30.37 | ppl 15444156081042.98 | bpc   43.812
| epoch  11 |   550/  559 batches | lr 0.0001 | ms/batch 42.25 | loss 30.36 | ppl 15378869599509.65 | bpc   43.806
-----------------------------------------------------------------------------------------
| end of epoch  11 | time: 32.05s | valid loss  2.22 | valid ppl     9.24 | bpc    3.208
-----------------------------------------------------------------------------------------
EarlyStopping counter: 3 out of 25
| epoch  12 |    50/  559 batches | lr 0.0001 | ms/batch 43.10 | loss 31.08 | ppl 31449399850419.43 | bpc   44.838
| epoc

| epoch  16 |   550/  559 batches | lr 0.0001 | ms/batch 42.29 | loss 30.98 | ppl 28385948271996.80 | bpc   44.690
-----------------------------------------------------------------------------------------
| end of epoch  16 | time: 32.15s | valid loss  2.22 | valid ppl     9.20 | bpc    3.202
-----------------------------------------------------------------------------------------
Validation loss decreased (2.221001 --> 2.219491).  Saving model ...
| epoch  17 |    50/  559 batches | lr 0.0001 | ms/batch 40.52 | loss 31.70 | ppl 58693609780092.65 | bpc   45.738
| epoch  17 |   100/  559 batches | lr 0.0001 | ms/batch 42.25 | loss 31.08 | ppl 31615515509542.40 | bpc   44.846
| epoch  17 |   150/  559 batches | lr 0.0001 | ms/batch 42.26 | loss 31.09 | ppl 31722853627340.87 | bpc   44.851
| epoch  17 |   200/  559 batches | lr 0.0001 | ms/batch 42.24 | loss 31.09 | ppl 31788207412545.24 | bpc   44.854
| epoch  17 |   250/  559 batches | lr 0.0001 | ms/batch 42.27 | loss 31.09 | ppl 31732

| epoch  22 |    50/  559 batches | lr 0.0001 | ms/batch 40.40 | loss 32.30 | ppl 106650587664343.92 | bpc   46.600
| epoch  22 |   100/  559 batches | lr 0.0001 | ms/batch 42.17 | loss 31.67 | ppl 56908851088529.23 | bpc   45.694
| epoch  22 |   150/  559 batches | lr 0.0001 | ms/batch 42.23 | loss 31.67 | ppl 56995101127414.42 | bpc   45.696
| epoch  22 |   200/  559 batches | lr 0.0001 | ms/batch 42.23 | loss 31.68 | ppl 57157962825380.11 | bpc   45.700
| epoch  22 |   250/  559 batches | lr 0.0001 | ms/batch 42.28 | loss 31.67 | ppl 57034249996730.34 | bpc   45.697
| epoch  22 |   300/  559 batches | lr 0.0001 | ms/batch 42.21 | loss 31.67 | ppl 56670984729956.05 | bpc   45.688
| epoch  22 |   350/  559 batches | lr 0.0001 | ms/batch 42.24 | loss 31.67 | ppl 56902989959330.06 | bpc   45.694
| epoch  22 |   400/  559 batches | lr 0.0001 | ms/batch 42.28 | loss 31.68 | ppl 57326647398358.46 | bpc   45.704
| epoch  22 |   450/  559 batches | lr 0.0001 | ms/batch 42.25 | loss 31.68 | p

| epoch  27 |   150/  559 batches | lr 0.0001 | ms/batch 42.26 | loss 32.22 | ppl 98141745224716.19 | bpc   46.480
| epoch  27 |   200/  559 batches | lr 0.0001 | ms/batch 42.25 | loss 32.22 | ppl 98308486275483.59 | bpc   46.482
| epoch  27 |   250/  559 batches | lr 0.0001 | ms/batch 42.26 | loss 32.22 | ppl 98047446532936.78 | bpc   46.479
| epoch  27 |   300/  559 batches | lr 0.0001 | ms/batch 42.28 | loss 32.21 | ppl 97410694739493.14 | bpc   46.469
| epoch  27 |   350/  559 batches | lr 0.0001 | ms/batch 42.27 | loss 32.21 | ppl 97827395276204.48 | bpc   46.475
| epoch  27 |   400/  559 batches | lr 0.0001 | ms/batch 42.29 | loss 32.22 | ppl 98162338342538.06 | bpc   46.480
| epoch  27 |   450/  559 batches | lr 0.0001 | ms/batch 42.07 | loss 32.23 | ppl 98945822312071.42 | bpc   46.492
| epoch  27 |   500/  559 batches | lr 0.0001 | ms/batch 42.24 | loss 32.22 | ppl 98478515896226.97 | bpc   46.485
| epoch  27 |   550/  559 batches | lr 0.0001 | ms/batch 41.97 | loss 32.22 | pp

| epoch  32 |   300/  559 batches | lr 0.0001 | ms/batch 42.27 | loss 32.74 | ppl 165058314118117.47 | bpc   47.230
| epoch  32 |   350/  559 batches | lr 0.0001 | ms/batch 42.26 | loss 32.74 | ppl 165524911531242.44 | bpc   47.234
| epoch  32 |   400/  559 batches | lr 0.0001 | ms/batch 42.24 | loss 32.75 | ppl 166554803279874.72 | bpc   47.243
| epoch  32 |   450/  559 batches | lr 0.0001 | ms/batch 42.22 | loss 32.75 | ppl 167637139581599.03 | bpc   47.252
| epoch  32 |   500/  559 batches | lr 0.0001 | ms/batch 42.24 | loss 32.75 | ppl 167047932880808.94 | bpc   47.247
| epoch  32 |   550/  559 batches | lr 0.0001 | ms/batch 42.27 | loss 32.75 | ppl 166816139471823.06 | bpc   47.245
-----------------------------------------------------------------------------------------
| end of epoch  32 | time: 32.16s | valid loss  2.20 | valid ppl     9.06 | bpc    3.179
-----------------------------------------------------------------------------------------
EarlyStopping counter: 4 out of 25


| epoch  37 |   450/  559 batches | lr 0.0001 | ms/batch 42.30 | loss 33.28 | ppl 283465206778034.88 | bpc   48.010
| epoch  37 |   500/  559 batches | lr 0.0001 | ms/batch 42.27 | loss 33.28 | ppl 282949879473966.44 | bpc   48.008
| epoch  37 |   550/  559 batches | lr 0.0001 | ms/batch 42.30 | loss 33.27 | ppl 281823109906308.44 | bpc   48.002
-----------------------------------------------------------------------------------------
| end of epoch  37 | time: 32.29s | valid loss  2.20 | valid ppl     9.02 | bpc    3.172
-----------------------------------------------------------------------------------------
Validation loss decreased (2.199034 --> 2.198988).  Saving model ...
| epoch  38 |    50/  559 batches | lr 0.0001 | ms/batch 40.11 | loss 34.03 | ppl 602276815071442.50 | bpc   49.097
| epoch  38 |   100/  559 batches | lr 0.0001 | ms/batch 42.18 | loss 33.37 | ppl 310490666501995.75 | bpc   48.142
| epoch  38 |   150/  559 batches | lr 0.0001 | ms/batch 42.29 | loss 33.37 | ppl 

-----------------------------------------------------------------------------------------
| end of epoch  42 | time: 32.28s | valid loss  2.20 | valid ppl     9.06 | bpc    3.179
-----------------------------------------------------------------------------------------
EarlyStopping counter: 5 out of 25
| epoch  43 |    50/  559 batches | lr 0.0001 | ms/batch 40.31 | loss 34.54 | ppl 1003760718727007.50 | bpc   49.834
| epoch  43 |   100/  559 batches | lr 0.0001 | ms/batch 42.10 | loss 33.87 | ppl 512068813373238.25 | bpc   48.863
| epoch  43 |   150/  559 batches | lr 0.0001 | ms/batch 42.28 | loss 33.87 | ppl 514224015750287.25 | bpc   48.869
| epoch  43 |   200/  559 batches | lr 0.0001 | ms/batch 42.29 | loss 33.87 | ppl 514757850288655.88 | bpc   48.871
| epoch  43 |   250/  559 batches | lr 0.0001 | ms/batch 42.28 | loss 33.87 | ppl 513604520781464.12 | bpc   48.868
| epoch  43 |   300/  559 batches | lr 0.0001 | ms/batch 42.26 | loss 33.87 | ppl 509897356844888.19 | bpc   48.857

| epoch  48 |    50/  559 batches | lr 0.0001 | ms/batch 39.81 | loss 35.05 | ppl 1668009525676174.75 | bpc   50.567
| epoch  48 |   100/  559 batches | lr 0.0001 | ms/batch 41.71 | loss 34.37 | ppl 842328599474267.12 | bpc   49.581
| epoch  48 |   150/  559 batches | lr 0.0001 | ms/batch 42.29 | loss 34.37 | ppl 844870878868673.88 | bpc   49.586
| epoch  48 |   200/  559 batches | lr 0.0001 | ms/batch 42.30 | loss 34.37 | ppl 845564092328517.50 | bpc   49.587
| epoch  48 |   250/  559 batches | lr 0.0001 | ms/batch 42.27 | loss 34.37 | ppl 845431854257707.75 | bpc   49.587
| epoch  48 |   300/  559 batches | lr 0.0001 | ms/batch 42.29 | loss 34.36 | ppl 838743859483282.75 | bpc   49.575
| epoch  48 |   350/  559 batches | lr 0.0001 | ms/batch 42.29 | loss 34.36 | ppl 838417568468625.12 | bpc   49.575
| epoch  48 |   400/  559 batches | lr 0.0001 | ms/batch 42.31 | loss 34.37 | ppl 844957902369217.62 | bpc   49.586
| epoch  48 |   450/  559 batches | lr 0.0001 | ms/batch 42.27 | loss 3

| epoch  53 |   200/  559 batches | lr 0.0001 | ms/batch 42.26 | loss 34.86 | ppl 1376355325780711.50 | bpc   50.290
| epoch  53 |   250/  559 batches | lr 0.0001 | ms/batch 42.25 | loss 34.86 | ppl 1373669755266221.50 | bpc   50.287
| epoch  53 |   300/  559 batches | lr 0.0001 | ms/batch 42.28 | loss 34.85 | ppl 1363785910780885.00 | bpc   50.277
| epoch  53 |   350/  559 batches | lr 0.0001 | ms/batch 42.27 | loss 34.85 | ppl 1365509002853157.00 | bpc   50.278
| epoch  53 |   400/  559 batches | lr 0.0001 | ms/batch 42.28 | loss 34.86 | ppl 1372056741686326.75 | bpc   50.285
| epoch  53 |   450/  559 batches | lr 0.0001 | ms/batch 42.27 | loss 34.86 | ppl 1382195602413645.50 | bpc   50.296
| epoch  53 |   500/  559 batches | lr 0.0001 | ms/batch 42.24 | loss 34.86 | ppl 1377048550313726.00 | bpc   50.291
| epoch  53 |   550/  559 batches | lr 0.0001 | ms/batch 42.36 | loss 34.85 | ppl 1368977387547158.25 | bpc   50.282
----------------------------------------------------------------

| epoch  58 |   300/  559 batches | lr 0.0001 | ms/batch 42.29 | loss 35.33 | ppl 2211018390006668.75 | bpc   50.974
| epoch  58 |   350/  559 batches | lr 0.0001 | ms/batch 42.27 | loss 35.34 | ppl 2219231811374555.00 | bpc   50.979
| epoch  58 |   400/  559 batches | lr 0.0001 | ms/batch 42.26 | loss 35.34 | ppl 2229065276811655.00 | bpc   50.985
| epoch  58 |   450/  559 batches | lr 0.0001 | ms/batch 42.29 | loss 35.35 | ppl 2242566580904983.50 | bpc   50.994
| epoch  58 |   500/  559 batches | lr 0.0001 | ms/batch 42.27 | loss 35.34 | ppl 2238754422920073.75 | bpc   50.992
| epoch  58 |   550/  559 batches | lr 0.0001 | ms/batch 42.28 | loss 35.34 | ppl 2227280317924844.75 | bpc   50.984
-----------------------------------------------------------------------------------------
| end of epoch  58 | time: 32.27s | valid loss  2.20 | valid ppl     9.00 | bpc    3.170
-----------------------------------------------------------------------------------------
EarlyStopping counter: 3 out 

| epoch  63 |   450/  559 batches | lr 0.0001 | ms/batch 42.29 | loss 35.81 | ppl 3571860293352589.00 | bpc   51.666
| epoch  63 |   500/  559 batches | lr 0.0001 | ms/batch 42.29 | loss 35.81 | ppl 3560840618780383.00 | bpc   51.661
| epoch  63 |   550/  559 batches | lr 0.0001 | ms/batch 42.30 | loss 35.80 | ppl 3546511736065403.00 | bpc   51.655
-----------------------------------------------------------------------------------------
| end of epoch  63 | time: 32.09s | valid loss  2.20 | valid ppl     8.99 | bpc    3.168
-----------------------------------------------------------------------------------------
EarlyStopping counter: 8 out of 25
| epoch  64 |    50/  559 batches | lr 0.0001 | ms/batch 42.95 | loss 36.61 | ppl 7950861895094005.00 | bpc   52.820
| epoch  64 |   100/  559 batches | lr 0.0001 | ms/batch 42.27 | loss 35.90 | ppl 3892125409154791.00 | bpc   51.789
| epoch  64 |   150/  559 batches | lr 0.0001 | ms/batch 42.26 | loss 35.90 | ppl 3904944829950284.00 | bpc   5

| epoch  68 |   550/  559 batches | lr 0.0001 | ms/batch 42.26 | loss 36.25 | ppl 5548775502868778.00 | bpc   52.301
-----------------------------------------------------------------------------------------
| end of epoch  68 | time: 32.26s | valid loss  2.19 | valid ppl     8.95 | bpc    3.161
-----------------------------------------------------------------------------------------
EarlyStopping counter: 13 out of 25
| epoch  69 |    50/  559 batches | lr 0.0001 | ms/batch 40.57 | loss 37.07 | ppl 12533059415703168.00 | bpc   53.477
| epoch  69 |   100/  559 batches | lr 0.0001 | ms/batch 42.00 | loss 36.35 | ppl 6101860671573165.00 | bpc   52.438
| epoch  69 |   150/  559 batches | lr 0.0001 | ms/batch 42.28 | loss 36.35 | ppl 6107076892138710.00 | bpc   52.439
| epoch  69 |   200/  559 batches | lr 0.0001 | ms/batch 42.24 | loss 36.35 | ppl 6115866047515208.00 | bpc   52.441
| epoch  69 |   250/  559 batches | lr 0.0001 | ms/batch 42.29 | loss 36.35 | ppl 6103583394303567.00 | bpc  

| epoch  74 |    50/  559 batches | lr 0.0001 | ms/batch 40.15 | loss 37.53 | ppl 19835939629836552.00 | bpc   54.139
| epoch  74 |   100/  559 batches | lr 0.0001 | ms/batch 42.19 | loss 36.80 | ppl 9556824981485938.00 | bpc   53.085
| epoch  74 |   150/  559 batches | lr 0.0001 | ms/batch 42.28 | loss 36.80 | ppl 9578431554695824.00 | bpc   53.089
| epoch  74 |   200/  559 batches | lr 0.0001 | ms/batch 42.31 | loss 36.80 | ppl 9576129885832632.00 | bpc   53.088
| epoch  74 |   250/  559 batches | lr 0.0001 | ms/batch 42.24 | loss 36.80 | ppl 9557043722353968.00 | bpc   53.085
| epoch  74 |   300/  559 batches | lr 0.0001 | ms/batch 42.28 | loss 36.79 | ppl 9481802093409618.00 | bpc   53.074
| epoch  74 |   350/  559 batches | lr 0.0001 | ms/batch 42.28 | loss 36.79 | ppl 9505305638453390.00 | bpc   53.078
| epoch  74 |   400/  559 batches | lr 0.0001 | ms/batch 42.29 | loss 36.79 | ppl 9545129641009636.00 | bpc   53.084
| epoch  74 |   450/  559 batches | lr 0.0001 | ms/batch 42.28 

| epoch  79 |   150/  559 batches | lr 0.0001 | ms/batch 42.25 | loss 37.24 | ppl 14865956412880502.00 | bpc   53.723
| epoch  79 |   200/  559 batches | lr 0.0001 | ms/batch 42.22 | loss 37.24 | ppl 14849010051320500.00 | bpc   53.721
| epoch  79 |   250/  559 batches | lr 0.0001 | ms/batch 42.29 | loss 37.23 | ppl 14807152245064520.00 | bpc   53.717
| epoch  79 |   300/  559 batches | lr 0.0001 | ms/batch 42.27 | loss 37.23 | ppl 14694724601112714.00 | bpc   53.706
| epoch  79 |   350/  559 batches | lr 0.0001 | ms/batch 42.23 | loss 37.23 | ppl 14754714449061948.00 | bpc   53.712
| epoch  79 |   400/  559 batches | lr 0.0001 | ms/batch 42.25 | loss 37.24 | ppl 14829593700699056.00 | bpc   53.719
| epoch  79 |   450/  559 batches | lr 0.0001 | ms/batch 42.26 | loss 37.24 | ppl 14931087328989472.00 | bpc   53.729
| epoch  79 |   500/  559 batches | lr 0.0001 | ms/batch 42.22 | loss 37.24 | ppl 14862384167382718.00 | bpc   53.723
| epoch  79 |   550/  559 batches | lr 0.0001 | ms/batch

| epoch  84 |   250/  559 batches | lr 0.0001 | ms/batch 42.29 | loss 37.68 | ppl 23048307017215616.00 | bpc   54.356
| epoch  84 |   300/  559 batches | lr 0.0001 | ms/batch 42.26 | loss 37.67 | ppl 22834946107126776.00 | bpc   54.342
| epoch  84 |   350/  559 batches | lr 0.0001 | ms/batch 42.29 | loss 37.67 | ppl 22925981156895788.00 | bpc   54.348
| epoch  84 |   400/  559 batches | lr 0.0001 | ms/batch 42.27 | loss 37.67 | ppl 23009390324354800.00 | bpc   54.353
| epoch  84 |   450/  559 batches | lr 0.0001 | ms/batch 42.27 | loss 37.68 | ppl 23168545597422756.00 | bpc   54.363
| epoch  84 |   500/  559 batches | lr 0.0001 | ms/batch 42.27 | loss 37.68 | ppl 23075403004635684.00 | bpc   54.357
| epoch  84 |   550/  559 batches | lr 0.0001 | ms/batch 42.27 | loss 37.67 | ppl 22963530358950120.00 | bpc   54.350
-----------------------------------------------------------------------------------------
| end of epoch  84 | time: 32.18s | valid loss  2.19 | valid ppl     8.92 | bpc    3

| epoch  89 |   350/  559 batches | lr 0.0001 | ms/batch 42.28 | loss 38.11 | ppl 35584389562360140.00 | bpc   54.982
| epoch  89 |   400/  559 batches | lr 0.0001 | ms/batch 42.30 | loss 38.12 | ppl 35762522408642844.00 | bpc   54.989
| epoch  89 |   450/  559 batches | lr 0.0001 | ms/batch 42.29 | loss 38.12 | ppl 35969664722485868.00 | bpc   54.998
| epoch  89 |   500/  559 batches | lr 0.0001 | ms/batch 42.30 | loss 38.12 | ppl 35854726724696984.00 | bpc   54.993
| epoch  89 |   550/  559 batches | lr 0.0001 | ms/batch 42.24 | loss 38.11 | ppl 35675318056858220.00 | bpc   54.986
-----------------------------------------------------------------------------------------
| end of epoch  89 | time: 32.29s | valid loss  2.19 | valid ppl     8.90 | bpc    3.153
-----------------------------------------------------------------------------------------
EarlyStopping counter: 13 out of 25
| epoch  90 |    50/  559 batches | lr 0.0001 | ms/batch 42.56 | loss 38.96 | ppl 83318638406528560.00 | 

| epoch  94 |   450/  559 batches | lr 0.0001 | ms/batch 42.26 | loss 38.54 | ppl 54851792157088848.00 | bpc   55.606
| epoch  94 |   500/  559 batches | lr 0.0001 | ms/batch 42.25 | loss 38.54 | ppl 54719918614086480.00 | bpc   55.603
| epoch  94 |   550/  559 batches | lr 0.0001 | ms/batch 42.28 | loss 38.54 | ppl 54402929420955344.00 | bpc   55.595
-----------------------------------------------------------------------------------------
| end of epoch  94 | time: 32.28s | valid loss  2.19 | valid ppl     8.93 | bpc    3.158
-----------------------------------------------------------------------------------------
EarlyStopping counter: 18 out of 25
| epoch  95 |    50/  559 batches | lr 0.0001 | ms/batch 43.02 | loss 39.39 | ppl 127810384348012608.00 | bpc   56.827
| epoch  95 |   100/  559 batches | lr 0.0001 | ms/batch 42.28 | loss 38.62 | ppl 59219176994834912.00 | bpc   55.717
| epoch  95 |   150/  559 batches | lr 0.0001 | ms/batch 42.31 | loss 38.62 | ppl 59390659449865168.00 |

| epoch  99 |   550/  559 batches | lr 0.0001 | ms/batch 42.32 | loss 38.95 | ppl 82739656372233344.00 | bpc   56.199
-----------------------------------------------------------------------------------------
| end of epoch  99 | time: 32.27s | valid loss  2.19 | valid ppl     8.90 | bpc    3.153
-----------------------------------------------------------------------------------------
EarlyStopping counter: 23 out of 25
| epoch 100 |    50/  559 batches | lr 0.0001 | ms/batch 43.13 | loss 39.82 | ppl 195934148998829568.00 | bpc   57.443
| epoch 100 |   100/  559 batches | lr 0.0001 | ms/batch 42.27 | loss 39.04 | ppl 90258857367376096.00 | bpc   56.325
| epoch 100 |   150/  559 batches | lr 0.0001 | ms/batch 42.28 | loss 39.04 | ppl 90301561933522640.00 | bpc   56.326
| epoch 100 |   200/  559 batches | lr 0.0001 | ms/batch 42.29 | loss 39.04 | ppl 90448426783086448.00 | bpc   56.328
| epoch 100 |   250/  559 batches | lr 0.0001 | ms/batch 42.31 | loss 39.04 | ppl 90308107160022720.00 |

In [52]:
validation_losses = [i.item() for i in val_losses]
training_losses = [i.item() for i in train_losses]

In [56]:
plt.figure(figsize=(6,6))
plt.plot(range(100), training_losses, c='#00ff00')
plt.plot(range(100), validation_losses)
plt.xlim(0, 100)
plt.ylim(0, 3.0)
plt.xlabel('EPOCH')
plt.ylabel('Loss')
plt.legend(['train', 'val'])
plt.title('Loss')
plt.savefig('Char_Noise_Adaptive'+'.png')
plt.close()

**Word based**

In [57]:
data = 'data/penn'
batch_size = 64
emsize = 256
nlayers = 1
nhid = 1000
lr = 0.0001
dropout = 0.5
checkpoint = ''
clip = 1
bptt = 35
epochs = 10
save = 'output/model_test_word_none.pt'

torch.manual_seed(1111)

# Load data
corpus = Corpus(data)


In [58]:
def batchify(data, bsz):
    # Work out how cleanly we can divide the dataset into bsz parts.
    nbatch = data.size(0) // bsz
    # Trim off any extra elements that wouldn't cleanly fit (remainders).
    data = data.narrow(0, 0, nbatch * bsz)
    # Evenly divide the data across the bsz batches.
    data = data.view(bsz, -1).t().contiguous()
    return data

In [59]:
eval_batch_size = 64
train_data = batchify(corpus.train, batch_size) # size(total_len//bsz, bsz)
val_data = batchify(corpus.valid, eval_batch_size)
test_data = batchify(corpus.test, eval_batch_size)

In [60]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [61]:
train_data.to(device)

tensor([[   0,  988,   48,  ...,   32, 3490,  556],
        [   1,   40,   32,  ..., 6789,  119,   27],
        [   2, 2756,  189,  ..., 1168,  129, 1880],
        ...,
        [1825,   54,   32,  ...,  416,   26,   35],
        [  35, 3940, 2361,  ...,   27,  373,  198],
        [ 101, 1305, 4923,  ...,   24,   42,   42]], device='cuda:0')

In [62]:
val_data.to(device)
test_data.to(device)

tensor([[ 142,  712,  439,  ..., 1940,   64, 3981],
        [  78, 4480,   48,  ...,   64, 4500,  500],
        [  54,  556,   40,  ...,  872,  398,   32],
        ...,
        [ 555,   64, 2380,  ...,  801,   32,   26],
        [1319,   26,  301,  ..., 2030, 6851,   64],
        [ 410,  119,   32,  ...,  159,  548,  220]], device='cuda:0')

In [63]:
# Build the model
interval = 50 # interval to report
ntokens = len(corpus.dictionary) # 10000
model = RNNModel(ntokens, emsize, nhid, nlayers, dropout)

# Load checkpoint
if checkpoint != '':
    model = torch.load(checkpoint, map_location=lambda storage, loc: storage)

print(model)
criterion = torch.nn.CrossEntropyLoss()

RNNModel(
  (drop): Dropout(p=0.5, inplace=False)
  (encoder): Embedding(10000, 256)
  (rnn): LSTM(256, 1000, dropout=0.5)
  (decoder): Linear(in_features=1000, out_features=10000, bias=True)
)


In [64]:
model.to(device)

RNNModel(
  (drop): Dropout(p=0.5, inplace=False)
  (encoder): Embedding(10000, 256)
  (rnn): LSTM(256, 1000, dropout=0.5)
  (decoder): Linear(in_features=1000, out_features=10000, bias=True)
)

In [65]:
def repackage_hidden(h):
    # detach
    return tuple(v.clone().detach() for v in h)


In [66]:

def get_batch(source, i):
    # source: size(total_len//bsz, bsz)
    seq_len = min(bptt, len(source) - 1 - i)
    #data = torch.tensor(source[i:i+seq_len]) # size(bptt, bsz)
    data = source[i:i+seq_len].clone().detach()
    target = source[i+1:i+1+seq_len].clone().detach().view(-1)
    #target = torch.tensor(source[i+1:i+1+seq_len].view(-1)) # size(bptt * bsz)
    return data, target

In [67]:
def evaluate(data_source):
    # Turn on evaluation mode which disables dropout.
    with torch.no_grad():
        model.eval()
        total_loss = 0
        ntokens = len(corpus.dictionary)
        hidden = model.init_hidden(eval_batch_size) #hidden size(nlayers, bsz, hdsize)
        for i in range(0, data_source.size(0) - 1, bptt):# iterate over every timestep
            data, targets = get_batch(data_source, i)
            output, hidden = model(data.to(device), hidden)
            # model input and output
            # inputdata size(bptt, bsz), and size(bptt, bsz, embsize) after embedding
            # output size(bptt*bsz, ntoken)
            total_loss += len(data) * criterion(output.to(device), targets.to(device)).data
            hidden = repackage_hidden(hidden)
        return total_loss / len(data_source)


In [68]:
def train():
    # choose a optimizer

    model.train()
    total_loss = 0
    start_time = time.time()
    hidden = model.init_hidden(batch_size)
    # train_data size(batchcnt, bsz)
    for batch, i in enumerate(range(0, train_data.size(0) - 1, bptt)):
        data, targets = get_batch(train_data, i)
        # Starting each batch, we detach the hidden state from how it was previously produced.
        # If we didn't, the model would try backpropagating all the way to start of the dataset.
        hidden = repackage_hidden(hidden)
        # print(hidden.to(device))
        output, hidden = model(data.to(device), hidden)
        loss = criterion(output.to(device), targets.to(device))
        opt.zero_grad()
        loss.backward()

        # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
        torch.nn.utils.clip_grad_value_(model.parameters(), clip)
        opt.step()

        total_loss += loss.data

        if batch % interval == 0 and batch > 0:
            cur_loss = total_loss / interval
            elapsed = time.time() - start_time
            print('| epoch {:3d} | {:5d}/{:5d} batches | lr {:02.4f} | ms/batch {:5.2f} | '
                    'loss {:5.2f} | ppl {:8.2f} | bpc {:8.3f}'.format(
                epoch, batch, len(train_data) // bptt, lr,
                elapsed * 1000 / interval, cur_loss, math.exp(cur_loss), cur_loss / math.log(2)))
            total_loss = 0
            start_time = time.time()


In [69]:
print("Number of tokens:")
print("Train: ", len(corpus.train))
print("Valid: ", len(corpus.valid))
print("Test:  ", len(corpus.test))

Number of tokens:
Train:  929589
Valid:  73760
Test:   82430


In [70]:
lr = lr
best_val_loss = None
opt = torch.optim.SGD(model.parameters(), lr=lr, momentum=0.99)
opts = 'SGD'

val_losses = []
train_losses = []
epochs = 100
early_stopping = EarlyStopping(patience=25, verbose=True)

try:
    for epoch in range(1, epochs+1):
        epoch_start_time = time.time()
        train()
        val_loss = evaluate(val_data)
        train_loss = evaluate(train_data)
        print('-' * 89)
        print('| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | '
                'valid ppl {:8.2f} | bpc {:8.3f}'.format(epoch, (time.time() - epoch_start_time),
                                           val_loss, math.exp(val_loss), val_loss / math.log(2)))
        val_losses.append(val_loss)
        train_losses.append(train_loss)
        print('-' * 89)
        early_stopping(val_loss, model)
        if early_stopping.early_stop:
            print("Early stopping")
            break
        # Save the model if the validation loss is the best we've seen so far.
        if not best_val_loss or val_loss < best_val_loss:
            with open(save, 'wb') as f:
                torch.save(model, f)
            best_val_loss = val_loss
        

except KeyboardInterrupt:
    print('-' * 89)
    print('Exiting from training early')

# Load the best saved model.
with open(save, 'rb') as f:
    model = torch.load(f)

# Run on test data.
test_loss = evaluate(test_data)
print('=' * 89)
print('| End of training | test loss {:5.2f} | test ppl {:8.2f} | bpc {:8.3f}'.format(
    test_loss, math.exp(test_loss), test_loss / math.log(2)))
print('=' * 89)

| epoch   1 |    50/  414 batches | lr 0.0001 | ms/batch 31.79 | loss  9.39 | ppl 12016.95 | bpc   13.553
| epoch   1 |   100/  414 batches | lr 0.0001 | ms/batch 30.27 | loss  9.21 | ppl  9955.06 | bpc   13.281
| epoch   1 |   150/  414 batches | lr 0.0001 | ms/batch 30.40 | loss  9.20 | ppl  9892.48 | bpc   13.272
| epoch   1 |   200/  414 batches | lr 0.0001 | ms/batch 30.41 | loss  9.19 | ppl  9820.05 | bpc   13.262
| epoch   1 |   250/  414 batches | lr 0.0001 | ms/batch 30.48 | loss  9.18 | ppl  9731.62 | bpc   13.248
| epoch   1 |   300/  414 batches | lr 0.0001 | ms/batch 30.47 | loss  9.17 | ppl  9641.81 | bpc   13.235
| epoch   1 |   350/  414 batches | lr 0.0001 | ms/batch 30.47 | loss  9.16 | ppl  9546.74 | bpc   13.221
| epoch   1 |   400/  414 batches | lr 0.0001 | ms/batch 30.50 | loss  9.15 | ppl  9450.07 | bpc   13.206
-----------------------------------------------------------------------------------------
| end of epoch   1 | time: 17.41s | valid loss  9.14 | valid p

| epoch   8 |    50/  414 batches | lr 0.0001 | ms/batch 32.90 | loss  7.13 | ppl  1247.38 | bpc   10.285
| epoch   8 |   100/  414 batches | lr 0.0001 | ms/batch 32.45 | loss  6.98 | ppl  1070.85 | bpc   10.065
| epoch   8 |   150/  414 batches | lr 0.0001 | ms/batch 34.57 | loss  6.93 | ppl  1024.39 | bpc   10.001
| epoch   8 |   200/  414 batches | lr 0.0001 | ms/batch 34.81 | loss  6.95 | ppl  1040.86 | bpc   10.024
| epoch   8 |   250/  414 batches | lr 0.0001 | ms/batch 34.78 | loss  6.96 | ppl  1053.25 | bpc   10.041
| epoch   8 |   300/  414 batches | lr 0.0001 | ms/batch 34.80 | loss  6.94 | ppl  1031.45 | bpc   10.010
| epoch   8 |   350/  414 batches | lr 0.0001 | ms/batch 34.82 | loss  6.92 | ppl  1017.29 | bpc    9.991
| epoch   8 |   400/  414 batches | lr 0.0001 | ms/batch 34.50 | loss  6.90 | ppl   994.64 | bpc    9.958
-----------------------------------------------------------------------------------------
| end of epoch   8 | time: 19.50s | valid loss  6.84 | valid p

| epoch  15 |    50/  414 batches | lr 0.0001 | ms/batch 34.52 | loss  6.86 | ppl   956.59 | bpc    9.902
| epoch  15 |   100/  414 batches | lr 0.0001 | ms/batch 34.84 | loss  6.72 | ppl   831.83 | bpc    9.700
| epoch  15 |   150/  414 batches | lr 0.0001 | ms/batch 34.84 | loss  6.69 | ppl   803.82 | bpc    9.651
| epoch  15 |   200/  414 batches | lr 0.0001 | ms/batch 34.82 | loss  6.71 | ppl   820.22 | bpc    9.680
| epoch  15 |   250/  414 batches | lr 0.0001 | ms/batch 34.85 | loss  6.73 | ppl   837.07 | bpc    9.709
| epoch  15 |   300/  414 batches | lr 0.0001 | ms/batch 34.84 | loss  6.72 | ppl   829.71 | bpc    9.696
| epoch  15 |   350/  414 batches | lr 0.0001 | ms/batch 34.81 | loss  6.71 | ppl   818.57 | bpc    9.677
| epoch  15 |   400/  414 batches | lr 0.0001 | ms/batch 34.83 | loss  6.69 | ppl   805.70 | bpc    9.654
-----------------------------------------------------------------------------------------
| end of epoch  15 | time: 19.74s | valid loss  6.62 | valid p

| epoch  22 |    50/  414 batches | lr 0.0001 | ms/batch 33.13 | loss  6.80 | ppl   897.79 | bpc    9.810
| epoch  22 |   100/  414 batches | lr 0.0001 | ms/batch 32.79 | loss  6.66 | ppl   784.09 | bpc    9.615
| epoch  22 |   150/  414 batches | lr 0.0001 | ms/batch 33.76 | loss  6.63 | ppl   757.04 | bpc    9.564
| epoch  22 |   200/  414 batches | lr 0.0001 | ms/batch 34.84 | loss  6.65 | ppl   769.99 | bpc    9.589
| epoch  22 |   250/  414 batches | lr 0.0001 | ms/batch 34.86 | loss  6.67 | ppl   790.53 | bpc    9.627
| epoch  22 |   300/  414 batches | lr 0.0001 | ms/batch 34.84 | loss  6.66 | ppl   783.73 | bpc    9.614
| epoch  22 |   350/  414 batches | lr 0.0001 | ms/batch 34.85 | loss  6.65 | ppl   773.91 | bpc    9.596
| epoch  22 |   400/  414 batches | lr 0.0001 | ms/batch 34.82 | loss  6.64 | ppl   762.15 | bpc    9.574
-----------------------------------------------------------------------------------------
| end of epoch  22 | time: 19.51s | valid loss  6.57 | valid p

| epoch  29 |    50/  414 batches | lr 0.0001 | ms/batch 32.60 | loss  6.77 | ppl   873.44 | bpc    9.771
| epoch  29 |   100/  414 batches | lr 0.0001 | ms/batch 32.52 | loss  6.64 | ppl   761.37 | bpc    9.572
| epoch  29 |   150/  414 batches | lr 0.0001 | ms/batch 33.90 | loss  6.60 | ppl   734.05 | bpc    9.520
| epoch  29 |   200/  414 batches | lr 0.0001 | ms/batch 34.83 | loss  6.62 | ppl   747.28 | bpc    9.546
| epoch  29 |   250/  414 batches | lr 0.0001 | ms/batch 34.83 | loss  6.65 | ppl   769.07 | bpc    9.587
| epoch  29 |   300/  414 batches | lr 0.0001 | ms/batch 34.85 | loss  6.64 | ppl   762.83 | bpc    9.575
| epoch  29 |   350/  414 batches | lr 0.0001 | ms/batch 34.83 | loss  6.63 | ppl   754.21 | bpc    9.559
| epoch  29 |   400/  414 batches | lr 0.0001 | ms/batch 34.84 | loss  6.61 | ppl   743.28 | bpc    9.538
-----------------------------------------------------------------------------------------
| end of epoch  29 | time: 19.48s | valid loss  6.55 | valid p

| epoch  36 |    50/  414 batches | lr 0.0001 | ms/batch 35.40 | loss  6.76 | ppl   860.13 | bpc    9.748
| epoch  36 |   100/  414 batches | lr 0.0001 | ms/batch 34.84 | loss  6.62 | ppl   749.87 | bpc    9.551
| epoch  36 |   150/  414 batches | lr 0.0001 | ms/batch 34.87 | loss  6.58 | ppl   722.90 | bpc    9.498
| epoch  36 |   200/  414 batches | lr 0.0001 | ms/batch 34.83 | loss  6.60 | ppl   735.51 | bpc    9.523
| epoch  36 |   250/  414 batches | lr 0.0001 | ms/batch 34.84 | loss  6.63 | ppl   756.22 | bpc    9.563
| epoch  36 |   300/  414 batches | lr 0.0001 | ms/batch 34.87 | loss  6.62 | ppl   750.62 | bpc    9.552
| epoch  36 |   350/  414 batches | lr 0.0001 | ms/batch 34.80 | loss  6.61 | ppl   744.79 | bpc    9.541
| epoch  36 |   400/  414 batches | lr 0.0001 | ms/batch 34.81 | loss  6.60 | ppl   732.87 | bpc    9.517
-----------------------------------------------------------------------------------------
| end of epoch  36 | time: 19.78s | valid loss  6.54 | valid p

| epoch  43 |    50/  414 batches | lr 0.0001 | ms/batch 33.63 | loss  6.75 | ppl   850.63 | bpc    9.732
| epoch  43 |   100/  414 batches | lr 0.0001 | ms/batch 34.57 | loss  6.61 | ppl   742.75 | bpc    9.537
| epoch  43 |   150/  414 batches | lr 0.0001 | ms/batch 34.86 | loss  6.57 | ppl   715.75 | bpc    9.483
| epoch  43 |   200/  414 batches | lr 0.0001 | ms/batch 34.86 | loss  6.59 | ppl   728.11 | bpc    9.508
| epoch  43 |   250/  414 batches | lr 0.0001 | ms/batch 34.84 | loss  6.62 | ppl   749.63 | bpc    9.550
| epoch  43 |   300/  414 batches | lr 0.0001 | ms/batch 34.84 | loss  6.61 | ppl   743.33 | bpc    9.538
| epoch  43 |   350/  414 batches | lr 0.0001 | ms/batch 34.86 | loss  6.60 | ppl   736.46 | bpc    9.524
| epoch  43 |   400/  414 batches | lr 0.0001 | ms/batch 34.85 | loss  6.59 | ppl   725.67 | bpc    9.503
-----------------------------------------------------------------------------------------
| end of epoch  43 | time: 19.69s | valid loss  6.54 | valid p

| epoch  50 |    50/  414 batches | lr 0.0001 | ms/batch 32.38 | loss  6.74 | ppl   844.70 | bpc    9.722
| epoch  50 |   100/  414 batches | lr 0.0001 | ms/batch 33.14 | loss  6.60 | ppl   735.92 | bpc    9.523
| epoch  50 |   150/  414 batches | lr 0.0001 | ms/batch 34.72 | loss  6.57 | ppl   709.96 | bpc    9.472
| epoch  50 |   200/  414 batches | lr 0.0001 | ms/batch 34.88 | loss  6.58 | ppl   722.70 | bpc    9.497
| epoch  50 |   250/  414 batches | lr 0.0001 | ms/batch 34.85 | loss  6.61 | ppl   742.70 | bpc    9.537
| epoch  50 |   300/  414 batches | lr 0.0001 | ms/batch 34.88 | loss  6.61 | ppl   739.54 | bpc    9.530
| epoch  50 |   350/  414 batches | lr 0.0001 | ms/batch 34.86 | loss  6.59 | ppl   730.84 | bpc    9.513
| epoch  50 |   400/  414 batches | lr 0.0001 | ms/batch 34.83 | loss  6.58 | ppl   720.04 | bpc    9.492
-----------------------------------------------------------------------------------------
| end of epoch  50 | time: 19.54s | valid loss  6.53 | valid p

| epoch  57 |    50/  414 batches | lr 0.0001 | ms/batch 32.64 | loss  6.73 | ppl   837.47 | bpc    9.710
| epoch  57 |   100/  414 batches | lr 0.0001 | ms/batch 33.33 | loss  6.60 | ppl   732.31 | bpc    9.516
| epoch  57 |   150/  414 batches | lr 0.0001 | ms/batch 34.84 | loss  6.56 | ppl   705.40 | bpc    9.462
| epoch  57 |   200/  414 batches | lr 0.0001 | ms/batch 34.84 | loss  6.58 | ppl   718.37 | bpc    9.489
| epoch  57 |   250/  414 batches | lr 0.0001 | ms/batch 34.86 | loss  6.61 | ppl   739.73 | bpc    9.531
| epoch  57 |   300/  414 batches | lr 0.0001 | ms/batch 34.86 | loss  6.60 | ppl   733.18 | bpc    9.518
| epoch  57 |   350/  414 batches | lr 0.0001 | ms/batch 34.83 | loss  6.59 | ppl   727.04 | bpc    9.506
| epoch  57 |   400/  414 batches | lr 0.0001 | ms/batch 34.86 | loss  6.57 | ppl   715.29 | bpc    9.482
-----------------------------------------------------------------------------------------
| end of epoch  57 | time: 19.57s | valid loss  6.53 | valid p

| epoch  64 |    50/  414 batches | lr 0.0001 | ms/batch 32.46 | loss  6.73 | ppl   835.56 | bpc    9.707
| epoch  64 |   100/  414 batches | lr 0.0001 | ms/batch 33.37 | loss  6.59 | ppl   728.95 | bpc    9.510
| epoch  64 |   150/  414 batches | lr 0.0001 | ms/batch 34.81 | loss  6.55 | ppl   702.33 | bpc    9.456
| epoch  64 |   200/  414 batches | lr 0.0001 | ms/batch 34.84 | loss  6.57 | ppl   714.87 | bpc    9.482
| epoch  64 |   250/  414 batches | lr 0.0001 | ms/batch 34.89 | loss  6.60 | ppl   736.47 | bpc    9.524
| epoch  64 |   300/  414 batches | lr 0.0001 | ms/batch 34.85 | loss  6.59 | ppl   730.00 | bpc    9.512
| epoch  64 |   350/  414 batches | lr 0.0001 | ms/batch 34.85 | loss  6.59 | ppl   724.18 | bpc    9.500
| epoch  64 |   400/  414 batches | lr 0.0001 | ms/batch 34.84 | loss  6.57 | ppl   712.28 | bpc    9.476
-----------------------------------------------------------------------------------------
| end of epoch  64 | time: 19.57s | valid loss  6.53 | valid p

| epoch  71 |    50/  414 batches | lr 0.0001 | ms/batch 33.43 | loss  6.72 | ppl   830.75 | bpc    9.698
| epoch  71 |   100/  414 batches | lr 0.0001 | ms/batch 34.18 | loss  6.59 | ppl   725.82 | bpc    9.503
| epoch  71 |   150/  414 batches | lr 0.0001 | ms/batch 34.87 | loss  6.55 | ppl   698.51 | bpc    9.448
| epoch  71 |   200/  414 batches | lr 0.0001 | ms/batch 34.91 | loss  6.57 | ppl   712.74 | bpc    9.477
| epoch  71 |   250/  414 batches | lr 0.0001 | ms/batch 34.90 | loss  6.60 | ppl   732.41 | bpc    9.516
| epoch  71 |   300/  414 batches | lr 0.0001 | ms/batch 34.87 | loss  6.59 | ppl   727.81 | bpc    9.507
| epoch  71 |   350/  414 batches | lr 0.0001 | ms/batch 34.85 | loss  6.58 | ppl   721.14 | bpc    9.494
| epoch  71 |   400/  414 batches | lr 0.0001 | ms/batch 34.86 | loss  6.57 | ppl   710.15 | bpc    9.472
-----------------------------------------------------------------------------------------
| end of epoch  71 | time: 19.67s | valid loss  6.53 | valid p

| epoch  78 |    50/  414 batches | lr 0.0001 | ms/batch 35.48 | loss  6.72 | ppl   828.58 | bpc    9.694
| epoch  78 |   100/  414 batches | lr 0.0001 | ms/batch 34.84 | loss  6.59 | ppl   724.84 | bpc    9.502
| epoch  78 |   150/  414 batches | lr 0.0001 | ms/batch 34.87 | loss  6.55 | ppl   698.12 | bpc    9.447
| epoch  78 |   200/  414 batches | lr 0.0001 | ms/batch 34.88 | loss  6.56 | ppl   709.02 | bpc    9.470
| epoch  78 |   250/  414 batches | lr 0.0001 | ms/batch 34.88 | loss  6.59 | ppl   730.65 | bpc    9.513
| epoch  78 |   300/  414 batches | lr 0.0001 | ms/batch 34.91 | loss  6.59 | ppl   726.18 | bpc    9.504
| epoch  78 |   350/  414 batches | lr 0.0001 | ms/batch 34.87 | loss  6.58 | ppl   719.06 | bpc    9.490
| epoch  78 |   400/  414 batches | lr 0.0001 | ms/batch 34.84 | loss  6.56 | ppl   707.57 | bpc    9.467
-----------------------------------------------------------------------------------------
| end of epoch  78 | time: 19.80s | valid loss  6.53 | valid p

| epoch  85 |    50/  414 batches | lr 0.0001 | ms/batch 35.52 | loss  6.72 | ppl   826.96 | bpc    9.692
| epoch  85 |   100/  414 batches | lr 0.0001 | ms/batch 34.85 | loss  6.58 | ppl   722.63 | bpc    9.497
| epoch  85 |   150/  414 batches | lr 0.0001 | ms/batch 34.88 | loss  6.55 | ppl   696.10 | bpc    9.443
| epoch  85 |   200/  414 batches | lr 0.0001 | ms/batch 34.85 | loss  6.56 | ppl   707.18 | bpc    9.466
| epoch  85 |   250/  414 batches | lr 0.0001 | ms/batch 34.87 | loss  6.59 | ppl   729.21 | bpc    9.510
| epoch  85 |   300/  414 batches | lr 0.0001 | ms/batch 34.88 | loss  6.58 | ppl   723.56 | bpc    9.499
| epoch  85 |   350/  414 batches | lr 0.0001 | ms/batch 34.86 | loss  6.57 | ppl   716.80 | bpc    9.485
| epoch  85 |   400/  414 batches | lr 0.0001 | ms/batch 34.86 | loss  6.56 | ppl   706.22 | bpc    9.464
-----------------------------------------------------------------------------------------
| end of epoch  85 | time: 19.80s | valid loss  6.53 | valid p

| epoch  92 |    50/  414 batches | lr 0.0001 | ms/batch 33.83 | loss  6.72 | ppl   825.17 | bpc    9.689
| epoch  92 |   100/  414 batches | lr 0.0001 | ms/batch 34.33 | loss  6.58 | ppl   721.34 | bpc    9.495
| epoch  92 |   150/  414 batches | lr 0.0001 | ms/batch 34.86 | loss  6.54 | ppl   693.36 | bpc    9.437
| epoch  92 |   200/  414 batches | lr 0.0001 | ms/batch 34.84 | loss  6.56 | ppl   705.89 | bpc    9.463
| epoch  92 |   250/  414 batches | lr 0.0001 | ms/batch 34.86 | loss  6.59 | ppl   725.98 | bpc    9.504
| epoch  92 |   300/  414 batches | lr 0.0001 | ms/batch 34.87 | loss  6.58 | ppl   722.21 | bpc    9.496
| epoch  92 |   350/  414 batches | lr 0.0001 | ms/batch 34.88 | loss  6.57 | ppl   715.00 | bpc    9.482
| epoch  92 |   400/  414 batches | lr 0.0001 | ms/batch 34.83 | loss  6.56 | ppl   704.43 | bpc    9.460
-----------------------------------------------------------------------------------------
| end of epoch  92 | time: 19.69s | valid loss  6.52 | valid p

| epoch  99 |    50/  414 batches | lr 0.0001 | ms/batch 32.56 | loss  6.71 | ppl   823.44 | bpc    9.686
| epoch  99 |   100/  414 batches | lr 0.0001 | ms/batch 32.85 | loss  6.58 | ppl   718.91 | bpc    9.490
| epoch  99 |   150/  414 batches | lr 0.0001 | ms/batch 32.85 | loss  6.54 | ppl   692.81 | bpc    9.436
| epoch  99 |   200/  414 batches | lr 0.0001 | ms/batch 34.74 | loss  6.56 | ppl   703.59 | bpc    9.459
| epoch  99 |   250/  414 batches | lr 0.0001 | ms/batch 34.84 | loss  6.59 | ppl   726.07 | bpc    9.504
| epoch  99 |   300/  414 batches | lr 0.0001 | ms/batch 34.83 | loss  6.58 | ppl   719.25 | bpc    9.490
| epoch  99 |   350/  414 batches | lr 0.0001 | ms/batch 34.87 | loss  6.57 | ppl   713.36 | bpc    9.478
| epoch  99 |   400/  414 batches | lr 0.0001 | ms/batch 34.87 | loss  6.55 | ppl   702.67 | bpc    9.457
-----------------------------------------------------------------------------------------
| end of epoch  99 | time: 19.44s | valid loss  6.52 | valid p

In [71]:
validation_losses = [i.item() for i in val_losses]
training_losses = [i.item() for i in train_losses]

In [72]:
plt.figure(figsize=(6,6))
plt.plot(range(100), training_losses, c='#00ff00')
plt.plot(range(100), validation_losses)
plt.xlim(0, 100)
plt.ylim(0, 3.0)
plt.xlabel('EPOCH')
plt.ylabel('Loss')
plt.legend(['train', 'val'])
plt.title('Loss')
plt.savefig('Word_None'+'.png')
plt.close()

**Word Noise**

In [82]:
# Build the model
interval = 50 # interval to report
ntokens = len(corpus.dictionary) # 10000
model = RNNModel(ntokens, emsize, nhid, nlayers, dropout)
save = '/content/output/model_test_word_noise.pt'
checkpoint = "/content/output/model_test_word_none.pt"

# Load checkpoint
if checkpoint != '':
    model = torch.load(checkpoint, map_location=lambda storage, loc: storage)

print(model)
criterion = torch.nn.CrossEntropyLoss()

  "num_layers={}".format(dropout, num_layers))


RNNModel(
  (drop): Dropout(p=0.5, inplace=False)
  (encoder): Embedding(10000, 256)
  (rnn): LSTM(256, 1000, dropout=0.5)
  (decoder): Linear(in_features=1000, out_features=10000, bias=True)
)


In [83]:
from torch.nn.utils import vector_to_parameters, parameters_to_vector
lr = lr
best_val_loss = None
opt = torch.optim.SGD(model.parameters(), lr=lr, momentum=0.99)
opts = 'SGD'
epochs = 25
# if opt == 'Adam':
#     opt = torch.optim.Adam(model.parameters(), lr=0.001, betas=(0.9, 0.99))
#     lr = 0.001
# if args.opt == 'Momentum':
#     opt = torch.optim.SGD(model.parameters(), lr=lr, momentum=0.8)
# if args.opt == 'RMSprop':
#     opt = torch.optim.RMSprop(model.parameters(), lr=0.001, alpha=0.9)
#     lr = 0.001

try:
    for epoch in range(1, epochs+1):
        epoch_start_time = time.time()
        model.to(device)

        param_vector = parameters_to_vector(model.rnn.parameters())
        param_vector.to(device)
        n_params = len(param_vector)
        noise = torch.distributions.Normal(loc=torch.tensor(0.), scale=torch.tensor(0.075)).sample_n(n_params)
        param_vector.add_(noise.to(device))
        
        vector_to_parameters(param_vector, model.rnn.parameters())
        model.to(device)
        train()
        val_loss = evaluate(val_data)
        print('-' * 89)
        print('| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | '
                'valid ppl {:8.2f} | bpc {:8.3f}'.format(epoch, (time.time() - epoch_start_time),
                                           val_loss, math.exp(val_loss), val_loss / math.log(2)))
        print('-' * 89)
        # Save the model if the validation loss is the best we've seen so far.
        if not best_val_loss or val_loss < best_val_loss:
            with open(save, 'wb') as f:
                torch.save(model, f)
            best_val_loss = val_loss
        else:
            # Anneal the learning rate if no improvement has been seen in the validation dataset.
            if opts == 'SGD' or opts == 'Momentum':
                lr /= 4.0
                for group in opt.param_groups:
                    group['lr'] = lr

except KeyboardInterrupt:
    print('-' * 89)
    print('Exiting from training early')

# Load the best saved model.
with open(save, 'rb') as f:
    model = torch.load(f)

# Run on test data.
test_loss = evaluate(test_data)
print('=' * 89)
print('| End of training | test loss {:5.2f} | test ppl {:8.2f} | bpc {:8.3f}'.format(
    test_loss, math.exp(test_loss), test_loss / math.log(2)))
print('=' * 89)



| epoch   1 |    50/  414 batches | lr 0.0001 | ms/batch 136.33 | loss  7.64 | ppl  2083.54 | bpc   11.025
| epoch   1 |   100/  414 batches | lr 0.0001 | ms/batch 133.73 | loss  7.01 | ppl  1106.82 | bpc   10.112
| epoch   1 |   150/  414 batches | lr 0.0001 | ms/batch 133.12 | loss  6.90 | ppl   997.01 | bpc    9.961
| epoch   1 |   200/  414 batches | lr 0.0001 | ms/batch 133.57 | loss  6.89 | ppl   983.50 | bpc    9.942
| epoch   1 |   250/  414 batches | lr 0.0001 | ms/batch 133.09 | loss  6.90 | ppl   988.29 | bpc    9.949
| epoch   1 |   300/  414 batches | lr 0.0001 | ms/batch 132.93 | loss  6.87 | ppl   959.97 | bpc    9.907
| epoch   1 |   350/  414 batches | lr 0.0001 | ms/batch 133.00 | loss  6.85 | ppl   939.19 | bpc    9.875
| epoch   1 |   400/  414 batches | lr 0.0001 | ms/batch 132.95 | loss  6.82 | ppl   915.84 | bpc    9.839
-----------------------------------------------------------------------------------------
| end of epoch   1 | time: 57.63s | valid loss  6.74 |