In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchtext

from torchtext.legacy.data import Field, BucketIterator

import spacy
import numpy as np

import random
import math
import time

In [2]:
SEED = 1234

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True
#!pip install spacy --upgrade

Since in this assignment, we are only working on English, spacy for english only is needed.

In [3]:
%%bash
python -m spacy download en

Collecting en_core_web_sm==2.2.5
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.2.5/en_core_web_sm-2.2.5.tar.gz (12.0 MB)
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')
[38;5;2m✔ Linking successful[0m
/usr/local/lib/python3.7/dist-packages/en_core_web_sm -->
/usr/local/lib/python3.7/dist-packages/spacy/data/en
You can now load the model via spacy.load('en')


In [4]:
spacy_en = spacy.load('en_core_web_sm')

In [5]:
def tokenize_n(text):
    """
    Tokenizes English text from a string into a list of strings (tokens)
    """
    return [tok.text for tok in spacy_en.tokenizer(text)]

In [6]:
QUE = Field(tokenize = tokenize_n,
            init_token = '<sos>', 
            eos_token = '<eos>', 
            lower = True)

ANS = Field(tokenize = tokenize_n, 
            init_token = '<sos>', 
            eos_token = '<eos>', 
            lower = True)

In [7]:
#STR1 = Field(sequential = True, tokenize = tokenize_n, init_token='<sos>',eos_token='<eos>', lower=True)
#STR2 = Field(sequential = True, tokenize = tokenize_n, init_token='<sos>',eos_token='<eos>', lower=True)

In [8]:
fields = [('que', QUE),('ans',ANS)]

In [9]:
import pandas as pd
from google.colab import files

In [10]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Overview of the Question Answer Dataset

The file "question_answer_pairs.txt" contains the questions and answers. The first line of the file contains 
column names for the tab-separated data fields in the file. This first line follows:

ArticleTitle    Question        Answer  DifficultyFromQuestioner        DifficultyFromAnswerer  ArticleFile

Field 1 is the name of the Wikipedia article from which questions and answers initially came.
Field 2 is the question.
Field 3 is the answer.
Field 4 is the prescribed difficulty rating for the question as given to the question-writer. 
Field 5 is a difficulty rating assigned by the individual who evaluated and answered the question, 
which may differ from the difficulty in field 4.

Since we are only focussing on the text part of the dataset, which is question and answer, I am only picking up these two columns.

In [11]:
df=pd.read_csv('/content/drive/MyDrive/dataset/cmu/S08/question_answer_pairs.txt',sep='\t',
               header=0,lineterminator='\n',dtype=pd.StringDtype(),encoding = "ISO-8859-1",
               names= ['ArticleTitle','Question','Answer','DifficultyFromQuestioner','DifficultyFromAnswerer','ArticleFile'],usecols = ['Question','Answer'])

In [12]:
for col in df.columns:
    print(f'\t column {col} | {type(df[col][0])}')

	 column Question | <class 'str'>
	 column Answer | <class 'str'>


In [13]:
df.shape

(1714, 2)

Some Examples from the dataset

In [14]:
df.head()

Unnamed: 0,Question,Answer
0,Was Abraham Lincoln the sixteenth President of...,yes
1,Was Abraham Lincoln the sixteenth President of...,Yes.
2,Did Lincoln sign the National Banking Act of 1...,yes
3,Did Lincoln sign the National Banking Act of 1...,Yes.
4,Did his mother die of pneumonia?,no


In [15]:
df1=pd.read_csv('/content/drive/MyDrive/dataset/cmu/S09/question_answer_pairs.txt',sep='\t',
               header=0,lineterminator='\n',dtype=pd.StringDtype(),encoding = "ISO-8859-1",
               names= ['ArticleTitle','Question','Answer','DifficultyFromQuestioner','DifficultyFromAnswerer','ArticleFile'],usecols = ['Question','Answer'])

In [16]:
df=df.append(df1)

In [17]:
df2=pd.read_csv('/content/drive/MyDrive/dataset/cmu/S10/question_answer_pairs.txt',sep='\t',
               header=0,lineterminator='\n',dtype=pd.StringDtype(),encoding = "ISO-8859-1",
               names= ['ArticleTitle','Question','Answer','DifficultyFromQuestioner','DifficultyFromAnswerer','ArticleFile'],usecols = ['Question','Answer'])

In [18]:
df=df.append(df2)

In [19]:
df.shape

(3997, 2)

In [20]:
df=df.reset_index(drop=True)

Lets first create the DataFrame

In [21]:
example = [torchtext.legacy.data.Example.fromlist([str(df.Question[i]),str(df.Answer[i])], fields) for i in range(df.shape[0])]

# New Section

Now create the dataset

In [22]:
from torchtext.legacy import data

In [23]:
# Creating dataset
QADataset = data.Dataset(example, fields)

Finally, we can split into training  and test sets by using the split() method:

In [24]:
(train_set, test_set) = QADataset.split(split_ratio=[0.70, 0.30], random_state=random.seed(SEED))

In [25]:
print(f"Number of training examples: {len(train_set.examples)}")
print(f"Number of testing examples: {len(test_set.examples)}")

Number of training examples: 2798
Number of testing examples: 1199


In [74]:
print(vars(train_set.examples[0]))

{'que': ['in', 'central', 'and', 'south', 'america', ',', 'what', 'instrument', 'was', 'the', 'xylophone', 'changed', 'into', '?'], 'ans': ['<', 'na', '>']}


In [27]:
QUE.build_vocab(train_set, min_freq = 2)
ANS.build_vocab(train_set, min_freq = 2)

In [28]:
print(f"Unique tokens in source  vocabulary: {len(QUE.vocab)}")
print(f"Unique tokens in target vocabulary: {len(ANS.vocab)}")

Unique tokens in source  vocabulary: 2215
Unique tokens in target vocabulary: 1293


In [29]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [30]:
BATCH_SIZE = 32

train_iterator,  test_iterator = BucketIterator.splits(
    (train_set, test_set), sort_key = lambda x: len(x.que),
    sort_within_batch=True,
    batch_size = BATCH_SIZE, 
    device = device)

#Building Transformer from Scratch

In [31]:
class SelfAttention(nn.Module):
    def __init__(self, embed_size, heads):
        super(SelfAttention, self).__init__()
        self.embed_size = embed_size
        self.heads = heads
        self.head_dim = embed_size // heads

        assert (
            self.head_dim * heads == embed_size
        ), "Embedding size needs to be divisible by heads"

        self.values = nn.Linear(self.head_dim, self.head_dim, bias=False)
        self.keys = nn.Linear(self.head_dim, self.head_dim, bias=False)
        self.queries = nn.Linear(self.head_dim, self.head_dim, bias=False)
        self.fc_out = nn.Linear(heads * self.head_dim, embed_size)

    def forward(self, values, keys, query, mask):
        # Get number of training examples
        N = query.shape[0]

        value_len, key_len, query_len = values.shape[1], keys.shape[1], query.shape[1]

        # Split the embedding into self.heads different pieces
        values = values.reshape(N,value_len, self.heads, self.head_dim)
        keys = keys.reshape(N, key_len, self.heads, self.head_dim)
        query = query.reshape(N, query_len, self.heads, self.head_dim)

        values = self.values(values)  # (N, value_len, heads, head_dim)
        keys = self.keys(keys)  # (N, key_len, heads, head_dim)
        queries = self.queries(query)  # (N, query_len, heads, heads_dim)

        # Einsum does matrix mult. for query*keys for each training example
        # with every other training example, don't be confused by einsum
        # it's just how I like doing matrix multiplication & bmm

        energy = torch.einsum("nqhd,nkhd->nhqk", [queries, keys])
        # queries shape: (N, query_len, heads, heads_dim),
        # keys shape: (N, key_len, heads, heads_dim)
        # energy: (N, heads, query_len, key_len)

        # Mask padded indices so their weights become 0
        if mask is not None:
            energy = energy.masked_fill(mask == 0, float("-1e20"))

        # Normalize energy values similarly to seq2seq + attention
        # so that they sum to 1. Also divide by scaling factor for
        # better stability
        attention = torch.softmax(energy / (self.embed_size ** (1 / 2)), dim=3)
        # attention shape: (N, heads, query_len, key_len)

        out = torch.einsum("nhql,nlhd->nqhd", [attention, values]).reshape(
            N,query_len, self.heads * self.head_dim
        )
        # attention shape: (N, heads, query_len, key_len)
        # values shape: (N, value_len, heads, heads_dim)
        # out after matrix multiply: (N, query_len, heads, head_dim), then
        # we reshape and flatten the last two dimensions.

        out = self.fc_out(out)
        # Linear layer doesn't modify the shape, final shape will be
        # (N, query_len, embed_size)

        return out

Transformer Block

In [32]:
class TransformerBlock(nn.Module):
    def __init__(self, embed_size, heads, dropout, forward_expansion):
        super(TransformerBlock, self).__init__()
        self.attention = SelfAttention(embed_size, heads)
        self.norm1 = nn.LayerNorm(embed_size)
        self.norm2 = nn.LayerNorm(embed_size)

        self.feed_forward = nn.Sequential(
            nn.Linear(embed_size, forward_expansion * embed_size),
            nn.ReLU(),
            nn.Linear(forward_expansion * embed_size, embed_size),
        )

        self.dropout = nn.Dropout(dropout)

    def forward(self, value, key, query, mask):
        attention = self.attention(value, key, query, mask)

        # Add skip connection, run through normalization and finally dropout
        x = self.dropout(self.norm1(attention + query))
        forward = self.feed_forward(x)
        out = self.dropout(self.norm2(forward + x))
        return out

Ecoder

In [33]:
class Encoder(nn.Module):
    def __init__(
        self,
        src_vocab_size,
        embed_size,
        num_layers,
        heads,
        device,
        forward_expansion,
        dropout,
        max_length,
    ):

        super(Encoder, self).__init__()
        self.embed_size = embed_size
        self.device = device
        self.word_embedding = nn.Embedding(src_vocab_size, embed_size)
        self.position_embedding = nn.Embedding(max_length, embed_size)

        self.layers = nn.ModuleList(
            [
                TransformerBlock(
                    embed_size,
                    heads,
                    dropout=dropout,
                    forward_expansion=forward_expansion,
                )
                for _ in range(num_layers)
            ]
        )

        self.dropout = nn.Dropout(dropout)

    def forward(self, x, mask):
        N, seq_length = x.shape
        positions = torch.arange(0, seq_length).expand(N, seq_length).to(self.device)
        out = self.dropout(
            (self.word_embedding(x) + self.position_embedding(positions))
        )

        # In the Encoder the query, key, value are all the same, it's in the
        # decoder this will change. This might look a bit odd in this case.
        for layer in self.layers:
            out = layer(out, out, out, mask)

        return out

Decoder Block


In [34]:
class DecoderBlock(nn.Module):
    def __init__(self, embed_size, heads, forward_expansion, dropout, device):
        super(DecoderBlock, self).__init__()
        self.norm = nn.LayerNorm(embed_size)
        self.attention = SelfAttention(embed_size, heads=heads)
        self.transformer_block = TransformerBlock(
            embed_size, heads, dropout, forward_expansion
        )
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, value, key, src_mask, trg_mask):
        attention = self.attention(x, x, x, trg_mask)
        query = self.dropout(self.norm(attention + x))
        out = self.transformer_block(value, key, query, src_mask)
        return out

And then Decoder

In [35]:
class Decoder(nn.Module):
    def __init__(
        self,
        trg_vocab_size,
        embed_size,
        num_layers,
        heads,
        forward_expansion,
        dropout,
        device,
        max_length,
    ):
        super(Decoder, self).__init__()
        self.device = device
        self.word_embedding = nn.Embedding(trg_vocab_size, embed_size)
        self.position_embedding = nn.Embedding(max_length, embed_size)

        self.layers = nn.ModuleList(
            [
                DecoderBlock(embed_size, heads, forward_expansion, dropout, device)
                for _ in range(num_layers)
            ]
        )
        self.fc_out = nn.Linear(embed_size, trg_vocab_size)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, enc_out, src_mask, trg_mask):
        N, seq_length= x.shape
        positions =  torch.arange(0, seq_length).expand(N, seq_length).to(self.device)
        x = self.dropout((self.word_embedding(x) + self.position_embedding(positions)))

        for layer in self.layers:
            x = layer(x, enc_out, enc_out, src_mask, trg_mask)

        out = self.fc_out(x)

        return out

Then Finally transformer

In [36]:
class Transformer(nn.Module):
    def __init__(
        self,
        src_vocab_size,
        trg_vocab_size,
        src_pad_idx,
        trg_pad_idx,
        embed_size=512,
        num_layers=6,
        forward_expansion=4,
        heads=8,
        dropout=0,
        device="cpu",
        max_length=100,
    ):

        super(Transformer, self).__init__()

        self.encoder = Encoder(
            src_vocab_size,
            embed_size,
            num_layers,
            heads,
            device,
            forward_expansion,
            dropout,
            max_length,
        )

        self.decoder = Decoder(
            trg_vocab_size,
            embed_size,
            num_layers,
            heads,
            forward_expansion,
            dropout,
            device,
            max_length,
        )

        self.src_pad_idx = src_pad_idx
        self.trg_pad_idx = trg_pad_idx
        self.device = device

    def make_src_mask(self, src):
        src_mask = (src != self.src_pad_idx).unsqueeze(1).unsqueeze(2)
        # (N, 1, 1, src_len)
        
        #src_mask = src.transpose(0, 1) == self.src_pad_idx
        return src_mask.to(self.device)

    def make_trg_mask(self, trg):
        N, trg_len= trg.shape
        trg_mask = torch.tril(torch.ones((trg_len, trg_len))).expand(
            N, 1, trg_len, trg_len
        )

        return trg_mask.to(self.device)

    def forward(self, src, trg):
        src_mask = self.make_src_mask(src)
        trg_mask = self.make_trg_mask(trg)
        enc_src = self.encoder(src, src_mask)
        out = self.decoder(trg, enc_src, src_mask, trg_mask)
        return out

In [37]:
'''
INPUT_DIM = len(QUE.vocab)
OUTPUT_DIM = len(ANS.vocab)
ENC_EMB_DIM = 256
DEC_EMB_DIM = 256
HID_DIM = 512
N_LAYERS = 2
ENC_DROPOUT = 0.5
DEC_DROPOUT = 0.5

enc = Encoder(INPUT_DIM, ENC_EMB_DIM, HID_DIM, N_LAYERS, ENC_DROPOUT)
dec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, HID_DIM, N_LAYERS, DEC_DROPOUT)

model = Seq2Seq(enc, dec, device).to(device)
'''

'\nINPUT_DIM = len(QUE.vocab)\nOUTPUT_DIM = len(ANS.vocab)\nENC_EMB_DIM = 256\nDEC_EMB_DIM = 256\nHID_DIM = 512\nN_LAYERS = 2\nENC_DROPOUT = 0.5\nDEC_DROPOUT = 0.5\n\nenc = Encoder(INPUT_DIM, ENC_EMB_DIM, HID_DIM, N_LAYERS, ENC_DROPOUT)\ndec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, HID_DIM, N_LAYERS, DEC_DROPOUT)\n\nmodel = Seq2Seq(enc, dec, device).to(device)\n'

In [38]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

load_model = True
save_model = True

# Training hyperparameters
num_epochs = 10
learning_rate = 3e-4
BATCH_SIZE = 32

# Model hyperparameters
src_vocab_size = len(QUE.vocab) #len(vocab_transform[SRC_LANGUAGE]) #len(german.vocab)
trg_vocab_size = len(ANS.vocab) #len(vocab_transform[TGT_LANGUAGE]) #len(english.vocab)

src_pad_idx = QUE.vocab.stoi["<pad>"]
trg_pad_idx = ANS.vocab.stoi["<pad>"]
model = Transformer(src_vocab_size, trg_vocab_size, src_pad_idx, trg_pad_idx, device=device).to(
        device)

#optimizer = optim.Adam(model.parameters(), lr=learning_rate)

In [39]:
def init_weights(m):
    for name, param in m.named_parameters():
        nn.init.uniform_(param.data, -0.08, 0.08)
        
model.apply(init_weights)

Transformer(
  (encoder): Encoder(
    (word_embedding): Embedding(2215, 512)
    (position_embedding): Embedding(100, 512)
    (layers): ModuleList(
      (0): TransformerBlock(
        (attention): SelfAttention(
          (values): Linear(in_features=64, out_features=64, bias=False)
          (keys): Linear(in_features=64, out_features=64, bias=False)
          (queries): Linear(in_features=64, out_features=64, bias=False)
          (fc_out): Linear(in_features=512, out_features=512, bias=True)
        )
        (norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (feed_forward): Sequential(
          (0): Linear(in_features=512, out_features=2048, bias=True)
          (1): ReLU()
          (2): Linear(in_features=2048, out_features=512, bias=True)
        )
        (dropout): Dropout(p=0, inplace=False)
      )
      (1): TransformerBlock(
        (attention): SelfAttention(
          (values):

In [40]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 32,738,061 trainable parameters


In [41]:
optimizer = optim.Adam(model.parameters())

In [42]:
TRG_PAD_IDX = ANS.vocab.stoi[ANS.pad_token]

criterion = nn.CrossEntropyLoss(ignore_index = TRG_PAD_IDX)

In [43]:
b=next(iter(train_iterator))

In [44]:
src,trg=b.que,b.ans

In [45]:
src.shape

torch.Size([11, 32])

In [46]:
src=torch.transpose(src,0,1)

In [47]:
trg.shape

torch.Size([83, 32])

In [48]:
trg=torch.transpose(trg,0,1)

In [49]:
trg[:,:-1].shape

torch.Size([32, 82])

In [50]:
len(src[0])

11

In [51]:
o=model(src,trg[:,:-1])

In [52]:
o.shape

torch.Size([32, 82, 1293])

In [53]:
top1 = o.argmax(1)

In [54]:
top1

tensor([[60,  0, 38,  ..., 39, 40, 68],
        [ 6,  0, 38,  ..., 39, 40, 38],
        [ 5,  0, 38,  ..., 39, 40, 68],
        ...,
        [60, 10, 38,  ..., 39, 40, 38],
        [60,  0, 38,  ..., 39, 40, 68],
        [ 6,  8, 38,  ..., 39, 40, 38]])

In [55]:
top1.shape

torch.Size([32, 1293])

In [56]:
 ou_dim = o.shape[-1]

In [57]:
output = o.reshape(-1, o.shape[2])
trg1 = trg[:,1:].reshape(-1)
        
              
        

In [58]:
loss = criterion(output, trg1)

In [59]:
loss.item()

7.174862384796143

In [60]:
     
loss.backward()


In [61]:
clip=1      
torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        
optimizer.step()

In [62]:
len(train_iterator)

88

In [69]:
def train(model, iterator, optimizer, criterion, clip):
    
    model.train()
    
    epoch_loss = 0
    
    for i, batch in enumerate(iterator):
        
        src = batch.que
        trg = batch.ans
        src=torch.transpose(src,0,1)
        trg=torch.transpose(trg,0,1)
        optimizer.zero_grad()
        
        output = model(src, trg[:,:-1])
        output = output.reshape(-1, output.shape[2])
        trg1 = trg[:,1:].reshape(-1)
        #trg = [trg len, batch size]
        #output = [trg len, batch size, output dim]
        
        #output_dim = output.shape[-1]
        
        #output = output[1:].view(-1, output_dim)
        #trg = trg[1:].view(-1)
        
        #trg = [(trg len - 1) * batch size]
        #output = [(trg len - 1) * batch size, output dim]
        
        loss = criterion(output, trg1)
        
        loss.backward()
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        
        optimizer.step()
        
        epoch_loss += loss.item()
        if i==20:
          break
        exampls=i*BATCH_SIZE
    return epoch_loss / exampls

In [64]:
len(test_iterator)

38

In [65]:
len(df)

3997

In [70]:
def evaluate(model, iterator, criterion):
    
    model.eval()
    
    epoch_loss = 0
    
    with torch.no_grad():
    
        for i, batch in enumerate(iterator):

            src = batch.que
            trg = batch.ans
            src=torch.transpose(src,0,1)
            trg=torch.transpose(trg,0,1)
            output = model(src, trg[:,:-1]) #turn off teacher forcing
            output = output.reshape(-1, output.shape[2])
            trg1 = trg[:,1:].reshape(-1)
            #trg = [trg len, batch size]
            #output = [trg len, batch size, output dim]

            #output_dim = output.shape[-1]
            
            #output = output[1:].view(-1, output_dim)
            #trg = trg[1:].view(-1)

            #trg = [(trg len - 1) * batch size]
            #output = [(trg len - 1) * batch size, output dim]

            loss = criterion(output, trg1)
            
            epoch_loss += loss.item()
            if i==20:
              break
            exampls=i*BATCH_SIZE
    return epoch_loss / exampls

In [67]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [71]:
#The name VALID_LOSS actually indicated TEST_LOSS. There was no validation data. The model was evaluated on Test Data.
N_EPOCHS = 10
CLIP = 1

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
    
    start_time = time.time()
    
    train_loss = train(model, train_iterator, optimizer, criterion, CLIP)
    valid_loss = evaluate(model, test_iterator, criterion)
    
    end_time = time.time()
    
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'tut1-model.pt')
    
    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f}')

Epoch: 01 | Time: 0m 23s
	Train Loss: 0.157 | Train PPL:   1.170
	 Val. Loss: 0.135 |  Val. PPL:   1.145
Epoch: 02 | Time: 0m 23s
	Train Loss: 0.160 | Train PPL:   1.173
	 Val. Loss: 0.135 |  Val. PPL:   1.144
Epoch: 03 | Time: 0m 24s
	Train Loss: 0.157 | Train PPL:   1.170
	 Val. Loss: 0.134 |  Val. PPL:   1.143
Epoch: 04 | Time: 0m 25s
	Train Loss: 0.154 | Train PPL:   1.166
	 Val. Loss: 0.134 |  Val. PPL:   1.144
Epoch: 05 | Time: 0m 22s
	Train Loss: 0.158 | Train PPL:   1.171
	 Val. Loss: 0.135 |  Val. PPL:   1.144
Epoch: 06 | Time: 0m 24s
	Train Loss: 0.160 | Train PPL:   1.174
	 Val. Loss: 0.134 |  Val. PPL:   1.144
Epoch: 07 | Time: 0m 23s
	Train Loss: 0.160 | Train PPL:   1.174
	 Val. Loss: 0.134 |  Val. PPL:   1.144
Epoch: 08 | Time: 0m 22s
	Train Loss: 0.155 | Train PPL:   1.168
	 Val. Loss: 0.135 |  Val. PPL:   1.144
Epoch: 09 | Time: 0m 26s
	Train Loss: 0.161 | Train PPL:   1.174
	 Val. Loss: 0.134 |  Val. PPL:   1.144
Epoch: 10 | Time: 0m 25s
	Train Loss: 0.156 | Train PPL

In [72]:
model.load_state_dict(torch.load('tut1-model.pt'))

test_loss = evaluate(model, test_iterator, criterion)

print(f'| Test Loss: {test_loss:.3f} | Test PPL: {math.exp(test_loss):7.3f} |')

| Test Loss: 0.133 | Test PPL:   1.143 |
