# Model Evaluation
## Load Model

In [1]:
import math
import torchtext
import torch
import torch.nn as nn
from torchtext.data.utils import get_tokenizer
from collections import Counter
from torchtext.vocab import Vocab
from torchtext.utils import download_from_url, extract_archive
from torch import Tensor
import time
import io
from utils import *

%matplotlib inline

torch.manual_seed(0)
model_pth = "./models/"
model_name = "transformer-5-20-3-ckpt-90"
model = torch.load(model_pth + model_name + ".pth.tar")
model.eval()

Seq2SeqTransformer(
  (transformer_encoder): TransformerEncoder(
    (layers): ModuleList(
      (0): TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): _LinearWithBias(in_features=512, out_features=512, bias=True)
        )
        (linear1): Linear(in_features=512, out_features=2048, bias=True)
        (dropout): Dropout(p=0.3, inplace=False)
        (linear2): Linear(in_features=2048, out_features=512, bias=True)
        (norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (dropout1): Dropout(p=0.3, inplace=False)
        (dropout2): Dropout(p=0.3, inplace=False)
      )
      (1): TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): _LinearWithBias(in_features=512, out_features=512, bias=True)
        )
        (linear1): Linear(in_features=512, out_features=2048, bias=True)
        (dropout): Dropout(p=0.3, inplace=False)


In [2]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 43,016,278 trainable parameters


## Build Vocabulary

In [3]:
pth_base = "./.data/multi30k/task1/raw/"

train_pths = ('train.de.gz', 'train.en.gz')
test_pths = ('test_2016_flickr.de.gz', 'test_2016_flickr.en.gz')

train_filepaths = [extract_archive(pth_base + pth)[0] for pth in train_pths]
test_filepaths = [extract_archive(pth_base + pth)[0] for pth in test_pths]

de_tokenizer = get_tokenizer('spacy', language='de_core_news_sm')
en_tokenizer = get_tokenizer('spacy', language='en_core_web_sm')

de_vocab = build_vocab(train_filepaths[0], de_tokenizer, min_freq=1)
en_vocab = build_vocab(train_filepaths[1], en_tokenizer, min_freq=1)

BOS_IDX = de_vocab['<bos>']
EOS_IDX = de_vocab['<eos>']

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [4]:
translate(model, "Eine Gruppe von Menschen steht vor einem Iglu .", de_vocab, en_vocab, de_tokenizer, BOS_IDX, EOS_IDX, device)

' A group of people stand in front of an igloo . '

## Prepare Reference and Predictions

In [5]:
'''load reference'''
with open(test_filepaths[0], 'r', encoding='utf8') as f:
    test_data = f.readlines()
    
'''update reference.txt'''
with open(test_filepaths[1], 'r', encoding='utf8') as f:
    reference = f.readlines()

for i in range(len(reference)):
    reference[i] = " ".join(en_tokenizer(reference[i]))

with open("reference.txt",'w+') as f:
    f.writelines(reference)

In [6]:
'''make predictions'''
predictions = []
for data in test_data:
    temp_trans = translate(model, data, de_vocab, en_vocab, de_tokenizer, BOS_IDX, EOS_IDX, device)
    predictions.append(temp_trans[1:-3]+" . \n")

'''update predictions.txt'''
with open("predictions.txt",'w+') as f:
    f.writelines(predictions)

## Calculate BLEU

In [7]:
! F:/ProgramData/Strawberry/perl/bin/perl.exe ./multi-bleu.perl -lc reference.txt < predictions.txt

with open(model_pth + model_name + ".txt",'w+') as f:    
    f.writelines(predictions)

BLEU = 30.76, 60.6/37.1/24.3/16.4 (BP=1.000, ratio=1.142, hyp_len=14906, ref_len=13058)


In [8]:
from torchtext.data.metrics import bleu_score
references_corpus  = []
candidate_corpus = []
for pred,ref in zip(predictions, reference):
    temp = pred.rstrip(" \n").split(" ")
    candidate_corpus.append(temp)
    temp = ref.rstrip(" \n").split(" ")
    references_corpus.append([temp])
bleu_torchtext = bleu_score(candidate_corpus, references_corpus)
print(f'BLEU score = {bleu_torchtext*100:.2f}')

BLEU score = 30.53


In [9]:
print(en_tokenizer(reference[0].rstrip(" \n")))
print(reference[:1])

['A', 'man', 'in', 'an', 'orange', 'hat', 'starring', 'at', 'something', '.']
['A man in an orange hat starring at something . \n']


In [10]:
print(candidate_corpus[0])
print(references_corpus[0])

['A', 'man', 'in', 'an', 'orange', 'hat', 'struggles', 'to', 'a', 'as', 'he', 'explores', 'something', '.']
[['A', 'man', 'in', 'an', 'orange', 'hat', 'starring', 'at', 'something', '.']]


In [11]:
max_len = 0
for ref in references_corpus[0]:
    if len(ref)>max_len:
        max_len = len(ref)
print(max_len)

10


### Check BLEU from txt

In [12]:
# with open(model_pth + "transformer-5-20-2-best" + ".txt",'r') as f:    
#     predictions = f.readlines()

# with open("predictions.txt",'w+') as f:
#     f.writelines(predictions)

# ! F:/ProgramData/Strawberry/perl/bin/perl.exe ./multi-bleu.perl -lc reference.txt < predictions.txt