# Model Evaluation
## Load Model

In [3]:
import torch
from torchtext.data.utils import get_tokenizer
from src.utils import *

%matplotlib inline

SEED = 42
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True
model_pth = "./models/"
model_name = "transformer-6-5-1-best"
model = torch.load(model_pth + model_name + ".pth.tar")
model.eval()

MyTf(
  (transformer_encoder): MyTfEncoder(
    (layers): ModuleList(
      (0): MyTfEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): _LinearWithBias(in_features=256, out_features=256, bias=True)
        )
        (linear1): Linear(in_features=256, out_features=512, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
        (linear2): Linear(in_features=512, out_features=256, bias=True)
        (norm1): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
        (dropout1): Dropout(p=0.1, inplace=False)
        (dropout2): Dropout(p=0.1, inplace=False)
        (activation): ReLU()
      )
      (1): MyTfEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): _LinearWithBias(in_features=256, out_features=256, bias=True)
        )
        (linear1): Linear(in_features=256, out_features=512, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
        (

In [4]:
print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 7,667,147 trainable parameters


## Build Vocabulary

In [5]:
pth_base = "./.data/multi30k/task1/raw/"
train_pths = ('train.de', 'train.en')
val_pths = ('val.de', 'val.en')
test_pths = ('test_2016_flickr.de', 'test_2016_flickr.en')
train_filepaths = [(pth_base + pth) for pth in train_pths]
test_filepaths = [(pth_base + pth) for pth in test_pths]

de_tokenizer = get_tokenizer('spacy', language='de_core_news_sm')
en_tokenizer = get_tokenizer('spacy', language='en_core_web_sm')

de_vocab = build_vocab(train_filepaths[0], de_tokenizer, min_freq=3)
en_vocab = build_vocab(train_filepaths[1], en_tokenizer, min_freq=3)

BOS_IDX = de_vocab['<bos>']
EOS_IDX = de_vocab['<eos>']

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [6]:
print(len(de_vocab))
print(len(en_vocab))

5374
4555


In [7]:
translate(model, "eine gruppe von menschen steht vor einem iglu .", de_vocab, en_vocab, de_tokenizer, BOS_IDX, EOS_IDX, "greedy", device)

' a group of people standing in front of a <unk> sign . '

## Prepare Reference and Predictions

In [8]:
'''load reference'''
with open(test_filepaths[0], 'r', encoding='utf8') as f:
    test_data = f.readlines()
for i in range(len(test_data)):
    test_data[i] = test_data[i].rstrip("\n").lower()
    
'''update reference.txt'''
with open(test_filepaths[1], 'r', encoding='utf8') as f:
    reference = f.readlines()

for i in range(len(reference)):
    reference[i] = " ".join(en_tokenizer(reference[i])).lower()

with open("reference.txt",'w+') as f:
    f.writelines(reference)

In [10]:
'''make predictions'''
predictions = []
for data in test_data:
    temp_trans = translate(model, data.lower(), de_vocab, en_vocab, de_tokenizer, BOS_IDX, EOS_IDX, "greedy", device)
    predictions.append(temp_trans+"\n")

'''update predictions.txt'''
with open("predictions.txt",'w+') as f:
    f.writelines(predictions)

In [11]:
# for i,pre in enumerate(predictions):
#     predictions[i] = pre.replace(" <unk> "," ")
# '''update predictions.txt'''
# with open("predictions.txt",'w+') as f:
#     f.writelines(predictions)

## Calculate BLEU

In [12]:
! perl ./src/multi-bleu.perl -lc reference.txt < predictions.txt

with open(model_pth + model_name + ".txt",'w+') as f:    
    f.writelines(predictions)

BLEU = 37.39, 70.5/46.3/31.2/21.4 (BP=0.972, ratio=0.973, hyp_len=12701, ref_len=13058)


In [13]:
from torchtext.data.metrics import bleu_score
references_corpus  = []
candidate_corpus = []
for pred,ref in zip(predictions, reference):
    temp = pred.rstrip(" \n").split(" ")
    candidate_corpus.append(temp)
    temp = ref.rstrip(" \n").split(" ")
    references_corpus.append([temp])
bleu_torchtext = bleu_score(candidate_corpus, references_corpus)
print(f'BLEU score = {bleu_torchtext*100:.2f}')

BLEU score = 35.27


In [14]:
print(references_corpus[3])
print(candidate_corpus[3])

[['five', 'people', 'wearing', 'winter', 'jackets', 'and', 'helmets', 'stand', 'in', 'the', 'snow', ',', 'with', 'snowmobiles', 'in', 'the', 'background', '.']]
['', 'five', 'people', 'in', 'winter', 'jackets', 'and', 'helmets', 'are', 'standing', 'in', 'the', 'snow', 'with', '<unk>', 'in', 'the', 'background', '.']


### Check BLEU from txt

In [15]:
# with open(model_pth + "transformer-5-21-7-best" + ".txt",'r') as f:    
#     predictions = f.readlines()

# with open("predictions.txt",'w+') as f:
#     f.writelines(predictions)

!perl ./multi-bleu.perl -lc reference.txt < predictions.txt

Can't open perl script "./multi-bleu.perl": No such file or directory
