In [2]:
import random
%reload_ext autoreload
%autoreload 2
# %load_ext jupyter_black

from tqdm import tqdm
import sys
from pathlib import Path

sys.path.append("../src")
sys.path.append("..")

from src.trainutil import *
from src.metrics import *

import yaml

data_root = "../data"
device = "cuda" if torch.cuda.is_available() else "cpu"



In [3]:
experiment_dir = Path("../checkpoints/paper")
with open(experiment_dir / "history/config.yaml", "r") as stream:
    cfg = yaml.safe_load(stream)

In [4]:
src_tokenizer = torch.load(experiment_dir / "src_tokenizer.pt")
tgt_tokenizer = torch.load(experiment_dir / "tgt_tokenizer.pt")
src_vocab = src_tokenizer.vocab
tgt_vocab = tgt_tokenizer.vocab

In [5]:
model = Seq2Seq(
    src_vocab_size=len(src_vocab),
    tgt_vocab_size=len(tgt_vocab),
    src_embedding_vector=None,
    tgt_embedding_vector=None,
    tgt_pad_index=tgt_vocab["<PAD>"],
    tgt_sos_index=tgt_vocab["<SOS>"],
    tgt_eos_index=tgt_vocab["<EOS>"],
    hidden_size=cfg["hidden_size"],
    bidirectional=cfg["bidirectional"],
    num_layers=cfg["num_layers"],
    src_embedding_size=cfg["src_embedding_size"],
    tgt_embedding_size=cfg["tgt_embedding_size"],
    dropout=cfg["dropout"],
)
model.to(device)

Seq2Seq(
  (src_embedding): Embedding(45000, 300)
  (tgt_embedding): Embedding(28000, 300)
  (encoder): Encoder(
    (embedding): Embedding(45000, 300)
    (layers): Sequential(
      (0): Embedding(45000, 300)
      (1): LSTM(300, 600, num_layers=2, batch_first=True, dropout=0.3, bidirectional=True)
    )
  )
  (decoder): Decoder(
    (embedding): Embedding(28000, 300)
    (lstm): LSTM(300, 600, num_layers=2, batch_first=True, dropout=0.3)
    (attention): Attention(
      (projection_layer): Linear(in_features=1200, out_features=600, bias=True)
    )
    (decoder_linear): Sequential(
      (0): Linear(in_features=1800, out_features=600, bias=True)
      (1): Tanh()
      (2): Linear(in_features=600, out_features=28000, bias=True)
    )
  )
)

In [6]:
model, _, _, epoch = load_checkpoint(model, experiment_dir / "model_best.pt")

2024-05-06 19:22:54,663 🎉 Loaded existing model. Epoch: 13


In [7]:
%%time
with open(f"{data_root}/dev.src") as srcfile:
    sources = srcfile.readlines()

with open(f"{data_root}/dev.tgt") as tgtfile:
    references = tgtfile.readlines()


hypotheses = []
for source in tqdm(sources):
    hyp, _ = generate(model, source, src_tokenizer, tgt_tokenizer, cfg, method="greedy")
    hypotheses.append(hyp[0])

metrics = compute_metrics(hypotheses, references)
metrics = {k: v * 100 for k, v in metrics.items()}
metrics

  1%|          | 61/10570 [00:10<29:03,  6.03it/s]


KeyboardInterrupt: 

In [8]:
%%time
with open(f"{data_root}/dev.src") as srcfile:
    sources = srcfile.readlines()

with open(f"{data_root}/dev.tgt") as tgtfile:
    references = tgtfile.readlines()

batch_size = 16

hypotheses = []

i = 0
while i < len(sources):
    hyp, _ = generate(model, sources[i: i+batch_size], src_tokenizer, tgt_tokenizer, cfg, method="nucleus", p=0.45)
    hypotheses.extend(hyp)
    i += batch_size
    print(f"{i} / {len(sources)}", end="\r")

metrics = compute_metrics(hypotheses, references)
metrics = {k: v * 100 for k, v in metrics.items()}
metrics

240 / 10570

KeyboardInterrupt: 

In [9]:
with open(f"{data_root}/dev.src") as srcfile:
    sources = srcfile.readlines()

with open(f"{data_root}/dev.tgt") as tgtfile:
    references = tgtfile.readlines()

In [20]:
# idx = random.randint(0, 10000)
# nucleus
hyp_nucleus, _ = generate(model, sources[idx: idx+2], src_tokenizer, tgt_tokenizer, cfg, method="nucleus", p=0.7)

hyp_greedy, _ = generate(model, sources[idx], src_tokenizer, tgt_tokenizer, cfg, method="greedy")

hyp_beam, _ = generate(model, sources[idx], src_tokenizer, tgt_tokenizer, cfg, method="beam")


print(sources[idx], references[idx], sep='\n\n')
print(f"nucleus: {hyp_nucleus[0]}\n\nbeam: {hyp_beam}\n\ngreedy: {hyp_greedy}")

surveys of plague pit remains in france and england indicate the first variant entered europe through the port of marseille around november 1347 and spread through france over the next two years , eventually reaching england in the spring of 1349 , where it spread through the country in three epidemics . 


how and when did the first variant of y. pestis enter europe ?

nucleus: when did the first peasants begin to expand europe ?

beam: ['when did england begin the first ?']

greedy: ['what did england use in france to the port of <UNK>']
