In [1]:
%load_ext jupyter_black
%load_ext autoreload
%autoreload 2

https://pytorch.org/tutorials/beginner/chatbot_tutorial.html?highlight=chatbot#define-evaluation

# Import

In [1]:
from pathlib import Path
import io

import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
import torchtext
from torchtext.vocab import build_vocab_from_iterator, GloVe
from nltk.translate import bleu

import torch
from torch import nn

from torch.utils.data import Dataset, DataLoader
from typing import List

device = "cuda" if torch.cuda.is_available() else "cpu"

  from .autonotebook import tqdm as notebook_tqdm


# Look at Data
Data already preprocessed collected from the original repo.

In [3]:
data_root = Path("data/processed")

In [4]:
with open(data_root / "src-train.txt") as f:
    src_train = [line.strip() for line in f]

with open(data_root / "tgt-train.txt") as f:
    tgt_train = [line.strip() for line in f]

with open(data_root / "src-dev.txt") as f:
    src_dev = [line.strip() for line in f]

with open(data_root / "tgt-dev.txt") as f:
    tgt_dev = [line.strip() for line in f]

with open(data_root / "src-test.txt") as f:
    src_test = [line.strip() for line in f]

with open(data_root / "tgt-test.txt") as f:
    tgt_test = [line.strip() for line in f]

print(len(src_train), len(src_dev), len(src_test))

70484 10570 11877


# Vocabulary

In [5]:
import datahandler as dh

In [6]:
vocab = dh.load_and_build_vocab(
    data_root / "src-train.txt", data_root / "tgt-train.txt"
)

In [7]:
pad_index = vocab["<PAD>"]
sos_index = vocab["<SOS>"]
eos_index = vocab["<EOS>"]

In [8]:
# a tensor of shape (vocab_size, embedding_dim)
embedding_vector = dh.load_pretrained_glove(vocab, cache="data/")

# Batch Dataloader

In [9]:
config = {"src_max_seq": 150, "tgt_max_seq": 50, "batch_size": 64, "lr": 1.0}

In [10]:
from tokenization import Tokenizer

In [11]:
tokenizer = Tokenizer(vocab, pad_index, sos_index, eos_index)

src_train_tensor, src_train_mask = tokenizer.encode(
    src_train, max_seq=config["src_max_seq"]
)

tgt_train_tensor, tgt_train_mask = tokenizer.encode(
    tgt_train, add_sos=True, add_eos=True, max_seq=config["tgt_max_seq"]
)

src_test_tensor, src_test_mask = tokenizer.encode(
    src_test, max_seq=config["src_max_seq"]
)
tgt_test_tensor, tgt_test_mask = tokenizer.encode(
    tgt_test, add_sos=True, add_eos=True, max_seq=config["tgt_max_seq"]
)

src_dev_tensor, src_dev_mask = tokenizer.encode(src_dev, max_seq=config["src_max_seq"])
tgt_dev_tensor, tgt_dev_mask = tokenizer.encode(
    tgt_dev, add_sos=True, add_eos=True, max_seq=config["tgt_max_seq"]
)

In [12]:
tokenizer.decode(src_train_tensor[0].unsqueeze(dim=0), keep_specials=False)

['a pub / <UNK> / , or public house is , despite its name , a private house , but is called a public house because it is licensed to sell alcohol to the general public .']

In [13]:
src_train[0]

'a pub / pʌb / , or public house is , despite its name , a private house , but is called a public house because it is licensed to sell alcohol to the general public .'

In [14]:
src_train_mask.shape

torch.Size([70484, 150])

In [15]:
class SentenceQuestionDataset(Dataset):
    def __init__(
        self,
        sentences: torch.Tensor,
        questions: torch.Tensor,
        sentences_mask=None,
        questions_mask=None,
    ):
        """
        Represents a dataset of text pairs for training or evaluating models that
        deal with relationships between text passages.

        Args:
            vocab (torchtext.vocab.Vocab): A pre-built vocabulary object
                containing word mappings from text to numerical representation.
            sentences (List[str]): A list of text passages (sentences, paragraphs, etc.).
            questions (List[str]): A list of corresponding questions related to the sentences.
            Ls (int, optional): The maximum length to which sentences will be
                truncated or padded during preprocessing (default: 150).
            Lq (int, optional): The maximum length to which questions will be
                truncated or padded during preprocessing (default: 50).
        """
        self.sentences = sentences
        self.questions = questions
        self.sentences_mask = sentences_mask
        self.questions_mask = questions_mask

    def __len__(self):
        return self.sentences.size(0)

    def __getitem__(self, index):
        return (
            self.sentences[index],
            self.questions[index],
            self.sentences_mask[index],
            self.questions_mask[index],
        )

In [16]:
train_ds = SentenceQuestionDataset(
    src_train_tensor, tgt_train_tensor, src_train_mask, tgt_train_mask
)
test_ds = SentenceQuestionDataset(
    src_test_tensor, tgt_train_tensor, src_test_mask, tgt_test_mask
)
dev_ds = SentenceQuestionDataset(
    src_dev_tensor, tgt_dev_tensor, src_dev_mask, tgt_dev_mask
)

In [17]:
train_dl = DataLoader(train_ds, batch_size=config["batch_size"], shuffle=True)
print(len(train_dl))
dev_dl = DataLoader(dev_ds, batch_size=8, shuffle=False)
test_dl = DataLoader(test_ds, batch_size=32, shuffle=False)

1102


# Models

In [18]:
from models import Encoder, Decoder, Seq2SeqEncoderDecoder

# Trainer

In [23]:
from simple_trainer import fit

In [None]:
config["lr"] = 1.0

net = Seq2SeqEncoderDecoder(
    vocab_size=len(vocab),
    embedding_vector=embedding_vector,
    embedding_dim=300,
    pad_index=pad_index,
    sos_index=sos_index,
    eos_index=eos_index,
    hidden_dim=8,
    bidirectional=True,
    num_layers=2,
)

optim = torch.optim.SGD(net.parameters(), lr=config["lr"])

# halve the learning rate once reaching epoch 8 as the original paper
lr_scheduler = torch.optim.lr_scheduler.MultiplicativeLR(
    optim, lr_lambda=lambda epoch: 0.5 if epoch > 8 else 1.0
)

fit(
    net,
    optim,
    train_dl,
    dev_dl,
    tokenizer,
    config,
    15,
    lr_scheduler,
    max_step=2,
    validation_data=tgt_dev,
)

  0%|                                                                                                                        | 1/1102 [00:04<1:18:48,  4.30s/batch, loss=10.8]

In [None]:
trainer.test(test_dl, tgt_test)