In [None]:
import sys
import torch
import torch.nn as nn
import torch.nn.utils
import torch.nn.functional as func

from typing import List, Tuple, Optional
from collections import namedtuple
from torch.nn.utils.rnn import pad_packed_sequence, pack_padded_sequence

In [None]:
class ModelEmbeddings(nn.Module):
    """
    Class that converts input words to their embeddings.
    """

    def __init__(self, embed_size, vocab):
        """
        Init the Embedding layers.

        @param embed_size (int): Embedding size (dimensionality)
        @param vocab (Vocab): Vocabulary object containing src and tgt languages
                              See vocab.py for documentation.
        """
        super(ModelEmbeddings, self).__init__()
        self.embed_size = embed_size

        # default values
        self.source = None
        self.target = None

        src_pad_token_idx = vocab.src["<pad>"]
        tgt_pad_token_idx = vocab.tgt["<pad>"]

        self.source = nn.Embedding(
            len(vocab.src),
            self.embed_size,
            padding_idx=src_pad_token_idx,
        )
        self.target = nn.Embedding(
            len(vocab.tgt),
            self.embed_size,
            padding_idx=tgt_pad_token_idx,
        )

In [None]:

Hypothesis = namedtuple("Hypothesis", ["value", "score"])


class NMT(nn.Module):
    """Simple Neural Machine Translation Model:
    - Bidirectional LSTM Encoder
    - Unidirectional LSTM Decoder
    - Global Attention Model (Luong, et al. 2015)
    """

    def __init__(self, embed_size, hidden_size, vocab, dropout_rate=0.2):
        """Init NMT Model.

        @param embed_size (int): Embedding size (dimensionality)
        @param hidden_size (int): Hidden Size, the size of hidden states (dimensionality)
        @param vocab (Vocab): Vocabulary object containing src and tgt languages
                              See vocab.py for documentation.
        @param dropout_rate (float): Dropout probability, for attention
        """
        super(NMT, self).__init__()
        self.model_embeddings = ModelEmbeddings(embed_size, vocab)
        self.hidden_size = hidden_size
        self.dropout_rate = dropout_rate
        self.vocab = vocab

        # For sanity check only, not relevant to implementation
        self.gen_sanity_check = False
        self.counter = 0

        self.encoder = torch.nn.LSTM(
            input_size=embed_size,
            hidden_size=self.hidden_size,
            bias=True,
            bidirectional=True,
        )
        self.decoder = torch.nn.LSTMCell(
            input_size=embed_size + hidden_size,
            hidden_size=self.hidden_size,
            bias=True,
        )
        self.h_projection = torch.nn.Linear(
            in_features=2 * self.hidden_size, out_features=self.hidden_size, bias=False
        )
        self.c_projection = torch.nn.Linear(
            in_features=2 * self.hidden_size, out_features=self.hidden_size, bias=False
        )
        self.att_projection = torch.nn.Linear(
            in_features=2 * self.hidden_size, out_features=self.hidden_size, bias=False
        )
        self.combined_output_projection = torch.nn.Linear(
            in_features=3 * self.hidden_size, out_features=self.hidden_size, bias=False
        )
        self.target_vocab_projection = torch.nn.Linear(
            in_features=self.hidden_size, out_features=len(self.vocab.tgt), bias=False
        )
        self.dropout = torch.nn.Dropout(p=self.dropout_rate)

    def forward(self, source: List[List[str]], target: List[List[str]]) -> torch.Tensor:
        """Take a mini-batch of source and target sentences, compute the log-likelihood of
        target sentences under the language models learned by the NMT system.

        @param source: list of source sentence tokens
        @param target: list of target sentence tokens, wrapped by `<s>` and `</s>`

        @returns scores (Tensor): a variable/tensor of shape (b, ) representing the
                                    log-likelihood of generating the gold-standard target sentence for
                                    each example in the input batch. Here b = batch size.
        """
        # Compute sentence lengths
        source_lengths = [len(s) for s in source]

        # Convert list of lists into tensors
        source_padded = self.vocab.src.to_input_tensor(
            source, device=self.device
        )  # Tensor: (src_len, b)
        target_padded = self.vocab.tgt.to_input_tensor(
            target, device=self.device
        )  # Tensor: (tgt_len, b)
        enc_hiddens, dec_init_state = self.encode(source_padded, source_lengths)
        enc_masks = self.generate_sent_masks(enc_hiddens, source_lengths)
        combined_outputs = self.decode(
            enc_hiddens, enc_masks, dec_init_state, target_padded
        )
        P = func.log_softmax(self.target_vocab_projection(combined_outputs), dim=-1)

        # Zero out, probabilities for which we have nothing in the target text
        target_masks = (target_padded != self.vocab.tgt["<pad>"]).float()

        # Compute log probability of generating true target words
        target_gold_words_log_prob = (
            torch.gather(P, index=target_padded[1:].unsqueeze(-1), dim=-1).squeeze(-1)
            * target_masks[1:]
        )
        scores = target_gold_words_log_prob.sum(dim=0)
        return scores

    def encode(
        self, source_padded: torch.Tensor, source_lengths: List[int]
    ) -> Tuple[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
        """Apply the encoder to source sentences to obtain encoder hidden states.
            Additionally, take the final states of the encoder and project them to obtain initial states for decoder.

        @param source_padded: Tensor of padded source sentences with shape (src_len, b), where
                              b = batch_size, src_len = maximum source sentence length. Note that
                              these have already been sorted in order of longest to shortest sentence.
        @param source_lengths: List of actual lengths for each of the source sentences in the batch
        @returns enc_hiddens: Tensor of hidden units with shape (b, src_len, h*2), where
                              b = batch size, src_len = maximum source sentence length, h = hidden size.
        @returns dec_init_state (tuple(Tensor, Tensor)): Tuple of tensors representing the decoder's initial
                                                         hidden state and cell.
        """
        X = self.model_embeddings.source(source_padded)

        X_packed = pack_padded_sequence(X, source_lengths)
        enc_hiddens, (last_hidden, last_cell) = self.encoder(X_packed)
        enc_hiddens, _ = pad_packed_sequence(enc_hiddens, batch_first=True)

        concat_last_hidden = torch.cat((last_hidden[0], last_hidden[1]), 1)
        init_decoder_hidden = self.h_projection(concat_last_hidden)

        concat_last_cell = torch.cat((last_cell[0], last_cell[1]), 1)
        init_decoder_cell = self.c_projection(concat_last_cell)

        dec_init_state = (init_decoder_hidden, init_decoder_cell)

        return enc_hiddens, dec_init_state

    def decode(
        self,
        enc_hiddens: torch.Tensor,
        enc_masks: torch.Tensor,
        dec_init_state: Tuple[torch.Tensor, torch.Tensor],
        target_padded: torch.Tensor,
    ) -> torch.Tensor:
        """Compute combined output vectors for a batch.

        @param enc_hiddens: Hidden states (b, src_len, h*2), where
                                     b = batch size, src_len = maximum source sentence length, h = hidden size.
        @param enc_masks: Tensor of sentence masks (b, src_len), where
                          b = batch size, src_len = maximum source sentence length.
        @param dec_init_state: Initial state and cell for decoder
        @param target_padded: Gold-standard padded target sentences (tgt_len, b), where
                              tgt_len = maximum target sentence length, b = batch size.

        @returns combined_outputs: combined output tensor  (tgt_len, b,  h), where
                                   tgt_len = maximum target sentence length, b = batch_size,  h = hidden size
        """
        # Chop of the <END> token for max length sentences.
        target_padded = target_padded[:-1]

        # Initialize the decoder state (hidden and cell)
        dec_state = dec_init_state

        # Initialize previous combined output vector o_{t-1} as zero
        batch_size = enc_hiddens.size(0)
        o_prev = torch.zeros(batch_size, self.hidden_size, device=self.device)

        # Initialize a list we will use to collect the combined output o_t on each step
        combined_outputs = []

        enc_hiddens_proj = self.att_projection(enc_hiddens)
        Y = self.model_embeddings.target(target_padded)
        for Y_t in torch.split(Y, 1):
            Y_t_squeezed = torch.squeeze(Y_t)
            Y_bar_t = torch.cat([Y_t_squeezed, o_prev], 1)
            dec_state, o_t, e_t = self.step(
                Ybar_t=Y_bar_t,
                dec_state=dec_state,
                enc_hiddens=enc_hiddens,
                enc_hiddens_proj=enc_hiddens_proj,
                enc_masks=enc_masks,
            )
            combined_outputs.append(o_t)
            o_prev = o_t

        combined_outputs = torch.stack(combined_outputs, dim=0)

        return combined_outputs

    def step(
        self,
        Ybar_t: torch.Tensor,
        dec_state: Tuple[torch.Tensor, torch.Tensor],
        enc_hiddens: torch.Tensor,
        enc_hiddens_proj: torch.Tensor,
        enc_masks: Optional[torch.Tensor],
    ) -> Tuple[Tuple, torch.Tensor, torch.Tensor]:
        """Compute one forward step of the LSTM decoder, including the attention computation.

        @param Ybar_t: Concatenated Tensor of [Y_t o_prev], with shape (b, e + h). The input for the decoder,
                       where b = batch size, e = embedding size, h = hidden size.
        @param dec_state: Tuple of tensors both with shape (b, h), where b = batch size, h = hidden size.
                          First tensor is decoder's prev hidden state, second tensor is decoder's prev cell.
        @param enc_hiddens: Encoder hidden states Tensor, with shape (b, src_len, h * 2), where b = batch size,
                            src_len = maximum source length, h = hidden size.
        @param enc_hiddens_proj: Encoder hidden states Tensor, projected from (h * 2) to h. Tensor is with shape (b, src_len, h),
                                 where b = batch size, src_len = maximum source length, h = hidden size.
        @param enc_masks: Tensor of sentence masks shape (b, src_len),
                          where b = batch size, src_len is maximum source length.

        @returns dec_state (tuple (Tensor, Tensor)): Tuple of tensors both shape (b, h), where b = batch size, h = hidden size.
                First tensor is decoder's new hidden state, second tensor is decoder's new cell.
        @returns combined_output (Tensor): Combined output Tensor at time step t, shape (b, h), where b = batch size, h = hidden size.
        @returns e_t (Tensor): Tensor of shape (b, src_len). It is attention scores distribution.
                                Note: You will not use this outside of this function.
                                      We are simply returning this value so that we can sanity check
                                      your implementation.
        """

        dec_state = self.decoder(Ybar_t, dec_state)
        dec_hidden, dec_cell = dec_state

        e_t = torch.squeeze(
            torch.bmm(enc_hiddens_proj, torch.unsqueeze(dec_hidden, dim=2)), dim=2
        )

        # Set e_t to -inf where enc_masks has 1
        if enc_masks is not None:
            e_t.data.masked_fill_(enc_masks.bool(), -float("inf"))

        alpha_t = func.softmax(e_t, dim=-1)

        a_t = torch.squeeze(
            torch.bmm(torch.unsqueeze(alpha_t, dim=1), enc_hiddens), dim=1
        )

        U_t = torch.cat([dec_hidden, a_t], dim=1)
        V_t = self.combined_output_projection(U_t)
        O_t = self.dropout(torch.tanh(V_t))

        combined_output = O_t
        return dec_state, combined_output, e_t

    def generate_sent_masks(
        self, enc_hiddens: torch.Tensor, source_lengths: List[int]
    ) -> torch.Tensor:
        """Generate sentence masks for encoder hidden states.

        @param enc_hiddens: encodings of shape (b, src_len, 2*h), where b = batch size,
                            src_len = max source length, h = hidden size.
        @param source_lengths: List of actual lengths for each of the sentences in the batch.

        @returns enc_masks: Tensor of sentence masks of shape (b, src_len),
                            where src_len = max source length, h = hidden size.
        """
        enc_masks = torch.zeros(
            enc_hiddens.size(0), enc_hiddens.size(1), dtype=torch.float
        )
        for e_id, src_len in enumerate(source_lengths):
            enc_masks[e_id, src_len:] = 1
        return enc_masks.to(self.device)

    def beam_search(
        self, src_sent: List[str], beam_size: int = 5, max_decoding_time_step: int = 70
    ) -> List[Hypothesis]:
        """Given a single source sentence, perform beam search, yielding translations in the target language.
        @param src_sent: a single source sentence (words)
        @param beam_size: beam size
        @param max_decoding_time_step: maximum number of time steps to unroll the decoding RNN
        @returns hypotheses: a list of hypothesis, each hypothesis has two fields:
                value: List[str]: the decoded target sentence, represented as a list of words
                score: float: the log-likelihood of the target sentence
        """
        src_sents_var = self.vocab.src.to_input_tensor([src_sent], self.device)

        src_encodings, dec_init_vec = self.encode(src_sents_var, [len(src_sent)])
        src_encodings_att_linear = self.att_projection(src_encodings)

        h_tm1 = dec_init_vec
        att_tm1 = torch.zeros(1, self.hidden_size, device=self.device)

        eos_id = self.vocab.tgt["</s>"]

        hypotheses = [["<s>"]]
        hyp_scores = torch.zeros(len(hypotheses), dtype=torch.float, device=self.device)
        completed_hypotheses = []

        t = 0
        while len(completed_hypotheses) < beam_size and t < max_decoding_time_step:
            t += 1
            hyp_num = len(hypotheses)

            exp_src_encodings = src_encodings.expand(
                hyp_num, src_encodings.size(1), src_encodings.size(2)
            )

            exp_src_encodings_att_linear = src_encodings_att_linear.expand(
                hyp_num,
                src_encodings_att_linear.size(1),
                src_encodings_att_linear.size(2),
            )

            y_tm1 = torch.tensor(
                [self.vocab.tgt[hyp[-1]] for hyp in hypotheses],
                dtype=torch.long,
                device=self.device,
            )
            y_t_embed = self.model_embeddings.target(y_tm1)

            x = torch.cat([y_t_embed, att_tm1], dim=-1)

            (h_t, cell_t), att_t, _ = self.step(
                x,
                h_tm1,
                exp_src_encodings,
                exp_src_encodings_att_linear,
                enc_masks=None,
            )

            # log probabilities over target words
            log_p_t = func.log_softmax(self.target_vocab_projection(att_t), dim=-1)

            live_hyp_num = beam_size - len(completed_hypotheses)
            continuing_hyp_scores = (
                hyp_scores.unsqueeze(1).expand_as(log_p_t) + log_p_t
            ).view(-1)
            top_cand_hyp_scores, top_cand_hyp_pos = torch.topk(
                continuing_hyp_scores, k=live_hyp_num
            )

            prev_hyp_ids = top_cand_hyp_pos // len(self.vocab.tgt)
            hyp_word_ids = top_cand_hyp_pos % len(self.vocab.tgt)

            new_hypotheses = []
            live_hyp_ids = []
            new_hyp_scores = []

            for prev_hyp_id, hyp_word_id, cand_new_hyp_score in zip(
                prev_hyp_ids, hyp_word_ids, top_cand_hyp_scores
            ):
                prev_hyp_id = prev_hyp_id.item()
                hyp_word_id = hyp_word_id.item()
                cand_new_hyp_score = cand_new_hyp_score.item()

                hyp_word = self.vocab.tgt.id2word[hyp_word_id]
                new_hyp_sent = hypotheses[prev_hyp_id] + [hyp_word]
                if hyp_word == "</s>":
                    completed_hypotheses.append(
                        Hypothesis(value=new_hyp_sent[1:-1], score=cand_new_hyp_score)
                    )
                else:
                    new_hypotheses.append(new_hyp_sent)
                    live_hyp_ids.append(prev_hyp_id)
                    new_hyp_scores.append(cand_new_hyp_score)

            if len(completed_hypotheses) == beam_size:
                break

            live_hyp_ids = torch.tensor(
                live_hyp_ids, dtype=torch.long, device=self.device
            )
            h_tm1 = (h_t[live_hyp_ids], cell_t[live_hyp_ids])
            att_tm1 = att_t[live_hyp_ids]

            hypotheses = new_hypotheses
            hyp_scores = torch.tensor(
                new_hyp_scores, dtype=torch.float, device=self.device
            )

        if len(completed_hypotheses) == 0:
            completed_hypotheses.append(
                Hypothesis(value=hypotheses[0][1:], score=hyp_scores[0].item())
            )

        completed_hypotheses.sort(key=lambda hyp: hyp.score, reverse=True)

        return completed_hypotheses

    @property
    def device(self) -> torch.device:
        """Determine which device to place the Tensors upon, CPU or GPU."""
        return self.model_embeddings.source.weight.device

    @staticmethod
    def load(model_path: str):
        """Load the model from a file.
        @param model_path: path to model
        """
        params = torch.load(model_path, map_location=lambda storage, loc: storage)
        args = params["args"]
        model = NMT(vocab=params["vocab"], **args)
        model.load_state_dict(params["state_dict"])

        return model

    def save(self, path: str):
        """Save the model to a file.
        @param path: path to the model
        """
        print("save model parameters to [%s]" % path, file=sys.stderr)

        params = {
            "args": dict(
                embed_size=self.model_embeddings.embed_size,
                hidden_size=self.hidden_size,
                dropout_rate=self.dropout_rate,
            ),
            "vocab": self.vocab,
            "state_dict": self.state_dict(),
        }

        torch.save(params, path)

In [None]:
!pip install sentencepiece


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting sentencepiece
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m15.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sentencepiece
Successfully installed sentencepiece-0.1.99


In [None]:
!pip install docopt

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting docopt
  Downloading docopt-0.6.2.tar.gz (25 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: docopt
  Building wheel for docopt (setup.py) ... [?25l[?25hdone
  Created wheel for docopt: filename=docopt-0.6.2-py2.py3-none-any.whl size=13721 sha256=112aa97ea290ccee3bfb2c41b67f6ee82ca004cee0d75d7f52b1ae9c0010c749
  Stored in directory: /root/.cache/pip/wheels/fc/ab/d4/5da2067ac95b36618c629a5f93f809425700506f72c9732fac
Successfully built docopt
Installing collected packages: docopt
Successfully installed docopt-0.6.2


In [None]:
import math
from typing import List

import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import nltk
import sentencepiece as spm
from collections import Counter
from docopt import docopt
from itertools import chain
import json
#from utils import read_corpus, pad_sents


In [None]:
nltk.download("punkt")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
def pad_sents(sents, pad_token):
    """Pad list of sentences according to the longest sentence in the batch.
        The paddings should be at the end of each sentence.
    @param sents (list[list[str]]): list of sentences, where each sentence
                                    is represented as a list of words
    @param pad_token (str): padding token
    @returns sents_padded (list[list[str]]): list of sentences where sentences shorter
        than the max length sentence are padded out with the pad_token, such that
        each sentences in the batch now has equal length.
    """
    sents_padded = []

    # YOUR CODE HERE (~6 Lines)
    sentence_lengths = [len(s) for s in sents]
    max_length = max(sentence_lengths)
    for s in sents:
        s += [pad_token] * (max_length - len(s))
        sents_padded.append(s)
    # END YOUR CODE

    return sents_padded

In [None]:
def read_corpus(file_path, source, vocab_size=2500):
    """Read file, where each sentence is dilineated by a `\n`.
    @param file_path (str): path to file containing corpus
    @param source (str): "tgt" or "src" indicating whether text
        is of the source language or target language
    @param vocab_size (int): number of unique subwords in
        vocabulary when reading and tokenizing
    """
    data = []
    sp = spm.SentencePieceProcessor()
    sp.load("{}.model".format(source))

    with open(file_path, "r", encoding="utf8") as f:
        for line in f:
            subword_tokens = sp.encode_as_pieces(line)
            # only append <s> and </s> to the target sentence
            if source == "tgt":
                subword_tokens = ["<s>"] + subword_tokens + ["</s>"]
            data.append(subword_tokens)

    return data

In [None]:
def autograder_read_corpus(file_path, source):
    """Read file, where each sentence is dilineated by a `\n`.
    @param file_path (str): path to file containing corpus
    @param source (str): "tgt" or "src" indicating whether text
        is of the source language or target language
    """
    data = []
    for line in open(file_path):
        sent = nltk.word_tokenize(line)
        # only append <s> and </s> to the target sentence
        if source == "tgt":
            sent = ["<s>"] + sent + ["</s>"]
        data.append(sent)

    return data

In [None]:
def batch_iter(data, batch_size, shuffle=False):
    """Yield batches of source and target sentences reverse sorted by length (largest to smallest).
    @param data (list of (src_sent, tgt_sent)): list of tuples containing source and target sentence
    @param batch_size (int): batch size
    @param shuffle (boolean): whether to randomly shuffle the dataset
    """
    batch_num = math.ceil(len(data) / batch_size)
    index_array = list(range(len(data)))

    if shuffle:
        np.random.shuffle(index_array)

    for i in range(batch_num):
        indices = index_array[i * batch_size : (i + 1) * batch_size]
        examples = [data[idx] for idx in indices]

        examples = sorted(examples, key=lambda e: len(e[0]), reverse=True)
        src_sents = [e[0] for e in examples]
        tgt_sents = [e[1] for e in examples]

        yield src_sents, tgt_sents

In [None]:
class VocabEntry(object):
    """Vocabulary Entry, i.e. structure containing either
    src or tgt language terms.
    """

    def __init__(self, word2id=None):
        """Init VocabEntry Instance.
        @param word2id (dict): dictionary mapping words 2 indices
        """
        if word2id:
            self.word2id = word2id
        else:
            self.word2id = dict()
            self.word2id["<pad>"] = 0  # Pad Token
            self.word2id["<s>"] = 1  # Start Token
            self.word2id["</s>"] = 2  # End Token
            self.word2id["<unk>"] = 3  # Unknown Token
        self.unk_id = self.word2id["<unk>"]
        self.id2word = {v: k for k, v in self.word2id.items()}

    def __getitem__(self, word):
        """Retrieve word's index. Return the index for the unk
        token if the word is out of vocabulary.
        @param word (str): word to look up.
        @returns index (int): index of word
        """
        return self.word2id.get(word, self.unk_id)

    def __contains__(self, word):
        """Check if word is captured by VocabEntry.
        @param word (str): word to look up
        @returns contains (bool): whether word is contained
        """
        return word in self.word2id

    def __setitem__(self, key, value):
        """Raise error, if one tries to edit the VocabEntry."""
        raise ValueError("vocabulary is readonly")

    def __len__(self):
        """Compute number of words in VocabEntry.
        @returns len (int): number of words in VocabEntry
        """
        return len(self.word2id)

    def __repr__(self):
        """Representation of VocabEntry to be used
        when printing the object.
        """
        return "Vocabulary[size=%d]" % len(self)

    def id2word(self, wid):
        """Return mapping of index to word.
        @param wid (int): word index
        @returns word (str): word corresponding to index
        """
        return self.id2word[wid]

    def add(self, word):
        """Add word to VocabEntry, if it is previously unseen.
        @param word (str): word to add to VocabEntry
        @return index (int): index that the word has been assigned
        """
        if word not in self:
            wid = self.word2id[word] = len(self)
            self.id2word[wid] = word
            return wid
        else:
            return self[word]

    def words2indices(self, sents):
        """Convert list of words or list of sentences of words
        into list or list of list of indices.
        @param sents (list[str] or list[list[str]]): sentence(s) in words
        @return word_ids (list[int] or list[list[int]]): sentence(s) in indices
        """
        if type(sents[0]) == list:
            return [[self[w] for w in s] for s in sents]
        else:
            return [self[w] for w in sents]

    def indices2words(self, word_ids):
        """Convert list of indices into words.
        @param word_ids (list[int]): list of word ids
        @return sents (list[str]): list of words
        """
        return [self.id2word[w_id] for w_id in word_ids]

    def to_input_tensor(
        self, sents: List[List[str]], device: torch.device
    ) -> torch.Tensor:
        """Convert list of sentences (words) into tensor with necessary padding for
        shorter sentences.

        @param sents (List[List[str]]): list of sentences (words)
        @param device: device on which to load the tesnor, i.e. CPU or GPU

        @returns sents_var: tensor of (max_sentence_length, batch_size)
        """
        word_ids = self.words2indices(sents)
        sents_t = pad_sents(word_ids, self["<pad>"])
        sents_var = torch.tensor(sents_t, dtype=torch.long, device=device)
        return torch.t(sents_var)

    @staticmethod
    def from_corpus(corpus, size, freq_cutoff=2):
        """Given a corpus construct a Vocab Entry.
        @param corpus (list[str]): corpus of text produced by read_corpus function
        @param size (int): # of words in vocabulary
        @param freq_cutoff (int): if word occurs n < freq_cutoff times, drop the word
        @returns vocab_entry (VocabEntry): VocabEntry instance produced from provided corpus
        """
        vocab_entry = VocabEntry()
        word_freq = Counter(chain(*corpus))
        valid_words = [w for w, v in word_freq.items() if v >= freq_cutoff]
        print(
            "number of word types: {}, number of word types w/ frequency >= {}: {}".format(
                len(word_freq), freq_cutoff, len(valid_words)
            )
        )
        top_k_words = sorted(valid_words, key=lambda w: word_freq[w], reverse=True)[
            :size
        ]
        for word in top_k_words:
            vocab_entry.add(word)
        return vocab_entry

    @staticmethod
    def from_subword_list(subword_list):
        vocab_entry = VocabEntry()
        for subword in subword_list:
            vocab_entry.add(subword)
        return vocab_entry


class Vocab(object):
    """Vocab encapsulating src and target languages."""

    def __init__(self, src_vocab: VocabEntry, tgt_vocab: VocabEntry):
        """Init Vocab.
        @param src_vocab (VocabEntry): VocabEntry for source language
        @param tgt_vocab (VocabEntry): VocabEntry for target language
        """
        self.src = src_vocab
        self.tgt = tgt_vocab

    @staticmethod
    def build(src_sents, tgt_sents) -> "Vocab":
        """Build Vocabulary.
        @param src_sents (list[str]): Source subwords provided by SentencePiece
        @param tgt_sents (list[str]): Target subwords provided by SentencePiece
        """

        print("initialize source vocabulary ..")
        src = VocabEntry.from_subword_list(src_sents)

        print("initialize target vocabulary ..")
        tgt = VocabEntry.from_subword_list(tgt_sents)

        return Vocab(src, tgt)

    def save(self, file_path):
        """Save Vocab to file as JSON dump.
        @param file_path (str): file path to vocab file
        """
        with open(file_path, "w") as f:
            json.dump(
                dict(src_word2id=self.src.word2id, tgt_word2id=self.tgt.word2id),
                f,
                indent=2,
            )

    @staticmethod
    def load(file_path):
        """Load vocabulary from JSON dump.
        @param file_path (str): file path to vocab file
        @returns Vocab object loaded from JSON dump
        """
        entry = json.load(open(file_path, "r"))
        src_word2id = entry["src_word2id"]
        tgt_word2id = entry["tgt_word2id"]

        return Vocab(VocabEntry(src_word2id), VocabEntry(tgt_word2id))

    def __repr__(self):
        """Representation of Vocab to be used
        when printing the object.
        """
        return "Vocab(source %d words, target %d words)" % (
            len(self.src),
            len(self.tgt),
        )



In [None]:
def get_vocab_list(file_path, source, vocab_size):
    """Use SentencePiece to tokenize and acquire list of unique subwords.
    @param file_path (str): file path to corpus
    @param source (str): tgt or src
    @param vocab_size: desired vocabulary size
    """
    spm.SentencePieceTrainer.train(
        input=file_path, model_prefix=source, vocab_size=vocab_size
    )  # train the spm model
    sp = (
        spm.SentencePieceProcessor()
    )  # create an instance; this saves .model and .vocab files
    sp.load("{}.model".format(source))  # loads tgt.model or src.model
    sp_list = [
        sp.id_to_piece(piece_id) for piece_id in range(sp.get_piece_size())
    ]  # this is the list of subwords
    return sp_list

In [None]:
if __name__ == "__main__":
    # Hardcode your desired argument values
    args = {
        "--train-src": "/content/drive/MyDrive/ISR Spring'23/src.vocab",
        "--train-tgt": "/content/drive/MyDrive/ISR Spring'23/tgt.vocab",
        "VOCAB_FILE": "/content/drive/MyDrive/ISR Spring'23/vocab_file.json"
    }

    print("read in source sentences: %s" % args["--train-src"])
    print("read in target sentences: %s" % args["--train-tgt"])

    src_sents = get_vocab_list(args["--train-src"], source="src", vocab_size=14128)
    tgt_sents = get_vocab_list(args["--train-tgt"], source="tgt", vocab_size=6748)
    vocab = Vocab.build(src_sents, tgt_sents)
    print(
        "generated vocabulary, source %d words, target %d words"
        % (len(src_sents), len(tgt_sents))
    )

    vocab.save(args["VOCAB_FILE"])
    print("vocabulary saved to %s" % args["VOCAB_FILE"])


read in source sentences: /content/drive/MyDrive/ISR Spring'23/src.vocab
read in target sentences: /content/drive/MyDrive/ISR Spring'23/tgt.vocab
initialize source vocabulary ..
initialize target vocabulary ..
generated vocabulary, source 14128 words, target 6748 words
vocabulary saved to /content/drive/MyDrive/ISR Spring'23/vocab_file.json


In [None]:
!pip install sacrebleu

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting sacrebleu
  Downloading sacrebleu-2.3.1-py3-none-any.whl (118 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m118.9/118.9 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
Collecting colorama
  Downloading colorama-0.4.6-py2.py3-none-any.whl (25 kB)
Collecting portalocker
  Downloading portalocker-2.7.0-py2.py3-none-any.whl (15 kB)
Installing collected packages: portalocker, colorama, sacrebleu
Successfully installed colorama-0.4.6 portalocker-2.7.0 sacrebleu-2.3.1


In [None]:
import math
import sys
import pickle
import time


from docopt import docopt

# from nltk.translate.bleu_score import corpus_bleu, sentence_bleu, SmoothingFunction
import sacrebleu
#from nmt_model import Hypothesis, NMT
import numpy as np
from typing import List, Tuple, Dict, Set, Union
from tqdm import tqdm
#from utils import read_corpus, batch_iter
#from vocab import Vocab, VocabEntry


In [None]:
def evaluate_ppl(model, dev_data, batch_size=32):
    """Evaluate perplexity on dev sentences
    @param model (NMT): NMT Model
    @param dev_data (list of (src_sent, tgt_sent)): list of tuples containing source and target sentence
    @param batch_size (batch size)
    @returns ppl (perplixty on dev sentences)
    """
    was_training = model.training
    model.eval()

    cum_loss = 0.0
    cum_tgt_words = 0.0

    # no_grad() signals backend to throw away all gradients
    with torch.no_grad():
        for src_sents, tgt_sents in batch_iter(dev_data, batch_size):
            loss = -model(src_sents, tgt_sents).sum()

            cum_loss += loss.item()
            tgt_word_num_to_predict = sum(
                len(s[1:]) for s in tgt_sents
            )  # omitting leading `<s>`
            cum_tgt_words += tgt_word_num_to_predict

        ppl = np.exp(cum_loss / cum_tgt_words)

    if was_training:
        model.train()

    return ppl

In [None]:
def compute_corpus_level_bleu_score(
    references: List[List[str]], hypotheses: List[Hypothesis]
) -> float:
    """Given decoding results and reference sentences, compute corpus-level BLEU score.
    @param references (List[List[str]]): a list of gold-standard reference target sentences
    @param hypotheses (List[Hypothesis]): a list of hypotheses, one for each reference
    @returns bleu_score: corpus-level BLEU score
    """
    # remove the start and end tokens
    if references[0][0] == "<s>":
        references = [ref[1:-1] for ref in references]

    # detokenize the subword pieces to get full sentences
    detokened_refs = ["".join(pieces).replace("▁", " ") for pieces in references]
    detokened_hyps = ["".join(hyp.value).replace("▁", " ") for hyp in hypotheses]

    # sacreBLEU can take multiple references (golden example per sentence) but we only feed it one
    bleu = sacrebleu.corpus_bleu(detokened_hyps, [detokened_refs])

    return bleu.score

In [None]:
def train(args: Dict):
    """Train the NMT Model.
    @param args (Dict): args from cmd line
    """
    train_data_src = read_corpus(args["--train-src"], source="src", vocab_size=21000)
    train_data_tgt = read_corpus(args["--train-tgt"], source="tgt", vocab_size=8000)

    dev_data_src = read_corpus(args["--dev-src"], source="src", vocab_size=3000)
    dev_data_tgt = read_corpus(args["--dev-tgt"], source="tgt", vocab_size=2000)

    train_data = list(zip(train_data_src, train_data_tgt))
    dev_data = list(zip(dev_data_src, dev_data_tgt))

    train_batch_size = int(args["--batch-size"])
    clip_grad = float(args["--clip-grad"])
    valid_niter = int(args["--valid-niter"])
    log_every = int(args["--log-every"])
    model_save_path = args["--save-to"]

    vocab = Vocab.load(args["--vocab"])

    # model = NMT(embed_size=int(args['--embed-size']),
    #             hidden_size=int(args['--hidden-size']),
    #             dropout_rate=float(args['--dropout']),
    #             vocab=vocab)

    model = NMT(
        embed_size=1024,
        hidden_size=1024,
        dropout_rate=float(args["--dropout"]),
        vocab=vocab,
    )

    model.train()

    uniform_init = float(args["--uniform-init"])
    if np.abs(uniform_init) > 0.0:
        print(
            "uniformly initialize parameters [-%f, +%f]" % (uniform_init, uniform_init),
            file=sys.stderr,
        )
        for p in model.parameters():
            p.data.uniform_(-uniform_init, uniform_init)

    vocab_mask = torch.ones(len(vocab.tgt))
    vocab_mask[vocab.tgt["<pad>"]] = 0

    device = torch.device("cuda:0" if args["--cuda"] else "cpu")
    print("use device: %s" % device, file=sys.stderr)

    model = model.to(device)

    optimizer = torch.optim.Adam(model.parameters(), lr=float(args["--lr"]))

    num_trial = 0
    train_iter = (
        patience
    ) = cum_loss = report_loss = cum_tgt_words = report_tgt_words = 0
    cum_examples = report_examples = epoch = valid_num = 0
    hist_valid_scores = []
    train_time = begin_time = time.time()
    print("begin Maximum Likelihood training")

    while True:
        epoch += 1

        for src_sents, tgt_sents in batch_iter(
            train_data, batch_size=train_batch_size, shuffle=True
        ):
            train_iter += 1

            optimizer.zero_grad()

            batch_size = len(src_sents)

            example_losses = -model(src_sents, tgt_sents)  # (batch_size,)
            batch_loss = example_losses.sum()
            loss = batch_loss / batch_size

            loss.backward()

            # clip gradient
            grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), clip_grad)

            optimizer.step()

            batch_losses_val = batch_loss.item()
            report_loss += batch_losses_val
            cum_loss += batch_losses_val

            tgt_words_num_to_predict = sum(
                len(s[1:]) for s in tgt_sents
            )  # omitting leading `<s>`
            report_tgt_words += tgt_words_num_to_predict
            cum_tgt_words += tgt_words_num_to_predict
            report_examples += batch_size
            cum_examples += batch_size

            if train_iter % log_every == 0:
                print(
                    "epoch %d, iter %d, avg. loss %.2f, avg. ppl %.2f "
                    "cum. examples %d, speed %.2f words/sec, time elapsed %.2f sec"
                    % (
                        epoch,
                        train_iter,
                        report_loss / report_examples,
                        math.exp(report_loss / report_tgt_words),
                        cum_examples,
                        report_tgt_words / (time.time() - train_time),
                        time.time() - begin_time,
                    ),
                    file=sys.stderr,
                )

                train_time = time.time()
                report_loss = report_tgt_words = report_examples = 0.0

            # perform validation
            if train_iter % valid_niter == 0:
                print(
                    "epoch %d, iter %d, cum. loss %.2f, cum. ppl %.2f cum. examples %d"
                    % (
                        epoch,
                        train_iter,
                        cum_loss / cum_examples,
                        np.exp(cum_loss / cum_tgt_words),
                        cum_examples,
                    ),
                    file=sys.stderr,
                )

                cum_loss = cum_examples = cum_tgt_words = 0.0
                valid_num += 1

                print("begin validation ...", file=sys.stderr)

                # compute dev. ppl and bleu
                dev_ppl = evaluate_ppl(
                    model, dev_data, batch_size=128
                )  # dev batch size can be a bit larger
                valid_metric = -dev_ppl

                print(
                    "validation: iter %d, dev. ppl %f" % (train_iter, dev_ppl),
                    file=sys.stderr,
                )

                is_better = len(hist_valid_scores) == 0 or valid_metric > max(
                    hist_valid_scores
                )
                hist_valid_scores.append(valid_metric)

                if is_better:
                    patience = 0
                    print(
                        "save currently the best model to [%s]" % model_save_path,
                        file=sys.stderr,
                    )
                    model.save(model_save_path)

                    # also save the optimizers' state
                    torch.save(optimizer.state_dict(), model_save_path + ".optim")
                elif patience < int(args["--patience"]):
                    patience += 1
                    print("hit patience %d" % patience, file=sys.stderr)

                    if patience == int(args["--patience"]):
                        num_trial += 1
                        print("hit #%d trial" % num_trial, file=sys.stderr)
                        if num_trial == int(args["--max-num-trial"]):
                            print("early stop!", file=sys.stderr)
                            exit(0)

                        # decay lr, and restore from previously best checkpoint
                        lr = optimizer.param_groups[0]["lr"] * float(args["--lr-decay"])
                        print(
                            "load previously best model and decay learning rate to %f"
                            % lr,
                            file=sys.stderr,
                        )

                        # load model
                        params = torch.load(
                            model_save_path, map_location=lambda storage, loc: storage
                        )
                        model.load_state_dict(params["state_dict"])
                        model = model.to(device)

                        print("restore parameters of the optimizers", file=sys.stderr)
                        optimizer.load_state_dict(
                            torch.load(model_save_path + ".optim")
                        )

                        # set new lr
                        for param_group in optimizer.param_groups:
                            param_group["lr"] = lr

                        # reset patience
                        patience = 0

                if epoch == int(args["--max-epoch"]):
                    print("reached maximum number of epochs!", file=sys.stderr)
                    exit(0)

In [None]:
def beam_search(
    model: NMT,
    test_data_src: List[List[str]],
    beam_size: int,
    max_decoding_time_step: int,
) -> List[List[Hypothesis]]:
    """Run beam search to construct hypotheses for a list of src-language sentences.
    @param model (NMT): NMT Model
    @param test_data_src (List[List[str]]): List of sentences (words) in source language, from test set.
    @param beam_size (int): beam_size (# of hypotheses to hold for a translation at every step)
    @param max_decoding_time_step (int): maximum sentence length that Beam search can produce
    @returns hypotheses (List[List[Hypothesis]]): List of Hypothesis translations for every source sentence.
    """
    was_training = model.training
    model.eval()

    hypotheses = []
    with torch.no_grad():
        for src_sent in tqdm(test_data_src, desc="Decoding", file=sys.stdout):
            example_hyps = model.beam_search(
                src_sent,
                beam_size=beam_size,
                max_decoding_time_step=max_decoding_time_step,
            )

            hypotheses.append(example_hyps)

    if was_training:
        model.train(was_training)

    return hypotheses

In [None]:
def decode(args: Dict[str, str]):
    """Performs decoding on a test set, and save the best-scoring decoding results.
    If the target gold-standard sentences are given, the function also computes
    corpus-level BLEU score.
    @param args (Dict): args from cmd line
    """

    print(
        "load test source sentences from [{}]".format(args["TEST_SOURCE_FILE"]),
        file=sys.stderr,
    )
    test_data_src = read_corpus(args["TEST_SOURCE_FILE"], source="src", vocab_size=3000)
    if args["TEST_TARGET_FILE"]:
        print(
            "load test target sentences from [{}]".format(args["TEST_TARGET_FILE"]),
            file=sys.stderr,
        )
        test_data_tgt = read_corpus(
            args["TEST_TARGET_FILE"], source="tgt", vocab_size=2000
        )

    print("load model from {}".format(args["MODEL_PATH"]), file=sys.stderr)
    model = NMT.load(args["MODEL_PATH"])

    if args["--cuda"]:
        model = model.to(torch.device("cuda:0"))

    hypotheses = beam_search(
        model,
        test_data_src,
        #  beam_size=int(args['--beam-size']),
        beam_size=10,
        max_decoding_time_step=int(args["--max-decoding-time-step"]),
    )

    if args["TEST_TARGET_FILE"]:
        top_hypotheses = [hyps[0] for hyps in hypotheses]
        bleu_score = compute_corpus_level_bleu_score(test_data_tgt, top_hypotheses)
        print("Corpus BLEU: {}".format(bleu_score), file=sys.stderr)

    with open(args["OUTPUT_FILE"], "w") as f:
        for src_sent, hyps in zip(test_data_src, hypotheses):
            top_hyp = hyps[0]
            hyp_sent = "".join(top_hyp.value).replace("▁", " ")
            f.write(hyp_sent + "\n")

In [None]:
def main():
    """Main func."""
    args = docopt(__doc__)

    # Check pytorch version
    assert (
        torch.__version__ >= "1.0.0"
    ), "Please update your installation of PyTorch. You have {} and you should have version 1.0.0".format(
        torch.__version__
    )

    # seed the random number generators
    seed = int(args["--seed"])
    torch.manual_seed(seed)
    if args["--cuda"]:
        torch.cuda.manual_seed(seed)
    np.random.seed(seed * 13 // 7)

    if args["train"]:
        train(args)
    elif args["decode"]:
        decode(args)
    else:
        raise RuntimeError("invalid run mode")


if __name__ == "__main__":
    main()

DocoptLanguageError: ignored

In [None]:
!cd /content/drive/MyDrive/ChrEn


In [None]:
!pip install docopt

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting docopt
  Downloading docopt-0.6.2.tar.gz (25 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: docopt
  Building wheel for docopt (setup.py) ... [?25l[?25hdone
  Created wheel for docopt: filename=docopt-0.6.2-py2.py3-none-any.whl size=13707 sha256=9e3df16199f1db6d57a88dce9c9161a51a587b7de900f240c48e4e0466f3d61d
  Stored in directory: /root/.cache/pip/wheels/fc/ab/d4/5da2067ac95b36618c629a5f93f809425700506f72c9732fac
Successfully built docopt
Installing collected packages: docopt
Successfully installed docopt-0.6.2


In [None]:
!pip install sentencepiece

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting sentencepiece
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m21.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sentencepiece
Successfully installed sentencepiece-0.1.99


In [None]:
!pip install sacrebleu

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting sacrebleu
  Downloading sacrebleu-2.3.1-py3-none-any.whl (118 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m118.9/118.9 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting portalocker (from sacrebleu)
  Downloading portalocker-2.7.0-py2.py3-none-any.whl (15 kB)
Collecting colorama (from sacrebleu)
  Downloading colorama-0.4.6-py2.py3-none-any.whl (25 kB)
Installing collected packages: portalocker, colorama, sacrebleu
Successfully installed colorama-0.4.6 portalocker-2.7.0 sacrebleu-2.3.1


In [None]:
!bash /content/drive/MyDrive/ChrEn/run.sh train

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
uniformly initialize parameters [-0.100000, +0.100000]
use device: cuda:0
begin Maximum Likelihood training
epoch 1, iter 10, avg. loss 115.61, avg. ppl 53.48 cum. examples 320, speed 2820.80 words/sec, time elapsed 3.30 sec
epoch 1, iter 20, avg. loss 36.12, avg. ppl 3.43 cum. examples 640, speed 3298.13 words/sec, time elapsed 6.14 sec
epoch 1, iter 30, avg. loss 16.78, avg. ppl 1.81 cum. examples 960, speed 3512.10 words/sec, time elapsed 8.72 sec
epoch 1, iter 40, avg. loss 13.88, avg. ppl 1.57 cum. examples 1280, speed 3555.87 words/sec, time elapsed 11.47 sec
epoch 1, iter 50, avg. loss 13.74, avg. ppl 1.58 cum. examples 1600, speed 3726.07 words/sec, time elapsed 14.05 sec
epoch 1, iter 60, avg. loss 12.59, avg. ppl 1.55 cum. examples 1920, speed 3756.25 words/sec, time elapsed 16.50 sec
epoch 1, iter 70, avg. loss 12.30, avg. ppl 1.51 cum. examples 2240, speed 3490.96 words

In [None]:
!bash /content/drive/MyDrive/ChrEn/run.sh test

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
load test source sentences from [./chr_en_data/test.chr]
load test target sentences from [./chr_en_data/test.en]
load model from model.bin
Decoding: 100% 14855/14855 [10:25<00:00, 23.75it/s]
Corpus BLEU: 0.09467879568752796
