In [1]:
# connect to google drive
import os
import numpy as np

# mount google drive
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)

Mounted at /content/gdrive


In [2]:
# memory footprint support libraries/code
!ln -sf /opt/bin/nvidia-smi /usr/bin/nvidia-smi
!pip install gputil
#!pip install psutil
#!pip install humanize
import psutil
import humanize
import os
import GPUtil as GPU
GPUs = GPU.getGPUs()
# XXX: only one GPU on Colab and isn’t guaranteed
gpu = GPUs[0]
def printm():
 process = psutil.Process(os.getpid())
 print("Gen RAM Free: " + humanize.naturalsize( psutil.virtual_memory().available ), " | Proc size: " + humanize.naturalsize( process.memory_info().rss))
 print("GPU RAM Free: {0:.0f}MB | Used: {1:.0f}MB | Util {2:3.0f}% | Total {3:.0f}MB".format(gpu.memoryFree, gpu.memoryUsed, gpu.memoryUtil*100, gpu.memoryTotal))
printm() 

Gen RAM Free: 12.9 GB  | Proc size: 120.8 MB
GPU RAM Free: 11441MB | Used: 0MB | Util   0% | Total 11441MB


In [3]:
root_dir = "/content/gdrive/My Drive/NLP/MT_ENSP"
os.chdir(root_dir)
!ls

collect_submission.sh	   model_embeddings.py	run.sh
Debugging.ipynb		   NMT_model		sanity_check_en_es_data
en_es_data		   NMT_model.optim	sanity_check.py
gpu_requirements.txt	   nmt_model.py		test_output.txt
__init__.py		   __pycache__		utils.py
local_env.yml		   README.md		vocab.json
Machine_Translation.ipynb  run.py		vocab.py


## Imports

In [0]:
import math
import sys

import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.nn.utils
from torch.nn.utils.rnn import pad_packed_sequence, pack_padded_sequence

from collections import Counter, namedtuple
from docopt import docopt
from itertools import chain
import json
from typing import List, Tuple, Dict, Set, Union

from docopt import docopt
from nltk.translate.bleu_score import corpus_bleu, sentence_bleu, SmoothingFunction
from tqdm import tqdm

## Utility Functions

In [0]:
# post-padding for source/target sequences

def pad_sents(sents, pad_token):
    """ Pad list of sentences according to the longest sentence in the batch.
    @param sents (list[list[str]]): list of sentences, where each sentence
                                    is represented as a list of words
    @param pad_token (str): padding token
    @returns sents_padded (list[list[str]]): list of sentences where sentences shorter
        than the max length sentence are padded out with the pad_token, such that
        each sentences in the batch now has equal length.
    """
    sents_padded = []

    ### YOUR CODE HERE (~6 Lines)
    max_len = max([len(sent) for sent in sents])
    for sent in sents:
        sent_len = len(sent)
        sents_padded.append(sent + (max_len - sent_len) * [pad_token])

    ### END YOUR CODE

    return sents_padded

In [6]:
# verify padding
sents = [['a','clear','day'],['it','is','not','raining','today']]
pad_sents(sents,'<Pad>')

[['a', 'clear', 'day', '<Pad>', '<Pad>'],
 ['it', 'is', 'not', 'raining', 'today']]

In [0]:
# read from corpus: vocab building

def read_corpus(file_path, source):
    """ Read file, where each sentence is dilineated by a `\n`.
    @param file_path (str): path to file containing corpus
    @param source (str): "tgt" or "src" indicating whether text
        is of the source language or target language
    """
    data = []
    for line in open(file_path):
        sent = line.strip().split(' ')
        #sent = line.split(' ')
        # only append <s> and </s> to the target sentence
        if source == 'tgt':
            sent = ['<s>'] + sent + ['</s>']
        data.append(sent)

    return data

In [8]:
# verify if read_corpus is working
file_path = 'en_es_data/dev.en'
data = read_corpus(file_path, 'src')
data[1]

['But',
 'this',
 'understates',
 'the',
 'seriousness',
 'of',
 'this',
 'particular',
 'problem',
 '',
 'because',
 'it',
 "doesn't",
 'show',
 'the',
 'thickness',
 'of',
 'the',
 'ice.']

In [0]:
# generate batches for taining

def batch_iter(data, batch_size, shuffle=False):
    """ Yield batches of source and target sentences reverse sorted by length (largest to smallest).
    @param data (list of (src_sent, tgt_sent)): list of tuples containing source and target sentence
    @param batch_size (int): batch size
    @param shuffle (boolean): whether to randomly shuffle the dataset
    """
    batch_num = math.ceil(len(data) / batch_size)
    index_array = list(range(len(data)))

    if shuffle:
        np.random.shuffle(index_array)

    for i in range(batch_num):
        indices = index_array[i * batch_size: (i + 1) * batch_size]
        examples = [data[idx] for idx in indices]

        examples = sorted(examples, key=lambda e: len(e[0]), reverse=True)
        src_sents = [e[0] for e in examples]
        tgt_sents = [e[1] for e in examples]

        yield src_sents, tgt_sents

In [10]:
# check batch_iter
es_path = 'en_es_data/dev.es'
en_path = 'en_es_data/dev.en'

es_data = read_corpus(es_path, source = 'src')
en_data = read_corpus(en_path, source = 'tgt')

data = list(zip(es_data, en_data))

for src, tgt in batch_iter(data[:4],2):
  print(src, tgt)

[['El', 'ao', 'pasado', 'proyect', 'estas', 'dos', 'diapositivas', 'para', 'demostrar', 'que', 'la', 'capa', 'de', 'hielo', 'rtico,', 'que', 'durante', 'los', 'ltimos', 'tres', 'millones', 'de', 'aos', 'ha', 'sido', 'del', 'tamao', 'de', 'los', '48', 'estados,', 'se', 'ha', 'reducido', 'en', 'un', '40', 'por', 'ciento.'], ['Pero', 'esto', 'minimiza', 'la', 'seriedad', 'de', 'este', 'problema', 'concreto', 'porque', 'no', 'muestra', 'el', 'grosor', 'del', 'hielo.']] [['<s>', 'Last', 'year', 'I', 'showed', 'these', 'two', 'slides', 'so', 'that', '', 'demonstrate', 'that', 'the', 'arctic', 'ice', 'cap,', '', 'which', 'for', 'most', 'of', 'the', 'last', 'three', 'million', 'years', '', 'has', 'been', 'the', 'size', 'of', 'the', 'lower', '48', 'states,', '', 'has', 'shrunk', 'by', '40', 'percent.', '</s>'], ['<s>', 'But', 'this', 'understates', 'the', 'seriousness', 'of', 'this', 'particular', 'problem', '', 'because', 'it', "doesn't", 'show', 'the', 'thickness', 'of', 'the', 'ice.', '</s>'

## Build Vocab

In [0]:
class VocabEntry(object):
    """ Vocabulary Entry, i.e. structure containing either
    src or tgt language terms.
    """
    def __init__(self, word2id=None):
        """ Init VocabEntry Instance.
        @param word2id (dict): dictionary mapping words 2 indices
        """
        if word2id:
            self.word2id = word2id
        else:
            self.word2id = dict()
            self.word2id['<pad>'] = 0   # Pad Token
            self.word2id['<s>'] = 1 # Start Token
            self.word2id['</s>'] = 2    # End Token
            self.word2id['<unk>'] = 3   # Unknown Token
        self.unk_id = self.word2id['<unk>']
        self.id2word = {v: k for k, v in self.word2id.items()}

    def __getitem__(self, word):
        """ Retrieve word's index. Return the index for the unk
        token if the word is out of vocabulary.
        @param word (str): word to look up.
        @returns index (int): index of word 
        """
        return self.word2id.get(word, self.unk_id)

    def __contains__(self, word):
        """ Check if word is captured by VocabEntry.
        @param word (str): word to look up
        @returns contains (bool): whether word is contained    
        """
        return word in self.word2id

    def __setitem__(self, key, value):
        """ Raise error, if one tries to edit the VocabEntry.
        """
        raise ValueError('vocabulary is readonly')

    def __len__(self):
        """ Compute number of words in VocabEntry.
        @returns len (int): number of words in VocabEntry
        """
        return len(self.word2id)

    def __repr__(self):
        """ Representation of VocabEntry to be used
        when printing the object.
        """
        return 'Vocabulary[size=%d]' % len(self)

    def id2word(self, wid):
        """ Return mapping of index to word.
        @param wid (int): word index
        @returns word (str): word corresponding to index
        """
        return self.id2word[wid]

    def add(self, word):
        """ Add word to VocabEntry, if it is previously unseen.
        @param word (str): word to add to VocabEntry
        @return index (int): index that the word has been assigned
        """
        if word not in self:
            wid = self.word2id[word] = len(self)
            self.id2word[wid] = word
            return wid
        else:
            return self[word]

    def words2indices(self, sents):
        """ Convert list of words or list of sentences of words
        into list or list of list of indices.
        @param sents (list[str] or list[list[str]]): sentence(s) in words
        @return word_ids (list[int] or list[list[int]]): sentence(s) in indices
        """
        if type(sents[0]) == list:
            return [[self[w] for w in s] for s in sents]
        else:
            return [self[w] for w in sents]

    def indices2words(self, word_ids):
        """ Convert list of indices into words.
        @param word_ids (list[int]): list of word ids
        @return sents (list[str]): list of words
        """
        return [self.id2word[w_id] for w_id in word_ids]

    def to_input_tensor(self, sents: List[List[str]], device: torch.device) -> torch.Tensor:
        """ Convert list of sentences (words) into tensor with necessary padding for 
        shorter sentences.

        @param sents (List[List[str]]): list of sentences (words)
        @param device: device on which to load the tesnor, i.e. CPU or GPU

        @returns sents_var: tensor of (max_sentence_length, batch_size)
        """
        word_ids = self.words2indices(sents)
        sents_t = pad_sents(word_ids, self['<pad>'])
        sents_var = torch.tensor(sents_t, dtype=torch.long, device=device)
        return torch.t(sents_var)

    @staticmethod
    def from_corpus(corpus, size, freq_cutoff=2):
        """ Given a corpus construct a Vocab Entry.
        @param corpus (list[str]): corpus of text produced by read_corpus function
        @param size (int): # of words in vocabulary
        @param freq_cutoff (int): if word occurs n < freq_cutoff times, drop the word
        @returns vocab_entry (VocabEntry): VocabEntry instance produced from provided corpus
        """
        vocab_entry = VocabEntry()
        word_freq = Counter(chain(*corpus))
        valid_words = [w for w, v in word_freq.items() if v >= freq_cutoff]
        print('number of word types: {}, number of word types w/ frequency >= {}: {}'
              .format(len(word_freq), freq_cutoff, len(valid_words)))
        top_k_words = sorted(valid_words, key=lambda w: word_freq[w], reverse=True)[:size]
        for word in top_k_words:
            vocab_entry.add(word)
        return vocab_entry

In [12]:
# Check Vocab Entry Object
lang = VocabEntry()
#
print('Check if <pad> token is inside vocab:')
print('<pad>' in lang) #__contains__#
print('Length of vocab:{}'.format(len(lang))) # __len__
print('Adding a new word')
lang.add('new') # add new entry 'add' method
print(lang) #__repr__
print('The index of "new" is:{}'.format(lang.word2id['new']))
## 
print('Generate a vocab with the from_corpus static method:')
en_vocab = VocabEntry.from_corpus(en_data, 100)
print('The token for the word "the" is:{}'.format(en_vocab.word2id['the']))
# 
print('check to_input_tensor method:')
# set device name
device = torch.tensor(1).cuda().device

temp = en_vocab.to_input_tensor(sents,device)
print(temp)
# Note: output tensor shape--> (max_len, batch)
# len(en_vocab)

Check if <pad> token is inside vocab:
True
Length of vocab:4
Adding a new word
Vocabulary[size=5]
The index of "new" is:4
Generate a vocab with the from_corpus static method:
number of word types: 3955, number of word types w/ frequency >= 2: 1339
The token for the word "the" is:5
check to_input_tensor method:
tensor([[10, 18],
        [ 3, 11],
        [ 3, 37],
        [ 0,  3],
        [ 0,  3]], device='cuda:0')


In [0]:
class Vocab(object):
    """ Vocab encapsulating src and target langauges.
    """
    def __init__(self, src_vocab: VocabEntry, tgt_vocab: VocabEntry):
        """ Init Vocab.
        @param src_vocab (VocabEntry): VocabEntry for source language
        @param tgt_vocab (VocabEntry): VocabEntry for target language
        """
        self.src = src_vocab
        self.tgt = tgt_vocab

    @staticmethod
    def build(src_sents, tgt_sents, vocab_size, freq_cutoff) -> 'Vocab':
        """ Build Vocabulary.
        @param src_sents (list[str]): Source sentences provided by read_corpus() function
        @param tgt_sents (list[str]): Target sentences provided by read_corpus() function
        @param vocab_size (int): Size of vocabulary for both source and target languages
        @param freq_cutoff (int): if word occurs n < freq_cutoff times, drop the word.
        """
        assert len(src_sents) == len(tgt_sents)

        print('initialize source vocabulary ..')
        src = VocabEntry.from_corpus(src_sents, vocab_size, freq_cutoff)

        print('initialize target vocabulary ..')
        tgt = VocabEntry.from_corpus(tgt_sents, vocab_size, freq_cutoff)

        return Vocab(src, tgt)

    def save(self, file_path):
        """ Save Vocab to file as JSON dump.
        @param file_path (str): file path to vocab file
        """
        json.dump(dict(src_word2id=self.src.word2id, tgt_word2id=self.tgt.word2id), open(file_path, 'w'), indent=2)

    @staticmethod
    def load(file_path):
        """ Load vocabulary from JSON dump.
        @param file_path (str): file path to vocab file
        @returns Vocab object loaded from JSON dump
        """
        entry = json.load(open(file_path, 'r'))
        src_word2id = entry['src_word2id']
        tgt_word2id = entry['tgt_word2id']

        return Vocab(VocabEntry(src_word2id), VocabEntry(tgt_word2id))

    def __repr__(self):
        """ Representation of Vocab to be used
        when printing the object.
        """
        return 'Vocab(source %d words, target %d words)' % (len(self.src), len(self.tgt))


In [14]:
# Now build source [Spanish] and target [English] vocab

train_es = 'en_es_data/train.es'
train_en = 'en_es_data/train.en'
vocab_file = 'en_es_data/vocab.json'

src_sents = read_corpus(train_es, source='src')
tgt_sents = read_corpus(train_en, source='tgt')

size = 50000
freq_cutoff= 2

vocab = Vocab.build(src_sents, tgt_sents, size, freq_cutoff)
print('generated vocabulary, source %d words, target %d words' % (len(vocab.src), len(vocab.tgt)))

vocab.save(vocab_file)
print('vocabulary saved to %s' % vocab_file)

#  
print('Note that the <s> and </s> tokens are added while vocab initialization.\n These tokens are also present in target top frequent words. \nThat is why vocab size for target language is lesser by 2.')

initialize source vocabulary ..
number of word types: 172418, number of word types w/ frequency >= 2: 80623
initialize target vocabulary ..
number of word types: 128873, number of word types w/ frequency >= 2: 64215
generated vocabulary, source 50004 words, target 50002 words
vocabulary saved to en_es_data/vocab.json
Note that the <s> and </s> tokens are added while vocab initialization.
 These tokens are also present in target top frequent words. 
That is why vocab size for target language is lesser by 2.


## Embedding

In [0]:
class ModelEmbeddings(nn.Module): 
    """
    Class that converts input words to their embeddings.
    """
    def __init__(self, embed_size, vocab):
        """
        Init the Embedding layers.

        @param embed_size (int): Embedding size (dimensionality)
        @param vocab (Vocab): Vocabulary object containing src and tgt languages
                              See vocab.py for documentation.
        """
        super(ModelEmbeddings, self).__init__()
        self.embed_size = embed_size

        # default values
        self.source = None
        self.target = None

        src_pad_token_idx = vocab.src['<pad>']
        tgt_pad_token_idx = vocab.tgt['<pad>']

        
        self.source = nn.Embedding(len(vocab.src),embed_size,padding_idx=src_pad_token_idx)
        self.target = nn.Embedding(len(vocab.tgt),embed_size,padding_idx=tgt_pad_token_idx)


## Encoder-Decoder model

In [0]:
Hypothesis = namedtuple('Hypothesis', ['value', 'score'])


In [0]:
class NMT(nn.Module):
    """ Simple Neural Machine Translation Model:
        - Bidrectional LSTM Encoder
        - Unidirection LSTM Decoder
        - Global Attention Model (Luong, et al. 2015)
    """
    def __init__(self, embed_size, hidden_size, vocab, dropout_rate=0.2):
        """ Init NMT Model.

        @param embed_size (int): Embedding size (dimensionality)
        @param hidden_size (int): Hidden Size (dimensionality)
        @param vocab (Vocab): Vocabulary object containing src and tgt languages
                              See vocab.py for documentation.
        @param dropout_rate (float): Dropout probability, for attention
        """
        super(NMT, self).__init__()
        self.model_embeddings = ModelEmbeddings(embed_size, vocab)
        self.hidden_size = hidden_size
        self.dropout_rate = dropout_rate
        self.vocab = vocab

        # default values
        self.encoder = None
        self.decoder = None
        self.h_projection = None
        self.c_projection = None
        self.att_projection = None
        self.combined_output_projection = None
        self.target_vocab_projection = None
        self.dropout = None

        # different layers        
        self.encoder = nn.LSTM(embed_size, hidden_size, bidirectional=True)
        self.decoder = nn.LSTMCell(embed_size+hidden_size, hidden_size)
        self.h_projection= nn.Linear(2*hidden_size, hidden_size, bias=False)
        self.c_projection = nn.Linear(2*hidden_size, hidden_size, bias=False)
        self.att_projection = nn.Linear(2*hidden_size, hidden_size, bias=False)
        self.combined_output_projection = nn.Linear(3*hidden_size, hidden_size, bias=False)
        self.target_vocab_projection = nn.Linear(hidden_size,len(vocab.tgt),bias=False)
        self.dropout = nn.Dropout(dropout_rate)
        
        # multipolicative attention
        self.mult_atten = nn.Linear(hidden_size, hidden_size)



    def forward(self, source: List[List[str]], target: List[List[str]]) -> torch.Tensor:
        """ Take a mini-batch of source and target sentences, compute the log-likelihood of
        target sentences under the language models learned by the NMT system.

        @param source (List[List[str]]): list of source sentence tokens
        @param target (List[List[str]]): list of target sentence tokens, wrapped by `<s>` and `</s>`

        @returns scores (Tensor): a variable/tensor of shape (b, ) representing the
                                    log-likelihood of generating the gold-standard target sentence for
                                    each example in the input batch. Here b = batch size.
        """
        # Compute sentence lengths
        source_lengths = [len(s) for s in source]

        # Convert list of lists into tensors
        source_padded = self.vocab.src.to_input_tensor(source, device=self.device)   # Tensor: (src_len, b)
        target_padded = self.vocab.tgt.to_input_tensor(target, device=self.device)   # Tensor: (tgt_len, b)

        ###     Run the network forward:

        enc_hiddens, dec_init_state = self.encode(source_padded, source_lengths)
        enc_masks = self.generate_sent_masks(enc_hiddens, source_lengths)
        combined_outputs = self.decode(enc_hiddens, enc_masks, dec_init_state, target_padded)
        #print(combined_outputs.size())
        #temp = self.target_vocab_projection(combined_outputs)
        #print(temp.size())
        P = F.log_softmax(self.target_vocab_projection(combined_outputs), dim=-1)

        # Zero out, probabilities for which we have nothing in the target text
        target_masks = (target_padded != self.vocab.tgt['<pad>']).float()
        
        # Compute log probability of generating true target words
        target_gold_words_log_prob = torch.gather(P, index=target_padded[1:].unsqueeze(-1), dim=-1).squeeze(-1) * target_masks[1:]
        scores = target_gold_words_log_prob.sum(dim=0)
        return scores


    def encode(self, source_padded: torch.Tensor, source_lengths: List[int]) -> Tuple[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
        """ Apply the encoder to source sentences to obtain encoder hidden states.
            Additionally, take the final states of the encoder and project them to obtain initial states for decoder.

        @param source_padded (Tensor): Tensor of padded source sentences with shape (src_len, b), where
                                        b = batch_size, src_len = maximum source sentence length. Note that 
                                       these have already been sorted in order of longest to shortest sentence.
        @param source_lengths (List[int]): List of actual lengths for each of the source sentences in the batch
        @returns enc_hiddens (Tensor): Tensor of hidden units with shape (b, src_len, h*2), where
                                        b = batch size, src_len = maximum source sentence length, h = hidden size.
        @returns dec_init_state (tuple(Tensor, Tensor)): Tuple of tensors representing the decoder's initial
                                                hidden state and cell.
        """
        enc_hiddens, dec_init_state = None, None

        
        X = self.model_embeddings.source(source_padded) #(src_len, b, embed_size)
        X = pack_padded_sequence(X, source_lengths)
        enc_hiddens, (last_hidden,last_cell) = self.encoder(X) #(h0,c0) defaults to zero
        enc_hiddens, _ = pad_packed_sequence(enc_hiddens, batch_first=True)
        last_hidden = torch.cat((last_hidden[0,:],last_hidden[1,:]),1)
        last_cell = torch.cat((last_cell[0,:],last_cell[1,:]),1)
        init_decoder_hidden = self.h_projection(last_hidden)
        init_decoder_cell = self.c_projection(last_cell)
        dec_init_state = (init_decoder_hidden, init_decoder_cell)


        return enc_hiddens, dec_init_state


    def decode(self, enc_hiddens: torch.Tensor, enc_masks: torch.Tensor,
                dec_init_state: Tuple[torch.Tensor, torch.Tensor], target_padded: torch.Tensor) -> torch.Tensor:
        """Compute combined output vectors for a batch.

        @param enc_hiddens (Tensor): Hidden states (b, src_len, h*2), where
                                     b = batch size, src_len = maximum source sentence length, h = hidden size.
        @param enc_masks (Tensor): Tensor of sentence masks (b, src_len), where
                                     b = batch size, src_len = maximum source sentence length.
        @param dec_init_state (tuple(Tensor, Tensor)): Initial state and cell for decoder
        @param target_padded (Tensor): Gold-standard padded target sentences (tgt_len, b), where
                                       tgt_len = maximum target sentence length, b = batch size. 

        @returns combined_outputs (Tensor): combined output tensor  (tgt_len, b,  h), where
                                        tgt_len = maximum target sentence length, b = batch_size,  h = hidden size
        """
        # Chop of the <END> token for max length sentences.
        target_padded = target_padded[:-1]

        # Initialize the decoder state (hidden and cell)
        dec_state = dec_init_state

        # Initialize previous combined output vector o_{t-1} as zero
        batch_size = enc_hiddens.size(0)
        o_prev = torch.zeros(batch_size, self.hidden_size, device=self.device)

        # Initialize a list we will use to collect the combined output o_t on each step
        combined_outputs = []


        enc_hiddens_proj = self.att_projection(enc_hiddens)
        Y = self.model_embeddings.target(target_padded) #(tgt_len,b,e)
        Y_splited = torch.split(Y,1, dim=0)
        tgt_len = target_padded.size(0)
        for i in range(tgt_len):
            Y_t = Y_splited[i]
            Y_t = torch.squeeze(Y_t,dim =0) #(b,e) --> after removal of time dim
            Ybar_t = torch.cat((Y_t,o_prev),dim=1) #(b,e+h)
            dec_state, o_prev, e_t = self.step(Ybar_t, dec_state, enc_hiddens, enc_hiddens_proj, enc_masks)
            combined_outputs.append(o_prev)
        combined_outputs = torch.stack(combined_outputs)
        ### END YOUR CODE

        return combined_outputs


    def step(self, Ybar_t: torch.Tensor,
            dec_state: Tuple[torch.Tensor, torch.Tensor],
            enc_hiddens: torch.Tensor,
            enc_hiddens_proj: torch.Tensor,
            enc_masks: torch.Tensor) -> Tuple[Tuple, torch.Tensor, torch.Tensor]:
        """ Compute one forward step of the LSTM decoder, including the attention computation.

        @param Ybar_t (Tensor): Concatenated Tensor of [Y_t o_prev], with shape (b, e + h). The input for the decoder,
                                where b = batch size, e = embedding size, h = hidden size.
        @param dec_state (tuple(Tensor, Tensor)): Tuple of tensors both with shape (b, h), where b = batch size, h = hidden size.
                First tensor is decoder's prev hidden state, second tensor is decoder's prev cell.
        @param enc_hiddens (Tensor): Encoder hidden states Tensor, with shape (b, src_len, h * 2), where b = batch size,
                                    src_len = maximum source length, h = hidden size.
        @param enc_hiddens_proj (Tensor): Encoder hidden states Tensor, projected from (h * 2) to h. Tensor is with shape (b, src_len, h),
                                    where b = batch size, src_len = maximum source length, h = hidden size.
        @param enc_masks (Tensor): Tensor of sentence masks shape (b, src_len),
                                    where b = batch size, src_len is maximum source length. 

        @returns dec_state (tuple (Tensor, Tensor)): Tuple of tensors both shape (b, h), where b = batch size, h = hidden size.
                First tensor is decoder's new hidden state, second tensor is decoder's new cell.
        @returns combined_output (Tensor): Combined output Tensor at timestep t, shape (b, h), where b = batch size, h = hidden size.
        @returns e_t (Tensor): Tensor of shape (b, src_len). It is attention scores distribution.
                                Note: You will not use this outside of this function.
                                      We are simply returning this value so that we can sanity check
                                      your implementation.
        """

        combined_output = None

        dec_state = self.decoder(Ybar_t, dec_state)
        dec_hidden, dec_cell = dec_state
        aug_dec_hidden = torch.unsqueeze(dec_hidden, dim=2) #(b,hidden_size,1)
        
        mul_enc_proj = self.mult_atten(enc_hiddens_proj)
 
        e_t = torch.bmm(mul_enc_proj,aug_dec_hidden) # (b,max_len, hidden_size) * (b,hidden_size,1) --> (b, max_len, 1)
        e_t = torch.squeeze(e_t, dim=2) #(b, max_len)

        ### END YOUR CODE

        # Set e_t to -inf where enc_masks has 1
        if enc_masks is not None:
            e_t.data.masked_fill_(enc_masks.byte(), -float('inf'))

        alpha_t = F.softmax(e_t, dim=1) #(b, src_len)
        # enc_hiddens --> (b, src_len, hidden_size*2) #
        aug_att = torch.unsqueeze(alpha_t,2) #(b, src_len, 1)
        tr_hiddens = enc_hiddens.transpose(1,2) #(b,hidden_size*2, src_len)
        a_t = torch.bmm(tr_hiddens,aug_att) #(b,2*hidden_size,1)
        a_t = torch.squeeze(a_t,dim=2) #(b,2*hidden_size)
        #print(a_t.size(),dec_hidden.size())
        
        U_t = torch.cat((a_t,dec_hidden), dim=1) #(b,3*hidden_size)
        V_t = self.combined_output_projection(U_t) #(b,hidden_size)
        O_t = self.dropout(torch.tanh(V_t))
        

        combined_output = O_t
        return dec_state, combined_output, e_t

    def generate_sent_masks(self, enc_hiddens: torch.Tensor, source_lengths: List[int]) -> torch.Tensor:
        """ Generate sentence masks for encoder hidden states.

        @param enc_hiddens (Tensor): encodings of shape (b, src_len, 2*h), where b = batch size,
                                     src_len = max source length, h = hidden size. 
        @param source_lengths (List[int]): List of actual lengths for each of the sentences in the batch.
        
        @returns enc_masks (Tensor): Tensor of sentence masks of shape (b, src_len),
                                    where src_len = max source length, h = hidden size.
        """
        enc_masks = torch.zeros(enc_hiddens.size(0), enc_hiddens.size(1), dtype=torch.float)
        for e_id, src_len in enumerate(source_lengths):
            enc_masks[e_id, src_len:] = 1
        return enc_masks.to(self.device)


    def beam_search(self, src_sent: List[str], beam_size: int=5, max_decoding_time_step: int=70) -> List[Hypothesis]:
        """ Given a single source sentence, perform beam search, yielding translations in the target language.
        @param src_sent (List[str]): a single source sentence (words)
        @param beam_size (int): beam size
        @param max_decoding_time_step (int): maximum number of time steps to unroll the decoding RNN
        @returns hypotheses (List[Hypothesis]): a list of hypothesis, each hypothesis has two fields:
                value: List[str]: the decoded target sentence, represented as a list of words
                score: float: the log-likelihood of the target sentence
        """
        src_sents_var = self.vocab.src.to_input_tensor([src_sent], self.device)

        src_encodings, dec_init_vec = self.encode(src_sents_var, [len(src_sent)])
        src_encodings_att_linear = self.att_projection(src_encodings)

        h_tm1 = dec_init_vec
        att_tm1 = torch.zeros(1, self.hidden_size, device=self.device)

        eos_id = self.vocab.tgt['</s>']

        hypotheses = [['<s>']]
        hyp_scores = torch.zeros(len(hypotheses), dtype=torch.float, device=self.device)
        completed_hypotheses = []

        t = 0
        while len(completed_hypotheses) < beam_size and t < max_decoding_time_step:
            t += 1
            hyp_num = len(hypotheses)

            exp_src_encodings = src_encodings.expand(hyp_num,
                                                     src_encodings.size(1),
                                                     src_encodings.size(2))

            exp_src_encodings_att_linear = src_encodings_att_linear.expand(hyp_num,
                                                                           src_encodings_att_linear.size(1),
                                                                           src_encodings_att_linear.size(2))

            y_tm1 = torch.tensor([self.vocab.tgt[hyp[-1]] for hyp in hypotheses], dtype=torch.long, device=self.device)
            y_t_embed = self.model_embeddings.target(y_tm1)

            x = torch.cat([y_t_embed, att_tm1], dim=-1)

            (h_t, cell_t), att_t, _  = self.step(x, h_tm1,
                                                      exp_src_encodings, exp_src_encodings_att_linear, enc_masks=None)

            # log probabilities over target words
            log_p_t = F.log_softmax(self.target_vocab_projection(att_t), dim=-1)

            live_hyp_num = beam_size - len(completed_hypotheses)
            contiuating_hyp_scores = (hyp_scores.unsqueeze(1).expand_as(log_p_t) + log_p_t).view(-1)
            top_cand_hyp_scores, top_cand_hyp_pos = torch.topk(contiuating_hyp_scores, k=live_hyp_num)

            prev_hyp_ids = top_cand_hyp_pos / len(self.vocab.tgt)
            hyp_word_ids = top_cand_hyp_pos % len(self.vocab.tgt)

            new_hypotheses = []
            live_hyp_ids = []
            new_hyp_scores = []

            for prev_hyp_id, hyp_word_id, cand_new_hyp_score in zip(prev_hyp_ids, hyp_word_ids, top_cand_hyp_scores):
                prev_hyp_id = prev_hyp_id.item()
                hyp_word_id = hyp_word_id.item()
                cand_new_hyp_score = cand_new_hyp_score.item()

                hyp_word = self.vocab.tgt.id2word[hyp_word_id]
                new_hyp_sent = hypotheses[prev_hyp_id] + [hyp_word]
                if hyp_word == '</s>':
                    completed_hypotheses.append(Hypothesis(value=new_hyp_sent[1:-1],
                                                           score=cand_new_hyp_score))
                else:
                    new_hypotheses.append(new_hyp_sent)
                    live_hyp_ids.append(prev_hyp_id)
                    new_hyp_scores.append(cand_new_hyp_score)

            if len(completed_hypotheses) == beam_size:
                break

            live_hyp_ids = torch.tensor(live_hyp_ids, dtype=torch.long, device=self.device)
            h_tm1 = (h_t[live_hyp_ids], cell_t[live_hyp_ids])
            att_tm1 = att_t[live_hyp_ids]

            hypotheses = new_hypotheses
            hyp_scores = torch.tensor(new_hyp_scores, dtype=torch.float, device=self.device)

        if len(completed_hypotheses) == 0:
            completed_hypotheses.append(Hypothesis(value=hypotheses[0][1:],
                                                   score=hyp_scores[0].item()))

        completed_hypotheses.sort(key=lambda hyp: hyp.score, reverse=True)

        return completed_hypotheses

    @property
    def device(self) -> torch.device:
        """ Determine which device to place the Tensors upon, CPU or GPU.
        """
        return self.model_embeddings.source.weight.device

    @staticmethod
    def load(model_path: str):
        """ Load the model from a file.
        @param model_path (str): path to model
        """
        params = torch.load(model_path, map_location=lambda storage, loc: storage)
        args = params['args']
        model = NMT(vocab=params['vocab'], **args)
        model.load_state_dict(params['state_dict'])

        return model

    def save(self, path: str):
        """ Save the odel to a file.
        @param path (str): path to the model
        """
        print('save model parameters to [%s]' % path, file=sys.stderr)

        params = {
            'args': dict(embed_size=self.model_embeddings.embed_size, hidden_size=self.hidden_size, dropout_rate=self.dropout_rate),
            'vocab': self.vocab,
            'state_dict': self.state_dict()
        }

        torch.save(params, path)


## Train

In [0]:
# load data
train_es = 'en_es_data/train.es'
train_en = 'en_es_data/train.en'

dev_es = 'en_es_data/dev.es'
dev_en = 'en_es_data/dev.en'

test_es = 'en_es_data/test.es'
test_en = 'en_es_data/test.en'

vocab_file = 'en_es_data/vocab.json'


train_data_src = read_corpus(train_es, source='src')
train_data_tgt = read_corpus(train_en, source='tgt')

dev_data_src = read_corpus(dev_es, source='src')
dev_data_tgt = read_corpus(dev_en, source='tgt')

test_data_src = read_corpus(test_es, source='src')
test_data_tgt = read_corpus(test_en, source='tgt')

train_data = list(zip(train_data_src,train_data_tgt))
dev_data = list(zip(dev_data_src,dev_data_tgt))
test_data = list(zip(test_data_src,test_data_tgt))

vocab = Vocab.load(vocab_file)

In [0]:
# We can set these parameters
train_batch_size = 64
clip_grad = 5.0
#valid_niter = int(args['--valid-niter'])
valid_niter = 2000

log_every = 100
model_save_path = 'NMT_model_mul_atten'
max_epoch = 30
max_patience = 3
max_trial = 3
lr_decay = 0.5


In [20]:
model = NMT(embed_size= 256, hidden_size=256, dropout_rate=0.3, vocab=vocab)
model.train()

NMT(
  (model_embeddings): ModelEmbeddings(
    (source): Embedding(50004, 256, padding_idx=0)
    (target): Embedding(50002, 256, padding_idx=0)
  )
  (encoder): LSTM(256, 256, bidirectional=True)
  (decoder): LSTMCell(512, 256)
  (h_projection): Linear(in_features=512, out_features=256, bias=False)
  (c_projection): Linear(in_features=512, out_features=256, bias=False)
  (att_projection): Linear(in_features=512, out_features=256, bias=False)
  (combined_output_projection): Linear(in_features=768, out_features=256, bias=False)
  (target_vocab_projection): Linear(in_features=256, out_features=50002, bias=False)
  (dropout): Dropout(p=0.3)
  (mult_atten): Linear(in_features=256, out_features=256, bias=True)
)

In [21]:
uniform_init = 0.1
if np.abs(uniform_init) > 0.:
  print('uniformly initialize parameters [-%f, +%f]' % (uniform_init, uniform_init), file=sys.stderr)
  for p in model.parameters():
    p.data.uniform_(-uniform_init, uniform_init)

uniformly initialize parameters [-0.100000, +0.100000]


In [0]:
# mask pad tokens
vocab_mask = torch.ones(len(vocab.tgt))
vocab_mask[vocab.tgt['<pad>']] = 0

In [23]:
device = torch.device("cuda:0" if torch.cuda.device_count()>0 else "cpu")
print('use device: %s' % device)
model = model.to(device)

use device: cuda:0


In [0]:
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [25]:
import time
num_trial = 0
train_iter = patience = cum_loss = report_loss = cum_tgt_words = report_tgt_words = 0
cum_examples = report_examples = epoch = valid_num = 0
hist_valid_scores = []
train_time = begin_time = time.time()

print('Start Maximum Likelihood training:')

Start Maximum Likelihood training:


In [0]:
def evaluate_ppl(model, dev_data, batch_size=32):
    """ Evaluate perplexity on dev sentences
    @param model (NMT): NMT Model
    @param dev_data (list of (src_sent, tgt_sent)): list of tuples containing source and target sentence
    @param batch_size (batch size)
    @returns ppl (perplixty on dev sentences)
    """
    was_training = model.training
    model.eval()

    cum_loss = 0.
    cum_tgt_words = 0.

    # no_grad() signals backend to throw away all gradients
    with torch.no_grad():
        for src_sents, tgt_sents in batch_iter(dev_data, batch_size):
            loss = -model(src_sents, tgt_sents).sum()

            cum_loss += loss.item()
            tgt_word_num_to_predict = sum(len(s[1:]) for s in tgt_sents)  # omitting leading `<s>`
            cum_tgt_words += tgt_word_num_to_predict

        ppl = np.exp(cum_loss / cum_tgt_words)

    if was_training:
        model.train()

    return ppl

In [27]:
while True:
    epoch += 1

    for src_sents, tgt_sents in batch_iter(train_data, batch_size=train_batch_size, shuffle=True):
        train_iter += 1

        optimizer.zero_grad()

        batch_size = len(src_sents)
        #
        #print(batch_size)
        example_losses = -model(src_sents, tgt_sents) # (batch_size,)
        batch_loss = example_losses.sum()
        loss = batch_loss / batch_size

        loss.backward()

        # clip gradient
        grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), clip_grad)

        optimizer.step()

        batch_losses_val = batch_loss.item()
        report_loss += batch_losses_val
        cum_loss += batch_losses_val

        tgt_words_num_to_predict = sum(len(s[1:]) for s in tgt_sents)  # omitting leading `<s>`
        report_tgt_words += tgt_words_num_to_predict
        cum_tgt_words += tgt_words_num_to_predict
        report_examples += batch_size
        cum_examples += batch_size

        if train_iter % log_every == 0:
            print('epoch %d, iter %d, avg. loss %.2f, avg. ppl %.2f ' \
                  'cum. examples %d, speed %.2f words/sec, time elapsed %.2f sec' % (epoch, train_iter,
                                                                                     report_loss / report_examples,
                                                                                     math.exp(report_loss / report_tgt_words),
                                                                                     cum_examples,
                                                                                     report_tgt_words / (time.time() - train_time),
                                                                                     time.time() - begin_time), file=sys.stderr)

            train_time = time.time()
            report_loss = report_tgt_words = report_examples = 0.

        # perform validation
        if train_iter % valid_niter == 0:
            print('epoch %d, iter %d, cum. loss %.2f, cum. ppl %.2f cum. examples %d' % (epoch, train_iter,
                                                                                     cum_loss / cum_examples,
                                                                                     np.exp(cum_loss / cum_tgt_words),
                                                                                     cum_examples), file=sys.stderr)

            cum_loss = cum_examples = cum_tgt_words = 0.
            valid_num += 1

            print('begin validation ...', file=sys.stderr)

            # compute dev. ppl and bleu
            dev_ppl = evaluate_ppl(model, dev_data, batch_size=128)   # dev batch size can be a bit larger
            valid_metric = -dev_ppl

            print('validation: iter %d, dev. ppl %f' % (train_iter, dev_ppl), file=sys.stderr)

            is_better = len(hist_valid_scores) == 0 or valid_metric > max(hist_valid_scores)
            hist_valid_scores.append(valid_metric)

            if is_better:
                patience = 0
                print('save currently the best model to [%s]' % model_save_path, file=sys.stderr)
                model.save(model_save_path)

                # also save the optimizers' state
                torch.save(optimizer.state_dict(), model_save_path + '.optim')
            elif patience < int(max_patience):
                patience += 1
                print('hit patience %d' % patience, file=sys.stderr)

                if patience == int(max_patience):
                    num_trial += 1
                    print('hit #%d trial' % num_trial, file=sys.stderr)
                    if num_trial == int(max_trial):
                        print('early stop!', file=sys.stderr)
                        exit(0)

                    # decay lr, and restore from previously best checkpoint
                    lr = optimizer.param_groups[0]['lr'] * float(lr_decay)
                    print('load previously best model and decay learning rate to %f' % lr, file=sys.stderr)

                    # load model
                    params = torch.load(model_save_path, map_location=lambda storage, loc: storage)
                    model.load_state_dict(params['state_dict'])
                    model = model.to(device)

                    print('restore parameters of the optimizers', file=sys.stderr)
                    optimizer.load_state_dict(torch.load(model_save_path + '.optim'))

                    # set new lr
                    for param_group in optimizer.param_groups:
                        param_group['lr'] = lr

                    # reset patience
                    patience = 0

            if epoch == int(max_epoch):
                print('reached maximum number of epochs!', file=sys.stderr)
                break

epoch 1, iter 100, avg. loss 132.87, avg. ppl 1846.13 cum. examples 6400, speed 2936.88 words/sec, time elapsed 38.50 sec
epoch 1, iter 200, avg. loss 117.89, avg. ppl 797.09 cum. examples 12800, speed 3043.33 words/sec, time elapsed 75.61 sec
epoch 1, iter 300, avg. loss 111.04, avg. ppl 526.82 cum. examples 19200, speed 3050.67 words/sec, time elapsed 112.78 sec
epoch 1, iter 400, avg. loss 104.73, avg. ppl 382.67 cum. examples 25600, speed 3026.76 words/sec, time elapsed 150.02 sec
epoch 1, iter 500, avg. loss 102.50, avg. ppl 312.44 cum. examples 32000, speed 3042.45 words/sec, time elapsed 187.55 sec
epoch 1, iter 600, avg. loss 97.50, avg. ppl 249.20 cum. examples 38400, speed 3046.70 words/sec, time elapsed 224.67 sec
epoch 1, iter 700, avg. loss 93.93, avg. ppl 207.86 cum. examples 44800, speed 3011.69 words/sec, time elapsed 262.07 sec
epoch 1, iter 800, avg. loss 90.99, avg. ppl 174.51 cum. examples 51200, speed 2978.75 words/sec, time elapsed 299.94 sec
epoch 1, iter 900, av

KeyboardInterrupt: ignored

## Test

In [0]:
def beam_search(model: NMT, test_data_src: List[List[str]], beam_size: int, max_decoding_time_step: int) -> List[List[Hypothesis]]:
    """ Run beam search to construct hypotheses for a list of src-language sentences.
    @param model (NMT): NMT Model
    @param test_data_src (List[List[str]]): List of sentences (words) in source language, from test set.
    @param beam_size (int): beam_size (# of hypotheses to hold for a translation at every step)
    @param max_decoding_time_step (int): maximum sentence length that Beam search can produce
    @returns hypotheses (List[List[Hypothesis]]): List of Hypothesis translations for every source sentence.
    """
    was_training = model.training
    model.eval()

    hypotheses = []
    with torch.no_grad():
        for src_sent in tqdm(test_data_src, desc='Decoding', file=sys.stdout):
            example_hyps = model.beam_search(src_sent, beam_size=beam_size, max_decoding_time_step=max_decoding_time_step)

            hypotheses.append(example_hyps)

    if was_training: model.train(was_training)

    return hypotheses

In [0]:
def compute_corpus_level_bleu_score(references: List[List[str]], hypotheses: List[Hypothesis]) -> float:
    """ Given decoding results and reference sentences, compute corpus-level BLEU score.
    @param references (List[List[str]]): a list of gold-standard reference target sentences
    @param hypotheses (List[Hypothesis]): a list of hypotheses, one for each reference
    @returns bleu_score: corpus-level BLEU score
    """
    if references[0][0] == '<s>':
        references = [ref[1:-1] for ref in references]
    bleu_score = corpus_bleu([[ref] for ref in references],
                             [hyp.value for hyp in hypotheses])
    return bleu_score

In [0]:
##Test


In [0]:
def decode():
    """ Performs decoding on a test set, and save the best-scoring decoding results.
    If the target gold-standard sentences are given, the function also computes
    corpus-level BLEU score.
    @param args (Dict): args from cmd line
    """

    print("load test source sentences", file=sys.stderr)
    test_data_src = read_corpus(test_es, source='src')
    
    
    print("load test target sentences", file=sys.stderr)
    test_data_tgt = read_corpus(test_en, source='tgt')

    print("load trained model", file=sys.stderr)
    model = NMT.load(model_save_path)

    #device = torch.device("cuda:0" if torch.cuda.device_count()>0 else "cpu")
    if torch.cuda.device_count()>0:
        print("Transfer to cuda!!")
        model = model.to(torch.device("cuda:0"))

    hypotheses = beam_search(model, test_data_src,
                             beam_size=5,
                             max_decoding_time_step=70)


    top_hypotheses = [hyps[0] for hyps in hypotheses]
    bleu_score = compute_corpus_level_bleu_score(test_data_tgt, top_hypotheses)
    print('Corpus BLEU: {}'.format(bleu_score * 100), file=sys.stderr)

    with open('test_output.txt', 'w') as f:
        for src_sent, hyps in zip(test_data_src, hypotheses):
            top_hyp = hyps[0]
            hyp_sent = ' '.join(top_hyp.value)
            f.write(hyp_sent + '\n')

In [31]:
decode()

load test source sentences
load test target sentences
load trained model


Transfer to cuda!!
Decoding: 100%|██████████| 8064/8064 [11:32<00:00, 11.65it/s]


Corpus BLEU: 22.08793926290774


## B