In [1]:
import numpy as np
from utils import batch_iter
from vocab import Vocab
import torch
import torch.nn as nn
import nltk

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/shaozhetao/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


helper fcn

In [2]:
def read_corpus(file_path, source):
    # Understood
    """ Read file, where each sentence is dilineated by a `\n`.
    @param file_path (str): path to file containing corpus
    @param source (str): "tgt" or "src" indicating whether text
        is of the source language or target language
    """
    data = []
    for line in open(file_path):
        sent = nltk.word_tokenize(line)
        # only append <s> and </s> to the target sentence
        if source == 'tgt':
            sent = ['<s>'] + sent + ['</s>']
        data.append(sent)

    return data

define config

In [3]:
BATCH_SIZE = 5
EMBED_SIZE = 4
HIDDEN_SIZE = 3
DROPOUT_RATE = 0.0
seed = 1234

In [4]:
# Load training data & vocabulary
train_data_src = read_corpus('./sanity_check_en_es_data/train_sanity_check.es', 'src')
train_data_tgt = read_corpus('./sanity_check_en_es_data/train_sanity_check.en', 'tgt')
train_data = list(zip(train_data_src, train_data_tgt))

for src_sents, tgt_sents in batch_iter(train_data, batch_size=BATCH_SIZE, shuffle=True):
    src_sents = src_sents
    tgt_sents = tgt_sents
    break
vocab = Vocab.load('./sanity_check_en_es_data/vocab_sanity_check.json') 

### Section 0 - Context 

1. understanding vocab

In [5]:
vocab.src.word2id 
# 1. shows dict(key=word, val=id) # rank <pad>:0, <s>:1, </s>:2
# 2. vocab.src/tgt  is VocabEntry object
type(vocab.tgt )

vocab.VocabEntry

2. Define backbone Embedding and NMT - assume nothing added

In [6]:
from typing import List, Tuple, Dict, Set, Union
class ModelEmbeddings(nn.Module): 
    """
    Class that converts input words to their embeddings.
    """
    def __init__(self, embed_size, vocab):
        super(ModelEmbeddings, self).__init__()
        self.embed_size = embed_size

        # default values
        self.source = None
        self.target = None

        src_pad_token_idx = vocab.src['<pad>']
        tgt_pad_token_idx = vocab.tgt['<pad>']

        self.source = nn.Embedding(len(vocab.src), embed_size, src_pad_token_idx)
        self.target = nn.Embedding(len(vocab.tgt), embed_size, tgt_pad_token_idx)

class NMT(nn.Module):
    """ Simple Neural Machine Translation Model:
        - Bidrectional LSTM Encoder
        - Unidirection LSTM Decoder
        - Global Attention Model (Luong, et al. 2015)
    """
    def __init__(self, embed_size, hidden_size, vocab, dropout_rate=0.2):
        super(NMT, self).__init__()
        self.model_embeddings = ModelEmbeddings(embed_size, vocab)
        self.hidden_size = hidden_size
        self.dropout_rate = dropout_rate
        self.vocab = vocab

        # default values
#         self.encoder = None 
#         self.decoder = None
#         self.h_projection = None
#         self.c_projection = None
#         self.att_projection = None
#         self.combined_output_projection = None
#         self.target_vocab_projection = None
#         self.dropout = None
        # For sanity check only, not relevant to implementation
        self.gen_sanity_check = False
        self.counter = 0
        
        self.encoder = nn.LSTM(input_size=embed_size, hidden_size=self.hidden_size, bidirectional=True, bias=True)
        self.decoder = nn.LSTMCell(input_size=embed_size+self.hidden_size, hidden_size=self.hidden_size, bias=True)

        self.h_projection = nn.Linear(self.hidden_size*2, self.hidden_size, bias=False)
        self.c_projection = nn.Linear(self.hidden_size*2, self.hidden_size, bias=False)

        self.att_projection = nn.Linear(self.hidden_size*2, self.hidden_size, bias=False)
        self.combined_output_projection = nn.Linear(self.hidden_size*3, self.hidden_size, bias=False)
        self.target_vocab_projection = nn.Linear(self.hidden_size, len(self.vocab.tgt), bias=False)
        self.dropout = nn.Dropout(self.dropout_rate)

In [7]:
model = NMT(
        embed_size=EMBED_SIZE,
        hidden_size=HIDDEN_SIZE,
        dropout_rate=DROPOUT_RATE,
        vocab=vocab)

In [8]:
model

NMT(
  (model_embeddings): ModelEmbeddings(
    (source): Embedding(77, 4, padding_idx=0)
    (target): Embedding(85, 4, padding_idx=0)
  )
  (encoder): LSTM(4, 3, bidirectional=True)
  (decoder): LSTMCell(7, 3)
  (h_projection): Linear(in_features=6, out_features=3, bias=False)
  (c_projection): Linear(in_features=6, out_features=3, bias=False)
  (att_projection): Linear(in_features=6, out_features=3, bias=False)
  (combined_output_projection): Linear(in_features=9, out_features=3, bias=False)
  (target_vocab_projection): Linear(in_features=3, out_features=85, bias=False)
  (dropout): Dropout(p=0.0, inplace=False)
)

3. The data we work on testing src_sents

In [9]:
wordids = [[vocab.src[w] for w in s] for s in src_sents]

### Section1: Encoder

In [10]:
from torch.nn.utils.rnn import pad_packed_sequence, pack_padded_sequence

In [11]:
source_lengths = [len(s) for s in src_sents]
source_padded = model.vocab.src.to_input_tensor(src_sents, device='cpu') # (src_len, batch)

In [12]:
source_padded.shape  # torch.Size([22, 5]) dim x batch_size

torch.Size([25, 5])

In [13]:
model_embeddings = ModelEmbeddings(EMBED_SIZE, vocab)
X = model_embeddings.source(source_padded) #(src_len, b, e)

In [14]:
Xz = pack_padded_sequence(X, torch.tensor(source_lengths)) #packed (src_len, b, e)

Define encoder

In [15]:
encoder = nn.LSTM(input_size=EMBED_SIZE, hidden_size=HIDDEN_SIZE, bidirectional=True, bias=True)
enc_hiddens_pack, (last_hidden, last_cell) = encoder(Xz) #(src_len, b, h*2)

In [16]:
encoder # input size is embedding size 4, hidden size is hidden size???

LSTM(4, 3, bidirectional=True)

In [17]:
last_hidden.shape # (Bidirectional * b, hidden)

torch.Size([2, 5, 3])

In [18]:
last_cell.shape #(Bidirectional * b, hidden)

torch.Size([2, 5, 3])

In [19]:
enc_hiddens, source_lengths = pad_packed_sequence(enc_hiddens_pack, batch_first=True)

# source_lengths: tensor([22, 14, 10, 10,  6]), defined earlier in embedding
# enc_hiddens.shape, (b, src_len, h*2) e.g. torch.Size([5, 22, 6]) 这里的src_len可以想象成每一步 rnn

### Section 2 decoder+attention

target prep

In [20]:
target_padded = vocab.tgt.to_input_tensor(tgt_sents, device='cpu') #torch.Size([24, 5])

In [21]:
target_padded = target_padded[:-1] # torch.Size([23, 5])

In [22]:
Y = model_embeddings.target(target_padded) #torch.Size([23, 5, 4])

In [23]:
Y_ts = torch.split(Y, 1) #spliting into a tuple of 23 elements, each is tensor of [5,4]

#### section 2.1 decode step

In [24]:
hidden_size = HIDDEN_SIZE
h_projection = nn.Linear(hidden_size*2, hidden_size, bias=False)
c_projection = nn.Linear(hidden_size*2, hidden_size, bias=False)

In [25]:
last_hidden.shape

torch.Size([2, 5, 3])

In [26]:
# init_decoder_hidden/cell: first concate bidirectional hidden together to get h*2, and then a linear layer to h
init_decoder_hidden = h_projection(torch.cat((last_hidden[0], last_hidden[1]),1))  #(b, h)
init_decoder_cell = c_projection(torch.cat((last_cell[0], last_cell[1]), 1)) #(b, h)

dec_init_state = (init_decoder_hidden, init_decoder_cell) # a tuple of 2 (b,h)

#### Section 2.2 now attention, in the decoder

In [27]:
att_projection = nn.Linear(hidden_size*2, hidden_size, bias=False)
enc_hiddens_proj = att_projection(enc_hiddens) #(b, src_len, h*2) -> (b, src_len, h)

In [28]:
o_prev = torch.zeros(BATCH_SIZE, HIDDEN_SIZE, device='cpu') #=(b, h)

**note of one step**



In [29]:
import torch.nn.functional as F
combined_output_projection = nn.Linear(hidden_size*3, hidden_size, bias=False)
dropout = nn.Dropout(0)

example of one step in attention (not in the final, only show)

In [30]:
dec_state = dec_init_state
dec_hidden = dec_state[0] #(b,e)
Y_t = Y_ts[0]
Y_t = torch.squeeze(Y_t)  # torch.Size([1, 5, 4]) -> torch.Size([5, 4]), (b,e) target embedding
Ybar_t = torch.cat((Y_t, o_prev), 1) # (b, e+h)
# enc_hiddens_proj.shape

In [31]:
unsqueeze_dec_hidden = dec_hidden.unsqueeze(2) # torch.Size([5, 3, 1]) (b, h) -> (b, h, 1)

In [32]:
e_t = torch.bmm(enc_hiddens_proj, unsqueeze_dec_hidden).squeeze(2) #    enc_hiddens_proj=(b, src_len, h)
# enc_hiddens_proj=(b, src_len, h) -> e_t =(b, src_len, 1) -> e_t =(b, src_len)
# all encoding steps are saved in src_len of enc_hiddens_proj
# here key = enc_hiddens, query is specific dec_hidden, in this example is dec_init_state[0]

In [33]:
alpha_t = F.softmax(e_t, dim=1) # (b, src_lens): This shows attention to all encoding layer

a_t = torch.bmm(alpha_t.unsqueeze(1), enc_hiddens).squeeze(1) #(b, h*2): This provides attention results 
# Here value set is enc_hiddens

u_t = torch.cat((a_t, dec_hidden), dim=1) #(b, h*3)  #concat attention with hidden and pass to next state
v_t = combined_output_projection(u_t) #(b, h) #
O_t = dropout(torch.tanh(v_t)) #(b, h)

**note of one step compelte**

Complete one step example of attention!

Let's put things together! Following is for all loop of decoder step (each loop walks through all encoder for attention)

In [34]:
# Define decoder 
decoder = nn.LSTMCell(input_size=EMBED_SIZE+HIDDEN_SIZE, hidden_size=HIDDEN_SIZE, bias=True)
# 不直接用LSTM作为decoder是因为， 每个decoder出来的(dec_hidden, dec_cell)将被用来求attention

In [35]:
dec_state[1]

tensor([[-0.1309, -0.0195, -0.1640],
        [-0.1616, -0.0361, -0.1463],
        [ 0.1730, -0.0910, -0.3136],
        [-0.1822, -0.0143, -0.0988],
        [ 0.2522, -0.0716, -0.3942]], grad_fn=<MmBackward>)

In [36]:
## Define a mask
enc_masks = torch.zeros(enc_hiddens.size(0), enc_hiddens.size(1), dtype=torch.float)
for e_id, src_len in enumerate(source_lengths):
    enc_masks[e_id, src_len:] = 1

In [37]:
# already defined o_prev, dec_state, Y, Y_ts, enc_hidden_proj
combined_outputs = []
for i in range(len(Y_ts)):
    Y_t = Y_ts[i]
    Y_t = torch.squeeze(Y_t)
    Ybar_t = torch.cat((Y_t, o_prev), 1)
    # one loop
    dec_hidden = dec_state[0]
    unsqueeze_dec_hidden = dec_hidden.unsqueeze(2) 
    e_t = torch.bmm(enc_hiddens_proj, unsqueeze_dec_hidden).squeeze(2)
    # 这里可能要一个masks
    if enc_masks is not None:
        e_t.data.masked_fill_(enc_masks.bool(), -float('inf'))
    alpha_t = F.softmax(e_t, dim=1) 
    a_t = torch.bmm(alpha_t.unsqueeze(1), enc_hiddens).squeeze(1)
    u_t = torch.cat((a_t, dec_hidden), dim=1)
    v_t = combined_output_projection(u_t)
    o_t = dropout(torch.tanh(v_t))
    o_prev = o_t
    # one loop complete
    dec_state = decoder(Ybar_t, dec_state) # refresh dec_state
    combined_outputs.append(o_t)
combined_outputs = torch.stack(combined_outputs, dim=0)
    

In [38]:
len(combined_outputs)

23

In [39]:
combined_outputs[0]

tensor([[-0.0898,  0.0389,  0.0026],
        [-0.1102,  0.0464,  0.0089],
        [-0.0738,  0.0496,  0.0310],
        [-0.0970,  0.0252,  0.0082],
        [-0.0419,  0.0445,  0.0305]], grad_fn=<SelectBackward>)

#### Section 2.X validation - no way to validate due to embed_size and seed

In [None]:
combined_outputs_target = torch.load('./sanity_check_en_es_data/combined_outputs.pkl')
print(combined_outputs_target.shape)

In [None]:
combined_outputs_target[0]

In [None]:
dec_init_state = torch.load('./sanity_check_en_es_data/dec_init_state.pkl')

In [None]:
dec_init_state[0]

### Section 3: Loss and forward

In [None]:
target_vocab_projection = nn.Linear(HIDDEN_SIZE, len(vocab.tgt), bias=False)

In [None]:
P = F.log_softmax(target_vocab_projection(combined_outputs), dim=-1)

In [None]:
target_masks = (target_padded != vocab.tgt['<pad>']).float()

In [None]:
target_gold_words_log_prob = torch.gather(P, index=target_padded[1:].unsqueeze(-1), dim=-1).squeeze(-1) * target_masks[1:]

In [None]:
scores = target_gold_words_log_prob.sum(dim=0)