In [1]:
import numpy as np
from utils import batch_iter
from vocab import Vocab
import torch
import torch.nn as nn
import nltk

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/shaozhetao/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
def read_corpus(file_path, source):
    # Understood
    """ Read file, where each sentence is dilineated by a `\n`.
    @param file_path (str): path to file containing corpus
    @param source (str): "tgt" or "src" indicating whether text
        is of the source language or target language
    """
    data = []
    for line in open(file_path):
        sent = nltk.word_tokenize(line)
        # only append <s> and </s> to the target sentence
        if source == 'tgt':
            sent = ['<s>'] + sent + ['</s>']
        data.append(sent)

    return data

In [3]:
BATCH_SIZE = 5
EMBED_SIZE = 3
HIDDEN_SIZE = 3
DROPOUT_RATE = 0.0

In [4]:
seed = 1234
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
np.random.seed(seed * 13 // 7)

# Load training data & vocabulary
train_data_src = read_corpus('./sanity_check_en_es_data/train_sanity_check.es', 'src')
train_data_tgt = read_corpus('./sanity_check_en_es_data/train_sanity_check.en', 'tgt')
train_data = list(zip(train_data_src, train_data_tgt))

for src_sents, tgt_sents in batch_iter(train_data, batch_size=BATCH_SIZE, shuffle=True):
    src_sents = src_sents
    tgt_sents = tgt_sents
    break
vocab = Vocab.load('./sanity_check_en_es_data/vocab_sanity_check.json') 

In [5]:
import json
file_path = './sanity_check_en_es_data/vocab_sanity_check.json'
entry = json.load(open(file_path, 'r'))
# src_word2id = entry['src_word2id']
# tgt_word2id = entry['tgt_word2id']

In [6]:
type(vocab.src) # class of VocabEntry)

vocab.VocabEntry

In [7]:
'Comencemos' in vocab.src.word2id
vocab.src.word2id['por']

29

In [8]:
wordids = [[vocab.src[w] for w in s] for s in src_sents  ]

In [9]:
wordids

[[3, 29, 3, 7, 12, 30, 3, 4, 8, 3, 3, 3, 8, 3, 15, 8, 3, 11, 6, 3, 3, 3],
 [3, 6, 3, 4, 3, 4, 3, 3, 3, 3, 9, 3, 3, 3],
 [3, 5, 47, 3, 6, 3, 3, 6, 3, 3],
 [3, 34, 20, 35, 24, 7, 8, 3, 3, 3],
 [3, 3, 3, 3, 3, 3]]

In [10]:
# torch.t(torch.tensor(pad_sents(wordids, 0)))

In [253]:
model = NMT(
        embed_size=EMBED_SIZE,
        hidden_size=HIDDEN_SIZE,
        dropout_rate=DROPOUT_RATE,
        vocab=vocab)

In [254]:
model

NMT(
  (model_embeddings): ModelEmbeddings(
    (source): Embedding(77, 3, padding_idx=0)
    (target): Embedding(85, 3, padding_idx=0)
  )
)

In [16]:
source_lengths = [len(s) for s in src_sents]
source_padded = model.vocab.src.to_input_tensor(src_sents, device='cpu') # (src_len, batch)

In [17]:
source_lengths

[22, 14, 10, 10, 6]

In [18]:
source_padded.T

tensor([[ 3, 29,  3,  7, 12, 30,  3,  4,  8,  3,  3,  3,  8,  3, 15,  8,  3, 11,
          6,  3,  3,  3],
        [ 3,  6,  3,  4,  3,  4,  3,  3,  3,  3,  9,  3,  3,  3,  0,  0,  0,  0,
          0,  0,  0,  0],
        [ 3,  5, 47,  3,  6,  3,  3,  6,  3,  3,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0],
        [ 3, 34, 20, 35, 24,  7,  8,  3,  3,  3,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0],
        [ 3,  3,  3,  3,  3,  3,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0]])

In [19]:
model_embeddings = ModelEmbeddings(EMBED_SIZE, vocab)
X = model_embeddings.source(source_padded)

In [20]:
X.shape

torch.Size([22, 5, 3])

In [21]:
from torch.nn.utils.rnn import pad_packed_sequence, pack_padded_sequence

In [22]:
Xz = pack_padded_sequence(X, torch.tensor(source_lengths)) #packed (src_len, b, e)

In [237]:
# X

In [238]:
# Xz

In [101]:
encoder = nn.LSTM(input_size=EMBED_SIZE, hidden_size=HIDDEN_SIZE, bidirectional=True, bias=True)
enc_hiddens, (last_hidden, last_cell) = encoder(Xz) #(src_len, b, h*2)

In [102]:
X.shape

torch.Size([22, 5, 3])

In [103]:
last_hidden[0].shape

torch.Size([5, 3])

In [104]:
enc_hiddens

PackedSequence(data=tensor([[-1.8370e-02, -8.4587e-03,  1.5130e-01,  5.3777e-02,  2.7016e-01,
         -2.1867e-01],
        [-1.8370e-02, -8.4587e-03,  1.5130e-01,  1.2228e-01,  2.7418e-01,
         -4.2072e-01],
        [-1.8370e-02, -8.4587e-03,  1.5130e-01,  1.7103e-01,  2.6669e-01,
         -1.9632e-01],
        [-1.8370e-02, -8.4587e-03,  1.5130e-01,  7.4291e-02,  2.7799e-01,
         -1.5331e-01],
        [-1.8370e-02, -8.4587e-03,  1.5130e-01, -7.2549e-02,  2.4953e-01,
         -6.9482e-02],
        [-1.0473e-02,  1.4300e-01,  1.5000e-01,  1.8946e-01, -6.6002e-03,
         -1.8303e-01],
        [-2.8854e-02,  1.0664e-01, -3.0446e-02,  1.2274e-01,  1.1017e-01,
         -1.7836e-01],
        [ 1.6446e-02,  2.7651e-01,  1.6091e-01,  1.8394e-01, -6.2075e-02,
         -1.5496e-01],
        [-2.2691e-02,  1.4687e-01,  2.2541e-01,  2.7052e-01, -7.9282e-02,
         -1.6494e-01],
        [-2.7597e-02, -1.6700e-02,  2.0493e-01, -7.3162e-02,  2.5088e-01,
         -6.3658e-02],
        [-

In [105]:
last_hidden

tensor([[[-0.0439, -0.0101,  0.2179],
         [-0.0289, -0.0146,  0.2324],
         [-0.0392,  0.0023,  0.1961],
         [-0.0282, -0.0126,  0.2293],
         [-0.0393, -0.0263,  0.2346]],

        [[ 0.0538,  0.2702, -0.2187],
         [ 0.1223,  0.2742, -0.4207],
         [ 0.1710,  0.2667, -0.1963],
         [ 0.0743,  0.2780, -0.1533],
         [-0.0725,  0.2495, -0.0695]]], grad_fn=<StackBackward>)

In [106]:
enc_hiddens, b = pad_packed_sequence(enc_hiddens, batch_first=True)

In [107]:
enc_hiddens.shape

torch.Size([5, 22, 6])

In [110]:
enc_masks = torch.zeros(enc_hiddens.size(0), enc_hiddens.size(1), dtype=torch.float)

In [111]:
source_lengths

[22, 14, 10, 10, 6]

In [112]:
for e_id, src_len in enumerate(source_lengths):
    print (e_id, src_len)
    enc_masks[e_id, src_len:]=1

0 22
1 14
2 10
3 10
4 6


In [113]:
enc_masks

tensor([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 1., 1.,
         1., 1., 1., 1.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 1., 1., 1., 1., 1., 1.,
         1., 1., 1., 1.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 1., 1., 1., 1., 1., 1.,
         1., 1., 1., 1.],
        [0., 0., 0., 0., 0., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
         1., 1., 1., 1.]])

In [26]:
source_padded.shape

torch.Size([22, 5])

In [27]:
X.shape

torch.Size([22, 5, 3])

In [28]:
enc_hiddens_target = torch.load('./sanity_check_en_es_data/enc_hiddens.pkl')
dec_init_state_target = torch.load('./sanity_check_en_es_data/dec_init_state.pkl')

In [29]:
enc_hiddens_target.shape #torch.Size([5, 22, 6])

torch.Size([5, 22, 6])

In [30]:
dec_init_state_target[1].shape #([5,3], [5,3])

torch.Size([5, 3])

## Loss function

In [184]:
# target_padded

In [186]:
vocab.tgt['<pad>']

0

In [189]:
target_masks=(target_padded != vocab.tgt['<pad>']).float()

## Question 1e

In [291]:
enc_hiddens.shape

torch.Size([5, 22, 6])

In [292]:
hidden_size = HIDDEN_SIZE

In [293]:
att_projection = nn.Linear(hidden_size*2, hidden_size, bias=False)

In [294]:
att_projection(enc_hiddens).shape

torch.Size([5, 22, 3])

In [295]:
model_embeddings.target(target_padded)

tensor([[[-0.3120,  2.4534,  0.7926],
         [-0.3120,  2.4534,  0.7926],
         [-0.3120,  2.4534,  0.7926],
         [-0.3120,  2.4534,  0.7926],
         [-0.3120,  2.4534,  0.7926]],

        [[-2.0323, -0.3952,  0.7286],
         [ 0.1590,  0.6634, -0.2352],
         [-0.2974, -0.6724,  0.8060],
         [-2.0323, -0.3952,  0.7286],
         [ 0.4922,  1.7666,  0.0404]],

        [[-2.0323, -0.3952,  0.7286],
         [ 1.3579,  1.3583, -0.5818],
         [-2.0323, -0.3952,  0.7286],
         [-1.4359,  0.7409, -1.6357],
         [ 0.4461, -1.6570,  0.6247]],

        [[-2.0323, -0.3952,  0.7286],
         [-2.0323, -0.3952,  0.7286],
         [ 2.9154, -0.1985,  0.7507],
         [-1.2673, -0.1306, -0.3743],
         [-0.3204,  1.5569, -1.4629]],

        [[ 0.3047,  0.8214, -2.6705],
         [-0.3204,  1.5569, -1.4629],
         [-0.7392, -0.1709,  1.2120],
         [-0.0135,  0.9174, -1.8431],
         [-2.0323, -0.3952,  0.7286]],

        [[ 1.3099, -1.3060, -1.4669],
  

**target explanation**

In [153]:
target_padded.shape

torch.Size([24, 5])

In [154]:
target_padded = target_padded[:-1]

In [156]:
target_padded.shape

torch.Size([23, 5])

In [123]:
[len(t) for t in tgt_sents]

[24, 15, 15, 15, 13]

In [129]:
mytgt = vocab.tgt.to_input_tensor(tgt_sents, device='cpu')
# tgt_sents #english while src_sents is spanish
# mytgt = target_padded
# target_padded

In [200]:
#tgt_sents[0] ['<s>', 'Let', "'s'", 'start', ...]

#src_sents[0] ['Comencemos', 'por', 'pensar', ...]

look back to target paddwed

In [290]:
target_padded.shape

torch.Size([23, 5])

In [164]:
Y = model_embeddings.target(target_padded)
Y.shape

torch.Size([23, 5, 3])

In [168]:
x = torch.split(Y, 1, dim=0)

In [177]:
x[0].size()

torch.Size([1, 5, 3])

In [202]:
x[0] # this is embedding for <s> and you need learn embedding

tensor([[[-0.3120,  2.4534,  0.7926],
         [-0.3120,  2.4534,  0.7926],
         [-0.3120,  2.4534,  0.7926],
         [-0.3120,  2.4534,  0.7926],
         [-0.3120,  2.4534,  0.7926]]], grad_fn=<SplitBackward>)

In [203]:
torch.squeeze(x[0]).shape

torch.Size([5, 3])

In [175]:
Yt.shape

torch.Size([23, 5, 3])

In [303]:
dec_state #(two tensor = dec_init_state)
enc_hiddens.shape #torch.Size([5, 22, 6])

torch.Size([5, 22, 6])

In [276]:
enc_hiddens_proj.shape #torch.Size([5, 22, 3])

torch.Size([5, 22, 3])

In [275]:
enc_hiddens_proj = att_projection(enc_hiddens)

In [279]:
enc_masks.shape #torch.Size([5, 22])

torch.Size([5, 22])

In [280]:
Ybar_t.shape  #torch.Size([5, 6])

torch.Size([5, 6])

In [302]:
o_prev.shape # torch.Size([5, 3])

torch.Size([5, 3])

In [304]:
len(Y_ts)

23

In [347]:
COUNTER = [0]
def stepFunction(Ybar_t, dec_state, enc_hiddens, enc_hiddens_proj, enc_masks):
   dec_state = torch.load('./sanity_check_en_es_data/step_dec_state_{}.pkl'.format(COUNTER[0]))
   o_t = torch.load('./sanity_check_en_es_data/step_o_t_{}.pkl'.format(COUNTER[0]))
   COUNTER[0]+=1
   return dec_state, o_t, None
model.step = stepFunction

In [348]:
o_prev = torch.zeros(5, 3, device='cpu')
combined_outputs = []
Y = model_embeddings.target(target_padded) # 23, 已经去掉最后一个</s>
Y_ts = torch.split(Y, 1)
for i in range(len(Y_ts)):
#     print (i)
    Y_t = Y_ts[i]
    Y_t = torch.squeeze(Y_t)
    Ybar_t = torch.cat((Y_t, o_prev), 1)
    dec_state, o_t, _ = model.step(Ybar_t, dec_state, enc_hiddens, enc_hiddens_proj,
                                                        enc_masks)
#     o_prev = o_t
    combined_outputs.append(o_t)
    o_prev = o_t
#     print (o_t.shape)
combined_outputs = torch.stack(combined_outputs, dim=0)

In [353]:
Y.shape

torch.Size([23, 5, 3])

In [352]:
len(Y_ts)

23

In [351]:
len(combined_outputs)

23

In [344]:
# combined_outputs consistent with combined_outputs_target from loading output

**loading output**

In [114]:
dec_init_state = torch.load('./sanity_check_en_es_data/dec_init_state.pkl')
enc_hiddens = torch.load('./sanity_check_en_es_data/enc_hiddens.pkl')
enc_masks = torch.load('./sanity_check_en_es_data/enc_masks.pkl')
target_padded = torch.load('./sanity_check_en_es_data/target_padded.pkl')


In [119]:
enc_hiddens.shape

torch.Size([5, 22, 6])

In [126]:
target_padded.shape #correct with tgt_sents

torch.Size([24, 5])

In [116]:
combined_outputs_target = torch.load('./sanity_check_en_es_data/combined_outputs.pkl')
print(combined_outputs_target.shape)

torch.Size([23, 5, 3])


In [258]:
enc_hiddens.shape

torch.Size([5, 22, 6])

In [262]:
xx = combined_outputs_target.numpy()

In [268]:
len(xx[0]) # 22*5*3

5

In [259]:
# combined_outputs_pred = model.decode(enc_hiddens, enc_masks, dec_init_state, target_padded)

## Question 1f

In [27]:
def reinitialize_layers(model):
    """ Reinitialize the Layer Weights for Sanity Checks.
    """
    def init_weights(m):
        if type(m) == nn.Linear:
            m.weight.data.fill_(0.3)
            if m.bias is not None:
                m.bias.data.fill_(0.1)
        elif type(m) == nn.Embedding:
            m.weight.data.fill_(0.15)
        elif type(m) == nn.Dropout:
            nn.Dropout(DROPOUT_RATE)
    with torch.no_grad():
        model.apply(init_weights)

In [252]:
from typing import List, Tuple, Dict, Set, Union
class ModelEmbeddings(nn.Module): 
    """
    Class that converts input words to their embeddings.
    """
    def __init__(self, embed_size, vocab):
        super(ModelEmbeddings, self).__init__()
        self.embed_size = embed_size

        # default values
        self.source = None
        self.target = None

        src_pad_token_idx = vocab.src['<pad>']
        tgt_pad_token_idx = vocab.tgt['<pad>']

        self.source = nn.Embedding(len(vocab.src), embed_size, src_pad_token_idx)
        self.target = nn.Embedding(len(vocab.tgt), embed_size, tgt_pad_token_idx)

class NMT(nn.Module):
    """ Simple Neural Machine Translation Model:
        - Bidrectional LSTM Encoder
        - Unidirection LSTM Decoder
        - Global Attention Model (Luong, et al. 2015)
    """
    def __init__(self, embed_size, hidden_size, vocab, dropout_rate=0.2):
        super(NMT, self).__init__()
        self.model_embeddings = ModelEmbeddings(embed_size, vocab)
        self.hidden_size = hidden_size
        self.dropout_rate = dropout_rate
        self.vocab = vocab

        # default values
        self.encoder = None 
        self.decoder = None
        self.h_projection = None
        self.c_projection = None
        self.att_projection = None
        self.combined_output_projection = None
        self.target_vocab_projection = None
        self.dropout = None
        # For sanity check only, not relevant to implementation
        self.gen_sanity_check = False
        self.counter = 0

    def decode(self, enc_hiddens: torch.Tensor, enc_masks: torch.Tensor,
                dec_init_state: Tuple[torch.Tensor, torch.Tensor], target_padded: torch.Tensor) -> torch.Tensor:
        """Compute combined output vectors for a batch.

        @param enc_hiddens (Tensor): Hidden states (b, src_len, h*2), where
                                     b = batch size, src_len = maximum source sentence length, h = hidden size.
        @param enc_masks (Tensor): Tensor of sentence masks (b, src_len), where
                                     b = batch size, src_len = maximum source sentence length.
        @param dec_init_state (tuple(Tensor, Tensor)): Initial state and cell for decoder
        @param target_padded (Tensor): Gold-standard padded target sentences (tgt_len, b), where
                                       tgt_len = maximum target sentence length, b = batch size. 

        @returns combined_outputs (Tensor): combined output tensor  (tgt_len, b,  h), where
                                        tgt_len = maximum target sentence length, b = batch_size,  h = hidden size
        """
        # Chop of the <END> token for max length sentences.
        target_padded = target_padded[:-1]

        # Initialize the decoder state (hidden and cell)
        dec_state = dec_init_state

        # Initialize previous combined output vector o_{t-1} as zero
        batch_size = enc_hiddens.size(0)
        o_prev = torch.zeros(batch_size, self.hidden_size, device='cpu')

        # Initialize a list we will use to collect the combined output o_t on each step
        combined_outputs = []

        enc_hiddens_proj = self.att_projection(enc_hiddens)
        Y = self.model_embeddings.target(target_padded)
        Y_ts = torch.split(Y, 1)
        for Y_t in Y_ts:
            Y_t = torch.squeeze(Y_t)
            Ybar_t = torch.cat((Y_t, o_prev), 1)
            dec_state, o_t, _ = self.step(Ybar_t, dec_state, enc_hiddens, enc_hiddens_proj,
                                                        enc_masks)
            combined_outputs.append(o_t.tolist())
            o_prev = o_t
        combined_outputs = torch.stack(combined_outputs, dim=0)



In [None]:
def words2indices(sents):
    """ Convert list of words or list of sentences of words
    into list or list of list of indices.
    @param sents (list[str] or list[list[str]]): sentence(s) in words
    @return word_ids (list[int] or list[list[int]]): sentence(s) in indices
    """
    if type(sents[0]) == list:
        return [[[w] for w in s] for s in sents]
    else:
        return [[w] for w in sents]

In [229]:
t = torch.tensor([[1, 2, 3], [4, 5, 6], [7, 8, 9]])

In [234]:
t.gather(0, torch.tensor([[2,1, 0]]))

tensor([[7, 5, 3]])

In [235]:
model

NMT(
  (model_embeddings): ModelEmbeddings(
    (source): Embedding(77, 3, padding_idx=0)
    (target): Embedding(85, 3, padding_idx=0)
  )
)

In [236]:
model.train()

NMT(
  (model_embeddings): ModelEmbeddings(
    (source): Embedding(77, 3, padding_idx=0)
    (target): Embedding(85, 3, padding_idx=0)
  )
)