In [1]:
from torch.autograd import Function, NestedIOFunction, Variable
import torch.backends.cudnn as cudnn
from torch.nn._functions.thnn import rnnFusedPointwise as fusedBackend

In [1]:
# torch
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torchtext.data import Field, Iterator, BucketIterator, TabularDataset
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from torchnlp.metrics import get_moses_multi_bleu
from NMTutils import get_parser, build_data, get_model_config, evaluation

from decoder import Decoder
from encoder import Encoder
from attention import Attention
# others
import argparse
import random
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
from collections import defaultdict

In [2]:
# USE_CUDA = torch.cuda.is_available()
# DEVICE = torch.cuda.current_device()
USE_CUDA = False
DEVICE = -1

In [3]:
lang1 = 'eng'
lang2 = 'fra'
modelcode_small = ['111301', '111311', '111301', '111311', '111311', 
                  '111311', '111311', '122421', '122521', '122621',
                  '122622', '222421', '222521', '222421']
modelcode_filtered = ['322521', '322421']

model_idx = 1
config, test_data, test_loader, SOURCE, TARGET = get_model_config(modelcode_small[model_idx-1], lang1, lang2,
                                                                 device=DEVICE)

In [4]:
for batch in test_loader:
    inputs, lengths = batch.so
    targets = batch.ta
    break
lengths = lengths.tolist()

---


## Beam Search

https://github.com/IBM/pytorch-seq2seq/blob/master/seq2seq/models/TopKDecoder.py

In [45]:
emd_size = 10
hid_size = 12
embed = nn.Embedding(len(SOURCE.vocab), emd_size)
gru = nn.GRU(emd_size, hid_size, 3, batch_first=True, bidirectional=True)

In [46]:
embed_dec = nn.Embedding(len(TARGET.vocab), emd_size)
gru_dec = nn.GRU(emd_size+2*hid_size, 2*hid_size, 1, batch_first=True, bidirectional=False)
linear = nn.Linear(2*2*hid_size, len(TARGET.vocab))
attention = Attention(2*hid_size)

In [47]:
sos = torch.LongTensor([2]*inputs.size(0)).unsqueeze(1)

In [48]:
embeded = embed(inputs)
print(embeded.size())
packed = pack_padded_sequence(embeded, lengths, batch_first=True)
outputs, hidden = gru(packed)
print(hidden.size())
outputs, output_lengths = pad_packed_sequence(outputs, batch_first=True)
print(outputs.size())
hidden = torch.cat([h for h in hidden[-2:]], 1).unsqueeze(1)
print(hidden.size())

torch.Size([64, 9, 10])
torch.Size([6, 64, 12])
torch.Size([64, 9, 24])
torch.Size([64, 1, 24])


In [72]:
embeded_dec = embed_dec(sos)
print(embeded_dec.size())
context, weights = attention(hidden, outputs, output_lengths.tolist(), return_weight=True)
gru_input = torch.cat([embeded_dec, context], 2)
print(gru_input.size())
_, hidden = gru_dec(gru_input, hidden.transpose(0, 1))
print(hidden.size())
hidden = hidden.transpose(0, 1)
concated = torch.cat([hidden, context], 2)
print(concated.size())
score = linear(concated.squeeze(1))

torch.Size([64, 1, 10])
torch.Size([64, 1, 34])
torch.Size([1, 64, 24])
torch.Size([64, 1, 48])


In [124]:
beam_idxes = defaultdict(dict)
scores = []

In [115]:
sorted_scores, sorted_idxes = score.sort(dim=1, descending=True)
ktop_idxes = sorted_idxes[:, :5]
ktop_scores = sorted_scores[:, :5]

In [121]:
scores.append(ktop_scores)

In [125]:
beam_idxes[1]['score'] = ktop_scores
beam_idxes[1]['idxes'] = ktop_idxes

In [101]:
x = torch.randint(0, 10, (4, 5))

In [102]:
x

tensor([[ 1.,  0.,  6.,  6.,  3.],
        [ 1.,  3.,  1.,  0.,  2.],
        [ 8.,  1.,  8.,  6.,  6.],
        [ 4.,  0.,  6.,  8.,  9.]])

In [104]:
score.sort(dim=1, descending=True)

(tensor([[ 6.0359e-01,  5.8120e-01,  5.5645e-01,  ..., -6.0914e-01,
          -6.5717e-01, -6.7255e-01],
         [ 5.9112e-01,  5.7375e-01,  5.5910e-01,  ..., -6.0132e-01,
          -6.3573e-01, -6.4145e-01],
         [ 6.0446e-01,  5.8701e-01,  5.8411e-01,  ..., -6.1869e-01,
          -6.6643e-01, -6.8972e-01],
         ...,
         [ 6.0359e-01,  5.8120e-01,  5.5645e-01,  ..., -6.0914e-01,
          -6.5717e-01, -6.7255e-01],
         [ 6.1332e-01,  5.4864e-01,  5.3648e-01,  ..., -5.7529e-01,
          -6.4074e-01, -6.4379e-01],
         [ 6.0347e-01,  6.0111e-01,  5.9818e-01,  ..., -6.1534e-01,
          -6.7679e-01, -7.0563e-01]]),
 tensor([[ 9308,  7745,  5762,  ...,  4880,  1378,  1262],
         [ 9308,  7745,  3605,  ...,  4880,  1378,  1262],
         [ 9308,  7745,  2550,  ...,  4880,  1378,  1262],
         ...,
         [ 9308,  7745,  5762,  ...,  4880,  1378,  1262],
         [ 9308,  7745,  4197,  ...,  4880,  1262,  1378],
         [ 9308,  4197,  7745,  ...,  4880,  

---

## Layer Norm

In [9]:
input_size = 5
hidden_size = 6
seq_len = 7
bias = True
B = 10
num_layer = 2
x = torch.randn((seq_len, B, input_size))
h = torch.randn((num_layer*2, B, hidden_size))

In [10]:
w_ih = nn.Linear(input_size, 3*hidden_size, bias=bias)
w_hh = nn.Linear(hidden_size, 3*hidden_size, bias=bias)

In [12]:
gi = w_ih(x[0])
gh = w_hh(h[0])
print(gi.size(), gh.size())
i_r, i_i, i_n = gi.chunk(3, 1)
h_r, h_i, h_n = gh.chunk(3, 1)

a_r = i_r + h_r
a_i = i_i + h_i
print(a_r.size(), a_i.size())

torch.Size([10, 18]) torch.Size([10, 18])
torch.Size([10, 6]) torch.Size([10, 6])


In [13]:
lm_r = nn.LayerNorm(hidden_size, elementwise_affine=True) 

In [14]:
lm_r(a_r)

tensor([[ 0.8483,  0.5183,  1.2515, -0.6050, -0.2809, -1.7322],
        [ 1.1328, -0.1554, -0.1978, -0.2964, -1.7330,  1.2498],
        [ 1.6602, -0.5964,  0.5125,  0.5637, -0.9766, -1.1634],
        [-0.0735,  0.1378,  1.0084,  0.4167, -2.1003,  0.6110],
        [-0.2187, -0.7102, -0.1035,  0.5517, -1.3435,  1.8242],
        [ 0.5345,  1.8563, -0.5719, -0.5729,  0.0238, -1.2698],
        [-0.1510, -0.0145, -0.5039,  1.1937,  1.1801, -1.7045],
        [ 0.0017, -1.8937,  1.2953,  0.2352, -0.3739,  0.7355],
        [ 1.0443,  0.2331,  1.0121,  0.2053, -0.6657, -1.8290],
        [ 0.8642,  0.8665,  0.7942, -1.1174, -1.6072,  0.1996]])

https://github.com/pytorch/pytorch/issues/4930

In [15]:
import torch.nn as nn
from torch.nn.utils.rnn import PackedSequence
import numpy as np

class LayerNormGRU(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, batch_first=False, 
                 bidirectional=False, bias=True, use_cuda=False):
        super(LayerNormGRU, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.batch_first = batch_first
        self.bidrectional = bidirectional
        self.num_directions = 2 if self.bidrectional else 1
        self.bias = bias
        self.use_cuda = use_cuda
        self.gate_num = 3
        
        self.weight_ih = nn.Linear(input_size, self.gate_num*hidden_size, bias=bias)
        self.weight_hh = nn.Linear(hidden_size, self.gate_num*hidden_size, bias=bias)
        if self.use_cuda:
            self.weight_ih = self.weight_ih.cuda()
            self.weight_hh = self.weight_hh.cuda()
            
        self.lm_r = nn.LayerNorm(hidden_size)
        self.lm_i = nn.LayerNorm(hidden_size)
        self.lm_n = nn.LayerNorm(hidden_size)
    
    def gru_cell(self, inpt, hidden):
        """
        input:
        * inpt: B, input_size
        * hidden: B, hidden_size
        output:
        * 
        """
        gi = self.weight_ih(inpt)
        gh = self.weight_hh(hidden)
        i_r, i_i, i_n = gi.chunk(3, 1)
        h_r, h_i, h_n = gh.chunk(3, 1)
        
        a_r = self.lm_r(i_r + h_r)
        a_i = self.lm_i(i_i + h_i)
        
        resetgate = F.sigmoid(a_r)
        inputgate = F.sigmoid(a_i)
        
        a_n = self.lm_n(i_n + resetgate * h_n)
        newgate = F.tanh(a_n)
        hy = newgate + inputgate * (hidden - newgate)
        return hy
    
    def forward(self, inpt, hidden=None):
        """
        input:
        * inpt: seq_len, B, input_size
        * hidden: num_layers * num_directions, B, hidden_size
        output:
        * output: seq_len, B, hidden_size * num_directions
        * hidden: num_layers * num_directions, B, hidden_size
        """
        is_packed = isinstance(inpt, PackedSequence)
        if is_packed:
            inpt, batch_sizes = inpt
            max_batch_size = int(batch_sizes[0])
        else:
            batch_sizes = None
            max_batch_size = inpt.size(0) if self.batch_first else inpt.size(1)
        
        if hidden is None:
            hidden = self.init_hidden(inpt, max_batch_size)
            
        seq_len = inpt.size(0)
        output_forward = []
        for t in range(seq_len):
            hidden = self.gru_cell(inpt[t], hidden)
            # bidirec needed
            output_forward.append(hidden)
        output_forward = torch.stack(output)
        
        output = output_forward
        if self.batch_first:
            output = output.transpose(0, 1)
        
        return output, hidden

    def init_hidden(self, inpt, max_batch_size):
        hx = inpt.new_zeros(self.num_layers * self.num_directions,
                             max_batch_size, self.hidden_size,
                             requires_grad=False)
        if self.use_cuda:
            hx = hx.cuda()
        return hx

In [16]:
packed = pack_padded_sequence(x, [7]*9 + [6])

In [21]:
inpt, batch_sizes = packed
max_batch_size = int(batch_sizes[0])

In [22]:
inpt.size()

torch.Size([69, 5])

In [23]:
hx = inpt.new_zeros(num_layer * 2, max_batch_size, hidden_size, requires_grad=False)

In [24]:
hx.size()

torch.Size([4, 10, 6])

In [19]:
grucell = nn.GRUCell(input_size, hidden_size)

In [20]:
layernormgru = LayerNormGRU(input_size, hidden_size, 1)

In [96]:
hy = layernormgru.gru_cell(x[0], h)

In [97]:
hy.size()

torch.Size([10, 6])

In [126]:
grucell(x[0], h).size()

torch.Size([10, 6])

In [133]:
out, hid = layernormgru(x.transpose(0, 1))

RuntimeError: The size of tensor a (6) must match the size of tensor b (18) at non-singleton dimension 2

In [121]:
inpt, batch_sizes = packed

In [None]:
class Encoder(nn.Module):
    def __init__(self, V_e, m_e, n_e, num_layers=1, bidrec=False, use_dropout=False, dropout_rate=0.5, 
                 layernorm=False):
        super(Encoder, self).__init__()
        """
        vocab_size: V_e
        embed_size: m_e
        hidden_size: n_e
        """
        self.V_e = V_e
        self.m_e = m_e
        self.n_e = n_e
        self.num_layers = num_layers
        self.bidrec = bidrec
        self.n_direct = 2 if bidrec else 1
        self.use_dropout = use_dropout
        self.layernorm = layernorm

        if self.use_dropout:
            self.dropout = nn.Dropout(dropout_rate)

        if self.layernorm:
            self.lm = nn.LayerNorm()

        self.embed = nn.Embedding(V_e, m_e) 
        self.gru = nn.GRU(m_e, n_e, num_layers, batch_first=True, bidirectional=bidrec)
        
    def forward(self, inputs, lengths):
        """
        input: 
        - inputs: B, T_x
        - lengths: actual max length of batches
        output:
        - outputs: B, T_x, n_e
        """
        # embeded: (B, T_x, n_e)
        embeded = self.embed(inputs)
        if self.use_dropout:
            embeded = self.dropout(embeded)
            
        # packed: (B*T_x, n_e)
        packed = pack_padded_sequence(embeded, lengths, batch_first=True) 
        # packed outputs: (B*T_x, 2*n_e)
        # hidden: (num of layers*n_direct, B, 2*n_e)
        outputs, hidden = self.gru(packed)
        # unpacked outputs: (B, T_x, 2*n_e)
        outputs, output_lengths = pad_packed_sequence(outputs, batch_first=True)
        
        # hidden bidirection: (num of layers*n_direct(0,1,2...last one), B, n_e)
        # choosen last hidden: (B, 1, 2*n_e)
        hidden = torch.cat([h for h in hidden[-self.n_direct:]], 1).unsqueeze(1)
        
        return outputs, hidden

In [None]:
def build(config, SOURCE, TARGET):
    enc = Encoder(len(SOURCE.vocab), config.EMBED, config.HIDDEN, config.NUM_HIDDEN, bidrec=True)
    dec = Decoder(len(TARGET.vocab), config.EMBED, 2*config.HIDDEN, hidden_size2=config.HIDDEN2, \
                  sos_idx=SOURCE.vocab.stoi['<s>'], method=config.METHOD, USE_CUDA=USE_CUDA)
    if USE_CUDA:
        enc = enc.cuda()
        dec = dec.cuda()

    loss_function = nn.CrossEntropyLoss(ignore_index=TARGET.vocab.stoi['<pad>'])
    return enc, dec, loss_function