In [1]:
from torch.autograd import Function, NestedIOFunction, Variable
import torch.backends.cudnn as cudnn
from torch.nn._functions.thnn import rnnFusedPointwise as fusedBackend

In [1]:
# torch
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torchtext.data import Field, Iterator, BucketIterator, TabularDataset
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from NMTutils import get_parser, build_data, get_model_config, evaluation

from decoder import Decoder
from encoder import Encoder
from attention import Attention
# others
import argparse
import random
import numpy as np
from collections import defaultdict

In [2]:
USE_CUDA = torch.cuda.is_available()
DEVICE = torch.cuda.current_device()
# USE_CUDA = False
# DEVICE = -1

In [3]:
lang1 = 'eng'
lang2 = 'fra'
modelcode_small = ['111301', '111311', '111301', '111311', '111311', 
                  '111311', '111311', '122421', '122521', '122621',
                  '122622', '222421', '222521', '222421']
modelcode_filtered = ['322521', '322421']

model_idx = 14
config, test_data, test_loader, SOURCE, TARGET = get_model_config(modelcode_small[model_idx-1], lang1, lang2,
                                                                 device=DEVICE)

In [13]:
for batch in test_loader:
    inputs, lengths = batch.so
    targets = batch.ta
    break

In [5]:
def build(config, SOURCE, TARGET):
    enc = Encoder(len(SOURCE.vocab), config.EMBED, config.HIDDEN, config.NUM_HIDDEN, bidrec=True)
    dec = Decoder(len(TARGET.vocab), config.EMBED, 2*config.HIDDEN, hidden_size2=config.HIDDEN2, 
                  sos_idx=SOURCE.vocab.stoi['<s>'], method=config.METHOD, dropout_rate=config.DROPOUT_RATE,
                  USE_CUDA=USE_CUDA)
    if USE_CUDA:
        enc = enc.cuda()
        dec = dec.cuda()

    loss_function = nn.CrossEntropyLoss(ignore_index=TARGET.vocab.stoi['<pad>'])
    return enc, dec, loss_function

In [6]:
def build_model(model_idx, code, lang1, lang2, file_path='./data/en_fa/', file_type='small', device=-1):
    config, test_data, test_loader, SOURCE, TARGET = get_model_config(code, lang1, lang2, device=device,
                                                                      file_path=file_path, file_type=file_type)
    enc, dec, loss_function = build(config, SOURCE, TARGET)
    enc.eval()
    dec.eval()
    
    enc_model_path = './data/model/{0}_{1}/{0}-{1}{2}.enc'.format(lang1, lang2, model_idx)
    dec_model_path = './data/model/{0}_{1}/{0}-{1}{2}.dec'.format(lang1, lang2, model_idx)
    enc.load_state_dict(torch.load(enc_model_path))
    dec.load_state_dict(torch.load(dec_model_path))
    return enc, dec, loss_function, test_loader, test_data, config

In [None]:
enc, dec, loss_function, test_loader, _, config = build_model(model_idx, modelcode_small[model_idx-1], 
                                                              lang1, lang2, file_path='./data/en_fa/', 
                                                              file_type='small', device=DEVICE)

---


## Beam Search

https://github.com/IBM/pytorch-seq2seq/blob/master/seq2seq/models/TopKDecoder.py

In [8]:
from beam_search import Beam

In [None]:
# dec = Decoder(len(TARGET.vocab), config.EMBED, 2*config.HIDDEN, hidden_size2=config.HIDDEN2, \
#                   sos_idx=SOURCE.vocab.stoi['<s>'], method=config.METHOD, USE_CUDA=USE_CUDA)

In [None]:
class Decoder(nn.Module):
    def __init__(self, V_d, m_d, n_d, sos_idx=2, num_layers=1, hidden_size2=None, decode_method='greedy',
                 method='general', ktop=5, return_weight=True, max_len=15, dropout_rate=0.0, USE_CUDA=True):
        super(Decoder, self).__init__()
        """
        vocab_size: V_d
        embed_size: m_d
        hidden_size: n_d (set this value as 2*n_e)
        methods:
        - 'dot': dot product between hidden and encoder_outputs
        - 'general': encoder_outputs through a linear layer 
        - 'concat': concat (hidden, encoder_outputs)
        - 'paper': concat + tanh
        return_weight: return attention weights
        """
        self.V_d = V_d
        self.m_d = m_d
        self.n_d = n_d
        self.sos_idx = sos_idx
        self.num_layers = num_layers
        self.return_weight = return_weight
        self.method = method
        self.dec_method = decode_method
        self.ktop = ktop
        self.use_dropout = False if dropout_rate == 0.0 else True
        self.USE_CUDA = USE_CUDA
        # attention
        self.attention = Attention(hidden_size=n_d, hidden_size2=hidden_size2, method=method)
        # embed
        self.embed = nn.Embedding(V_d, m_d)
        # dropout:
        if self.use_dropout:
            self.dropout = nn.Dropout(dropout_rate)
        # gru(W*[embed, context] + U*[hidden_prev])
        # gru: m+n
        self.gru = nn.GRU(m_d+n_d, n_d, num_layers, batch_first=True, bidirectional=False)
        # linear
        self.linear = nn.Linear(2*n_d, V_d)
        self.max_len = max_len
        
        
    def start_token(self, batch_size):
        sos = torch.LongTensor([self.sos_idx]*batch_size).unsqueeze(1)
        if self.USE_CUDA: sos = sos.cuda()
        return sos
    
    def forward(self, hidden, enc_outputs, enc_outputs_lengths=None, max_len=None):
        """
        input:
        - hidden(previous hidden): B, 1, n_d 
        - enc_outputs(source context): B, T_x, n_d
        - enc_outputs_lengths: list type
        - max_len(targer sentences max len in batch): T_y
        """
        if max_len is None: max_len = self.max_len
        
        inputs = self.start_token(hidden.size(0)) # (B, 1)
        embeded = self.embed(inputs) # (B, 1, m_d)
        if self.use_dropout:
            embeded = self.dropout(embeded)
            
        # prepare for whole targer sentence scores
        scores = []
        attn_weights = []

        for i in range(max_len):
            # context vector: previous hidden(s{i-1}), encoder_outputs(O_e) > context(c{i}), weights
            # - context: (B, 1, n_d)
            # - weights: (B, 1, T_x)
            context, weights = self.attention(hidden, enc_outputs, enc_outputs_lengths, 
                                              return_weight=self.return_weight)
            attn_weights.append(weights.squeeze(1))
            
            # concat context & embedding vectors: (B, 1, m_d+n_d)
            gru_input = torch.cat([embeded, context], 2)
            
            # gru((context&embedding), previous hidden)
            # output hidden(s{i}): (1, B, n_d)
            _, hidden = self.gru(gru_input, hidden.transpose(0, 1))
            hidden = hidden.transpose(0, 1)  # change shape to (B, 1, n_d) again
            
            # concat context and new hidden vectors: (B, 1, 2*n_d)
            concated = torch.cat([hidden, context], 2)
            
            # get score: (B, V_d)
            score = self.linear(concated.squeeze(1))
            scores.append(score)
            
            # greedy method
            decoded = self.decode_method(score, dec_method=self.dec_method, ktop=self.ktop)  # (B)
            embeded = self.embed(decoded).unsqueeze(1) # next input y{i-1} (B, 1, m_d)
            if self.use_dropout:
                embeded = self.dropout(embeded)

        # column-wise concat, reshape!! 
        # scores = [(B, V_d), (B, V_d), (B, V_d)...] > (B, V_d*max_len)
        # attn_weights = [(B, T_x), (B, T_x), (B, T_x)...] > (B*max_len, T_x)
        scores = torch.cat(scores, 1)
        return scores.view(inputs.size(0)*max_len, -1), torch.cat(attn_weights)

    def decode_method(self, score, dec_method='greedy', ktop=5):
        prob, decoded = score.max(1)
        if dec_method == 'greedy':
            return decoded
        elif dec_method == 'beam':
            pass

    def decode(self, hidden, enc_outputs, enc_outputs_lengths, eos_idx=3, max_len=50):
        
        inputs = self.start_token(hidden.size(0))  # (1, 1)
        embeded = self.embed(inputs)  # (1, 1, m_d)
        if self.use_dropout:
            embeded = self.dropout(embeded)
        
        decodes = [] 
        attn_weights = []
        decoded = torch.LongTensor([self.sos_idx]).view(1, -1)
        
        while (decoded.item() != eos_idx):
            # context: (1, 1, n_d)
            # weights: (1, 1, T_x)
            context, weights = self.attention(hidden, enc_outputs, enc_outputs_lengths, 
                                              return_weight=self.return_weight)
            attn_weights.append(weights.squeeze(1))  # (1, T_x)
            gru_input = torch.cat([embeded, context], 2)  # (1, 1, m_d+n_d)
            _, hidden = self.gru(gru_input, hidden.transpose(0, 1))  # (1, 1, n_d)
            hidden = hidden.transpose(0, 1)
            concated = torch.cat([hidden, context], 2)  # (1, 1, 2*n_d)
            score = self.linear(concated.squeeze(1))  # (1, 2*n_d) -> # (1, V_d)
            decoded = score.max(1)[1]  # (1)
            decodes.append(decoded)
            embeded = self.embed(decoded).unsqueeze(1) # (1, 1, m_d)
            if self.use_dropout:
                embeded = self.dropout(embeded)
            
            if len(decodes) >= max_len:
                break
        
        return torch.cat(decodes), torch.cat(attn_weights)

In [9]:
output, hidden = enc(inputs, lengths.tolist())

In [12]:
inputs = dec.start_token(hidden.size(0))
embeded = dec.embed(inputs)
embeded = dec.dropout(embeded)

AttributeError: 'Decoder' object has no attribute 'dropout'

In [10]:
beam_search = Beam(5, SOURCE.vocab.stoi['<pad>'], SOURCE.vocab.stoi['<s>'], SOURCE.vocab.stoi['</s>']
                   , n_best=1, cuda=USE_CUDA)

In [47]:
beam_search.advance(score)

False

In [48]:
beam_search.prevKs

[tensor([ 0,  0,  0,  0,  0])]

In [49]:
beam_search.nextYs

[tensor([ 2,  1,  1,  1,  1]), tensor([ 4854,   428,  3554,  7799,   869])]

In [124]:
beam_idxes = defaultdict(dict)
scores = []

In [115]:
sorted_scores, sorted_idxes = score.sort(dim=1, descending=True)
ktop_idxes = sorted_idxes[:, :5]
ktop_scores = sorted_scores[:, :5]

In [121]:
scores.append(ktop_scores)

In [125]:
beam_idxes[1]['score'] = ktop_scores
beam_idxes[1]['idxes'] = ktop_idxes

---

## Layer Norm

https://github.com/pytorch/pytorch/issues/4930

In [4]:
import torch
import torch.nn as nn
from torch.nn.utils.rnn import PackedSequence
import torch.nn.functional as F
import numpy as np

In [9]:
stacked_GRU = StackedGRU(config.EMBED, config.HIDDEN, config.NUM_HIDDEN, bidirectional=True, 
                         layernorm=False, return_all_hidden=False).cuda()

In [10]:
embed = nn.Embedding(len(SOURCE.vocab), config.EMBED).cuda()

In [14]:
embeded = embed(inputs)

In [15]:
embeded, l = pack_padded_sequence(embeded, lengths.tolist(), batch_first=True)

In [16]:
embeded.size(), l

(torch.Size([1152, 256]),
 tensor([ 128,  128,  128,  128,  128,  128,  128,  128,  128]))

In [18]:
embeded.size()

torch.Size([1152, 256])

In [59]:
o, h = stacked_GRU(embeded, hidden)

RuntimeError: size mismatch, m1: [16 x 5], m2: [512 x 1536] at /pytorch/aten/src/THC/generic/THCTensorMathBlas.cu:249

In [44]:

class LayerNormGRU(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, batch_first=False, layernorm=False
                 bidirectional=False, bias=True, use_cuda=False, return_all_hidden=False), :
        super(LayerNormGRU, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.layernorm = layernorm
        self.batch_first = batch_first
        self.bidrectional = bidirectional
        self.return_all_hidden = return_all_hidden
        self.num_directions = 2 if self.bidrectional else 1
        self.bias = bias
        self.gate_num = 3
        
        
    def forward(self, inputs, hidden=None):
        """
        input:
        * inputs: seq_len, B, input_size
        * hidden: num_layers * num_directions, B, hidden_size
        output:
        * output: seq_len, B, hidden_size * num_directions
        * hidden: num_layers * num_directions, B, hidden_size
        """
        is_packed = isinstance(inputs, PackedSequence)
        if is_packed:
            inputs, batch_sizes = inputs
            max_batch_size = int(batch_sizes[0])
        else:
            batch_sizes = None
            max_batch_size = inputs.size(0) if self.batch_first else inputs.size(1)
        
        if hidden is None:
            hidden = self.init_hidden(inputs, max_batch_size)
            
        func = stacked_GRU(input_size=self.input_size, 
                           hidden_size=self.hidden_size, 
                           num_layers=self.num_layers, 
                           bidirectional=self.bidrectional, 
                           layernorm=self.layernorm, 
                           return_all_hidden=self.return_all_hidden,
                           batch_first=self.batch_first,
                           batch_sizes=batch_sizes,
                           is_packed=is_packed)
        
        output, hidden = func(inputs, hidden)
        
        if self.batch_first:
            output = output.transpose(0, 1)
        
        return output, hidden

    def init_hidden(self, inpt, max_batch_size):
        hx = inpt.new_zeros(self.num_layers * self.num_directions, max_batch_size, self.hidden_size,
                            requires_grad=False)
        if self.use_cuda:
            hx = hx.cuda()
        return hx

In [7]:
class StackedGRU(nn.Module):

    def __init__(self, input_size, hidden_size, num_layers, bidirectional=False, layernorm=False, 
                 return_all_hidden=False, batch_sizes=None, batch_first=False, is_packed=False):
        super(StackedGRU, self).__init__()
        # to do: add is_packed
        self.batch_first = batch_first
        self.layernorm = layernorm
        self.bidirec = bidirectional
        self.return_all_hidden = return_all_hidden
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.num_directions = 2 if self.bidirec else 1
        self.num_layers = num_layers
        self.build_layers(input_size, hidden_size)
        # packed seq
        self.batch_sizes = batch_sizes
        self.is_packed = is_packed
        
    def build_layers(self, input_size, hidden_size):
        self.layers = nn.ModuleList()
        for _ in range(self.num_layers):
            self.layers.append(GRUCell(input_size, hidden_size, layernorm=self.layernorm))
            input_size = hidden_size
        if self.bidirec:
            input_size = self.input_size
            self.r_layers = nn.ModuleList()
            for _ in range(self.num_layers):
                self.r_layers.append(GRUCell(input_size, hidden_size, layernorm=self.layernorm))
                input_size = hidden_size
    
    
    def forward(self, inputs, hidden, batch_sizes=None):
        """
        * input:
        inputs: 'tensor(T, B, D)' if packed, 'tensor(T*B, D)'
        hidden: 'tensor(num_layers * num_directions, B, H)'
        
        * return:
        output: 'tensor(num_layers, T, B, 2H)' if return_all_hiddens else last layer 'tensor(T, B, 2H)'
        hidden 'tensor(num_layers*num_directions, B, H)'
        """
        if self.bidirec:
            # output (num_layers, T, B, 2H)
            # last_hidden (num_layers*num_directions, B, H)
            # forward: idx of time t ~ (0, 1, ..., T-1)            
            f_idx = [i for i in range(self.num_layers * self.num_directions) if i % 2 == 0]
            f_all_outputs, f_last_hidden = self._forward(self.layers, inputs, hidden[f_idx, :])
            
            # backward: 
            r_inputs = self._flip(inputs, 0)  # (T, B, H) idx of time t ~ (T-1, ... , 0)
            b_idx = [i for i in range(self.num_layers * self.num_directions) if i % 2 != 0]
            b_all_outputs, b_last_hidden = self._forward(self.r_layers, r_inputs, hidden[b_idx, :])
            
            # concate layers
            # f: hidden[T-1], b: hidden[0]
            output = torch.cat([f_all_outputs, b_all_outputs], -1)
            idx = [int(i/self.num_directions) if i % 2 == 0 else \
                   i + int(((self.num_layers * self.num_directions) - i) / 2) \
                   for i in range(self.num_layers * self.num_directions) ]
            hidden = torch.cat([f_last_hidden, b_last_hidden])[idx, :]

            if self.return_all_hidden:
                return output, hidden
            return output[-1], hidden
            
        else:
            f_all_outputs, f_last_hidden = self._forward(self.layers, inputs, hidden)
            if self.return_all_hidden:
                return f_all_outputs, f_last_hidden
            return f_all_outputs[-1], f_last_hidden

    
    def init_hidden(self, batch_size):
        # init_hidden
        hidden = torch.zeros((self.num_layers*self.num_directions, batch_size, self.hidden_size))
        return hidden

    def _forward(self, layers, inputs, hidden, batch_sizes=None):
        """
        * input:
        layers: nn.ModuleList for one direction layers
        inp: T, B, D
        hid: num_layers, B, H (init hidden)
        
        * return:
        all_outputs: all layers a forward or backward layer
        tensor(num_layers, T, B, H)
        last_hidden: 
        tensor(num_layers, B, H)
        """
        # todo: add is_packed
        assert isinstance(layers, nn.ModuleList)
        inp = inputs
        
        all_outputs = []
        for l_idx, layer in enumerate(layers):
            hid = hidden.chunk(3, 0)[l_idx].squeeze(0)  # init hidden: 1, B, H --> B, H
            output_ith_layer = []
            for t in range(inp.size(0)):
                hid = layer(inp[t], hid)
                output_ith_layer.append(hid)
            output_ith_layer = torch.stack(output_ith_layer)  # T, B, H
            inp = output_ith_layer 
            all_outputs.append(output_ith_layer)
        
        last_hidden = torch.stack([out[-1] for out in all_outputs]) # num_layer, B, H
        return torch.stack(all_outputs), last_hidden
    
    
    def _flip(self, x, dim):
        """
        https://discuss.pytorch.org/t/optimizing-diagonal-stripe-code/17777/16
        """
        indices = [slice(None)] * x.dim()
        indices[dim] = torch.arange(x.size(dim) - 1, -1, -1,
                                    dtype=torch.long, device=x.device)
        return x[tuple(indices)]

In [8]:
class GRUCell(nn.Module):
    def __init__(self, input_size, hidden_size, bias=True, layernorm=False, gate_num=3):
        super(GRUCell, self).__init__()
        self.input_size = input_size
        self.bias = bias
        self.hidden_size = hidden_size
        self.layernorm = layernorm
        self.gate_num = gate_num
        
        self.weight_ih = nn.Linear(input_size, gate_num*hidden_size, bias=bias)
        self.weight_hh = nn.Linear(hidden_size, gate_num*hidden_size, bias=bias)
        
        self.lm_r = nn.LayerNorm(hidden_size)
        self.lm_i = nn.LayerNorm(hidden_size)
        self.lm_n = nn.LayerNorm(hidden_size)
        
    def forward(self, inputs, hidden):
        """
        inputs:
        * inputs: B, input_size
        * hidden: B, hidden_size
        output:
        * hy: B, hidden_size
        """
        gi = self.weight_ih(inputs)
        gh = self.weight_hh(hidden)
        i_r, i_i, i_n = gi.chunk(3, 1)
        h_r, h_i, h_n = gh.chunk(3, 1)
        
        a_r = i_r + h_r
        a_i = i_i + h_i
        if self.layernorm:
            a_r = self.lm_r(a_r)
            a_i = self.lm_i(a_i)
            
        resetgate = F.sigmoid(a_r)
        inputgate = F.sigmoid(a_i)
        
        a_n = i_n + resetgate * h_n
        if self.layernorm:
            a_n = self.lm_n(a_n)
            
        newgate = F.tanh(a_n)
        hy = newgate + inputgate * (hidden - newgate)
        return hy