In [1]:
from torch.autograd import Function, NestedIOFunction, Variable
import torch.backends.cudnn as cudnn
from torch.nn._functions.thnn import rnnFusedPointwise as fusedBackend

In [9]:
# torch
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torchtext.data import Field, Iterator, BucketIterator, TabularDataset
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from NMTutils import get_parser, build_data, get_model_config, evaluation

from decoder import Decoder
from encoder import Encoder
from attention import Attention
# others
import argparse
import random
import numpy as np
from collections import defaultdict

In [10]:
USE_CUDA = torch.cuda.is_available()
DEVICE = torch.cuda.current_device()
# USE_CUDA = False
# DEVICE = -1

In [3]:
lang1 = 'eng'
lang2 = 'fra'
modelcode_small = ['111301', '111311', '111301', '111311', '111311', 
                  '111311', '111311', '122421', '122521', '122621',
                  '122622', '222421', '222521', '222421']
modelcode_filtered = ['322521', '322421']

model_idx = 14
config, test_data, test_loader, SOURCE, TARGET = get_model_config(modelcode_small[model_idx-1], lang1, lang2,
                                                                 device=DEVICE)

In [4]:
for batch in test_loader:
    inputs, lengths = batch.so
    targets = batch.ta
    break

In [5]:
def build(config, SOURCE, TARGET):
    enc = Encoder(len(SOURCE.vocab), config.EMBED, config.HIDDEN, config.NUM_HIDDEN, bidrec=True)
    dec = Decoder(len(TARGET.vocab), config.EMBED, 2*config.HIDDEN, hidden_size2=config.HIDDEN2, 
                  sos_idx=SOURCE.vocab.stoi['<s>'], method=config.METHOD, dropout_rate=config.DROPOUT_RATE,
                  USE_CUDA=USE_CUDA)
    if USE_CUDA:
        enc = enc.cuda()
        dec = dec.cuda()

    loss_function = nn.CrossEntropyLoss(ignore_index=TARGET.vocab.stoi['<pad>'])
    return enc, dec, loss_function

In [6]:
def build_model(model_idx, code, lang1, lang2, file_path='./data/en_fa/', file_type='small', device=-1):
    config, test_data, test_loader, SOURCE, TARGET = get_model_config(code, lang1, lang2, device=device,
                                                                      file_path=file_path, file_type=file_type)
    enc, dec, loss_function = build(config, SOURCE, TARGET)
    enc.eval()
    dec.eval()
    
    enc_model_path = './data/model/{0}_{1}/{0}-{1}{2}.enc'.format(lang1, lang2, model_idx)
    dec_model_path = './data/model/{0}_{1}/{0}-{1}{2}.dec'.format(lang1, lang2, model_idx)
    enc.load_state_dict(torch.load(enc_model_path))
    dec.load_state_dict(torch.load(dec_model_path))
    return enc, dec, loss_function, test_loader, test_data, config

In [None]:
enc, dec, loss_function, test_loader, _, config = build_model(model_idx, modelcode_small[model_idx-1], 
                                                              lang1, lang2, file_path='./data/en_fa/', 
                                                              file_type='small', device=DEVICE)

---


## Beam Search

https://github.com/IBM/pytorch-seq2seq/blob/master/seq2seq/models/TopKDecoder.py

In [8]:
from beam_search import Beam

In [None]:
# dec = Decoder(len(TARGET.vocab), config.EMBED, 2*config.HIDDEN, hidden_size2=config.HIDDEN2, \
#                   sos_idx=SOURCE.vocab.stoi['<s>'], method=config.METHOD, USE_CUDA=USE_CUDA)

In [None]:
class Decoder(nn.Module):
    def __init__(self, V_d, m_d, n_d, sos_idx=2, num_layers=1, hidden_size2=None, decode_method='greedy',
                 method='general', ktop=5, return_weight=True, max_len=15, dropout_rate=0.0, USE_CUDA=True):
        super(Decoder, self).__init__()
        """
        vocab_size: V_d
        embed_size: m_d
        hidden_size: n_d (set this value as 2*n_e)
        methods:
        - 'dot': dot product between hidden and encoder_outputs
        - 'general': encoder_outputs through a linear layer 
        - 'concat': concat (hidden, encoder_outputs)
        - 'paper': concat + tanh
        return_weight: return attention weights
        """
        self.V_d = V_d
        self.m_d = m_d
        self.n_d = n_d
        self.sos_idx = sos_idx
        self.num_layers = num_layers
        self.return_weight = return_weight
        self.method = method
        self.dec_method = decode_method
        self.ktop = ktop
        self.use_dropout = False if dropout_rate == 0.0 else True
        self.USE_CUDA = USE_CUDA
        # attention
        self.attention = Attention(hidden_size=n_d, hidden_size2=hidden_size2, method=method)
        # embed
        self.embed = nn.Embedding(V_d, m_d)
        # dropout:
        if self.use_dropout:
            self.dropout = nn.Dropout(dropout_rate)
        # gru(W*[embed, context] + U*[hidden_prev])
        # gru: m+n
        self.gru = nn.GRU(m_d+n_d, n_d, num_layers, batch_first=True, bidirectional=False)
        # linear
        self.linear = nn.Linear(2*n_d, V_d)
        self.max_len = max_len
        
        
    def start_token(self, batch_size):
        sos = torch.LongTensor([self.sos_idx]*batch_size).unsqueeze(1)
        if self.USE_CUDA: sos = sos.cuda()
        return sos
    
    def forward(self, hidden, enc_outputs, enc_outputs_lengths=None, max_len=None):
        """
        input:
        - hidden(previous hidden): B, 1, n_d 
        - enc_outputs(source context): B, T_x, n_d
        - enc_outputs_lengths: list type
        - max_len(targer sentences max len in batch): T_y
        """
        if max_len is None: max_len = self.max_len
        
        inputs = self.start_token(hidden.size(0)) # (B, 1)
        embeded = self.embed(inputs) # (B, 1, m_d)
        if self.use_dropout:
            embeded = self.dropout(embeded)
            
        # prepare for whole targer sentence scores
        scores = []
        attn_weights = []

        for i in range(max_len):
            # context vector: previous hidden(s{i-1}), encoder_outputs(O_e) > context(c{i}), weights
            # - context: (B, 1, n_d)
            # - weights: (B, 1, T_x)
            context, weights = self.attention(hidden, enc_outputs, enc_outputs_lengths, 
                                              return_weight=self.return_weight)
            attn_weights.append(weights.squeeze(1))
            
            # concat context & embedding vectors: (B, 1, m_d+n_d)
            gru_input = torch.cat([embeded, context], 2)
            
            # gru((context&embedding), previous hidden)
            # output hidden(s{i}): (1, B, n_d)
            _, hidden = self.gru(gru_input, hidden.transpose(0, 1))
            hidden = hidden.transpose(0, 1)  # change shape to (B, 1, n_d) again
            
            # concat context and new hidden vectors: (B, 1, 2*n_d)
            concated = torch.cat([hidden, context], 2)
            
            # get score: (B, V_d)
            score = self.linear(concated.squeeze(1))
            scores.append(score)
            
            # greedy method
            decoded = self.decode_method(score, dec_method=self.dec_method, ktop=self.ktop)  # (B)
            embeded = self.embed(decoded).unsqueeze(1) # next input y{i-1} (B, 1, m_d)
            if self.use_dropout:
                embeded = self.dropout(embeded)

        # column-wise concat, reshape!! 
        # scores = [(B, V_d), (B, V_d), (B, V_d)...] > (B, V_d*max_len)
        # attn_weights = [(B, T_x), (B, T_x), (B, T_x)...] > (B*max_len, T_x)
        scores = torch.cat(scores, 1)
        return scores.view(inputs.size(0)*max_len, -1), torch.cat(attn_weights)

    def decode_method(self, score, dec_method='greedy', ktop=5):
        prob, decoded = score.max(1)
        if dec_method == 'greedy':
            return decoded
        elif dec_method == 'beam':
            pass

    def decode(self, hidden, enc_outputs, enc_outputs_lengths, eos_idx=3, max_len=50):
        
        inputs = self.start_token(hidden.size(0))  # (1, 1)
        embeded = self.embed(inputs)  # (1, 1, m_d)
        if self.use_dropout:
            embeded = self.dropout(embeded)
        
        decodes = [] 
        attn_weights = []
        decoded = torch.LongTensor([self.sos_idx]).view(1, -1)
        
        while (decoded.item() != eos_idx):
            # context: (1, 1, n_d)
            # weights: (1, 1, T_x)
            context, weights = self.attention(hidden, enc_outputs, enc_outputs_lengths, 
                                              return_weight=self.return_weight)
            attn_weights.append(weights.squeeze(1))  # (1, T_x)
            gru_input = torch.cat([embeded, context], 2)  # (1, 1, m_d+n_d)
            _, hidden = self.gru(gru_input, hidden.transpose(0, 1))  # (1, 1, n_d)
            hidden = hidden.transpose(0, 1)
            concated = torch.cat([hidden, context], 2)  # (1, 1, 2*n_d)
            score = self.linear(concated.squeeze(1))  # (1, 2*n_d) -> # (1, V_d)
            decoded = score.max(1)[1]  # (1)
            decodes.append(decoded)
            embeded = self.embed(decoded).unsqueeze(1) # (1, 1, m_d)
            if self.use_dropout:
                embeded = self.dropout(embeded)
            
            if len(decodes) >= max_len:
                break
        
        return torch.cat(decodes), torch.cat(attn_weights)

In [9]:
output, hidden = enc(inputs, lengths.tolist())

In [12]:
inputs = dec.start_token(hidden.size(0))
embeded = dec.embed(inputs)
embeded = dec.dropout(embeded)

AttributeError: 'Decoder' object has no attribute 'dropout'

In [10]:
beam_search = Beam(5, SOURCE.vocab.stoi['<pad>'], SOURCE.vocab.stoi['<s>'], SOURCE.vocab.stoi['</s>']
                   , n_best=1, cuda=USE_CUDA)

In [47]:
beam_search.advance(score)

False

In [48]:
beam_search.prevKs

[tensor([ 0,  0,  0,  0,  0])]

In [49]:
beam_search.nextYs

[tensor([ 2,  1,  1,  1,  1]), tensor([ 4854,   428,  3554,  7799,   869])]

In [124]:
beam_idxes = defaultdict(dict)
scores = []

In [115]:
sorted_scores, sorted_idxes = score.sort(dim=1, descending=True)
ktop_idxes = sorted_idxes[:, :5]
ktop_scores = sorted_scores[:, :5]

In [121]:
scores.append(ktop_scores)

In [125]:
beam_idxes[1]['score'] = ktop_scores
beam_idxes[1]['idxes'] = ktop_idxes

---

## Layer Norm

https://github.com/pytorch/pytorch/issues/4930

In [1]:
import torch
import torch.nn as nn
from torch.nn.utils.rnn import PackedSequence
import torch.nn.functional as F
import numpy as np
from itertools import accumulate

from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence


In [2]:
class LayerNormGRU(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, batch_first=False, layernorm=False,
                 bidirectional=False, bias=True, use_cuda=False, return_all_hidden=False):
        """
        Args:
            : input_size: The number of expected features in the input `x`
            : hidden_size: The number of features in the hidden state `h`
            : num_layers: Number of recurrent layers.
            : batch_first: If ``True``, then the input and output tensors are provided as `(batch, seq, feature)`
            : layernorm: If ``True``, then use torch.nn.Layernorm to normalize linear output in gru
            : bidirectional: If ``True``, becomes a bidirectional RNN. Default: ``False``
            : bias: If ``False``, then the layer does not use bias weights `b_ih` and `b_hh`. Default: ``True``
            : use_cuda: If ``True``, then use cuda to init hidden state. (didn't figure out how to auto detect it, yet) Default: ``False``
            : return_all_hidden: If ``True``, return all hidden layers output. Default: ``False``

        Input:
            : inputs: tensor(seq_len, batch_size, input_size) // 'tensor(sum(batch_sizes), input_size)'
            : hidden: tensor(num_layers * num_directions, batch_size, hidden_size) if nothing then auto initialize as zeros

            output:
            : output: tensor(seq_len, batch_size, hidden_size * num_directions) // tensor(sum(batch_sizes), hidden_size * num_directions)
            : hidden: tensor(num_layers * num_directions, B, hidden_size)

        """
        super(LayerNormGRU, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.layernorm = layernorm
        self.batch_first = batch_first
        self.bidrectional = bidirectional
        self.use_cuda = use_cuda
        self.return_all_hidden = return_all_hidden
        self.num_directions = 2 if self.bidrectional else 1
        self.bias = bias
        self.gate_num = 3

    def forward(self, inputs, hidden=None):
        """
        [no packed size // packed size]
        input:
        * inputs: tensor(seq_len, batch_size, input_size) // 'tensor(sum(batch_sizes), input_size)'
        * hidden: tensor(num_layers * num_directions, batch_size, hidden_size) if nothing then auto initialize as zeros
        output:
        * output: tensor(seq_len, batch_size, hidden_size * num_directions) // tensor(sum(batch_sizes), hidden_size * num_directions)
        * hidden: tensor(num_layers * num_directions, B, hidden_size)
        """
        is_packed = isinstance(inputs, PackedSequence)
        if is_packed:
            inputs, batch_sizes = inputs
            max_batch_size = int(batch_sizes[0])
        else:
            batch_sizes = None
            max_batch_size = inputs.size(0) if self.batch_first else inputs.size(1)

        if hidden is None:
            hidden = self.init_hidden(max_batch_size)

        self.func = StackedGRU(input_size=self.input_size,
                               hidden_size=self.hidden_size,
                               num_layers=self.num_layers,
                               bidirectional=self.bidrectional,
                               layernorm=self.layernorm,
                               return_all_hidden=self.return_all_hidden,
                               is_packed=is_packed)
        if self.batch_first and not is_packed:
            inputs = inputs.transpose(0, 1)  # B, T, D --> T, B, D

        output, hidden = self.func(inputs, hidden, batch_sizes=batch_sizes)

        if self.batch_first and not is_packed:
            output = output.transpose(0, 1)
        if is_packed:
            output = PackedSequence(output, batch_sizes)

        return output, hidden

    def init_hidden(self, max_batch_size):
        hx = torch.zeros(self.num_layers * self.num_directions, max_batch_size, self.hidden_size,
                            requires_grad=False)
        if self.use_cuda:
            hx = hx.cuda()
        return hx

In [3]:
class StackedGRU(nn.Module):

    def __init__(self, input_size, hidden_size, num_layers, bidirectional=False, layernorm=False, 
                 return_all_hidden=False, batch_first=False, is_packed=False):
        super(StackedGRU, self).__init__()
        # to do: add is_packed
        self.batch_first = batch_first
        self.layernorm = layernorm
        self.bidirec = bidirectional
        self.return_all_hidden = return_all_hidden
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.num_directions = 2 if self.bidirec else 1
        self.num_layers = num_layers
        self.build_layers(input_size, hidden_size)
        # packed seq
        self.is_packed = is_packed
        
    def build_layers(self, input_size, hidden_size):
        self.layers = self.create_layers(input_size, hidden_size)
        if self.bidirec:
            self.r_layers = self.create_layers(input_size, hidden_size)
    
    def create_layers(self, input_size, hidden_size):
        layers = nn.ModuleList()
        for _ in range(self.num_layers):
            layers.append(GRUCell(input_size, hidden_size, layernorm=self.layernorm))
            input_size = hidden_size
        return layers
        
    def forward(self, inputs, hidden, batch_sizes=None):
        """
        * input:
        inputs: 'tensor(T, B, D)' if packed, 'tensor(sum(batch_sizes), D)'
        hidden: 'tensor(num_layers * num_directions, B, H)'
        
        * return:
        output: 'tensor(T, B, num_directions*H)' // tensor(sum(batch_sizes), H)
                 if return_all_hiddens 
                 - 'tensor(num_layers, T, B, num_directions*H)' // 'tensor(num_layers, sum(batch_sizes), H)' 
        hidden 'tensor(num_layers*num_directions, B, H)'
        """
        if self.bidirec:
            # output (num_layers, T, B, 2H)
            # last_hidden (num_layers*num_directions, B, H)
            # forward: idx of time t ~ (0, 1, ..., T-1)            
            f_idx = [i for i in range(self.num_layers * self.num_directions) if i % 2 == 0]
            f_all_outputs, f_last_hidden = self._forward(self.layers, inputs, hidden[f_idx, :], batch_sizes)
            
            # backward: 
            r_inputs = self._flip(inputs, 0)  # (T, B, H) idx of time t ~ (T-1, ... , 0)
            b_idx = [i for i in range(self.num_layers * self.num_directions) if i % 2 != 0]
            b_all_outputs, b_last_hidden = self._forward(self.r_layers, r_inputs, hidden[b_idx, :], batch_sizes)
            b_all_outputs = self._flip(b_all_outputs, 1) # (num_layers, T, B, H) idx of time t ~ (0, 1, ..., T-1)
            # concate layers
            # f: hidden[T-1], b: hidden[0]
            output = torch.cat([f_all_outputs, b_all_outputs], -1)
            idx = [int(i/self.num_directions) if i % 2 == 0 else \
                   i + int(((self.num_layers * self.num_directions) - i) / 2) \
                   for i in range(self.num_layers * self.num_directions) ]
            hidden = torch.cat([f_last_hidden, b_last_hidden])[idx, :]

            if self.return_all_hidden:
                return output, hidden
            return output[-1], hidden
            
        else:
            f_all_outputs, f_last_hidden = self._forward(self.layers, inputs, hidden, batch_sizes)
            if self.return_all_hidden:
                return f_all_outputs, f_last_hidden
            return f_all_outputs[-1], f_last_hidden

    def _forward(self, layers, inputs, hidden, batch_sizes=None):
        """
        * input:
        layers: nn.ModuleList for one direction layers
        inp: tensor(T, B, D) // tensor(sum(batch_sizes), D)
        hid: num_layers, B, H (init hidden)
        
        * return:
        all_outputs: all layers a forward or backward layer
        tensor(num_layers, T, B, H) // tensor(num_layers, sum(batch_sizes), H)
        last_hidden: 
        tensor(num_layers, B, H)
        """
        # todo: add is_packed
        assert isinstance(layers, nn.ModuleList)
        if self.is_packed:
            assert batch_sizes is not None, 'packed sequence must have list of batch_sizes'
            acc_bs = [0] + list(accumulate(batch_sizes.tolist())) 
        # all_outputs
        # if packed: num_layers, sum(batch_sizes), H
        # if not packed : num_layers, T, B, H
        all_outputs = []
        for l_idx, layer in enumerate(layers):
            hid = hidden.chunk(self.num_layers, 0)[l_idx].squeeze(0)  # init hidden: 1, B, H --> B, H
            output_ith_layer = []
            
            if self.is_packed:
                # packed
                for t in range(len(batch_sizes)): # input: acc_bs[t:(t+1)]
                    hid = layer(inputs[acc_bs[t]:acc_bs[t+1]], hid[:batch_sizes[t]])
                    output_ith_layer.append(hid) 
                output_ith_layer = torch.cat(output_ith_layer, 0) # sum(batch_sizes), H 
            else:
                # not packed
                for t in range(inputs.size(0)):
                    hid = layer(inputs[t], hid)
                    output_ith_layer.append(hid)    
                output_ith_layer = torch.stack(output_ith_layer)  # T, B, H 
            
            inputs = output_ith_layer 
            all_outputs.append(output_ith_layer)
        all_outputs = torch.stack(all_outputs)
        if self.is_packed:
            last_idx = self.get_last_idx(all_outputs.size(1), batch_sizes, acc_bs)
            last_hidden = torch.stack([out[last_idx] for out in all_outputs]) # num_layer, max_batch_size, H
        else:
            last_hidden = torch.stack([out[-1] for out in all_outputs]) # num_layer, B, H
        
        return all_outputs, last_hidden
    
    def get_last_idx(self, total_len, batch_sizes, acc_bs):
        batch_sizes = batch_sizes if isinstance(batch_sizes, list) else batch_sizes.tolist()
        mask = batch_sizes + [0]
        mask = [mask[i+1] - mask[i] for i in range(len(batch_sizes))]
        temp = list(range(total_len))
        result = []
        for i, m in enumerate(mask):
            if m != 0:
                result.extend(temp[acc_bs[i]:acc_bs[i+1]][m:])
        return list(reversed(result))
    
    
    def _flip(self, x, dim):
        """
        https://discuss.pytorch.org/t/optimizing-diagonal-stripe-code/17777/16
        """
        indices = [slice(None)] * x.dim()
        indices[dim] = torch.arange(x.size(dim) - 1, -1, -1,
                                    dtype=torch.long, device=x.device)
        return x[tuple(indices)]

In [4]:
class GRUCell(nn.Module):
    def __init__(self, input_size, hidden_size, bias=True, layernorm=False, gate_num=3):
        super(GRUCell, self).__init__()
        self.input_size = input_size
        self.bias = bias
        self.hidden_size = hidden_size
        self.layernorm = layernorm
        self.gate_num = gate_num
        
        self.weight_ih = nn.Linear(input_size, gate_num*hidden_size, bias=bias)
        self.weight_hh = nn.Linear(hidden_size, gate_num*hidden_size, bias=bias)
        if self.layernorm:
            self.lm_r = nn.LayerNorm(hidden_size)
            self.lm_i = nn.LayerNorm(hidden_size)
            self.lm_n = nn.LayerNorm(hidden_size)
        
    def forward(self, inputs, hidden):
        """
        inputs:
        * inputs: B, input_size
        * hidden: B, hidden_size
        output:
        * hy: B, hidden_size
        """
        gi = self.weight_ih(inputs)
        gh = self.weight_hh(hidden)
        i_r, i_i, i_n = gi.chunk(3, 1)
        h_r, h_i, h_n = gh.chunk(3, 1)
        
        a_r = i_r + h_r
        a_i = i_i + h_i
        if self.layernorm:
            a_r = self.lm_r(a_r)
            a_i = self.lm_i(a_i)
            
        resetgate = F.sigmoid(a_r)
        inputgate = F.sigmoid(a_i)
        
        a_n = i_n + resetgate * h_n
        if self.layernorm:
            a_n = self.lm_n(a_n)
            
        newgate = F.tanh(a_n)
        hy = newgate + inputgate * (hidden - newgate)
        return hy

## Packed sequences

In [5]:
V = 10  # vocab length
T = 6  # max sequence length

# batch_first!! B, T
input_seq = torch.LongTensor([[9, 8, 4, 2, 0, 0],
                              [4, 6, 1, 1, 0, 0],
                              [8, 7, 5, 0, 0, 0],
                              [4, 1, 0, 0, 0, 0],
                              [4, 6, 8, 1, 9, 2]])

# decreasing order
input_lengths = torch.LongTensor([torch.max(input_seq[i, :].data.nonzero())+1 for i in range(input_seq.size(0))])
input_lengths, sorted_idx = input_lengths.sort(0, descending=True)
input_seq = input_seq[sorted_idx]

# pack sequences
# packed_input = pack_padded_sequence(input_seq, input_lengths.tolist(), batch_first=True)

input seqence 비교

In [325]:
input_seq

tensor([[ 9,  8,  4,  2,  0,  0],
        [ 4,  6,  1,  1,  0,  0],
        [ 8,  7,  5,  0,  0,  0],
        [ 4,  1,  0,  0,  0,  0],
        [ 4,  6,  8,  1,  9,  2]])

In [326]:
input_seq[sorted_idx]

tensor([[ 4,  6,  8,  1,  9,  2],
        [ 9,  8,  4,  2,  0,  0],
        [ 4,  6,  1,  1,  0,  0],
        [ 8,  7,  5,  0,  0,  0],
        [ 4,  1,  0,  0,  0,  0]])

In [6]:
embed = nn.Embedding(V, 7)
embeded = embed(input_seq)
embeded.size()

torch.Size([5, 6, 7])

In [11]:
packed_input = pack_padded_sequence(embeded, input_lengths.tolist(), batch_first=True)

In [12]:
packed_input[0].size(), packed_input[1].tolist()

(torch.Size([19, 7]), [5, 5, 4, 3, 1, 1])

In [120]:
gru = nn.GRU(input_size=7, hidden_size=2, num_layers=1, bidirectional=True)

In [121]:
# pack 한것과 안한것 차이 seq_len 이 다름 max len으로 되어있음
# https://discuss.pytorch.org/t/lstm-hidden-cell-outputs-and-packed-sequence-for-variable-length-sequence-inputs/1183
nopack_o, nopack_h = gru(embeded)
pack_o, pack_h = gru(packed_input)

In [122]:
nopack_o.size(), pack_o[0].size(), nopack_h.size(), pack_h.size() 

(torch.Size([5, 6, 4]),
 torch.Size([19, 4]),
 torch.Size([2, 6, 2]),
 torch.Size([2, 5, 2]))

In [123]:
nopack_o

tensor([[[-0.0206, -0.3060,  0.7552, -0.5244],
         [ 0.1811,  0.1301,  0.9484, -0.5482],
         [-0.0503, -0.1805,  0.8293,  0.0213],
         [-0.3703, -0.0457,  0.8630,  0.0969],
         [ 0.3921, -0.1847,  0.7820, -0.0545],
         [-0.2316, -0.0441,  0.9626,  0.2920]],

        [[ 0.4251, -0.4135,  0.8408, -0.3390],
         [-0.0511, -0.0747,  0.9452, -0.1381],
         [-0.0222, -0.4134,  0.4404, -0.0956],
         [-0.4067, -0.0922,  0.9615,  0.2548],
         [ 0.0316, -0.2260,  0.5681,  0.0943],
         [-0.2319, -0.1229,  0.5681,  0.0943]],

        [[ 0.0877, -0.5331,  0.7641, -0.3096],
         [ 0.2005,  0.0903,  0.8415, -0.5562],
         [-0.3886, -0.3916, -0.1171,  0.1611],
         [-0.5180, -0.1251,  0.5126, -0.0278],
         [-0.1032, -0.2695,  0.5352,  0.0812],
         [-0.2282, -0.1882,  0.5352,  0.0812]],

        [[ 0.0196, -0.5854,  0.8491, -0.0361],
         [ 0.5294, -0.1875,  0.5783, -0.1855],
         [ 0.2926, -0.4325, -0.3916,  0.2826],
       

In [124]:
nopack_h

tensor([[[ 0.0239, -0.6757],
         [-0.2533, -0.1960],
         [-0.0054, -0.4493],
         [-0.3013, -0.2556],
         [-0.1848, -0.3458],
         [-0.2176, -0.2894]],

        [[ 0.7552, -0.5244],
         [ 0.9484, -0.5482],
         [ 0.8293,  0.0213],
         [ 0.8630,  0.0969],
         [ 0.7820, -0.0545],
         [ 0.9626,  0.2920]]])

In [125]:
pack_o

PackedSequence(data=tensor([[-0.0206, -0.3060,  0.7859, -0.6231],
        [ 0.3921, -0.1847,  0.8851, -0.1073],
        [-0.0206, -0.3060,  0.7315, -0.6472],
        [-0.0503, -0.1805,  0.7934,  0.0918],
        [-0.0206, -0.3060,  0.5457, -0.2945],
        [ 0.2378, -0.0514,  0.9414, -0.4815],
        [-0.0174, -0.2925,  0.9278,  0.0331],
        [ 0.2378, -0.0514,  0.7760, -0.5244],
        [ 0.5332, -0.3803,  0.3400,  0.0630],
        [-0.3859, -0.2990,  0.1914, -0.0653],
        [-0.0311, -0.1999,  0.9258,  0.0557],
        [-0.0054, -0.4792,  0.8096, -0.0991],
        [-0.3045, -0.0834,  0.3291, -0.1149],
        [ 0.6439, -0.4076, -0.5722,  0.2439],
        [-0.3863, -0.2093,  0.8071, -0.0447],
        [-0.2406, -0.4790,  0.9393,  0.2513],
        [-0.4781, -0.1169,  0.1914, -0.0653],
        [ 0.2912, -0.3629,  0.8907,  0.0489],
        [-0.1373, -0.3698,  0.9393,  0.2513]]), batch_sizes=tensor([ 5,  5,  4,  3,  1,  1]))

In [126]:
pack_h

tensor([[[-0.1373, -0.3698],
         [-0.2406, -0.4790],
         [-0.4781, -0.1169],
         [ 0.6439, -0.4076],
         [-0.3859, -0.2990]],

        [[ 0.7859, -0.6231],
         [ 0.8851, -0.1073],
         [ 0.7315, -0.6472],
         [ 0.7934,  0.0918],
         [ 0.5457, -0.2945]]])

In [48]:
packed, batch_sizes = pack_padded_sequence(embeded, input_lengths.tolist(), batch_first=True)

In [171]:
unpacked, unpacked_len = pad_packed_sequence(o)

In [231]:
a = list(range(20))
print(a[0:5])
print(a[5:10])
print(a[10:14])
print(a[14:17])
print(a[17:19])
print(a[19:20])

[0, 1, 2, 3, 4]
[5, 6, 7, 8, 9]
[10, 11, 12, 13]
[14, 15, 16]
[17, 18]
[19]


In [338]:
from itertools import accumulate, islice

In [375]:
batch_sizes = [5, 5, 4, 3, 1, 1]
total_len = 19

In [405]:
result = get_last_idx(total_len, batch_sizes)

In [406]:
pack_o[0][result]

tensor([[-0.4422, -0.0451],
        [ 0.6043, -0.4460],
        [-0.3532,  0.0088],
        [ 0.4450, -0.5224],
        [ 0.6037, -0.1997]])

In [407]:
pack_h

tensor([[[-0.4422, -0.0451],
         [-0.3532,  0.0088],
         [ 0.6043, -0.4460],
         [ 0.4450, -0.5224],
         [ 0.6037, -0.1997]]])

In [404]:
def get_last_idx(total_len, batch_sizes):
    # a: acc_of a
    # b: len of batch
    assert isinstance(batch_sizes, list)
    mask = batch_sizes + [0]
    mask = [mask[i+1] - mask[i] for i in range(len(batch_sizes))]
    acc_bs = [0] + list(accumulate(batch_sizes))
    temp = list(range(total_len))
    result = []
    for i, m in enumerate(mask):
        if m != 0:
            result.extend(temp[acc_bs[i]:acc_bs[i+1]][m:])
    return list(reversed(result))

test

In [103]:
stacked_GRU = StackedGRU(input_size=7, hidden_size=2, num_layers=3, bidirectional=True, batch_first=False, 
                         is_packed=False)

In [49]:
hidden = embeded.new_zeros(3 * 2, embeded.size(0), 2, requires_grad=False)
hidden.size()

torch.Size([6, 5, 2])

In [50]:
packed, batch_sizes = packed_input

In [109]:
nopack_o, nopack_h = stacked_GRU.forward(embeded.transpose(0, 1), hidden)

In [44]:
stacked_GRU = StackedGRU(input_size=7, hidden_size=2, num_layers=3, bidirectional=True, batch_first=False, 
                         is_packed=True)

In [52]:
pack_o, pack_h = stacked_GRU(packed, hidden, batch_sizes)

In [112]:
hid = hidden.chunk(stacked_GRU.num_layers, 0)[0].squeeze(0)

In [113]:
nopack_o.size(), nopack_h.size(), pack_o.size(), pack_h.size()

(torch.Size([6, 5, 4]),
 torch.Size([6, 5, 2]),
 torch.Size([19, 4]),
 torch.Size([6, 5, 2]))

In [115]:
pack_o

tensor([[-0.1057,  0.2791,  0.2545, -0.3794],
        [-0.1027,  0.2818,  0.3091, -0.3687],
        [-0.1057,  0.2791,  0.2597, -0.3865],
        [-0.0949,  0.2896,  0.3566, -0.3696],
        [-0.1057,  0.2791,  0.3667, -0.3633],
        [-0.1294,  0.3859,  0.3948, -0.3636],
        [-0.1241,  0.3906,  0.3158, -0.3717],
        [-0.1294,  0.3859,  0.3890, -0.3648],
        [-0.1215,  0.3928,  0.4252, -0.3616],
        [-0.1310,  0.3835,  0.3946, -0.3464],
        [-0.1223,  0.4304,  0.3714, -0.3411],
        [-0.1237,  0.4308,  0.3453, -0.3424],
        [-0.1230,  0.4286,  0.3579, -0.3406],
        [-0.1208,  0.4321,  0.4154, -0.3424],
        [-0.1049,  0.4541,  0.2971, -0.2558],
        [-0.1130,  0.4504,  0.2953, -0.2538],
        [-0.1062,  0.4522,  0.2971, -0.2558],
        [-0.0895,  0.4694,  0.2697, -0.2496],
        [-0.0738,  0.4810,  0.2953, -0.2538]])

In [119]:
torch.cat([h for h in pack_h[-2:]], 1)

tensor([[-0.0738,  0.4810,  0.2545, -0.3794],
        [-0.1062,  0.4522,  0.2597, -0.3865],
        [-0.1130,  0.4504,  0.3566, -0.3696],
        [-0.1208,  0.4321,  0.3948, -0.3636],
        [-0.1310,  0.3835,  0.3946, -0.3464]])

In [86]:
all_output=[]
for t in range(len(batch_sizes)):
    print('input',packed[acc_bs[t]:acc_bs[t+1]].size())
    print('hidden', hid[:batch_sizes[t]])
    hid = stacked_GRU.layers[0](packed[acc_bs[t]:acc_bs[t+1]], hid[:batch_sizes[t]])
    all_output.append(hid)
    print(hid.size())

input torch.Size([5, 7])
hidden tensor([[ 0.,  0.],
        [ 0.,  0.],
        [ 0.,  0.],
        [ 0.,  0.],
        [ 0.,  0.]])
torch.Size([5, 2])
input torch.Size([5, 7])
hidden tensor([[ 0.0244,  0.3408],
        [ 0.1301,  0.0093],
        [ 0.0244,  0.3408],
        [ 0.3196, -0.0357],
        [ 0.0244,  0.3408]])
torch.Size([5, 2])
input torch.Size([4, 7])
hidden tensor([[-0.3063,  0.4356],
        [ 0.3605, -0.0202],
        [-0.3063,  0.4356],
        [ 0.3636, -0.1822]])
torch.Size([4, 2])
input torch.Size([3, 7])
hidden tensor([[ 0.1719,  0.1572],
        [ 0.2532,  0.2940],
        [ 0.2218, -0.0176]])
torch.Size([3, 2])
input torch.Size([1, 7])
hidden tensor([[ 0.5032, -0.0707]])
torch.Size([1, 2])
input torch.Size([1, 7])
hidden tensor([[ 0.3811, -0.0112]])
torch.Size([1, 2])


Test for layernorm gru

In [6]:
input_seq, input_lengths

(tensor([[ 4,  6,  8,  1,  9,  2],
         [ 9,  8,  4,  2,  0,  0],
         [ 4,  6,  1,  1,  0,  0],
         [ 8,  7,  5,  0,  0,  0],
         [ 4,  1,  0,  0,  0,  0]]), tensor([ 6,  4,  4,  3,  2]))

In [7]:
embed = nn.Embedding(V, 7)
embeded = embed(input_seq)
embeded.size()

torch.Size([5, 6, 7])

In [8]:
gru = LayerNormGRU(input_size=7, hidden_size=3, num_layers=2, bidirectional=True, batch_first=True, 
                   layernorm=True)

In [9]:
packed_input = pack_padded_sequence(embeded, input_lengths.tolist(), batch_first=True)

In [10]:
nopack_o, nopack_h = gru(embeded)

In [11]:
nopack_o.size(), nopack_h.size()

(torch.Size([5, 6, 6]), torch.Size([4, 5, 3]))

In [12]:
pack_o, pack_h = gru(packed_input)

In [13]:
pack_o[0].size(), pack_h.size()

(torch.Size([19, 6]), torch.Size([4, 5, 3]))

In [118]:
unpack_o, unpack_len = pad_packed_sequence(pack_o, batch_first=True)

In [120]:
unpack_o.size(), unpack_len

(torch.Size([5, 6, 6]), tensor([ 6,  4,  4,  3,  2]))