In [29]:
from torch.autograd import Function, NestedIOFunction, Variable
import torch.backends.cudnn as cudnn
from torch.nn._functions.thnn import rnnFusedPointwise as fusedBackend

In [1]:
# torch
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torchtext.data import Field, Iterator, BucketIterator, TabularDataset
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from torchnlp.metrics import get_moses_multi_bleu
from NMTutils import get_parser, build_data, get_model_config, evaluation

from decoder import Decoder
from encoder import Encoder
from attention import Attention
# others
import argparse
import random
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
from collections import defaultdict

In [2]:
# USE_CUDA = torch.cuda.is_available()
# DEVICE = torch.cuda.current_device()
USE_CUDA = False
DEVICE = -1

In [3]:
lang1 = 'eng'
lang2 = 'fra'
modelcode_small = ['111301', '111311', '111301', '111311', '111311', 
                  '111311', '111311', '122421', '122521', '122621',
                  '122622', '222421', '222521', '222421']
modelcode_filtered = ['322521', '322421']

model_idx = 1
config, test_data, test_loader, SOURCE, TARGET = get_model_config(modelcode_small[model_idx-1], lang1, lang2,
                                                                 device=DEVICE)

In [4]:
for batch in test_loader:
    inputs, lengths = batch.so
    targets = batch.ta
    break
lengths = lengths.tolist()

In [7]:
embed = nn.Embedding(len(SOURCE.vocab), 10)
gru = nn.GRU(10, 7, 3, batch_first=True, bidirectional=True)

In [8]:
embeded = embed(inputs)
print(embeded.size())
packed = pack_padded_sequence(embeded, lengths, batch_first=True)
outputs, hidden = gru(packed)
print(hidden.size())
outputs, output_lengths = pad_packed_sequence(outputs, batch_first=True)
print(outputs.size())
hidden = torch.cat([h for h in hidden[-2:]], 1).unsqueeze(1)
print(hidden.size())

torch.Size([64, 9, 10])
torch.Size([6, 64, 7])
torch.Size([64, 9, 14])
torch.Size([64, 1, 14])


In [9]:
embeded.size()

torch.Size([64, 9, 10])

---

## Layer Norm

In [72]:
input_size = 5
hidden_size = 6
seq_len = 7
bias = True
B = 10
x = torch.randn((seq_len, B, input_size))
h = torch.randn((B, hidden_size))

In [73]:
w_ih = nn.Linear(input_size, 3*hidden_size, bias=bias)
w_hh = nn.Linear(hidden_size, 3*hidden_size, bias=bias)

In [74]:
gi = w_ih(x[0])
gh = w_hh(h)
print(gi.size(), gh.size())
i_r, i_i, i_n = gi.chunk(3, 1)
h_r, h_i, h_n = gh.chunk(3, 1)

a_r = i_r + h_r
a_i = i_i + h_i
print(a_r.size(), a_i.size())

torch.Size([10, 18]) torch.Size([10, 18])
torch.Size([10, 6]) torch.Size([10, 6])


In [75]:
lm_r = nn.LayerNorm(hidden_size, elementwise_affine=True) 

In [76]:
lm_r(a_r)

tensor([[-1.5299,  0.6143,  0.7059, -1.2720,  0.9255,  0.5562],
        [-1.7578,  0.4601,  0.1428, -0.5778,  0.2171,  1.5156],
        [ 0.6809,  0.7926,  0.9557, -1.9366, -0.4945,  0.0020],
        [ 0.3680,  1.0435,  1.1615, -0.0356, -0.9473, -1.5900],
        [ 0.7292, -0.2817, -1.1384, -1.3739,  1.2232,  0.8417],
        [-0.6347,  0.0036, -1.4513, -0.3078,  1.7144,  0.6759],
        [-1.5679,  1.6891,  0.6106, -0.5432, -0.1318, -0.0568],
        [-1.8565,  1.3214,  0.7711, -0.3297,  0.2696, -0.1759],
        [-1.6745, -0.0496,  0.5500, -0.8805,  0.9775,  1.0771],
        [ 1.2672, -1.3910,  0.8129,  0.0841, -1.2502,  0.4769]])

In [77]:
grucell = nn.GRUCell(input_size, hidden_size)

In [78]:
layernormgru = LayerNormGRU(input_size, hidden_size, 1)

In [79]:
hy = layernormgru.gru_cell(x[0], h)

In [80]:
hy.size()

torch.Size([10, 6])

In [81]:
grucell(x[0], h).size()

torch.Size([10, 6])

In [45]:
class LayerNormGRU(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, batch_first=False, 
                 bidirectional=False, bias=True, use_cuda=False):
        super(LayerNormGRU, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.batch_first = batch_first
        self.bidrectional = bidirectional
        self.num_directions = 2 if self.bidrectional else 1
        self.bias = bias
        self.use_cuda = use_cuda
        self.gate_num = 3
        
        self.weight_ih = nn.Linear(input_size, self.gate_num*hidden_size, bias=bias)
        self.weight_hh = nn.Linear(hidden_size, self.gate_num*hidden_size, bias=bias)
        if self.use_cuda:
            self.weight_ih = self.weight_ih.cuda()
            self.weight_hh = self.weight_hh.cuda()
            
        self.lm_r = nn.LayerNorm(hidden_size)
        self.lm_i = nn.LayerNorm(hidden_size)
        self.lm_n = nn.LayerNorm(hidden_size)
    
    def gru_cell(self, inpt, hidden):
        """
        input:
        * inpt: B, input_size
        * hidden: B, hidden_size
        output:
        * 
        """
        gi = self.weight_ih(inpt)
        gh = self.weight_hh(hidden)
        i_r, i_i, i_n = gi.chunk(3, 1)
        h_r, h_i, h_n = gh.chunk(3, 1)
        
        a_r = self.lm_r(i_r + h_r)
        a_i = self.lm_i(i_i + h_i)
        
        resetgate = F.sigmoid(a_r)
        inputgate = F.sigmoid(a_i)
        
        a_n = self.lm_n(i_n + resetgate * h_n)
        newgate = F.tanh(a_n)
        hy = newgate + inputgate * (hidden - newgate)
        return hy
    
    def forward(self, inpt, hx=None):
        """
        input:
        * inpt: seq_len, B, input_size
        * hidden: B, hidden_size
        output:
        * output: seq_len, B, hidden_size * num_directions
        * hidden: num_layers * num_directions, B, hidden_size
        """
        if hx is None:
            hx = init_hidden(inpt.size(1))
        seq_len = inpt.size(0)
        output = []
        for t in range(seq_len):
            

        return out, hx

    def init_hidden(self, batch_size):
        h_0 = torch.zeros(batch_size, self.hidden_size)
        if self.use_cuda:
            h_0 = h_0.cuda()
        return h_0



    

In [None]:
class Encoder(nn.Module):
    def __init__(self, V_e, m_e, n_e, num_layers=1, bidrec=False, use_dropout=False, dropout_rate=0.5, 
                 layernorm=False):
        super(Encoder, self).__init__()
        """
        vocab_size: V_e
        embed_size: m_e
        hidden_size: n_e
        """
        self.V_e = V_e
        self.m_e = m_e
        self.n_e = n_e
        self.num_layers = num_layers
        self.bidrec = bidrec
        self.n_direct = 2 if bidrec else 1
        self.use_dropout = use_dropout
        self.layernorm = layernorm

        if self.use_dropout:
            self.dropout = nn.Dropout(dropout_rate)

        if self.layernorm:
            self.lm = nn.LayerNorm()

        self.embed = nn.Embedding(V_e, m_e) 
        self.gru = nn.GRU(m_e, n_e, num_layers, batch_first=True, bidirectional=bidrec)
        
    def forward(self, inputs, lengths):
        """
        input: 
        - inputs: B, T_x
        - lengths: actual max length of batches
        output:
        - outputs: B, T_x, n_e
        """
        # embeded: (B, T_x, n_e)
        embeded = self.embed(inputs)
        if self.use_dropout:
            embeded = self.dropout(embeded)
            
        # packed: (B*T_x, n_e)
        packed = pack_padded_sequence(embeded, lengths, batch_first=True) 
        # packed outputs: (B*T_x, 2*n_e)
        # hidden: (num of layers*n_direct, B, 2*n_e)
        outputs, hidden = self.gru(packed)
        # unpacked outputs: (B, T_x, 2*n_e)
        outputs, output_lengths = pad_packed_sequence(outputs, batch_first=True)
        
        # hidden bidirection: (num of layers*n_direct(0,1,2...last one), B, n_e)
        # choosen last hidden: (B, 1, 2*n_e)
        hidden = torch.cat([h for h in hidden[-self.n_direct:]], 1).unsqueeze(1)
        
        return outputs, hidden

In [None]:
def build(config, SOURCE, TARGET):
    enc = Encoder(len(SOURCE.vocab), config.EMBED, config.HIDDEN, config.NUM_HIDDEN, bidrec=True)
    dec = Decoder(len(TARGET.vocab), config.EMBED, 2*config.HIDDEN, hidden_size2=config.HIDDEN2, \
                  sos_idx=SOURCE.vocab.stoi['<s>'], method=config.METHOD, USE_CUDA=USE_CUDA)
    if USE_CUDA:
        enc = enc.cuda()
        dec = dec.cuda()

    loss_function = nn.CrossEntropyLoss(ignore_index=TARGET.vocab.stoi['<pad>'])
    return enc, dec, loss_function