In [11]:
import torch
from attention import *

class PositionalEncoding(nn.Module):
    def __init__(self, dim_model, max_length=2000):
        super(PositionalEncoding, self).__init__()

        pe = torch.zeros(max_length, dim_model, requires_grad=False)
        position = torch.arange(0, max_length).unsqueeze(1).float()
        exp_term = torch.exp(torch.arange(0, dim_model, 2).float() * -(math.log(10000.0) / dim_model))
        pe[:, 0::2] = torch.sin(position * exp_term)
        pe[:, 1::2] = torch.cos(position * exp_term)
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)

    def forward(self, input):
        return self.pe[:, :input.size(1)]

class FeedForwardNetwork(torch.nn.Module):
    def __init__(self, hidden_size, filter_size, dropout_rate):
        super(FeedForwardNetwork, self).__init__()
        self.layer1 = nn.Linear(hidden_size, filter_size)
        self.dropout = nn.Dropout(dropout_rate)
        self.act = nn.SELU()
        self.layer2 = nn.Linear(filter_size, hidden_size)

    def forward(self, x):
        x = self.layer1(x)
        x = self.act(x)
        x = self.dropout(x)
        x = self.layer2(x)
        return x

In [22]:
import torch
import torch.nn as nn
from attention import MultiHeadAttention, RelPositionMultiHeadAttention

class EncoderLayer(nn.Module):
    def __init__(self, hidden_size, filter_size, n_head, dropout_rate):
        super(EncoderLayer, self).__init__()

        self.self_attention_norm = nn.LayerNorm(hidden_size, eps=1e-6)
        self.self_attention = RelPositionMultiHeadAttention(n_head, hidden_size, dropout_rate)
        self.self_attention_dropout = nn.Dropout(dropout_rate)

        self.ffn_norm = nn.LayerNorm(hidden_size, eps=1e-6)
        self.ffn = FeedForwardNetwork(hidden_size, filter_size, dropout_rate)
        self.ffn_dropout = nn.Dropout(dropout_rate)

    def forward(self, x, mask):  # pylint: disable=arguments-differ
        y = self.self_attention_norm(x)
        y = self.self_attention(y, y, y, mask)
        y = self.self_attention_dropout(y)
        x = x + y

        y = self.ffn_norm(x)
        y = self.ffn(y)
        y = self.ffn_dropout(y)
        x = x + y
        return x, mask
    
class Encoder(nn.Module):
    def __init__(self, hidden_size, filter_size, n_head, dropout_rate, n_layers):
        super(Encoder, self).__init__()

        self.layers = nn.ModuleList([EncoderLayer(hidden_size, filter_size, n_head, dropout_rate)
                    for _ in range(n_layers)])

        self.last_norm = nn.LayerNorm(hidden_size, eps=1e-6)

    def forward(self, inputs, mask):
        encoder_output = inputs
        for enc_layer in self.layers:
            encoder_output, mask = enc_layer(encoder_output, mask)
        return self.last_norm(encoder_output), mask

In [23]:
import torch
import torch.nn as nn
from attention import MultiHeadAttention, RelPositionMultiHeadAttention

class DecoderLayer(nn.Module):
    def __init__(self, hidden_size, filter_size, n_head, dropout_rate):
        super(DecoderLayer, self).__init__()

        self.self_attention_norm = nn.LayerNorm(hidden_size, eps=1e-6)
        self.self_attention = MultiHeadAttention(n_head, hidden_size, dropout_rate)
        self.self_attention_dropout = nn.Dropout(dropout_rate)

        self.enc_dec_attention_norm = nn.LayerNorm(hidden_size, eps=1e-6)
        self.enc_dec_attention = MultiHeadAttention(n_head, hidden_size, dropout_rate)
        self.enc_dec_attention_dropout = nn.Dropout(dropout_rate)

        self.ffn_norm = nn.LayerNorm(hidden_size, eps=1e-6)
        self.ffn = FeedForwardNetwork(hidden_size, filter_size, dropout_rate)
        self.ffn_dropout = nn.Dropout(dropout_rate)

    def forward(self, x, enc_output, self_mask, i_mask, cache):
        y = self.self_attention_norm(x)
        y = self.self_attention(y, y, y, self_mask)
        y = self.self_attention_dropout(y)
        x = x + y

        if enc_output is not None:
            y = self.enc_dec_attention_norm(x)
            y = self.enc_dec_attention(y, enc_output, enc_output, i_mask,
                                       cache)
            y = self.enc_dec_attention_dropout(y)
            x = x + y

        y = self.ffn_norm(x)
        y = self.ffn(y)
        y = self.ffn_dropout(y)
        x = x + y
        return x

class Decoder(nn.Module):
    def __init__(self, hidden_size, filter_size, n_head, dropout_rate, n_layers):
        super(Decoder, self).__init__()

        self.layers = nn.ModuleList([DecoderLayer(hidden_size, filter_size, n_head, dropout_rate)
                    for _ in range(n_layers)])

        self.last_norm = nn.LayerNorm(hidden_size, eps=1e-6)

    def forward(self, targets, enc_output, tgt_mask, enc_mask, cache):
        decoder_output = targets
        for i, dec_layer in enumerate(self.layers):
            layer_cache = None
            if cache is not None:
                if i not in cache:
                    cache[i] = {}
                layer_cache = cache[i]
            decoder_output = dec_layer(decoder_output, enc_output, tgt_mask, enc_mask, layer_cache)
        return self.last_norm(decoder_output)

In [33]:
import torch
import torch.nn as nn

from feature_extractor import *

class Transformer(nn.Module):
    def __init__(
        self, 
        vocab_size, 
        feat_extractor='vgg', 
        enc_n_layers=16, 
        dec_n_layers=1, 
        hidden_size=512, 
        filter_size=2048,
        n_head=8,
        dropout_rate=0.1, 
        pad_id=0, 
        sos_id=1, 
        eos_id=2,
        init_type="xavier_uniform"
    ):
        super(Transformer, self).__init__()
        self.pad_id = pad_id
        self.sos_id = sos_id
        self.eos_id = eos_id
        self.init_type = init_type
        
        if feat_extractor=='vgg':
            self.conv = VGGExtractor()
        elif feat_extractor=='w2v':
            self.conv = W2VExtractor()
            
        self.encoder = Encoder(hidden_size, filter_size, n_head,
                               dropout_rate, enc_n_layers)
        
        self.decoder = Decoder(hidden_size, filter_size,n_head,
                               dropout_rate, dec_n_layers)
        
        self.initialize()

    def forward(self, padded_input, input_lengths, padded_target):
        if self.feat_extractor == 'vgg' or self.feat_extractor == 'w2v':
            padded_input = self.conv(padded_input)

        # Reshaping features
        sizes = padded_input.size() # B x H_1 (channel?) x H_2 x T
        padded_input = padded_input.view(sizes[0], sizes[1] * sizes[2], sizes[3])
        padded_input = padded_input.transpose(1, 2).contiguous()  # BxTxH

        encoder_padded_outputs, _ = self.encoder(padded_input, input_lengths)
        pred, gold, *_ = self.decoder(padded_target, encoder_padded_outputs, input_lengths)
        hyp_best_scores, hyp_best_ids = torch.topk(pred, 1, dim=2)

        hyp_seq = hyp_best_ids.squeeze(2)
        gold_seq = gold

        return pred, gold, hyp_seq, gold_seq

    def initialize(self):
        # weight init
        for p in self.parameters():
            if p.dim() > 1:
                if self.init_type == "xavier_uniform":
                    nn.init.xavier_uniform_(p.data)
                elif self.init_type == "xavier_normal":
                    nn.init.xavier_normal_(p.data)
                elif self.init_type == "kaiming_uniform":
                    nn.init.kaiming_uniform_(p.data, nonlinearity="relu")
                elif self.init_type == "kaiming_normal":
                    nn.init.kaiming_normal_(p.data, nonlinearity="relu")
                else:
                    raise ValueError("Unknown initialization: " + self.init_type)
        # bias init
        for p in self.parameters():
            if p.dim() == 1:
                p.data.zero_()

        # reset some modules with default init
        for m in self.modules():
            if isinstance(m, (nn.Embedding, nn.LayerNorm)):
                m.reset_parameters()

In [41]:
sum([p.numel() for p in Transformer(512).parameters() if p.requires_grad])

59179968

In [4]:
from kosr.model.transformer.model import Transformer

ImportError: cannot import name 'Dncoder'

In [4]:
from kosr.model import Transformer

In [6]:
model = Transformer(512)

In [19]:
import torch

model(torch.randn(16,1,52,80), torch.randint(52,(16,52)), torch.randint(450,(16,250)))

ModuleAttributeError: 'Transformer' object has no attribute 'feat_extractor'

In [18]:
torch.randint(52,(16,52)).shape

torch.Size([16, 52])

In [23]:
mask = torch.zeros(256)

In [24]:
mask[:100] = 1

In [26]:
mask.eq(0)

tensor([False, False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False, False,
         True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True, 

In [27]:
torch.arange(256).masked_fill(mask.eq(0), 0.0)

tensor([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
        18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35,
        36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53,
        54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71,
        72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89,
        90, 91, 92, 93, 94, 95, 96, 97, 98, 99,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0, 

In [5]:
from kosr.model import mask

In [3]:
mask.subsequent_mask(3)

tensor([[1, 0, 0],
        [1, 1, 0],
        [1, 1, 1]], dtype=torch.uint8)

In [1]:
import torch
a = torch.randint(16,(16,1)).squeeze()

In [88]:
a = torch.randint(1,25,(1,280))

In [89]:
a.shape

torch.Size([1, 280])

In [90]:
a[:,-1] = 0

In [92]:
def subsequent_mask(size, device="cpu", dtype=torch.bool):
    ret = torch.ones(size, size, device=device, dtype=dtype)
    return torch.tril(ret, out=ret)


def target_mask(ys_in_pad, ignore_id):
    ys_mask = ys_in_pad != ignore_id
    print(ys_mask)
    m = subsequent_mask(ys_mask.size(-1), device=ys_mask.device).unsqueeze(0)
    print(m)
    return (ys_mask.unsqueeze(-2) & m).eq(0)

In [93]:
r = target_mask(a,0)

tensor([[ True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
          True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
          True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
          True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
          True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
          True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
          True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
          True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
          True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
          True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
          True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
          True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
          True,  True,  True,  True,  True,  True,  

In [94]:
r[0]

tensor([[ True, False, False,  ..., False, False, False],
        [ True,  True, False,  ..., False, False, False],
        [ True,  True,  True,  ..., False, False, False],
        ...,
        [ True,  True,  True,  ...,  True, False, False],
        [ True,  True,  True,  ...,  True,  True, False],
        [ True,  True,  True,  ...,  True,  True, False]])

In [28]:
r[0].shape

torch.Size([280, 280])

In [45]:
r[0].shape

torch.Size([280, 280])

In [51]:
import torch.nn as nn

emb = nn.Embedding(30,512)

In [47]:
from torch import Tensor

def get_decoder_self_attn_mask(seq_k: Tensor, seq_q: Tensor, pad_id):
    """ For masking the decoder self attention """
    def _get_attn_key_pad_mask(seq_k, seq_q, pad_id):
        """ For masking out the padding part of key sequence. """
        len_q = seq_q.size(1)
        padding_mask = seq_k.eq(pad_id)
        padding_mask = padding_mask.unsqueeze(1).expand(-1, len_q, -1)  # b x lq x lk

        return padding_mask

    def _get_subsequent_mask(inputs: Tensor) -> Tensor:
        """ Makes subsequent masking """
        batch_size, seq_length = inputs.size()
        subsequent_mask = torch.triu(torch.ones((seq_length, seq_length), device=inputs.device, dtype=torch.uint8), diagonal=1)
        subsequent_mask = subsequent_mask.unsqueeze(0).expand(batch_size, -1, -1)  # BxTxT

        return subsequent_mask.bool()

    return _get_attn_key_pad_mask(seq_k, seq_q, pad_id) | _get_subsequent_mask(seq_k)


In [48]:
a = torch.randint(30,(1,280))

In [55]:
mask = get_decoder_self_attn_mask(a,a,0)

In [60]:
mask

tensor([[[False,  True,  True,  ...,  True,  True,  True],
         [False, False,  True,  ...,  True,  True,  True],
         [False, False, False,  ...,  True,  True,  True],
         ...,
         [False, False, False,  ..., False,  True,  True],
         [False, False, False,  ..., False, False,  True],
         [False, False, False,  ..., False, False, False]]])

In [56]:
b = emb(a)

In [62]:
mask

tensor([[[False,  True,  True,  ...,  True,  True,  True],
         [False, False,  True,  ...,  True,  True,  True],
         [False, False, False,  ...,  True,  True,  True],
         ...,
         [False, False, False,  ..., False,  True,  True],
         [False, False, False,  ..., False, False,  True],
         [False, False, False,  ..., False, False, False]]])

In [63]:
r[0].shape

torch.Size([280, 280])

In [95]:
b = torch.randn(1,280)

In [96]:
b.masked_fill(r[0], 0)

tensor([[ 0.0000,  0.0901,  0.3619,  ...,  1.5084, -0.4110,  0.3909],
        [ 0.0000,  0.0000,  0.3619,  ...,  1.5084, -0.4110,  0.3909],
        [ 0.0000,  0.0000,  0.0000,  ...,  1.5084, -0.4110,  0.3909],
        ...,
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000, -0.4110,  0.3909],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.3909],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.3909]])

In [59]:
b.masked_fill(mask, 0)

RuntimeError: The size of tensor a (280) must match the size of tensor b (512) at non-singleton dimension 2

In [43]:
a.shape

torch.Size([1, 280, 512])