# Sources

In [1]:
import torch, sys, os
import dill as pickle
import torch.nn as nn
import torch, torchtext 
import numpy as np
import matplotlib.pyplot as plt
import torch.nn.functional as F
import transformer.Constants as Constants
from transformer.Layers import EncoderLayer, DecoderLayer
from torchtext.data import Field, TabularDataset, BucketIterator, Dataset

Matplotlib created a temporary config/cache directory at /tmp/matplotlib-95jn9ozn because the default path (/home/visionteam/.config/matplotlib) is not a writable directory; it is highly recommended to set the MPLCONFIGDIR environment variable to a writable directory, in particular to speed up the import of Matplotlib and to better support multiprocessing.


# Some environment checks

In [2]:
print('Torchtext version:{}, Torch version:{}'.format(torchtext.__version__, torch.__version__))
print('Is CUDA available:{}'.format(torch.cuda.is_available()))
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print('Device is{}'.format(device))

Torchtext version:0.8.0, Torch version:1.7.1
Is CUDA available:True
Device iscuda:0


# Load a preprocessed dataset

In [3]:
save_data = os.path.expanduser('~/tf_tutorials/imdb_dataset/imdb_fields_and_vocab.pkl')
print(save_data)
data = pickle.load(open(save_data, 'rb'))

/home/visionteam/tf_tutorials/imdb_dataset/imdb_fields_and_vocab.pkl


In [4]:
data.keys()

dict_keys(['fields_with_and_without_vocab', 'train_examples', 'valid_examples', 'test_examples'])

In [5]:
fields = data['fields_with_and_without_vocab']
train_data = Dataset(examples=data['train_examples'], fields=fields)
valid_data = Dataset(examples=data['valid_examples'], fields=fields)
test_data = Dataset(examples=data['test_examples'], fields=fields)

In [6]:
len(fields['text'].vocab)

30195

In [7]:
vars(fields['text'].vocab).keys()

dict_keys(['freqs', 'itos', 'unk_index', 'stoi', 'vectors'])

In [8]:
fields

{'text': <torchtext.data.field.Field at 0x7f6db3e60d10>,
 'label': <torchtext.data.field.Field at 0x7f6db373b810>}

In [9]:
len(train_data)

25588

In [10]:
train_data.examples[0].label

'0'

In [11]:
train_data.fields

{'text': <torchtext.data.field.Field at 0x7f6db3e60d10>,
 'label': <torchtext.data.field.Field at 0x7f6db373b810>}

In [12]:
train_iterator, test_iterator, valid_iterator = BucketIterator.splits(
    (train_data, test_data, valid_data), batch_size=2, device=device
)  #<--- bucketiterator expects a dataset object and fields that already have vocabularay built.



In [13]:
count = 0
for batch in train_iterator:
    print(batch.text.shape)
    print(batch.label.shape)
    count += 1
    if(count == 5):
        sys.exit()



torch.Size([216, 2])
torch.Size([2])
torch.Size([200, 2])
torch.Size([2])
torch.Size([221, 2])
torch.Size([2])
torch.Size([162, 2])
torch.Size([2])
torch.Size([151, 2])
torch.Size([2])


SystemExit: 

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


# Transformer modules

## Masking functions

In [14]:
def get_pad_mask(seq, pad_idx):
    return (seq != pad_idx).unsqueeze(-2)


def get_subsequent_mask(seq):
    ''' For masking out the subsequent info. '''
    sz_b, len_s = seq.size()
    subsequent_mask = (1 - torch.triu(
        torch.ones((1, len_s, len_s), device=seq.device), diagonal=1)).bool()
    return subsequent_mask

* Find the batch with smallest samples

In [15]:
largest_seq_len = max([batch.text.shape[0] for batch in train_iterator])
small_batch = None
for batch in train_iterator:
    if(batch.text.shape[0] < largest_seq_len):
        small_batch = batch
        largest_seq_len = batch.text.shape[0]
print(small_batch.text.transpose(0,1))
print(small_batch.text.shape)    

tensor([[    2,   461,  6476,    40,   780,     6,    24,   266,   305,  6943,
             5,  9759,  3787,    15,     4,   506,     9,  4144,  1526,     5,
             8,   343,   429,     9, 11562,  1396,     5,    35,     9,    61,
          1382,   131,     5,   155,    58,    29,     4,   636,    97,   854,
            22,   575,    63,     3,     1,     1,     1,     1,     1],
        [    2,    14,    11,     8,   698,     6,   135,     6,  1481,    17,
            51,     4,   561,  3129,     5,    13,    20,    35,     9,   166,
            19,  1233,    22,   180,  3876,  6334,    19,    89,    16,  1599,
            62,    53,  1647,     5,    52,    25,    41,     4,    93,    15,
            14,    35,     6,   199,    13,     8,   387,     5,     3]],
       device='cuda:0')
torch.Size([49, 2])


* Find the batch with largest samples

In [16]:
largest_seq_len = max([batch.text.shape[0] for batch in train_iterator])
smallest_seq_len = min([batch.text.shape[0] for batch in train_iterator])
print('Length of the largest sequence:{} and the smallest sequence:{}'.format(largest_seq_len,
                                                                             smallest_seq_len))

Length of the largest sequence:258 and the smallest sequence:38


### Test masking functions

In [17]:
src_pad_idx = fields['text'].vocab.stoi[Constants.PAD_WORD]
src_seq = small_batch.text.transpose(0,1)
print('Source sequence shape:{}'.format(src_seq.shape))
src_pad_mask = get_pad_mask(src_seq, src_pad_idx)
src_subseq_mask = get_subsequent_mask(src_seq)


Source sequence shape:torch.Size([2, 49])


In [18]:
print(src_pad_mask.shape)
print(src_pad_mask)

torch.Size([2, 1, 49])
tensor([[[ True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
           True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
           True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
           True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
           True,  True,  True,  True, False, False, False, False, False]],

        [[ True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
           True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
           True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
           True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
           True,  True,  True,  True,  True,  True,  True,  True,  True]]],
       device='cuda:0')


In [19]:
print(src_subseq_mask.shape)
print(src_subseq_mask)

torch.Size([1, 49, 49])
tensor([[[ True, False, False,  ..., False, False, False],
         [ True,  True, False,  ..., False, False, False],
         [ True,  True,  True,  ..., False, False, False],
         ...,
         [ True,  True,  True,  ...,  True, False, False],
         [ True,  True,  True,  ...,  True,  True, False],
         [ True,  True,  True,  ...,  True,  True,  True]]], device='cuda:0')


In [20]:
src_mask = src_pad_mask & src_subseq_mask
print(src_mask)
print(src_mask.shape)

tensor([[[ True, False, False,  ..., False, False, False],
         [ True,  True, False,  ..., False, False, False],
         [ True,  True,  True,  ..., False, False, False],
         ...,
         [ True,  True,  True,  ..., False, False, False],
         [ True,  True,  True,  ..., False, False, False],
         [ True,  True,  True,  ..., False, False, False]],

        [[ True, False, False,  ..., False, False, False],
         [ True,  True, False,  ..., False, False, False],
         [ True,  True,  True,  ..., False, False, False],
         ...,
         [ True,  True,  True,  ...,  True, False, False],
         [ True,  True,  True,  ...,  True,  True, False],
         [ True,  True,  True,  ...,  True,  True,  True]]], device='cuda:0')
torch.Size([2, 49, 49])


In [21]:
print(src_mask[0])

tensor([[ True, False, False,  ..., False, False, False],
        [ True,  True, False,  ..., False, False, False],
        [ True,  True,  True,  ..., False, False, False],
        ...,
        [ True,  True,  True,  ..., False, False, False],
        [ True,  True,  True,  ..., False, False, False],
        [ True,  True,  True,  ..., False, False, False]], device='cuda:0')


In [22]:
print(src_mask[1])

tensor([[ True, False, False,  ..., False, False, False],
        [ True,  True, False,  ..., False, False, False],
        [ True,  True,  True,  ..., False, False, False],
        ...,
        [ True,  True,  True,  ...,  True, False, False],
        [ True,  True,  True,  ...,  True,  True, False],
        [ True,  True,  True,  ...,  True,  True,  True]], device='cuda:0')


## Scaled dot product attention
![alt text](scaled_dot_product_attn.png "Scaled Dot Product Attention")

In [23]:
class ScaledDotProductAttention(nn.Module):
    ''' Scaled Dot-Product Attention '''

    def __init__(self, temperature, attn_dropout=0.1):
        super().__init__()
        self.temperature = temperature
        self.dropout = nn.Dropout(attn_dropout)

    def forward(self, q, k, v, mask=None):
        #print(q.shape, k.shape, v.shape)
        attn = torch.matmul(q / self.temperature, k.transpose(2, 3))
        #print(attn.shape)

        if mask is not None:
            #print('mask:{}'.format(mask.shape))
            attn = attn.masked_fill(mask == 0, -1e9)

        attn = self.dropout(F.softmax(attn, dim=-1))
        #print(attn.shape)
        output = torch.matmul(attn, v)
        #print(output.shape)
        return output, attn

### Testing scaled dot product attention

In [None]:
n_head = 1
batch = 2
sent_len = small_batch.text.shape[0]
d_model = 512
d_k = d_model
temperature = d_k ** 0.5
q = torch.Tensor(np.random.rand(batch, sent_len, n_head, d_model)).to(device=device) ##fake sentence
k = torch.Tensor(np.random.rand(batch, sent_len, n_head, d_model)).to(device=device)
v = torch.Tensor(np.random.rand(batch, sent_len, n_head, d_model)).to(device=device)
print('Query, Key, and Value shapes:{}, {}, {}'.format(q.shape, k.shape, v.shape))
scaled_dpa = ScaledDotProductAttention(temperature=temperature)
q, k, v = q.transpose(1, 2), k.transpose(1, 2), v.transpose(1, 2)
print('Query, Key, and Value shapes:{}, {}, {}'.format(q.shape, k.shape, v.shape))
src_mask = src_mask.unsqueeze(1)   # For head axis broadcasting.
output, attn = scaled_dpa(q, k, v, mask=src_mask)
print('Output shape:{}, attention shape:{}'.format(output.shape, attn.shape))

In [None]:
numpy_attn = attn.squeeze(1).cpu().numpy()
first_attention_mask = np.round(numpy_attn[0,:,:], 5)
second_attention_mask = np.round(numpy_attn[1,:,:], 5)
print(first_attention_mask)

In [None]:
print(second_attention_mask)

## Multi-head attention
![alt text](multi-head-attention.png "Multi-head attention")

In [24]:
class MultiHeadAttention(nn.Module):
    ''' Multi-Head Attention module '''

    def __init__(self, n_head, d_model, d_k, d_v, dropout=0.1):
        super().__init__()

        self.n_head = n_head
        self.d_k = d_k
        self.d_v = d_v

        self.w_qs = nn.Linear(d_model, n_head * d_k, bias=False)
        self.w_ks = nn.Linear(d_model, n_head * d_k, bias=False)
        self.w_vs = nn.Linear(d_model, n_head * d_v, bias=False)
        self.fc = nn.Linear(n_head * d_v, d_model, bias=False)

        self.attention = ScaledDotProductAttention(temperature=d_k ** 0.5)

        self.dropout = nn.Dropout(dropout)
        self.layer_norm = nn.LayerNorm(d_model, eps=1e-6)


    def forward(self, q, k, v, mask=None):

        d_k, d_v, n_head = self.d_k, self.d_v, self.n_head
        sz_b, len_q, len_k, len_v = q.size(0), q.size(1), k.size(1), v.size(1)

        residual = q

        # Pass through the pre-attention projection: b x lq x (n*dv)
        # Separate different heads: b x lq x n x dv
        q = self.w_qs(q).view(sz_b, len_q, n_head, d_k)
        k = self.w_ks(k).view(sz_b, len_k, n_head, d_k)
        v = self.w_vs(v).view(sz_b, len_v, n_head, d_v)

        # Transpose for attention dot product: b x n x lq x dv
        q, k, v = q.transpose(1, 2), k.transpose(1, 2), v.transpose(1, 2)

        if mask is not None:
            mask = mask.unsqueeze(1)   # For head axis broadcasting.

        q, attn = self.attention(q, k, v, mask=mask)

        # Transpose to move the head dimension back: b x lq x n x dv
        # Combine the last two dimensions to concatenate all the heads together: b x lq x (n*dv)
        q = q.transpose(1, 2).contiguous().view(sz_b, len_q, -1)
        q = self.dropout(self.fc(q))
        q += residual

        q = self.layer_norm(q)

        return q, attn

### Testing multi-head attention

* Generate a mask

In [None]:
src_pad_idx = fields['text'].vocab.stoi[Constants.PAD_WORD]
src_seq = small_batch.text.transpose(0,1)
print('Source sequence shape:{}'.format(src_seq.shape))
src_pad_mask = get_pad_mask(src_seq, src_pad_idx)
src_subseq_mask = get_subsequent_mask(src_seq)
src_mask = src_pad_mask & src_subseq_mask
print(src_mask)
print(src_mask.shape)

In [None]:
n_head = 8
batch = 2
sent_len = small_batch.text.shape[0]
d_model = 512
d_k = int(d_model / n_head)
temperature = d_k ** 0.5
q = torch.Tensor(np.random.rand(batch, sent_len, n_head*d_k)).to(device=device) ##fake sentence
k = torch.Tensor(np.random.rand(batch, sent_len, n_head*d_k)).to(device=device)
v = torch.Tensor(np.random.rand(batch, sent_len, n_head*d_k)).to(device=device)
print('Shapes of Query:{}, Key:{}, and Value:{}'.format(q.shape, k.shape, v.shape))
#q, k, v = q.transpose(1, 2), k.transpose(1, 2), v.transpose(1, 2)
#print('Shapes of Query:{}, Key:{}, and Value:{}'.format(q.shape, k.shape, v.shape))
mha = MultiHeadAttention(n_head=n_head, d_model=d_model, d_k=d_k, d_v=d_k)
q, attn = mha(q=q, k=k, v=v, mask=src_mask) 
print('Shapes of q:{}, and attn:{}'.format(q.shape, attn.shape))

In [None]:
print(attn[0,0,:,:])

In [None]:
print(attn[1,0,:,:])

## Positionwise feedforward

In [25]:
class PositionwiseFeedForward(nn.Module):
    ''' A two-feed-forward-layer module '''

    def __init__(self, d_in, d_hid, dropout=0.1):
        super().__init__()
        self.w_1 = nn.Linear(d_in, d_hid) # position-wise
        self.w_2 = nn.Linear(d_hid, d_in) # position-wise
        self.layer_norm = nn.LayerNorm(d_in, eps=1e-6)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):

        residual = x

        x = self.w_2(F.relu(self.w_1(x)))
        x = self.dropout(x)
        x += residual

        x = self.layer_norm(x)

        return x

## Encoder layer
![alt text](encoder_without_positional.png "Encoder without positional")

In [26]:
class EncoderLayer(nn.Module):
    ''' Compose with two layers '''

    def __init__(self, d_model, d_inner, n_head, d_k, d_v, dropout=0.1):
        super(EncoderLayer, self).__init__()
        self.slf_attn = MultiHeadAttention(n_head, d_model, d_k, d_v, dropout=dropout)
        self.pos_ffn = PositionwiseFeedForward(d_model, d_inner, dropout=dropout)

    def forward(self, enc_input, slf_attn_mask=None):
        enc_output, enc_slf_attn = self.slf_attn(
            enc_input, enc_input, enc_input, mask=slf_attn_mask)
        enc_output = self.pos_ffn(enc_output)
        return enc_output, enc_slf_attn

### Test the EncoderLayer


* Generate a mask

In [None]:
src_pad_idx = fields['text'].vocab.stoi[Constants.PAD_WORD]
src_seq = small_batch.text.transpose(0,1)
print('Source sequence shape:{}'.format(src_seq.shape))
src_pad_mask = get_pad_mask(src_seq, src_pad_idx)
src_subseq_mask = get_subsequent_mask(src_seq)
src_mask = src_pad_mask & src_subseq_mask
print(src_mask)
print(src_mask.shape)

In [None]:
d_model = 512
d_inner = 1024
n_head = 8
d_k = int(d_model / n_head)
d_v = d_k
enc_layer = EncoderLayer(d_model=d_model, d_inner=d_inner, n_head=n_head, d_k=d_k, d_v=d_v)
q = torch.Tensor(np.random.rand(batch, sent_len, n_head*d_k)).to(device=device) ##fake sentence
print('Shapes of Query:{}, Key:{}, and Value:{}'.format(q.shape, k.shape, v.shape))
enc_output, enc_self_attn = enc_layer(enc_input=q, slf_attn_mask=src_mask)
print('enc_output.shape:{}, enc_self_attn.shape:{}'.format(enc_output.shape, enc_self_attn.shape))

In [None]:
print(enc_self_attn[0,0,:,:])

In [None]:
print(enc_self_attn[1,0,:,:])

## Positional encoding
* Returns a tensor of size `n_position x d_model`, here `n_position` is the maximum number of words in your dataset. 

In [27]:
class PositionalEncoding(nn.Module):

    def __init__(self, d_hid, n_position=300):
        super(PositionalEncoding, self).__init__()

        # Not a parameter
        self.register_buffer('pos_table', self._get_sinusoid_encoding_table(n_position, d_hid))

    def _get_sinusoid_encoding_table(self, n_position, d_hid):
        ''' Sinusoid position encoding table '''

        def get_position_angle_vec(position):
            return [position / np.power(10000, 2 * (hid_j // 2) / d_hid) for hid_j in range(d_hid)]

        sinusoid_table = np.array([get_position_angle_vec(pos_i) for pos_i in range(n_position)])
        sinusoid_table[:, 0::2] = np.sin(sinusoid_table[:, 0::2])  # dim 2i
        sinusoid_table[:, 1::2] = np.cos(sinusoid_table[:, 1::2])  # dim 2i+1

        return torch.FloatTensor(sinusoid_table).unsqueeze(0)

    def forward(self, x):
        return x + self.pos_table[:, :x.size(1)].clone().detach()

### Testing positional encoding

In [None]:
pos_enc = PositionalEncoding(100, n_position=20)
x = np.zeros(shape=(20, 100))
x = torch.Tensor(x)  ## <--- send in an empty tensor so that we can visualize what's inside
x_pos_enc = pos_enc(x)

In [None]:
x_pos_enc = x_pos_enc.detach().cpu().numpy()[0]

In [None]:
x_pos_enc.shape

### See what's in even positions

In [None]:
for i in range(50):
    plt.plot(range(20), x_pos_enc[:,2*i])

### See what's in odd positions 

In [None]:
for i in range(50):
    plt.plot(range(20), x_pos_enc[:,2*i+1])

## Whole Encoder
![alt text](whole_encoder.png "Whole encoder")

In [28]:
class Encoder(nn.Module):
    ''' A encoder model with self attention mechanism. '''

    def __init__(
            self, n_src_vocab, d_word_vec, n_layers, n_head, d_k, d_v,
            d_model, d_inner, pad_idx, dropout=0.1, n_position=300, scale_emb=False):

        super().__init__()

        self.src_word_emb = nn.Embedding(n_src_vocab, d_word_vec, padding_idx=pad_idx)
        self.position_enc = PositionalEncoding(d_word_vec, n_position=n_position)
        self.dropout = nn.Dropout(p=dropout)
        self.layer_stack = nn.ModuleList([
            EncoderLayer(d_model, d_inner, n_head, d_k, d_v, dropout=dropout)
            for _ in range(n_layers)])
        self.layer_norm = nn.LayerNorm(d_model, eps=1e-6)
        self.scale_emb = scale_emb
        self.d_model = d_model

    def forward(self, src_seq, src_mask, return_attns=False):

        enc_slf_attn_list = []

        # -- Forward
        enc_output = self.src_word_emb(src_seq)
        if self.scale_emb:
            enc_output *= self.d_model ** 0.5
        enc_output = self.dropout(self.position_enc(enc_output))
        enc_output = self.layer_norm(enc_output)

        for enc_layer in self.layer_stack:
            enc_output, enc_slf_attn = enc_layer(enc_output, slf_attn_mask=src_mask)
            enc_slf_attn_list += [enc_slf_attn] if return_attns else []

        if return_attns:
            return enc_output, enc_slf_attn_list
        return enc_output,

### Testing the whole encoder

````
d_k = int(d_model / n_head) #---> int() is very important or else you'll see
TypeError: new() received an invalid combination of arguments - got (float, int), but expected one of:
 * (*, torch.device device)
      didn't match because some of the arguments have invalid types: (float, int)
 * (torch.Storage storage)
 * (Tensor other)
 * (tuple of ints size, *, torch.device device)
 * (object data, *, torch.device device)
 ````
 * If you do not use `int()` then `d_k = 64.0`, and `d_k*n_head = 512.0`, and the line
 `self.w_qs = nn.Linear(d_model, n_head * d_k, bias=False)` will throw this error.

In [29]:
src_vocab_size = len(fields['text'].vocab)
d_model = 512
n_layers = 6
n_head = 8
d_k = int(d_model / n_head)
d_inner = 1024
enc = Encoder(n_src_vocab=src_vocab_size, d_word_vec=d_model, n_layers=n_layers, n_head=n_head, 
              d_k=d_k, d_v=d_k, d_model=d_model, d_inner=d_inner, pad_idx=src_pad_idx)

In [30]:
src_seq = small_batch.text.transpose(0,1)
print('Source sequence shape:{}'.format(src_seq.shape))
src_pad_mask = get_pad_mask(src_seq, src_pad_idx)
src_subseq_mask = get_subsequent_mask(src_seq)
src_mask = src_pad_mask & src_subseq_mask

Source sequence shape:torch.Size([2, 49])


In [31]:
src_pad_idx

1

In [32]:
src_seq[0,:]

tensor([    2,   461,  6476,    40,   780,     6,    24,   266,   305,  6943,
            5,  9759,  3787,    15,     4,   506,     9,  4144,  1526,     5,
            8,   343,   429,     9, 11562,  1396,     5,    35,     9,    61,
         1382,   131,     5,   155,    58,    29,     4,   636,    97,   854,
           22,   575,    63,     3,     1,     1,     1,     1,     1],
       device='cuda:0')

In [33]:
src_seq[1,:]

tensor([   2,   14,   11,    8,  698,    6,  135,    6, 1481,   17,   51,    4,
         561, 3129,    5,   13,   20,   35,    9,  166,   19, 1233,   22,  180,
        3876, 6334,   19,   89,   16, 1599,   62,   53, 1647,    5,   52,   25,
          41,    4,   93,   15,   14,   35,    6,  199,   13,    8,  387,    5,
           3], device='cuda:0')

In [34]:
print(src_mask[0,:,:])

tensor([[ True, False, False,  ..., False, False, False],
        [ True,  True, False,  ..., False, False, False],
        [ True,  True,  True,  ..., False, False, False],
        ...,
        [ True,  True,  True,  ..., False, False, False],
        [ True,  True,  True,  ..., False, False, False],
        [ True,  True,  True,  ..., False, False, False]], device='cuda:0')


In [35]:
print(src_mask[1,:,:])

tensor([[ True, False, False,  ..., False, False, False],
        [ True,  True, False,  ..., False, False, False],
        [ True,  True,  True,  ..., False, False, False],
        ...,
        [ True,  True,  True,  ...,  True, False, False],
        [ True,  True,  True,  ...,  True,  True, False],
        [ True,  True,  True,  ...,  True,  True,  True]], device='cuda:0')


* One has to explicitly place the model on the GPU or else, it will reside on the CPU
* `RuntimeError: Input, output and indices must be on the current device`

In [37]:
enc = enc.to(device)

In [38]:
enc_op, enc_self_attn_list = enc(src_seq=src_seq, src_mask=src_mask, return_attns=True)

In [39]:
print('enc_op.shape:{}'.format(enc_op.shape))
print('enc_self_attn_list shapes:{}'.format([item.shape for item in enc_self_attn_list]))

enc_op.shape:torch.Size([2, 49, 512])
enc_self_attn_list shapes:[torch.Size([2, 8, 49, 49]), torch.Size([2, 8, 49, 49]), torch.Size([2, 8, 49, 49]), torch.Size([2, 8, 49, 49]), torch.Size([2, 8, 49, 49]), torch.Size([2, 8, 49, 49])]


In [40]:
print(enc_self_attn_list[0][0,0,:,:])

tensor([[1.1111, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.6745, 0.4366, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.3794, 0.4647, 0.2670,  ..., 0.0000, 0.0000, 0.0000],
        ...,
        [0.0346, 0.0122, 0.0348,  ..., 0.0000, 0.0000, 0.0000],
        [0.0323, 0.0115, 0.0353,  ..., 0.0000, 0.0000, 0.0000],
        [0.0198, 0.0138, 0.0248,  ..., 0.0000, 0.0000, 0.0000]],
       device='cuda:0', grad_fn=<SliceBackward>)


In [41]:
print(enc_self_attn_list[0][1,0,:,:])

tensor([[1.1111, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.5094, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.3243, 0.3930, 0.3938,  ..., 0.0000, 0.0000, 0.0000],
        ...,
        [0.0202, 0.0173, 0.0317,  ..., 0.0000, 0.0000, 0.0000],
        [0.0205, 0.0239, 0.0242,  ..., 0.0276, 0.0320, 0.0000],
        [0.0262, 0.0180, 0.0191,  ..., 0.0230, 0.0230, 0.0161]],
       device='cuda:0', grad_fn=<SliceBackward>)


In [42]:
print(enc_self_attn_list[-1][0,0,:,:])

tensor([[1.1111, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.4034, 0.7077, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.2433, 0.3795, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        ...,
        [0.0000, 0.0296, 0.0377,  ..., 0.0000, 0.0000, 0.0000],
        [0.0210, 0.0000, 0.0321,  ..., 0.0000, 0.0000, 0.0000],
        [0.0187, 0.0236, 0.0324,  ..., 0.0000, 0.0000, 0.0000]],
       device='cuda:0', grad_fn=<SliceBackward>)


In [43]:
print(enc_self_attn_list[-1][1,0,:,:])

tensor([[0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.8213, 0.2898, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.4633, 0.2447, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        ...,
        [0.0134, 0.0312, 0.0221,  ..., 0.0298, 0.0000, 0.0000],
        [0.0141, 0.0180, 0.0308,  ..., 0.0000, 0.0335, 0.0000],
        [0.0294, 0.0168, 0.0000,  ..., 0.0255, 0.0186, 0.0247]],
       device='cuda:0', grad_fn=<SliceBackward>)


### Classifier with learnable pooling 

* the below code performs $R^{2 \times 65 \times 512} \rightarrow R^{2 \times 1 \times 512}$ assuming  $d\_model \in R^{512}$, $enc\_output \in R^{2 \times 65 \times 512}$, and the batch size is 2. The same is implemented in the below class `ClassificationHeadWithLearnablePooling` lines `10-14`.

`temp_layer = nn.Linear(d_model, 1)`

`temp_layer_op = temp_layer(enc_output)`

`print(temp_layer_op.shape)`

torch.Size([2, 65, 1])

`temp_layer_op = temp_layer_op.transpose(-1, 1)`

`temp_layer_op = F.softmax(temp_layer_op, dim=-1)` #softmax(g(XL)T) in R^{bx1xn}

`print(temp_layer_op.shape)`

torch.Size([2, 1, 65])

`temp_z = torch.matmul(temp_layer_op, enc_output)` #[2,1,65]x[2,65,512], softmax(g(XL)T) x XL in R^{bx1xd} 

`print(temp_z.shape)`

torch.Size([2, 1, 512])



In [44]:
class ClassificationHeadWithLearnablePooling(nn.Module):
    def __init__(self, d_model: int = 512, n_classes: int = 2):
        super().__init__()
        self.reduction_layer = nn.Linear(d_model, 1)
        self.layer_norm = nn.LayerNorm(d_model) 
        self.linear_layer = nn.Linear(d_model, n_classes)
        
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        ### Learnable pooling
        reduction_layer_op = self.reduction_layer(x)
        reduction_layer_op = reduction_layer_op.transpose(-1, 1)
        reduction_layer_op = F.softmax(reduction_layer_op, dim=-1)
        enc_output = torch.matmul(reduction_layer_op, x)
        enc_output = enc_output.squeeze(1) #converts torch.Size([2, 1, 512]) -> torch.Size([2, 512])
        ## end of learnable pooling
        
        ## layer norm and a fully connected layer
        layer_normed_reduced = self.layer_norm(enc_output)
        output = self.linear_layer(layer_normed_reduced)
        #output = self.linear_layer(enc_output)
        return output

## Sentiment Analysis Transformer

In [71]:
class SentimentTransformer(nn.Module):
    def __init__(
            self, n_src_vocab, d_word_vec, n_layers, n_head, d_k, d_v,
            d_model, d_inner, pad_idx, n_classes, dropout=0.1, n_position=300, scale_emb=False,
    return_attns=True):
        
        super().__init__()
        self.src_pad_idx = src_pad_idx
        
        self.d_model = d_model
        self.return_attns = return_attns

        self.encoder = Encoder(
            n_src_vocab=n_src_vocab, n_position=n_position,
            d_word_vec=d_word_vec, d_model=d_model, d_inner=d_inner,
            n_layers=n_layers, n_head=n_head, d_k=d_k, d_v=d_v,
            pad_idx=src_pad_idx, dropout=dropout, scale_emb=scale_emb)
        
        self.classifier_head = ClassificationHeadWithLearnablePooling(d_model=d_model, 
                                                                      n_classes=n_classes)
    
    def forward(self, src_seq: torch.Tensor) -> torch.Tensor:
        src_mask = get_pad_mask(src_seq, self.src_pad_idx) & get_subsequent_mask(src_seq)
        encoder_op, enc_self_attn = self.encoder(src_seq, src_mask,return_attns=self.return_attns)
        classifier_op = self.classifier_head(encoder_op)
        return classifier_op

### Testing the complete sentiment analysis transformer 

In [72]:
src_vocab_size = len(fields['text'].vocab)
d_model = 512
n_layers = 6
n_head = 8
d_k = int(d_model / n_head)
d_inner = 1024
n_classes = 2
sentiment_trf = SentimentTransformer(n_src_vocab=src_vocab_size, d_word_vec=d_model, n_layers=n_layers,
                                    n_head=n_head, d_k=d_k, d_v=d_k, d_model=d_model, d_inner=d_inner,
                                    pad_idx=src_pad_idx, n_classes=n_classes)

In [73]:
sentiment_trf = sentiment_trf.to(device)

In [74]:
src_seq = small_batch.text.transpose(0,1)
print('Source sequence shape:{}'.format(src_seq.shape))

Source sequence shape:torch.Size([2, 49])


In [75]:
classifier_op = sentiment_trf(src_seq=src_seq)

In [77]:
print('classifier_op.shape:{}'.format(classifier_op.shape))
print('classifier_op:{}'.format(classifier_op))

classifier_op.shape:torch.Size([2, 2])
classifier_op:tensor([[-0.9916,  0.6943],
        [-1.0393,  0.9186]], device='cuda:0', grad_fn=<AddmmBackward>)
