# Chapter 17: Sequence-to-Sequence Architectures: Encoder-Decoders and Decoders
Masked attention

Programs from the book: [_Python for Natural Language Processing_](https://link.springer.com/book/9783031575488)

__Author__: Pierre Nugues

## Modules

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import math

In [2]:
torch.manual_seed(1234)

<torch._C.Generator at 0x10aa61070>

In [3]:
torch.set_printoptions(precision=2)

## Dataset

In [4]:
sentence_odyssey = 'I must go back to my ship and to my crew'
sentence_amazon = 'We process and ship your order'
# in the most cost-efficient way possible

In [5]:
words_a = sentence_amazon.lower().split()
words_o = sentence_odyssey.lower().split()
words_o

['i', 'must', 'go', 'back', 'to', 'my', 'ship', 'and', 'to', 'my', 'crew']

## GloVe Embeddings

In [6]:
def read_embeddings(file):
    """
    Return the embeddings in the from of a dictionary
    :param file:
    :return:
    """
    embeddings = {}
    with open(file, encoding='utf8') as glove:
        for line in glove:
            values = line.strip().split()
            word = values[0]
            vector = [float(value) for value in values[1:]]
            vector = torch.FloatTensor(vector)
            embeddings[word] = vector
    return embeddings

In [7]:
PATH = '../../corpus/'

In [8]:
embedding_file = PATH + 'glove.6B.50d.txt'
embeddings_dict = read_embeddings(embedding_file)

In [9]:
embeddings_dict['ship']

tensor([ 1.52,  0.11,  0.38, -0.51,  0.03, -0.13, -1.25,  0.80,  0.85, -1.10,
         0.89,  1.37,  0.43,  0.66, -0.26, -0.42, -0.49,  0.91, -1.72, -0.44,
         0.78,  0.20, -0.41, -0.54,  0.82, -1.74,  0.14,  0.28,  1.17,  0.17,
         2.23, -0.58, -0.46,  0.63,  0.54,  0.28,  0.44, -0.55, -0.36, -0.02,
         0.41, -0.87,  1.55, -0.81, -0.10, -0.28, -0.33, -0.51,  0.48, -0.66])

We build the embedding matrix

In [10]:
def embedding_matrix(words, embeddings_dict):
    embeddings_seq = [embeddings_dict[word] for word in words]
    embeddings_seq = torch.stack(embeddings_seq)
    return embeddings_seq

In [11]:
X_a = embedding_matrix(words_a, embeddings_dict)
X_o = embedding_matrix(words_o, embeddings_dict)

In [12]:
X_o.size()

torch.Size([11, 50])

In [13]:
X_a.size()

torch.Size([6, 50])

In [14]:
X_o[0][:10]

tensor([ 1.19e-01,  1.53e-01, -8.21e-02, -7.41e-01,  7.59e-01, -4.83e-01,
        -3.10e-01,  5.15e-01, -9.87e-01,  6.18e-04])

## Self-attention

In [15]:
def attention(Q, K, V):
    d_k = K.size(dim=-1)
    attn_weights = F.softmax(Q @ K.T/math.sqrt(d_k), dim=-1)
    attn_output = attn_weights @ V
    return attn_output, attn_weights

_ship_ in Homer

In [16]:
attention_output_o, attn_weights_o = attention(X_o, X_o, X_o)

In [17]:
attn_weights_o

tensor([[0.36, 0.05, 0.07, 0.05, 0.04, 0.19, 0.01, 0.02, 0.04, 0.19, 0.01],
        [0.14, 0.20, 0.10, 0.06, 0.11, 0.10, 0.03, 0.05, 0.11, 0.10, 0.02],
        [0.18, 0.09, 0.14, 0.09, 0.08, 0.13, 0.02, 0.04, 0.08, 0.13, 0.02],
        [0.14, 0.05, 0.09, 0.19, 0.08, 0.12, 0.03, 0.06, 0.08, 0.12, 0.03],
        [0.11, 0.11, 0.09, 0.09, 0.15, 0.08, 0.04, 0.07, 0.15, 0.08, 0.03],
        [0.19, 0.03, 0.05, 0.04, 0.03, 0.29, 0.01, 0.02, 0.03, 0.29, 0.01],
        [0.03, 0.03, 0.03, 0.04, 0.05, 0.03, 0.55, 0.03, 0.05, 0.03, 0.13],
        [0.10, 0.08, 0.07, 0.10, 0.12, 0.09, 0.04, 0.15, 0.12, 0.09, 0.04],
        [0.11, 0.11, 0.09, 0.09, 0.15, 0.08, 0.04, 0.07, 0.15, 0.08, 0.03],
        [0.19, 0.03, 0.05, 0.04, 0.03, 0.29, 0.01, 0.02, 0.03, 0.29, 0.01],
        [0.06, 0.05, 0.05, 0.06, 0.05, 0.06, 0.21, 0.04, 0.05, 0.06, 0.31]])

The word _ship_ in another context: _We process and ship your order_

In [18]:
attention_output_a, attn_weights_a = attention(X_a, X_a, X_a)

Attention weights for _ship:_

In [19]:
attn_weights_a

tensor([[0.61, 0.06, 0.06, 0.02, 0.20, 0.05],
        [0.17, 0.50, 0.08, 0.03, 0.11, 0.11],
        [0.22, 0.12, 0.30, 0.08, 0.15, 0.13],
        [0.04, 0.03, 0.04, 0.78, 0.05, 0.06],
        [0.14, 0.03, 0.03, 0.02, 0.74, 0.04],
        [0.16, 0.13, 0.10, 0.09, 0.18, 0.34]])

### Masked Attention

In [20]:
def attn_mask(size):
    U = torch.empty(size, size).fill_(float('-inf'))
    return torch.triu(U, diagonal=1)


attn_mask(6)

tensor([[0., -inf, -inf, -inf, -inf, -inf],
        [0., 0., -inf, -inf, -inf, -inf],
        [0., 0., 0., -inf, -inf, -inf],
        [0., 0., 0., 0., -inf, -inf],
        [0., 0., 0., 0., 0., -inf],
        [0., 0., 0., 0., 0., 0.]])

In [21]:
def attention_masked(Q, K, V, U):
    d_k = K.size(dim=-1)
    attn_weights = F.softmax(Q @ K.T/math.sqrt(d_k)
                             + U, dim=-1)
    attn_output = attn_weights @ V
    return attn_output, attn_weights

In [22]:
U = attn_mask(X_o.size(dim=0))
U

tensor([[0., -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf],
        [0., 0., -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf],
        [0., 0., 0., -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf],
        [0., 0., 0., 0., -inf, -inf, -inf, -inf, -inf, -inf, -inf],
        [0., 0., 0., 0., 0., -inf, -inf, -inf, -inf, -inf, -inf],
        [0., 0., 0., 0., 0., 0., -inf, -inf, -inf, -inf, -inf],
        [0., 0., 0., 0., 0., 0., 0., -inf, -inf, -inf, -inf],
        [0., 0., 0., 0., 0., 0., 0., 0., -inf, -inf, -inf],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., -inf, -inf],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., -inf],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]])

In [23]:
attn_output_masked, attn_weights_masked = attention_masked(X_o, X_o, X_o, U)

In [24]:
attn_weights_masked

tensor([[1.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00],
        [0.42, 0.58, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00],
        [0.44, 0.22, 0.35, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00],
        [0.29, 0.11, 0.19, 0.40, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00],
        [0.20, 0.20, 0.16, 0.17, 0.27, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00],
        [0.30, 0.05, 0.08, 0.07, 0.04, 0.45, 0.00, 0.00, 0.00, 0.00, 0.00],
        [0.04, 0.04, 0.04, 0.05, 0.06, 0.04, 0.73, 0.00, 0.00, 0.00, 0.00],
        [0.14, 0.10, 0.09, 0.13, 0.16, 0.12, 0.05, 0.21, 0.00, 0.00, 0.00],
        [0.12, 0.12, 0.10, 0.11, 0.16, 0.10, 0.04, 0.08, 0.16, 0.00, 0.00],
        [0.20, 0.03, 0.05, 0.05, 0.03, 0.29, 0.01, 0.02, 0.03, 0.29, 0.00],
        [0.06, 0.05, 0.05, 0.06, 0.05, 0.06, 0.21, 0.04, 0.05, 0.06, 0.31]])

## PyTorch Function

In [25]:
U = nn.Transformer.generate_square_subsequent_mask(X_a.size(dim=0))
U

tensor([[0., -inf, -inf, -inf, -inf, -inf],
        [0., 0., -inf, -inf, -inf, -inf],
        [0., 0., 0., -inf, -inf, -inf],
        [0., 0., 0., 0., -inf, -inf],
        [0., 0., 0., 0., 0., -inf],
        [0., 0., 0., 0., 0., 0.]])

In [26]:
attn_output_masked, attn_weights_masked = attention_masked(X_a, X_a, X_a, U)

In [27]:
attn_weights_masked

tensor([[1.00, 0.00, 0.00, 0.00, 0.00, 0.00],
        [0.25, 0.75, 0.00, 0.00, 0.00, 0.00],
        [0.35, 0.18, 0.47, 0.00, 0.00, 0.00],
        [0.05, 0.03, 0.05, 0.88, 0.00, 0.00],
        [0.15, 0.03, 0.03, 0.02, 0.77, 0.00],
        [0.16, 0.13, 0.10, 0.09, 0.18, 0.34]])