In [1]:
!pip install BPEmb

import math
import numpy as np
import tensorflow as tf

from bpemb import BPEmb

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting BPEmb
  Downloading bpemb-0.3.4-py3-none-any.whl (19 kB)
Collecting sentencepiece (from BPEmb)
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m25.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: sentencepiece, BPEmb
Successfully installed BPEmb-0.3.4 sentencepiece-0.1.99


# Transformers From Scratch

In [2]:
def scaled_dot_product_attention(query, key, value, mask=None):
  key_dim = tf.cast(tf.shape(key)[-1], tf.float32)
  scaled_scores = tf.matmul(query, key, transpose_b=True) / np.sqrt(key_dim)

  if mask is not None:
    scaled_scores = tf.where(mask==0, -np.inf, scaled_scores)

  softmax = tf.keras.layers.Softmax()
  weights = softmax(scaled_scores) 
  return tf.matmul(weights, value), weights

In [3]:
seq_len = 3
embed_dim = 4

queries = np.random.rand(seq_len, embed_dim)
keys = np.random.rand(seq_len, embed_dim)
values = np.random.rand(seq_len, embed_dim)

print("Queries:\n", queries)

Queries:
 [[0.75214069 0.00748362 0.47214515 0.7548674 ]
 [0.41820822 0.14339506 0.27825209 0.67074722]
 [0.71687512 0.92480808 0.45985028 0.07183883]]


In [4]:
output, attn_weights = scaled_dot_product_attention(queries, keys, values)

print("Output\n", output, "\n")
print("Weights\n", attn_weights)

Output
 tf.Tensor(
[[0.5559631  0.34568194 0.62707156 0.2565914 ]
 [0.5634332  0.35336107 0.6214385  0.25486088]
 [0.5232962  0.32520714 0.6340663  0.25825214]], shape=(3, 4), dtype=float32) 

Weights
 tf.Tensor(
[[0.37073195 0.3104373  0.3188307 ]
 [0.3726771  0.31980565 0.30751723]
 [0.33857176 0.29277173 0.3686565 ]], shape=(3, 3), dtype=float32)


In [5]:
batch_size = 1
seq_len = 3
embed_dim = 12
num_heads = 3
head_dim = embed_dim // num_heads

print(f"Dimension of each head: {head_dim}")

Dimension of each head: 4


In [6]:
x = np.random.rand(batch_size, seq_len, embed_dim).round(1)
print("Input shape: ", x.shape, "\n")
print("Input:\n", x)

Input shape:  (1, 3, 12) 

Input:
 [[[0.3 0.8 0.5 0.8 0.3 0.2 0.1 0.9 0.7 0.6 0.2 0.1]
  [0.5 0.9 0.6 0.  0.3 0.7 0.1 0.6 0.9 0.6 0.6 0.5]
  [0.5 0.5 0.7 0.8 0.5 0.4 0.1 0.2 0.9 0.  0.7 0.2]]]


In [7]:
# The query weights for each head.
wq0 = np.random.rand(embed_dim, head_dim).round(1)
wq1 = np.random.rand(embed_dim, head_dim).round(1)
wq2 = np.random.rand(embed_dim, head_dim).round(1)

# The key weights for each head. 
wk0 = np.random.rand(embed_dim, head_dim).round(1)
wk1 = np.random.rand(embed_dim, head_dim).round(1)
wk2 = np.random.rand(embed_dim, head_dim).round(1)

# The value weights for each head.
wv0 = np.random.rand(embed_dim, head_dim).round(1)
wv1 = np.random.rand(embed_dim, head_dim).round(1)
wv2 = np.random.rand(embed_dim, head_dim).round(1)

In [8]:
print("The three sets of query weights (one for each head):")
print("wq0:\n", wq0)
print("wq1:\n", wq1)
print("wq2:\n", wq1)

The three sets of query weights (one for each head):
wq0:
 [[0.1 0.4 0.5 0.1]
 [0.1 1.  0.7 0.4]
 [0.7 0.1 1.  0.5]
 [0.2 0.4 0.3 0.1]
 [0.8 0.3 0.7 0.2]
 [0.4 0.8 0.4 0.8]
 [0.3 0.1 0.3 0.6]
 [0.7 0.1 0.1 0.4]
 [0.8 0.7 0.  0.3]
 [0.9 0.3 0.2 0.2]
 [0.6 0.1 0.2 0.7]
 [0.2 0.5 1.  0.1]]
wq1:
 [[0.3 0.  0.5 0.7]
 [0.3 0.7 0.4 0.7]
 [0.6 0.5 0.9 0.3]
 [1.  0.9 0.8 0.9]
 [0.4 0.5 0.2 0.6]
 [0.3 0.4 1.  0.2]
 [0.6 0.8 0.4 0. ]
 [0.4 0.2 0.3 0.6]
 [0.5 0.3 0.  0.4]
 [0.9 0.9 0.4 0.2]
 [1.  0.1 0.5 0.8]
 [0.5 0.1 0.7 0.7]]
wq2:
 [[0.3 0.  0.5 0.7]
 [0.3 0.7 0.4 0.7]
 [0.6 0.5 0.9 0.3]
 [1.  0.9 0.8 0.9]
 [0.4 0.5 0.2 0.6]
 [0.3 0.4 1.  0.2]
 [0.6 0.8 0.4 0. ]
 [0.4 0.2 0.3 0.6]
 [0.5 0.3 0.  0.4]
 [0.9 0.9 0.4 0.2]
 [1.  0.1 0.5 0.8]
 [0.5 0.1 0.7 0.7]]


In [9]:
# Geneated queries, keys, and values for the first head.
q0 = np.dot(x, wq0)
k0 = np.dot(x, wk0)
v0 = np.dot(x, wv0)

# Geneated queries, keys, and values for the second head.
q1 = np.dot(x, wq1)
k1 = np.dot(x, wk1)
v1 = np.dot(x, wv1)

# Geneated queries, keys, and values for the third head.
q2 = np.dot(x, wq2)
k2 = np.dot(x, wk2)
v2 = np.dot(x, wv2)

In [10]:
print("Q, K, and V for first head:\n")

print(f"q0 {q0.shape}:\n", q0, "\n")
print(f"k0 {k0.shape}:\n", k0, "\n")
print(f"v0 {v0.shape}:\n", v0)

Q, K, and V for first head:

q0 (1, 3, 4):
 [[[2.84 2.38 2.12 1.8 ]
  [3.25 3.   2.8  2.49]
  [2.66 2.39 2.44 2.02]]] 

k0 (1, 3, 4):
 [[[2.58 2.19 2.8  2.32]
  [3.16 2.92 3.19 3.25]
  [2.7  2.66 2.89 2.7 ]]] 

v0 (1, 3, 4):
 [[[2.56 2.92 2.18 2.84]
  [3.04 3.99 2.08 3.  ]
  [2.94 3.13 2.25 3.26]]]


In [11]:
out0, attn_weights0 = scaled_dot_product_attention(q0, k0, v0)

print("Output from first attention head: ", out0, "\n")
print("Attention weights from first head: ", attn_weights0)

Output from first attention head:  tf.Tensor(
[[[3.0054321 3.8246443 2.1078758 3.0291328]
  [3.0205224 3.8866928 2.0980802 3.0210824]
  [3.0081773 3.835439  2.1062093 3.027895 ]]], shape=(1, 3, 4), dtype=float32) 

Attention weights from first head:  tf.Tensor(
[[[0.04314128 0.8182604  0.13859834]
  [0.02099406 0.8850018  0.09400418]
  [0.03895167 0.8297894  0.13125895]]], shape=(1, 3, 3), dtype=float32)


In [12]:
out1, _ = scaled_dot_product_attention(q1, k1, v1)
out2, _ = scaled_dot_product_attention(q2, k2, v2)

print("Output from second attention head: ", out1, "\n")
print("Output from third attention head: ", out2,)

Output from second attention head:  tf.Tensor(
[[[1.8091758 2.7371604 3.9851067 2.9945998]
  [1.80683   2.7380533 3.9870818 2.9951804]
  [1.8097466 2.7364388 3.9845045 2.9953678]]], shape=(1, 3, 4), dtype=float32) 

Output from third attention head:  tf.Tensor(
[[[3.7462146 3.5141351 3.336993  3.6518066]
  [3.7518353 3.5321844 3.3548553 3.6597142]
  [3.7440763 3.5070477 3.329955  3.6485972]]], shape=(1, 3, 4), dtype=float32)


In [13]:
combined_out_a = np.concatenate((out0, out1, out2), axis=-1)
print(f"Combined output from all heads {combined_out_a.shape}:")
print(combined_out_a)

# The final step would be to run combined_out_a through a linear/dense layer 
# for further processing.

Combined output from all heads (1, 3, 12):
[[[3.0054321 3.8246443 2.1078758 3.0291328 1.8091758 2.7371604 3.9851067
   2.9945998 3.7462146 3.5141351 3.336993  3.6518066]
  [3.0205224 3.8866928 2.0980802 3.0210824 1.80683   2.7380533 3.9870818
   2.9951804 3.7518353 3.5321844 3.3548553 3.6597142]
  [3.0081773 3.835439  2.1062093 3.027895  1.8097466 2.7364388 3.9845045
   2.9953678 3.7440763 3.5070477 3.329955  3.6485972]]]


In [14]:
print("Query weights for first head: \n", wq0, "\n")
print("Query weights for second head: \n", wq1, "\n")
print("Query weights for third head: \n", wq2)

Query weights for first head: 
 [[0.1 0.4 0.5 0.1]
 [0.1 1.  0.7 0.4]
 [0.7 0.1 1.  0.5]
 [0.2 0.4 0.3 0.1]
 [0.8 0.3 0.7 0.2]
 [0.4 0.8 0.4 0.8]
 [0.3 0.1 0.3 0.6]
 [0.7 0.1 0.1 0.4]
 [0.8 0.7 0.  0.3]
 [0.9 0.3 0.2 0.2]
 [0.6 0.1 0.2 0.7]
 [0.2 0.5 1.  0.1]] 

Query weights for second head: 
 [[0.3 0.  0.5 0.7]
 [0.3 0.7 0.4 0.7]
 [0.6 0.5 0.9 0.3]
 [1.  0.9 0.8 0.9]
 [0.4 0.5 0.2 0.6]
 [0.3 0.4 1.  0.2]
 [0.6 0.8 0.4 0. ]
 [0.4 0.2 0.3 0.6]
 [0.5 0.3 0.  0.4]
 [0.9 0.9 0.4 0.2]
 [1.  0.1 0.5 0.8]
 [0.5 0.1 0.7 0.7]] 

Query weights for third head: 
 [[0.2 0.1 0.9 0.2]
 [0.  0.1 0.9 0.8]
 [0.7 0.5 0.8 1. ]
 [0.7 0.  0.9 0.4]
 [0.1 0.7 1.  0.2]
 [1.  0.3 0.2 0.5]
 [0.1 0.5 0.6 0.3]
 [0.5 0.9 0.7 0.4]
 [0.4 0.8 0.  0.7]
 [0.6 0.2 0.8 0.6]
 [0.6 0.3 0.6 0.4]
 [0.6 0.1 1.  0.3]]


In [15]:
wq = np.concatenate((wq0, wq1, wq2), axis=1)
print(f"Single query weight matrix {wq.shape}: \n", wq)

Single query weight matrix (12, 12): 
 [[0.1 0.4 0.5 0.1 0.3 0.  0.5 0.7 0.2 0.1 0.9 0.2]
 [0.1 1.  0.7 0.4 0.3 0.7 0.4 0.7 0.  0.1 0.9 0.8]
 [0.7 0.1 1.  0.5 0.6 0.5 0.9 0.3 0.7 0.5 0.8 1. ]
 [0.2 0.4 0.3 0.1 1.  0.9 0.8 0.9 0.7 0.  0.9 0.4]
 [0.8 0.3 0.7 0.2 0.4 0.5 0.2 0.6 0.1 0.7 1.  0.2]
 [0.4 0.8 0.4 0.8 0.3 0.4 1.  0.2 1.  0.3 0.2 0.5]
 [0.3 0.1 0.3 0.6 0.6 0.8 0.4 0.  0.1 0.5 0.6 0.3]
 [0.7 0.1 0.1 0.4 0.4 0.2 0.3 0.6 0.5 0.9 0.7 0.4]
 [0.8 0.7 0.  0.3 0.5 0.3 0.  0.4 0.4 0.8 0.  0.7]
 [0.9 0.3 0.2 0.2 0.9 0.9 0.4 0.2 0.6 0.2 0.8 0.6]
 [0.6 0.1 0.2 0.7 1.  0.1 0.5 0.8 0.6 0.3 0.6 0.4]
 [0.2 0.5 1.  0.1 0.5 0.1 0.7 0.7 0.6 0.1 1.  0.3]]


In [16]:
wk = np.concatenate((wk0, wk1, wk2), axis=1)
wv = np.concatenate((wv0, wv1, wv2), axis=1)

print(f"Single key weight matrix {wk.shape}:\n", wk, "\n")
print(f"Single value weight matrix {wv.shape}:\n", wv)

Single key weight matrix (12, 12):
 [[0.2 0.6 0.2 0.4 0.4 0.1 0.6 0.6 0.2 0.5 0.1 0.5]
 [0.4 0.1 0.2 0.8 0.5 0.5 0.9 0.  0.4 0.9 0.3 0. ]
 [0.8 0.4 0.3 0.6 0.6 0.3 0.2 0.2 0.8 0.2 0.  0.1]
 [0.2 0.2 0.6 0.1 0.4 0.3 0.5 0.7 0.2 0.7 0.1 0. ]
 [0.7 1.  0.8 0.2 0.9 0.7 0.8 0.9 0.5 0.2 0.6 0.3]
 [0.7 0.5 0.3 0.6 0.7 0.9 0.7 0.4 0.1 0.  0.3 0.8]
 [0.6 0.9 0.4 0.9 0.5 0.5 0.4 0.7 0.  0.8 0.5 0.1]
 [0.5 0.1 0.7 0.1 0.8 0.2 0.2 0.8 0.9 0.8 0.9 0.3]
 [0.5 0.7 0.6 0.8 0.6 0.  0.6 0.4 0.4 0.8 0.6 0.4]
 [0.5 0.6 0.5 0.2 0.1 0.4 0.6 0.9 0.9 0.9 0.8 0.8]
 [0.6 0.5 0.8 0.5 0.6 0.  0.6 0.8 0.3 0.4 0.1 0.1]
 [0.1 0.4 1.  0.4 0.6 0.5 0.1 0.8 0.3 0.6 0.6 0.2]] 

Single value weight matrix (12, 12):
 [[0.3 0.4 0.4 0.  0.6 0.  0.1 0.5 0.9 0.5 0.4 0.1]
 [0.2 0.7 0.2 0.4 0.3 0.2 0.5 0.7 0.8 1.  0.6 0.8]
 [0.5 0.6 0.6 1.  0.2 0.4 0.5 0.5 0.7 0.3 0.6 1. ]
 [0.3 0.  0.6 0.8 0.8 0.7 1.  0.8 1.  0.3 0.5 0.5]
 [0.6 0.6 0.1 0.4 0.4 0.6 0.4 0.2 0.9 0.4 0.  0.7]
 [0.2 0.8 0.1 0.3 0.  0.7 0.7 0.2 0.1 0.5 1.  0.5]
 [0.6

In [17]:
q_s = np.dot(x, wq)
k_s = np.dot(x, wk)
v_s = np.dot(x, wv)

In [18]:
print(f"Query vectors using a single weight matrix {q_s.shape}:\n", q_s)

Query vectors using a single weight matrix (1, 3, 12):
 [[[2.84 2.38 2.12 1.8  3.17 2.8  2.54 3.03 2.48 2.24 3.84 3.03]
  [3.25 3.   2.8  2.49 3.25 2.48 3.02 3.15 2.94 2.52 4.   3.48]
  [2.66 2.39 2.44 2.02 3.23 2.31 2.81 3.19 2.61 2.1  3.58 2.9 ]]]


In [19]:
print(q0, "\n")
print(q1, "\n")
print(q2)

[[[2.84 2.38 2.12 1.8 ]
  [3.25 3.   2.8  2.49]
  [2.66 2.39 2.44 2.02]]] 

[[[3.17 2.8  2.54 3.03]
  [3.25 2.48 3.02 3.15]
  [3.23 2.31 2.81 3.19]]] 

[[[2.48 2.24 3.84 3.03]
  [2.94 2.52 4.   3.48]
  [2.61 2.1  3.58 2.9 ]]]


In [20]:
# Note: we can achieve the same thing by passing -1 instead of seq_len.
q_s_reshaped = tf.reshape(q_s, (batch_size, seq_len, num_heads, head_dim))
print(f"Combined queries: {q_s.shape}\n", q_s, "\n")
print(f"Reshaped into separate heads: {q_s_reshaped.shape}\n", q_s_reshaped)

Combined queries: (1, 3, 12)
 [[[2.84 2.38 2.12 1.8  3.17 2.8  2.54 3.03 2.48 2.24 3.84 3.03]
  [3.25 3.   2.8  2.49 3.25 2.48 3.02 3.15 2.94 2.52 4.   3.48]
  [2.66 2.39 2.44 2.02 3.23 2.31 2.81 3.19 2.61 2.1  3.58 2.9 ]]] 

Reshaped into separate heads: (1, 3, 3, 4)
 tf.Tensor(
[[[[2.84 2.38 2.12 1.8 ]
   [3.17 2.8  2.54 3.03]
   [2.48 2.24 3.84 3.03]]

  [[3.25 3.   2.8  2.49]
   [3.25 2.48 3.02 3.15]
   [2.94 2.52 4.   3.48]]

  [[2.66 2.39 2.44 2.02]
   [3.23 2.31 2.81 3.19]
   [2.61 2.1  3.58 2.9 ]]]], shape=(1, 3, 3, 4), dtype=float64)


In [21]:
q_s_transposed = tf.transpose(q_s_reshaped, perm=[0, 2, 1, 3]).numpy()
print(f"Queries transposed into \"separate\" heads {q_s_transposed.shape}:\n", 
      q_s_transposed)

Queries transposed into "separate" heads (1, 3, 3, 4):
 [[[[2.84 2.38 2.12 1.8 ]
   [3.25 3.   2.8  2.49]
   [2.66 2.39 2.44 2.02]]

  [[3.17 2.8  2.54 3.03]
   [3.25 2.48 3.02 3.15]
   [3.23 2.31 2.81 3.19]]

  [[2.48 2.24 3.84 3.03]
   [2.94 2.52 4.   3.48]
   [2.61 2.1  3.58 2.9 ]]]]


In [22]:
print("The separate per-head query matrices from before: ")
print(q0, "\n")
print(q1, "\n")
print(q2)

The separate per-head query matrices from before: 
[[[2.84 2.38 2.12 1.8 ]
  [3.25 3.   2.8  2.49]
  [2.66 2.39 2.44 2.02]]] 

[[[3.17 2.8  2.54 3.03]
  [3.25 2.48 3.02 3.15]
  [3.23 2.31 2.81 3.19]]] 

[[[2.48 2.24 3.84 3.03]
  [2.94 2.52 4.   3.48]
  [2.61 2.1  3.58 2.9 ]]]


In [23]:
k_s_transposed = tf.transpose(tf.reshape(k_s, (batch_size, -1, num_heads, head_dim)), perm=[0, 2, 1, 3]).numpy()
v_s_transposed = tf.transpose(tf.reshape(v_s, (batch_size, -1, num_heads, head_dim)), perm=[0, 2, 1, 3]).numpy()

print(f"Keys for all heads in a single matrix {k_s.shape}: \n", k_s_transposed, "\n")
print(f"Values for all heads in a single matrix {v_s.shape}: \n", v_s_transposed)

Keys for all heads in a single matrix (1, 3, 12): 
 [[[[2.58 2.19 2.8  2.32]
   [3.16 2.92 3.19 3.25]
   [2.7  2.66 2.89 2.7 ]]

  [[2.98 1.73 2.91 3.04]
   [3.56 2.18 3.43 3.3 ]
   [3.21 1.65 3.03 2.92]]

  [[2.83 3.63 2.43 1.53]
   [2.93 3.6  2.68 2.15]
   [2.12 2.86 1.66 1.33]]]] 

Values for all heads in a single matrix (1, 3, 12): 
 [[[[2.56 2.92 2.18 2.84]
   [3.04 3.99 2.08 3.  ]
   [2.94 3.13 2.25 3.26]]

  [[2.1  2.75 3.77 2.7 ]
   [1.77 2.76 4.02 2.99]
   [2.   2.48 3.78 3.28]]

  [[3.54 2.96 2.8  3.46]
   [3.79 3.64 3.46 3.7 ]
   [4.02 2.99 2.67 2.76]]]]


In [24]:
all_heads_output, all_attn_weights = scaled_dot_product_attention(q_s_transposed, 
                                                                  k_s_transposed, 
                                                                  v_s_transposed)
print("Self attention output:\n", all_heads_output)

Self attention output:
 tf.Tensor(
[[[[3.0054321 3.8246443 2.1078758 3.0291328]
   [3.0205224 3.8866928 2.0980802 3.0210824]
   [3.0081773 3.835439  2.1062093 3.027895 ]]

  [[1.8091758 2.7371604 3.9851067 2.9945998]
   [1.80683   2.7380533 3.9870818 2.9951804]
   [1.8097466 2.7364388 3.9845045 2.9953678]]

  [[3.7462146 3.5141351 3.336993  3.6518066]
   [3.7518353 3.5321844 3.3548553 3.6597142]
   [3.7440763 3.5070477 3.329955  3.6485972]]]], shape=(1, 3, 3, 4), dtype=float32)


In [25]:
print("Per head outputs from using separate sets of weights per head:")
print(out0, "\n")
print(out1, "\n")
print(out2)

Per head outputs from using separate sets of weights per head:
tf.Tensor(
[[[3.0054321 3.8246443 2.1078758 3.0291328]
  [3.0205224 3.8866928 2.0980802 3.0210824]
  [3.0081773 3.835439  2.1062093 3.027895 ]]], shape=(1, 3, 4), dtype=float32) 

tf.Tensor(
[[[1.8091758 2.7371604 3.9851067 2.9945998]
  [1.80683   2.7380533 3.9870818 2.9951804]
  [1.8097466 2.7364388 3.9845045 2.9953678]]], shape=(1, 3, 4), dtype=float32) 

tf.Tensor(
[[[3.7462146 3.5141351 3.336993  3.6518066]
  [3.7518353 3.5321844 3.3548553 3.6597142]
  [3.7440763 3.5070477 3.329955  3.6485972]]], shape=(1, 3, 4), dtype=float32)


In [26]:
combined_out_b = tf.reshape(tf.transpose(all_heads_output, perm=[0, 2, 1, 3]), 
                            shape=(batch_size, seq_len, embed_dim))
print("Final output from using single query, key, value matrices:\n", 
      combined_out_b, "\n")
print("Final output from using separate query, key, value matrices per head:\n", 
      combined_out_a)

Final output from using single query, key, value matrices:
 tf.Tensor(
[[[3.0054321 3.8246443 2.1078758 3.0291328 1.8091758 2.7371604 3.9851067
   2.9945998 3.7462146 3.5141351 3.336993  3.6518066]
  [3.0205224 3.8866928 2.0980802 3.0210824 1.80683   2.7380533 3.9870818
   2.9951804 3.7518353 3.5321844 3.3548553 3.6597142]
  [3.0081773 3.835439  2.1062093 3.027895  1.8097466 2.7364388 3.9845045
   2.9953678 3.7440763 3.5070477 3.329955  3.6485972]]], shape=(1, 3, 12), dtype=float32) 

Final output from using separate query, key, value matrices per head:
 [[[3.0054321 3.8246443 2.1078758 3.0291328 1.8091758 2.7371604 3.9851067
   2.9945998 3.7462146 3.5141351 3.336993  3.6518066]
  [3.0205224 3.8866928 2.0980802 3.0210824 1.80683   2.7380533 3.9870818
   2.9951804 3.7518353 3.5321844 3.3548553 3.6597142]
  [3.0081773 3.835439  2.1062093 3.027895  1.8097466 2.7364388 3.9845045
   2.9953678 3.7440763 3.5070477 3.329955  3.6485972]]]


In [27]:
class MultiHeadSelfAttention(tf.keras.layers.Layer):
  def __init__(self, d_model, num_heads):
    super(MultiHeadSelfAttention, self).__init__()
    self.d_model = d_model
    self.num_heads = num_heads

    self.d_head = self.d_model // self.num_heads

    self.wq = tf.keras.layers.Dense(self.d_model)
    self.wk = tf.keras.layers.Dense(self.d_model)
    self.wv = tf.keras.layers.Dense(self.d_model)

    # Linear layer to generate the final output.
    self.dense = tf.keras.layers.Dense(self.d_model)
  
  def split_heads(self, x):
    batch_size = x.shape[0]

    split_inputs = tf.reshape(x, (batch_size, -1, self.num_heads, self.d_head))
    return tf.transpose(split_inputs, perm=[0, 2, 1, 3])
  
  def merge_heads(self, x):
    batch_size = x.shape[0]

    merged_inputs = tf.transpose(x, perm=[0, 2, 1, 3])
    return tf.reshape(merged_inputs, (batch_size, -1, self.d_model))

  def call(self, q, k, v, mask):
    qs = self.wq(q)
    ks = self.wk(k)
    vs = self.wv(v)

    qs = self.split_heads(qs)
    ks = self.split_heads(ks)
    vs = self.split_heads(vs)

    output, attn_weights = scaled_dot_product_attention(qs, ks, vs, mask)
    output = self.merge_heads(output)

    return self.dense(output), attn_weights


In [28]:
mhsa = MultiHeadSelfAttention(12, 3)

output, attn_weights = mhsa(x, x, x, None)
print(f"MHSA output{output.shape}:")
print(output)

MHSA output(1, 3, 12):
tf.Tensor(
[[[ 0.2450927  -0.7318045  -0.07346617 -0.04063541  0.20350856
    0.65724885 -0.1746743   0.02042755  0.6685578   0.08295882
    0.8883524   0.01280902]
  [ 0.2583107  -0.73946834 -0.07798081 -0.06133163  0.20616147
    0.6644884  -0.1791845   0.02522293  0.6920126   0.09343398
    0.921222   -0.0015862 ]
  [ 0.24218246 -0.73264205 -0.07206094 -0.04281443  0.20945096
    0.6538497  -0.17880386  0.01817051  0.6598376   0.08123702
    0.89126253  0.01403455]]], shape=(1, 3, 12), dtype=float32)


## Encoder Block

In [29]:
def feed_forward_network(d_model, hidden_dim):
  return tf.keras.Sequential([
      tf.keras.layers.Dense(hidden_dim, activation='relu'),
      tf.keras.layers.Dense(d_model)
  ])

This is our encoder block containing all the layers and steps from the preceding illustration (plus dropout).

In [30]:
class EncoderBlock(tf.keras.layers.Layer):
  def __init__(self, d_model, num_heads, hidden_dim, dropout_rate=0.1):
    super(EncoderBlock, self).__init__()

    self.mhsa = MultiHeadSelfAttention(d_model, num_heads)
    self.ffn = feed_forward_network(d_model, hidden_dim)

    self.dropout1 = tf.keras.layers.Dropout(dropout_rate)
    self.dropout2 = tf.keras.layers.Dropout(dropout_rate)

    self.layernorm1 = tf.keras.layers.LayerNormalization()
    self.layernorm2 = tf.keras.layers.LayerNormalization()
  
  def call(self, x, training, mask):
    mhsa_output, attn_weights = self.mhsa(x, x, x, mask)
    mhsa_output = self.dropout1(mhsa_output, training=training)
    mhsa_output = self.layernorm1(x + mhsa_output)

    ffn_output = self.ffn(mhsa_output)
    ffn_output = self.dropout2(ffn_output, training=training)
    output = self.layernorm2(mhsa_output + ffn_output)

    return output, attn_weights


In [31]:
encoder_block = EncoderBlock(12, 3, 48)

block_output,  _ = encoder_block(x, True, None)
print(f"Output from single encoder block {block_output.shape}:")
print(block_output)

Output from single encoder block (1, 3, 12):
tf.Tensor(
[[[-1.2853216  -0.47298345  1.1163142   0.28988305 -0.15201803
    0.714354   -0.44642696  2.18815    -0.98428893  0.35164022
   -1.4659402   0.14663789]
  [-1.4989055  -0.36181933  1.5024801  -0.05960907 -0.21404044
    1.3806804  -0.81460243  1.3942987  -1.0500358   0.3370352
   -1.14041     0.52492774]
  [-1.2724154  -1.0890633   1.3531497   1.2134061   0.09007032
    1.2355703  -0.6275463   1.3311381  -1.1320009   0.10560008
   -0.88254625 -0.32536274]]], shape=(1, 3, 12), dtype=float32)


## Word and Positional Embeddings

In [32]:
# Load the English tokenizer.
bpemb_en = BPEmb(lang="en")

downloading https://nlp.h-its.org/bpemb/en/en.wiki.bpe.vs10000.model


100%|██████████| 400869/400869 [00:00<00:00, 493981.83B/s]


downloading https://nlp.h-its.org/bpemb/en/en.wiki.bpe.vs10000.d100.w2v.bin.tar.gz


100%|██████████| 3784656/3784656 [00:01<00:00, 2681792.28B/s]


In [33]:
bpemb_vocab_size, bpemb_embed_size = bpemb_en.vectors.shape
print("Vocabulary size:", bpemb_vocab_size)
print("Embedding size:", bpemb_embed_size)

Vocabulary size: 10000
Embedding size: 100


In [34]:
# Embedding for the word "car".
bpemb_en.vectors[bpemb_en.words.index('car')]

array([-0.305548, -0.325598, -0.134716, -0.078735, -0.660545,  0.076211,
       -0.735487,  0.124533, -0.294402,  0.459688,  0.030137,  0.174041,
       -0.224223,  0.486189, -0.504649, -0.459699,  0.315747,  0.477885,
        0.091398,  0.427867,  0.016524, -0.076833, -0.899727,  0.493158,
       -0.022309, -0.422785, -0.154148,  0.204981,  0.379834,  0.070588,
        0.196073, -0.368222,  0.473406,  0.007409,  0.004303, -0.007823,
       -0.19103 , -0.202509,  0.109878, -0.224521, -0.35741 , -0.611633,
        0.329958, -0.212956, -0.497499, -0.393839, -0.130101, -0.216903,
       -0.105595, -0.076007, -0.483942, -0.139704, -0.161647,  0.136985,
        0.415363, -0.360143,  0.038601, -0.078804, -0.030421,  0.324129,
        0.223378, -0.523636, -0.048317, -0.032248, -0.117367,  0.470519,
        0.225816, -0.222065, -0.225007, -0.165904, -0.334389, -0.20157 ,
        0.572352, -0.268794,  0.301929, -0.005563,  0.387491,  0.261031,
       -0.11613 ,  0.074982, -0.008433,  0.259987, 

In [35]:
sample_sentence = "Where can I find a pizzeria?"
tokens = bpemb_en.encode(sample_sentence)
print(tokens)

['▁where', '▁can', '▁i', '▁find', '▁a', '▁p', 'iz', 'zer', 'ia', '?']


In [36]:
token_seq = np.array(bpemb_en.encode_ids("Where can I find a pizzeria?"))
print(token_seq)

[ 571  280  386 1934    4   24  248 4339  177 9967]


In [37]:
token_embed = tf.keras.layers.Embedding(bpemb_vocab_size, embed_dim)
token_embeddings = token_embed(token_seq)

# The untrained embeddings for our sample sentence.
print("Embeddings for: ", sample_sentence)
print(token_embeddings)

Embeddings for:  Where can I find a pizzeria?
tf.Tensor(
[[ 0.021247   -0.01948299 -0.04639702 -0.02132328  0.03410709 -0.00271456
  -0.01825256  0.00657606 -0.03765022  0.02265502 -0.02121254 -0.0438798 ]
 [ 0.00857258 -0.01821322 -0.02282575 -0.0406186  -0.00255258 -0.04327289
   0.04462032 -0.01243747  0.02584079  0.01054467  0.03417119 -0.02772656]
 [-0.04897821  0.03801889 -0.02050481  0.04144004 -0.03804501 -0.03834953
  -0.03759488 -0.00296963 -0.03374429  0.01062299  0.01433483 -0.01363866]
 [ 0.00094473 -0.0388581   0.00746418 -0.04288788 -0.02629236 -0.02000649
   0.01079553 -0.04675244  0.00462361 -0.04766339  0.02018405 -0.04500157]
 [-0.04792075  0.01269305 -0.04440744 -0.00462649 -0.03221427 -0.00901992
   0.00595009  0.04646336 -0.01926882 -0.02931499 -0.02091855  0.02064309]
 [-0.0341584  -0.02598354  0.04016859 -0.014705   -0.01763175  0.03261962
  -0.03887844 -0.00059711  0.00582775  0.01827694 -0.01941589 -0.0366724 ]
 [-0.0433309   0.02875911  0.02125175 -0.03237639

In [38]:
max_seq_len = 256
pos_embed = tf.keras.layers.Embedding(max_seq_len, embed_dim)

# Generate ids for each position of the token sequence.
pos_idx = tf.range(len(token_seq))
print(pos_idx)

tf.Tensor([0 1 2 3 4 5 6 7 8 9], shape=(10,), dtype=int32)


In [39]:
# These are our positon embeddings.
position_embeddings = pos_embed(pos_idx)
print("Position embeddings for the input sequence\n", position_embeddings)

Position embeddings for the input sequence
 tf.Tensor(
[[-0.02627332  0.00494404 -0.00553044  0.00529874 -0.00445414 -0.03119564
   0.04908427  0.00199108 -0.00782778 -0.00158896  0.03763149 -0.02136957]
 [-0.04744848 -0.00125855 -0.03048106  0.03316036 -0.00197734  0.00174314
   0.0232848   0.02844452  0.04862762 -0.04100418  0.03514743 -0.0242133 ]
 [ 0.01328549 -0.00645056 -0.0093023   0.018146   -0.04506398 -0.03319913
  -0.02150148 -0.02230905  0.00856518 -0.02947686  0.02523848 -0.03266766]
 [ 0.00449647 -0.02950015 -0.04376494  0.04214707 -0.03811678 -0.02868011
  -0.03589477  0.03419762  0.04107979  0.02873271  0.01633034 -0.0194487 ]
 [ 0.02935529 -0.04177836 -0.04488919 -0.03773079 -0.01054424  0.04607414
   0.03547266  0.00316938  0.04814469 -0.0112382  -0.0283497  -0.00060313]
 [-0.04935927  0.0287292  -0.03315626 -0.03452907  0.02696041 -0.04182808
  -0.03011742  0.01208083  0.00889037  0.01043107 -0.03231344 -0.0325183 ]
 [-0.0472222  -0.04470063  0.01623067 -0.00702249 -

In [40]:
input = token_embeddings + position_embeddings
print("Input to the initial encoder block:\n", input)

Input to the initial encoder block:
 tf.Tensor(
[[-0.00502633 -0.01453896 -0.05192746 -0.01602453  0.02965296 -0.0339102
   0.03083171  0.00856714 -0.045478    0.02106606  0.01641894 -0.06524936]
 [-0.0388759  -0.01947178 -0.05330682 -0.00745824 -0.00452992 -0.04152975
   0.06790512  0.01600704  0.0744684  -0.03045951  0.06931862 -0.05193986]
 [-0.03569272  0.03156834 -0.0298071   0.05958605 -0.08310899 -0.07154866
  -0.05909636 -0.02527869 -0.02517911 -0.01885387  0.03957331 -0.04630632]
 [ 0.0054412  -0.06835826 -0.03630076 -0.00074081 -0.06440914 -0.0486866
  -0.02509924 -0.01255482  0.0457034  -0.01893068  0.03651439 -0.06445026]
 [-0.01856546 -0.02908531 -0.08929662 -0.04235728 -0.04275851  0.03705422
   0.04142275  0.04963274  0.02887586 -0.04055319 -0.04926825  0.02003996]
 [-0.08351767  0.00274567  0.00701233 -0.04923407  0.00932866 -0.00920846
  -0.06899586  0.01148372  0.01471812  0.02870801 -0.05172934 -0.06919071]
 [-0.09055309 -0.01594152  0.03748242 -0.03939888 -0.0447168

## Encoder

In [41]:
class Encoder(tf.keras.layers.Layer):
  def __init__(self, num_blocks, d_model, num_heads, hidden_dim, src_vocab_size,
               max_seq_len, dropout_rate=0.1):
    super(Encoder, self).__init__()

    self.d_model = d_model
    self.max_seq_len = max_seq_len

    self.token_embed = tf.keras.layers.Embedding(src_vocab_size, self.d_model)
    self.pos_embed = tf.keras.layers.Embedding(max_seq_len, self.d_model)

    # The original Attention Is All You Need paper applied dropout to the
    # input before feeding it to the first encoder block.
    self.dropout = tf.keras.layers.Dropout(dropout_rate)

    # Create encoder blocks.
    self.blocks = [EncoderBlock(self.d_model, num_heads, hidden_dim, dropout_rate) 
    for _ in range(num_blocks)]
  
  def call(self, input, training, mask):
    token_embeds = self.token_embed(input)

    # Generate position indices for a batch of input sequences.
    num_pos = input.shape[0] * self.max_seq_len
    pos_idx = np.resize(np.arange(self.max_seq_len), num_pos)
    pos_idx = np.reshape(pos_idx, input.shape)
    pos_embeds = self.pos_embed(pos_idx)

    x = self.dropout(token_embeds + pos_embeds, training=training)

    # Run input through successive encoder blocks.
    for block in self.blocks:
      x, weights = block(x, training, mask)

    return x, weights

In [42]:
# Batch of 3 sequences, each of length 10 (10 is also the 
# maximum sequence length in this case).
seqs = np.random.randint(0, 10000, size=(3, 10))
print(seqs.shape)
print(seqs)

(3, 10)
[[4543 6043 1740 5076 3654 5748 6803 8059 2385  117]
 [3776 4692 2924 9663 6247 1636 3160 2651 3796 5222]
 [ 898 1187 1663 3801  600  853 4573 8389 2772 8480]]


In [43]:
pos_ids = np.resize(np.arange(seqs.shape[1]), seqs.shape[0] * seqs.shape[1])
print(pos_ids)

[0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9]


In [44]:
pos_ids = np.reshape(pos_ids, (3, 10))
print(pos_ids.shape)
print(pos_ids)

(3, 10)
[[0 1 2 3 4 5 6 7 8 9]
 [0 1 2 3 4 5 6 7 8 9]
 [0 1 2 3 4 5 6 7 8 9]]


In [45]:
pos_embed(pos_ids)

<tf.Tensor: shape=(3, 10, 12), dtype=float32, numpy=
array([[[-0.02627332,  0.00494404, -0.00553044,  0.00529874,
         -0.00445414, -0.03119564,  0.04908427,  0.00199108,
         -0.00782778, -0.00158896,  0.03763149, -0.02136957],
        [-0.04744848, -0.00125855, -0.03048106,  0.03316036,
         -0.00197734,  0.00174314,  0.0232848 ,  0.02844452,
          0.04862762, -0.04100418,  0.03514743, -0.0242133 ],
        [ 0.01328549, -0.00645056, -0.0093023 ,  0.018146  ,
         -0.04506398, -0.03319913, -0.02150148, -0.02230905,
          0.00856518, -0.02947686,  0.02523848, -0.03266766],
        [ 0.00449647, -0.02950015, -0.04376494,  0.04214707,
         -0.03811678, -0.02868011, -0.03589477,  0.03419762,
          0.04107979,  0.02873271,  0.01633034, -0.0194487 ],
        [ 0.02935529, -0.04177836, -0.04488919, -0.03773079,
         -0.01054424,  0.04607414,  0.03547266,  0.00316938,
          0.04814469, -0.0112382 , -0.0283497 , -0.00060313],
        [-0.04935927,  0.02

In [46]:
input_batch = [
    "Where can I find a pizzeria?",
    "Mass hysteria over listeria.",
    "I ain't no circle back girl."
]

bpemb_en.encode(input_batch)

[['▁where', '▁can', '▁i', '▁find', '▁a', '▁p', 'iz', 'zer', 'ia', '?'],
 ['▁mass', '▁hy', 'ster', 'ia', '▁over', '▁l', 'ister', 'ia', '.'],
 ['▁i', '▁a', 'in', "'", 't', '▁no', '▁circle', '▁back', '▁girl', '.']]

In [47]:
input_seqs = bpemb_en.encode_ids(input_batch)
print("Vectorized inputs:")
input_seqs

Vectorized inputs:


[[571, 280, 386, 1934, 4, 24, 248, 4339, 177, 9967],
 [1535, 1354, 1238, 177, 380, 43, 871, 177, 9935],
 [386, 4, 6, 9937, 9915, 467, 5410, 810, 3692, 9935]]

In [48]:
padded_input_seqs = tf.keras.preprocessing.sequence.pad_sequences(input_seqs, padding="post")
print("Input to the encoder:")
print(padded_input_seqs.shape)
print(padded_input_seqs)

Input to the encoder:
(3, 10)
[[ 571  280  386 1934    4   24  248 4339  177 9967]
 [1535 1354 1238  177  380   43  871  177 9935    0]
 [ 386    4    6 9937 9915  467 5410  810 3692 9935]]


In [49]:
enc_mask = enc_mask[:, tf.newaxis, tf.newaxis, :]
enc_mask

NameError: ignored

In [50]:
num_encoder_blocks = 6

# d_model is the embedding dimension used throughout.
d_model = 12

num_heads = 3

# Feed-forward network hidden dimension width.
ffn_hidden_dim = 48

src_vocab_size = bpemb_vocab_size
max_input_seq_len = padded_input_seqs.shape[1]

encoder = Encoder(
    num_encoder_blocks,
    d_model,
    num_heads,
    ffn_hidden_dim,
    src_vocab_size,
    max_input_seq_len)

In [51]:
encoder_output, attn_weights = encoder(padded_input_seqs, training=True, 
                                       mask=enc_mask)
print(f"Encoder output {encoder_output.shape}:")
print(encoder_output)

NameError: ignored

## Decoder Block

In [52]:
class DecoderBlock(tf.keras.layers.Layer):
  def __init__(self, d_model, num_heads, hidden_dim, dropout_rate=0.1):
    super(DecoderBlock, self).__init__()

    self.mhsa1 = MultiHeadSelfAttention(d_model, num_heads)
    self.mhsa2 = MultiHeadSelfAttention(d_model, num_heads)

    self.ffn = feed_forward_network(d_model, hidden_dim)

    self.dropout1 = tf.keras.layers.Dropout(dropout_rate)
    self.dropout2 = tf.keras.layers.Dropout(dropout_rate)
    self.dropout3 = tf.keras.layers.Dropout(dropout_rate)

    self.layernorm1 = tf.keras.layers.LayerNormalization()
    self.layernorm2 = tf.keras.layers.LayerNormalization()
    self.layernorm3 = tf.keras.layers.LayerNormalization()
  
  # Note the decoder block takes two masks. One for the first MHSA, another
  # for the second MHSA.
  def call(self, encoder_output, target, training, decoder_mask, memory_mask):
    mhsa_output1, attn_weights = self.mhsa1(target, target, target, decoder_mask)
    mhsa_output1 = self.dropout1(mhsa_output1, training=training)
    mhsa_output1 = self.layernorm1(mhsa_output1 + target)

    mhsa_output2, attn_weights = self.mhsa2(mhsa_output1, encoder_output, 
                                            encoder_output, 
                                            memory_mask)
    mhsa_output2 = self.dropout2(mhsa_output2, training=training)
    mhsa_output2 = self.layernorm2(mhsa_output2 + mhsa_output1)

    ffn_output = self.ffn(mhsa_output2)
    ffn_output = self.dropout3(ffn_output, training=training)
    output = self.layernorm3(ffn_output + mhsa_output2)

    return output, attn_weights


## Decoder

In [53]:
class Decoder(tf.keras.layers.Layer):
  def __init__(self, num_blocks, d_model, num_heads, hidden_dim, target_vocab_size,
               max_seq_len, dropout_rate=0.1):
    super(Decoder, self).__init__()

    self.d_model = d_model
    self.max_seq_len = max_seq_len

    self.token_embed = tf.keras.layers.Embedding(target_vocab_size, self.d_model)
    self.pos_embed = tf.keras.layers.Embedding(max_seq_len, self.d_model)

    self.dropout = tf.keras.layers.Dropout(dropout_rate)

    self.blocks = [DecoderBlock(self.d_model, num_heads, hidden_dim, dropout_rate) for _ in range(num_blocks)]

  def call(self, encoder_output, target, training, decoder_mask, memory_mask):
    token_embeds = self.token_embed(target)

    # Generate position indices.
    num_pos = target.shape[0] * self.max_seq_len
    pos_idx = np.resize(np.arange(self.max_seq_len), num_pos)
    pos_idx = np.reshape(pos_idx, target.shape)

    pos_embeds = self.pos_embed(pos_idx)

    x = self.dropout(token_embeds + pos_embeds, training=training)

    for block in self.blocks:
      x, weights = block(encoder_output, x, training, decoder_mask, memory_mask)

    return x, weights

In [54]:
# Made up values.
target_input_seqs = [
    [1, 652, 723, 123, 62],
    [1, 25,  98, 129, 248, 215, 359, 249],
    [1, 2369, 1259, 125, 486],
]

In [55]:
padded_target_input_seqs = tf.keras.preprocessing.sequence.pad_sequences(target_input_seqs, padding="post")
print("Padded target inputs to the decoder:")
print(padded_target_input_seqs.shape)
print(padded_target_input_seqs)

Padded target inputs to the decoder:
(3, 8)
[[   1  652  723  123   62    0    0    0]
 [   1   25   98  129  248  215  359  249]
 [   1 2369 1259  125  486    0    0    0]]


We can create the padding mask the same way we did for the encoder.

In [56]:
dec_padding_mask = tf.cast(tf.math.not_equal(padded_target_input_seqs, 0), tf.float32)
dec_padding_mask = dec_padding_mask[:, tf.newaxis, tf.newaxis, :]
print(dec_padding_mask)

tf.Tensor(
[[[[1. 1. 1. 1. 1. 0. 0. 0.]]]


 [[[1. 1. 1. 1. 1. 1. 1. 1.]]]


 [[[1. 1. 1. 1. 1. 0. 0. 0.]]]], shape=(3, 1, 1, 8), dtype=float32)


In [57]:
target_input_seq_len = padded_target_input_seqs.shape[1]
look_ahead_mask = tf.linalg.band_part(tf.ones((target_input_seq_len, 
                                               target_input_seq_len)), -1, 0)
print(look_ahead_mask)

tf.Tensor(
[[1. 0. 0. 0. 0. 0. 0. 0.]
 [1. 1. 0. 0. 0. 0. 0. 0.]
 [1. 1. 1. 0. 0. 0. 0. 0.]
 [1. 1. 1. 1. 0. 0. 0. 0.]
 [1. 1. 1. 1. 1. 0. 0. 0.]
 [1. 1. 1. 1. 1. 1. 0. 0.]
 [1. 1. 1. 1. 1. 1. 1. 0.]
 [1. 1. 1. 1. 1. 1. 1. 1.]], shape=(8, 8), dtype=float32)


In [58]:
dec_mask = tf.minimum(dec_padding_mask, look_ahead_mask)
print("The decoder mask:")
print(dec_mask)

The decoder mask:
tf.Tensor(
[[[[1. 0. 0. 0. 0. 0. 0. 0.]
   [1. 1. 0. 0. 0. 0. 0. 0.]
   [1. 1. 1. 0. 0. 0. 0. 0.]
   [1. 1. 1. 1. 0. 0. 0. 0.]
   [1. 1. 1. 1. 1. 0. 0. 0.]
   [1. 1. 1. 1. 1. 0. 0. 0.]
   [1. 1. 1. 1. 1. 0. 0. 0.]
   [1. 1. 1. 1. 1. 0. 0. 0.]]]


 [[[1. 0. 0. 0. 0. 0. 0. 0.]
   [1. 1. 0. 0. 0. 0. 0. 0.]
   [1. 1. 1. 0. 0. 0. 0. 0.]
   [1. 1. 1. 1. 0. 0. 0. 0.]
   [1. 1. 1. 1. 1. 0. 0. 0.]
   [1. 1. 1. 1. 1. 1. 0. 0.]
   [1. 1. 1. 1. 1. 1. 1. 0.]
   [1. 1. 1. 1. 1. 1. 1. 1.]]]


 [[[1. 0. 0. 0. 0. 0. 0. 0.]
   [1. 1. 0. 0. 0. 0. 0. 0.]
   [1. 1. 1. 0. 0. 0. 0. 0.]
   [1. 1. 1. 1. 0. 0. 0. 0.]
   [1. 1. 1. 1. 1. 0. 0. 0.]
   [1. 1. 1. 1. 1. 0. 0. 0.]
   [1. 1. 1. 1. 1. 0. 0. 0.]
   [1. 1. 1. 1. 1. 0. 0. 0.]]]], shape=(3, 1, 8, 8), dtype=float32)


In [60]:
decoder = Decoder(6, 12, 3, 48, 10000, 8)
decoder_output, _ = decoder(encoder_output, padded_target_input_seqs, 
                            True, dec_mask, enc_mask)
print(f"Decoder output {decoder_output.shape}:")
print(decoder_output)

NameError: ignored

## Transformer

We now have all the pieces to build the **Transformer** itself, and it's pretty simple. 

In [61]:
class Transformer(tf.keras.Model):
  def __init__(self, num_blocks, d_model, num_heads, hidden_dim, source_vocab_size,
               target_vocab_size, max_input_len, max_target_len, dropout_rate=0.1):
    super(Transformer, self).__init__()

    self.encoder = Encoder(num_blocks, d_model, num_heads, hidden_dim, source_vocab_size, 
                           max_input_len, dropout_rate)
    
    self.decoder = Decoder(num_blocks, d_model, num_heads, hidden_dim, target_vocab_size,
                           max_target_len, dropout_rate)
    
    # The final dense layer to generate logits from the decoder output.
    self.output_layer = tf.keras.layers.Dense(target_vocab_size)

  def call(self, input_seqs, target_input_seqs, training, encoder_mask,
           decoder_mask, memory_mask):
    encoder_output, encoder_attn_weights = self.encoder(input_seqs, 
                                                        training, encoder_mask)

    decoder_output, decoder_attn_weights = self.decoder(encoder_output, 
                                                        target_input_seqs, training,
                                                        decoder_mask, memory_mask)

    return self.output_layer(decoder_output), encoder_attn_weights, decoder_attn_weights


In [62]:
transformer = Transformer(
    num_blocks = 6,
    d_model = 12,
    num_heads = 3,
    hidden_dim = 48,
    source_vocab_size = bpemb_vocab_size,
    target_vocab_size = 7000, # made-up target vocab size.
    max_input_len = padded_input_seqs.shape[1],
    max_target_len = padded_target_input_seqs.shape[1])

transformer_output, _, _ = transformer(padded_input_seqs, 
                                       padded_target_input_seqs, True, 
                                       enc_mask, dec_mask, memory_mask=enc_mask)
print(f"Transformer output {transformer_output.shape}:")
print(transformer_output) # If training, we would use this output to calculate losses.

NameError: ignored