Transformers


In [3]:
!pip install BPEmb

import math
import numpy as np
import tensorflow as tf
from bpemb import BPEmb

Collecting BPEmb
  Downloading bpemb-0.3.5-py3-none-any.whl.metadata (19 kB)
Downloading bpemb-0.3.5-py3-none-any.whl (19 kB)
Installing collected packages: BPEmb
Successfully installed BPEmb-0.3.5


# Transformers From Scratch

### Multi-Head Self-Attention

In [4]:
def scaled_dot_product_attention(query, key, value, mask=None):
  key_dim = tf.cast(tf.shape(key)[-1],tf.float32)
  scaled_scores = tf.matmul(query, key, transpose_b=True)/np.sqrt(key_dim)

  if mask is not None:
    scaled_scores = tf.where(mask==0, -np.inf, scaled_scores)

  softmax = tf.keras.layers.Softmax()
  weights = softmax(scaled_scores)
  return tf.matmul(weights, value),weights

Suppose our queries, keys, and values are each a length of 3 with a dimension of 4.

In [5]:
seq_len =3
embed_dim =4

queries = np.random.rand(seq_len, embed_dim)
keys = np.random.rand(seq_len, embed_dim)
values =  np.random.rand(seq_len, embed_dim)

print("Queries:\n",queries)

Queries:
 [[0.82788891 0.47845084 0.15714683 0.87821002]
 [0.64513255 0.17505266 0.11050864 0.83509783]
 [0.50878368 0.85307375 0.14718729 0.17733292]]


In [6]:
output, attn_weights = scaled_dot_product_attention(queries,keys,values)

print("Output\n", output, "\n")
print("Weights\n", attn_weights)

Output
 tf.Tensor(
[[0.7369056  0.77670056 0.6728094  0.70853746]
 [0.73682296 0.7711015  0.65910083 0.7067499 ]
 [0.7405319  0.77426565 0.6692036  0.7087018 ]], shape=(3, 4), dtype=float32) 

Weights
 tf.Tensor(
[[0.40532133 0.3466487  0.24803   ]
 [0.3807964  0.34614053 0.27306303]
 [0.4037669  0.33936718 0.25686586]], shape=(3, 3), dtype=float32)


Generating queries, keys, and values for multiple heads.

In [7]:
batch_size = 1
seq_len = 3
embed_dim = 12
num_heads = 3
head_dim = embed_dim // num_heads

print(f"Dimension of each head: {head_dim}")

Dimension of each head: 4


In [8]:
x = np.random.rand(batch_size, seq_len, embed_dim).round(1)
print("Input shape: ", x.shape, "\n")
print("Input:\n", x)

Input shape:  (1, 3, 12) 

Input:
 [[[0.8 0.6 0.1 0.9 0.3 1.  0.7 0.2 0.  0.4 0.3 0.9]
  [0.  1.  0.2 0.2 0.4 0.7 0.5 0.3 0.9 0.3 0.8 0.3]
  [0.5 0.3 0.6 0.9 0.9 0.7 0.  0.9 0.2 0.2 0.2 0.4]]]


In [9]:
# The query weights for each head.
wq0 = np.random.rand(embed_dim, head_dim).round(1)
wq1 = np.random.rand(embed_dim, head_dim).round(1)
wq2 = np.random.rand(embed_dim, head_dim).round(1)

# The key weights for each head.
wk0 = np.random.rand(embed_dim, head_dim).round(1)
wk1 = np.random.rand(embed_dim, head_dim).round(1)
wk2 = np.random.rand(embed_dim, head_dim).round(1)

# The value weights for each head.
wv0 = np.random.rand(embed_dim, head_dim).round(1)
wv1 = np.random.rand(embed_dim, head_dim).round(1)
wv2 = np.random.rand(embed_dim, head_dim).round(1)

In [10]:
print("The three sets of query weights (one for each head):")
print("wq0:\n", wq0)
print("wq1:\n", wq1)
print("wq2:\n", wq1)

The three sets of query weights (one for each head):
wq0:
 [[0.7 0.5 0.9 0.1]
 [0.3 0.4 0.7 0.9]
 [0.2 0.1 0.7 0.3]
 [0.5 0.7 0.1 0.7]
 [0.2 0.5 0.3 0.7]
 [0.2 0.4 0.2 0.5]
 [0.6 0.7 0.3 0.4]
 [0.7 0.2 0.4 0.8]
 [0.8 0.7 0.6 0.1]
 [0.7 0.5 0.9 0.7]
 [0.5 0.4 0.1 0.2]
 [0.8 1.  0.9 0.8]]
wq1:
 [[0.3 0.2 0.7 0.6]
 [0.1 0.9 0.1 0.8]
 [0.5 0.2 0.8 0.7]
 [0.  0.7 0.1 0.4]
 [0.6 0.6 0.6 0.2]
 [0.2 0.7 0.4 0.5]
 [0.3 0.4 0.2 0.4]
 [0.6 0.2 0.2 0.7]
 [0.7 0.7 0.2 0.6]
 [0.9 0.7 0.7 0.4]
 [0.6 0.2 0.1 0.9]
 [0.1 0.8 0.4 0.9]]
wq2:
 [[0.3 0.2 0.7 0.6]
 [0.1 0.9 0.1 0.8]
 [0.5 0.2 0.8 0.7]
 [0.  0.7 0.1 0.4]
 [0.6 0.6 0.6 0.2]
 [0.2 0.7 0.4 0.5]
 [0.3 0.4 0.2 0.4]
 [0.6 0.2 0.2 0.7]
 [0.7 0.7 0.2 0.6]
 [0.9 0.7 0.7 0.4]
 [0.6 0.2 0.1 0.9]
 [0.1 0.8 0.4 0.9]]


In [11]:
# Geneated queries, keys, and values for the first head.
q0 = np.dot(x, wq0)
k0 = np.dot(x, wk0)
v0 = np.dot(x, wv0)

# Geneated queries, keys, and values for the second head.
q1 = np.dot(x, wq1)
k1 = np.dot(x, wk1)
v1 = np.dot(x, wv1)

# Geneated queries, keys, and values for the third head.
q2 = np.dot(x, wq2)
k2 = np.dot(x, wk2)
v2 = np.dot(x, wv2)

In [12]:
print("Q, K, and V for first head:\n")

print(f"q0 {q0.shape}:\n", q0, "\n")
print(f"k0 {k0.shape}:\n", k0, "\n")
print(f"v0 {v0.shape}:\n", v0)

Q, K, and V for first head:

q0 (1, 3, 4):
 [[[3.18 3.58 3.08 3.49]
  [2.74 2.85 2.55 2.87]
  [2.68 2.69 2.62 3.35]]] 

k0 (1, 3, 4):
 [[[2.82 3.94 3.59 3.23]
  [2.42 3.07 3.3  2.83]
  [2.09 3.73 3.11 3.1 ]]] 

v0 (1, 3, 4):
 [[[3.3  3.04 3.53 1.79]
  [2.98 2.32 2.95 2.73]
  [3.53 2.66 2.37 1.65]]]


In [13]:
out0, attn_weights0 = scaled_dot_product_attention(q0, k0, v0)

print("Output from first attention head: ", out0, "\n")
print("Attention weights from first head: ", attn_weights0)

Output from first attention head:  tf.Tensor(
[[[3.3066816 2.9892757 3.4265764 1.8096166]
  [3.3062172 2.9612694 3.3783638 1.8270708]
  [3.3064375 2.9624128 3.3799012 1.8260374]]], shape=(1, 3, 4), dtype=float32) 

Attention weights from first head:  tf.Tensor(
[[[0.8949512  0.03178101 0.0732677 ]
  [0.8418675  0.05482367 0.10330877]
  [0.8437963  0.05361672 0.10258693]]], shape=(1, 3, 3), dtype=float32)


In [14]:
out0, attn_weights0 = scaled_dot_product_attention(q0, k0, v0)

print("Output from first attention head: ", out0, "\n")
print("Attention weights from first head: ", attn_weights0)

Output from first attention head:  tf.Tensor(
[[[3.3066816 2.9892757 3.4265764 1.8096166]
  [3.3062172 2.9612694 3.3783638 1.8270708]
  [3.3064375 2.9624128 3.3799012 1.8260374]]], shape=(1, 3, 4), dtype=float32) 

Attention weights from first head:  tf.Tensor(
[[[0.8949512  0.03178101 0.0732677 ]
  [0.8418675  0.05482367 0.10330877]
  [0.8437963  0.05361672 0.10258693]]], shape=(1, 3, 3), dtype=float32)


In [15]:
out1, _ = scaled_dot_product_attention(q1, k1, v1)
out2, _ = scaled_dot_product_attention(q2, k2, v2)

In [16]:
combined_out_a = np.concatenate((out0, out1, out2), axis=-1)
print(f"Combined output from all heads {combined_out_a.shape}:")
print(combined_out_a)

Combined output from all heads (1, 3, 12):
[[[3.3066816 2.9892757 3.4265764 1.8096166 3.6096988 2.9276514 2.1850147
   3.0830379 2.6692083 2.739544  2.7760735 3.4935157]
  [3.3062172 2.9612694 3.3783638 1.8270708 3.5816612 2.9478528 2.197434
   3.0901368 2.7068827 2.7709622 2.7728791 3.4823875]
  [3.3064375 2.9624128 3.3799012 1.8260374 3.5807714 2.9463234 2.191193
   3.0957341 2.6996639 2.7612586 2.7779686 3.4790032]]]


In [17]:
print("Query weights for first head: \n", wq0, "\n")
print("Query weights for second head: \n", wq1, "\n")
print("Query weights for third head: \n", wq2)

Query weights for first head: 
 [[0.7 0.5 0.9 0.1]
 [0.3 0.4 0.7 0.9]
 [0.2 0.1 0.7 0.3]
 [0.5 0.7 0.1 0.7]
 [0.2 0.5 0.3 0.7]
 [0.2 0.4 0.2 0.5]
 [0.6 0.7 0.3 0.4]
 [0.7 0.2 0.4 0.8]
 [0.8 0.7 0.6 0.1]
 [0.7 0.5 0.9 0.7]
 [0.5 0.4 0.1 0.2]
 [0.8 1.  0.9 0.8]] 

Query weights for second head: 
 [[0.3 0.2 0.7 0.6]
 [0.1 0.9 0.1 0.8]
 [0.5 0.2 0.8 0.7]
 [0.  0.7 0.1 0.4]
 [0.6 0.6 0.6 0.2]
 [0.2 0.7 0.4 0.5]
 [0.3 0.4 0.2 0.4]
 [0.6 0.2 0.2 0.7]
 [0.7 0.7 0.2 0.6]
 [0.9 0.7 0.7 0.4]
 [0.6 0.2 0.1 0.9]
 [0.1 0.8 0.4 0.9]] 

Query weights for third head: 
 [[0.2 0.5 0.6 0.8]
 [0.5 0.5 0.6 0.5]
 [0.8 0.4 0.9 0.4]
 [0.1 1.  0.2 0.1]
 [0.8 0.8 0.1 0.8]
 [0.9 0.2 0.2 0.8]
 [0.1 0.4 0.4 0. ]
 [0.2 0.3 0.  0.5]
 [0.7 0.4 0.5 0.6]
 [0.9 0.5 0.2 0.3]
 [0.7 0.  0.5 0.5]
 [1.  0.7 0.5 0.6]]


In [18]:
print("Query weights for first head: \n", wq0, "\n")
print("Query weights for second head: \n", wq1, "\n")
print("Query weights for third head: \n", wq2)

Query weights for first head: 
 [[0.7 0.5 0.9 0.1]
 [0.3 0.4 0.7 0.9]
 [0.2 0.1 0.7 0.3]
 [0.5 0.7 0.1 0.7]
 [0.2 0.5 0.3 0.7]
 [0.2 0.4 0.2 0.5]
 [0.6 0.7 0.3 0.4]
 [0.7 0.2 0.4 0.8]
 [0.8 0.7 0.6 0.1]
 [0.7 0.5 0.9 0.7]
 [0.5 0.4 0.1 0.2]
 [0.8 1.  0.9 0.8]] 

Query weights for second head: 
 [[0.3 0.2 0.7 0.6]
 [0.1 0.9 0.1 0.8]
 [0.5 0.2 0.8 0.7]
 [0.  0.7 0.1 0.4]
 [0.6 0.6 0.6 0.2]
 [0.2 0.7 0.4 0.5]
 [0.3 0.4 0.2 0.4]
 [0.6 0.2 0.2 0.7]
 [0.7 0.7 0.2 0.6]
 [0.9 0.7 0.7 0.4]
 [0.6 0.2 0.1 0.9]
 [0.1 0.8 0.4 0.9]] 

Query weights for third head: 
 [[0.2 0.5 0.6 0.8]
 [0.5 0.5 0.6 0.5]
 [0.8 0.4 0.9 0.4]
 [0.1 1.  0.2 0.1]
 [0.8 0.8 0.1 0.8]
 [0.9 0.2 0.2 0.8]
 [0.1 0.4 0.4 0. ]
 [0.2 0.3 0.  0.5]
 [0.7 0.4 0.5 0.6]
 [0.9 0.5 0.2 0.3]
 [0.7 0.  0.5 0.5]
 [1.  0.7 0.5 0.6]]


In [19]:
wq = np.concatenate((wq0, wq1, wq2), axis=1)
print(f"Single query weight matrix {wq.shape}: \n", wq)

Single query weight matrix (12, 12): 
 [[0.7 0.5 0.9 0.1 0.3 0.2 0.7 0.6 0.2 0.5 0.6 0.8]
 [0.3 0.4 0.7 0.9 0.1 0.9 0.1 0.8 0.5 0.5 0.6 0.5]
 [0.2 0.1 0.7 0.3 0.5 0.2 0.8 0.7 0.8 0.4 0.9 0.4]
 [0.5 0.7 0.1 0.7 0.  0.7 0.1 0.4 0.1 1.  0.2 0.1]
 [0.2 0.5 0.3 0.7 0.6 0.6 0.6 0.2 0.8 0.8 0.1 0.8]
 [0.2 0.4 0.2 0.5 0.2 0.7 0.4 0.5 0.9 0.2 0.2 0.8]
 [0.6 0.7 0.3 0.4 0.3 0.4 0.2 0.4 0.1 0.4 0.4 0. ]
 [0.7 0.2 0.4 0.8 0.6 0.2 0.2 0.7 0.2 0.3 0.  0.5]
 [0.8 0.7 0.6 0.1 0.7 0.7 0.2 0.6 0.7 0.4 0.5 0.6]
 [0.7 0.5 0.9 0.7 0.9 0.7 0.7 0.4 0.9 0.5 0.2 0.3]
 [0.5 0.4 0.1 0.2 0.6 0.2 0.1 0.9 0.7 0.  0.5 0.5]
 [0.8 1.  0.9 0.8 0.1 0.8 0.4 0.9 1.  0.7 0.5 0.6]]


In [20]:
wk = np.concatenate((wk0, wk1, wk2), axis=1)
wv = np.concatenate((wv0, wv1, wv2), axis=1)

In [21]:
q_s = np.dot(x, wq)
k_s = np.dot(x, wk)
v_s = np.dot(x, wv)

In [22]:
print(f"Query vectors using a single weight matrix {q_s.shape}:\n", q_s)

Query vectors using a single weight matrix (1, 3, 12):
 [[[3.18 3.58 3.08 3.49 1.69 3.61 2.22 3.61 3.35 3.25 2.3  3.02]
  [2.74 2.85 2.55 2.87 2.32 3.31 1.55 3.51 3.5  2.25 2.26 2.84]
  [2.68 2.69 2.62 3.35 2.18 2.97 2.31 3.22 3.21 3.13 1.87 3.13]]]


In [23]:
q_s_reshaped = tf.reshape(q_s, (batch_size, seq_len, num_heads, head_dim))

In [24]:
q_s_transposed = tf.transpose(q_s_reshaped, perm=[0, 2, 1, 3]).numpy()

In [25]:
print("The separate per-head query matrices from before: ")
print(q0, "\n")
print(q1, "\n")
print(q2)

The separate per-head query matrices from before: 
[[[3.18 3.58 3.08 3.49]
  [2.74 2.85 2.55 2.87]
  [2.68 2.69 2.62 3.35]]] 

[[[1.69 3.61 2.22 3.61]
  [2.32 3.31 1.55 3.51]
  [2.18 2.97 2.31 3.22]]] 

[[[3.35 3.25 2.3  3.02]
  [3.5  2.25 2.26 2.84]
  [3.21 3.13 1.87 3.13]]]


In [26]:
k_s_transposed = tf.transpose(tf.reshape(k_s, (batch_size, -1, num_heads, head_dim)), perm=[0, 2, 1, 3]).numpy()
v_s_transposed = tf.transpose(tf.reshape(v_s, (batch_size, -1, num_heads, head_dim)), perm=[0, 2, 1, 3]).numpy()

In [27]:
all_heads_output, all_attn_weights = scaled_dot_product_attention(q_s_transposed,
                                                                  k_s_transposed,
                                                                  v_s_transposed)
print("Self attention output:\n", all_heads_output)

Self attention output:
 tf.Tensor(
[[[[3.3066816 2.9892757 3.4265764 1.8096166]
   [3.3062172 2.9612694 3.3783638 1.8270708]
   [3.3064375 2.9624128 3.3799012 1.8260374]]

  [[3.6096988 2.9276514 2.1850147 3.0830379]
   [3.5816612 2.9478528 2.197434  3.0901368]
   [3.5807714 2.9463234 2.191193  3.0957341]]

  [[2.6692083 2.739544  2.7760735 3.4935157]
   [2.7068827 2.7709622 2.7728791 3.4823875]
   [2.6996639 2.7612586 2.7779686 3.4790032]]]], shape=(1, 3, 3, 4), dtype=float32)


In [28]:
print("Per head outputs from using separate sets of weights per head:")
print(out0, "\n")
print(out1, "\n")
print(out2)

Per head outputs from using separate sets of weights per head:
tf.Tensor(
[[[3.3066816 2.9892757 3.4265764 1.8096166]
  [3.3062172 2.9612694 3.3783638 1.8270708]
  [3.3064375 2.9624128 3.3799012 1.8260374]]], shape=(1, 3, 4), dtype=float32) 

tf.Tensor(
[[[3.6096988 2.9276514 2.1850147 3.0830379]
  [3.5816612 2.9478528 2.197434  3.0901368]
  [3.5807714 2.9463234 2.191193  3.0957341]]], shape=(1, 3, 4), dtype=float32) 

tf.Tensor(
[[[2.6692083 2.739544  2.7760735 3.4935157]
  [2.7068827 2.7709622 2.7728791 3.4823875]
  [2.6996639 2.7612586 2.7779686 3.4790032]]], shape=(1, 3, 4), dtype=float32)


In [29]:
combined_out_b = tf.reshape(tf.transpose(all_heads_output, perm=[0, 2, 1, 3]),
                            shape=(batch_size, seq_len, embed_dim))

In [30]:
class MultiHeadSelfAttention(tf.keras.layers.Layer):
  def __init__(self, d_model, num_heads):
    super(MultiHeadSelfAttention, self).__init__()
    self.d_model = d_model
    self.num_heads = num_heads

    self.d_head = self.d_model // self.num_heads

    self.wq = tf.keras.layers.Dense(self.d_model)
    self.wk = tf.keras.layers.Dense(self.d_model)
    self.wv = tf.keras.layers.Dense(self.d_model)

    # Linear layer to generate the final output.
    self.dense = tf.keras.layers.Dense(self.d_model)

  def split_heads(self, x):
    batch_size = x.shape[0]

    split_inputs = tf.reshape(x, (batch_size, -1, self.num_heads, self.d_head))
    return tf.transpose(split_inputs, perm=[0, 2, 1, 3])

  def merge_heads(self, x):
    batch_size = x.shape[0]

    merged_inputs = tf.transpose(x, perm=[0, 2, 1, 3])
    return tf.reshape(merged_inputs, (batch_size, -1, self.d_model))

  def call(self, q, k, v, mask):
    qs = self.wq(q)
    ks = self.wk(k)
    vs = self.wv(v)

    qs = self.split_heads(qs)
    ks = self.split_heads(ks)
    vs = self.split_heads(vs)

    output, attn_weights = scaled_dot_product_attention(qs, ks, vs, mask)
    output = self.merge_heads(output)

    return self.dense(output), attn_weights

In [31]:
mhsa = MultiHeadSelfAttention(12, 3)

output, attn_weights = mhsa(x, x, x, None)
print(f"MHSA output{output.shape}:")
print(output)

MHSA output(1, 3, 12):
tf.Tensor(
[[[ 1.3627412   0.16063865  0.38592675  0.3214462   0.06935024
    0.5341093   0.2784205   0.30460346 -0.88454527  0.2611329
   -0.150915   -0.20781514]
  [ 1.403109    0.14670819  0.4057802   0.33295926  0.05734877
    0.55069447  0.28100032  0.29597065 -0.87676257  0.25711
   -0.17546228 -0.21427244]
  [ 1.3713204   0.16634938  0.38348597  0.3137713   0.07374293
    0.5217956   0.29557145  0.2968211  -0.89511603  0.24521267
   -0.15199354 -0.20717233]]], shape=(1, 3, 12), dtype=float32)


## Encoder Block

In [32]:
def feed_forward_network(d_model, hidden_dim):
  return tf.keras.Sequential([
      tf.keras.layers.Dense(hidden_dim, activation='relu'),
      tf.keras.layers.Dense(d_model)
  ])

In [33]:
class EncoderBlock(tf.keras.layers.Layer):
  def __init__(self, d_model, num_heads, hidden_dim, dropout_rate=0.1):
    super(EncoderBlock, self).__init__()

    self.mhsa = MultiHeadSelfAttention(d_model, num_heads)
    self.ffn = feed_forward_network(d_model, hidden_dim)

    self.dropout1 = tf.keras.layers.Dropout(dropout_rate)
    self.dropout2 = tf.keras.layers.Dropout(dropout_rate)

    self.layernorm1 = tf.keras.layers.LayerNormalization()
    self.layernorm2 = tf.keras.layers.LayerNormalization()

  def call(self, x, training, mask):
    mhsa_output, attn_weights = self.mhsa(x, x, x, mask)
    mhsa_output = self.dropout1(mhsa_output, training=training)
    mhsa_output = self.layernorm1(x + mhsa_output)

    ffn_output = self.ffn(mhsa_output)
    ffn_output = self.dropout2(ffn_output, training=training)
    output = self.layernorm2(mhsa_output + ffn_output)

    return output, attn_weights

In [34]:
encoder_block = EncoderBlock(12, 3, 48)

block_output,  _ = encoder_block(x, True, None)
print(f"Output from single encoder block {block_output.shape}:")
print(block_output)

Output from single encoder block (1, 3, 12):
tf.Tensor(
[[[ 0.8362456  -0.8239631   1.046871    0.81172305 -0.5489809
    0.45544222  0.30277464  0.7323488  -1.7660596  -0.2322502
    1.0452968  -1.8594481 ]
  [-0.9280708  -0.03349714  1.4318237  -1.1032745  -0.6403576
   -1.0067633   0.54416806  0.25722837  0.07079867  0.35319436
    2.1678832  -1.1131334 ]
  [-1.4087355   0.6905646   1.6501527   0.3185261  -1.369937
    0.28188705 -0.24529329  0.9016651  -1.2333441  -0.88465244
    0.04952613  1.2496405 ]]], shape=(1, 3, 12), dtype=float32)


# Word and Positional Embeddings

In [35]:
bpemb_en = BPEmb(lang="en")

downloading https://nlp.h-its.org/bpemb/en/en.wiki.bpe.vs10000.model


100%|██████████| 400869/400869 [00:00<00:00, 1060027.91B/s]


downloading https://nlp.h-its.org/bpemb/en/en.wiki.bpe.vs10000.d100.w2v.bin.tar.gz


100%|██████████| 3784656/3784656 [00:00<00:00, 5189426.11B/s]


In [36]:
bpemb_vocab_size, bpemb_embed_size = bpemb_en.vectors.shape
print("Vocabulary size:", bpemb_vocab_size)
print("Embedding size:", bpemb_embed_size)

Vocabulary size: 10000
Embedding size: 100


In [37]:
# Embedding for the word "car".
bpemb_en.vectors[bpemb_en.words.index('car')]

array([-0.305548, -0.325598, -0.134716, -0.078735, -0.660545,  0.076211,
       -0.735487,  0.124533, -0.294402,  0.459688,  0.030137,  0.174041,
       -0.224223,  0.486189, -0.504649, -0.459699,  0.315747,  0.477885,
        0.091398,  0.427867,  0.016524, -0.076833, -0.899727,  0.493158,
       -0.022309, -0.422785, -0.154148,  0.204981,  0.379834,  0.070588,
        0.196073, -0.368222,  0.473406,  0.007409,  0.004303, -0.007823,
       -0.19103 , -0.202509,  0.109878, -0.224521, -0.35741 , -0.611633,
        0.329958, -0.212956, -0.497499, -0.393839, -0.130101, -0.216903,
       -0.105595, -0.076007, -0.483942, -0.139704, -0.161647,  0.136985,
        0.415363, -0.360143,  0.038601, -0.078804, -0.030421,  0.324129,
        0.223378, -0.523636, -0.048317, -0.032248, -0.117367,  0.470519,
        0.225816, -0.222065, -0.225007, -0.165904, -0.334389, -0.20157 ,
        0.572352, -0.268794,  0.301929, -0.005563,  0.387491,  0.261031,
       -0.11613 ,  0.074982, -0.008433,  0.259987, 

In [38]:
sample_sentence = "Where can I find a placement?"
tokens = bpemb_en.encode(sample_sentence)
print(tokens)

['▁where', '▁can', '▁i', '▁find', '▁a', '▁place', 'ment', '?']


In [39]:
token_seq = np.array(bpemb_en.encode_ids("Where can I find a placement?"))
print(token_seq)

[ 571  280  386 1934    4 1088  187 9967]


In [40]:
token_embed = tf.keras.layers.Embedding(bpemb_vocab_size, embed_dim)
token_embeddings = token_embed(token_seq)

# The untrained embeddings for our sample sentence.
print("Embeddings for: ", sample_sentence)
print(token_embeddings)

Embeddings for:  Where can I find a placement?
tf.Tensor(
[[ 9.43394750e-03  2.24299915e-02 -2.55576260e-02  2.87322067e-02
   1.54048316e-02 -2.71322485e-02  2.13734396e-02  4.89125364e-02
   1.16499662e-02 -1.41085312e-03 -2.12918166e-02  3.98954861e-02]
 [-7.93106481e-03  1.68977641e-02  3.32831256e-02 -7.79966265e-03
  -9.99156386e-03  4.01916616e-02 -3.46181020e-02  4.90086712e-02
  -1.90275908e-03  1.69362910e-02  1.92231201e-02  2.28811428e-03]
 [ 5.44165447e-03  3.08149345e-02  1.36735290e-03 -2.95903087e-02
   2.68846750e-03 -7.74685293e-03  7.65486807e-03  3.18320282e-02
  -1.42325982e-02  4.53356840e-02 -3.17296386e-02  2.14823596e-02]
 [-9.77023691e-03 -1.25811473e-02 -5.85842878e-04 -9.28838179e-03
   4.91710417e-02 -4.36002016e-03 -1.06690750e-02 -4.43048738e-02
   2.58824937e-02 -1.62694938e-02  1.19224191e-02  2.12631114e-02]
 [ 1.44097917e-02  2.43095271e-02 -2.08809506e-02  1.16189495e-02
  -8.24720785e-03 -2.18522549e-03 -2.73436438e-02 -1.53779984e-05
  -3.93536575e

In [41]:
max_seq_len = 256
pos_embed = tf.keras.layers.Embedding(max_seq_len, embed_dim)

# Generate ids for each position of the token sequence.
pos_idx = tf.range(len(token_seq))
print(pos_idx)

tf.Tensor([0 1 2 3 4 5 6 7], shape=(8,), dtype=int32)


In [42]:
position_embeddings = pos_embed(pos_idx)
print("Position embeddings for the input sequence\n", position_embeddings)

Position embeddings for the input sequence
 tf.Tensor(
[[ 0.0182374  -0.04646475 -0.00375506 -0.01233542  0.03764436  0.00863379
   0.00935394 -0.03585695  0.03879056  0.01691462 -0.01131745  0.011762  ]
 [-0.02997353  0.02893126  0.0034434   0.01245123 -0.00723519  0.01815319
  -0.04335778 -0.01362966  0.01541758 -0.03164274  0.03858277 -0.00330187]
 [ 0.00508808  0.0148906  -0.00459822 -0.0311793   0.03297097  0.01159291
  -0.00545932  0.00125635 -0.02360886  0.04261509  0.02811799  0.01085458]
 [-0.00920998 -0.03255661 -0.01375307  0.02883606 -0.03784255 -0.03116384
  -0.01753619 -0.00243188 -0.01238028  0.01684532 -0.03331041  0.04496412]
 [-0.03949132 -0.00810491  0.01725994 -0.04239985 -0.02407693  0.01918412
  -0.02115929 -0.00855763 -0.01861004 -0.03655253  0.00440925  0.01358545]
 [ 0.01651144  0.00551487  0.04656825  0.02754091  0.03595329  0.01630941
  -0.00906213  0.03377352  0.01843799 -0.00877187  0.03260056 -0.0125886 ]
 [-0.02006437  0.03781242  0.03810674 -0.04497066 -

In [43]:
input = token_embeddings + position_embeddings
print("Input to the initial encoder block:\n", input)

Input to the initial encoder block:
 tf.Tensor(
[[ 0.02767135 -0.02403476 -0.02931268  0.01639679  0.05304919 -0.01849846
   0.03072738  0.01305559  0.05044052  0.01550376 -0.03260927  0.05165749]
 [-0.0379046   0.04582903  0.03672652  0.00465157 -0.01722676  0.05834486
  -0.07797588  0.03537901  0.01351482 -0.01470644  0.05780589 -0.00101376]
 [ 0.01052973  0.04570553 -0.00323087 -0.06076961  0.03565944  0.00384606
   0.00219555  0.03308837 -0.03784146  0.08795077 -0.00361165  0.03233694]
 [-0.01898022 -0.04513776 -0.01433891  0.01954768  0.01132849 -0.03552385
  -0.02820526 -0.04673675  0.01350221  0.00057583 -0.02138799  0.06622723]
 [-0.02508153  0.01620462 -0.00362101 -0.0307809  -0.03232414  0.0169989
  -0.04850294 -0.00857301 -0.0579637  -0.00086577 -0.04528828  0.05701514]
 [-0.0301707   0.0437669   0.05965019  0.02629005  0.02869089 -0.03221594
   0.00130167 -0.00947776  0.0075758   0.0226541   0.07671732  0.02358885]
 [-0.06629125 -0.00777708  0.00865705 -0.08594636  0.022366

# Encoder

In [44]:
class Encoder(tf.keras.layers.Layer):
  def __init__(self, num_blocks, d_model, num_heads, hidden_dim, src_vocab_size,
               max_seq_len, dropout_rate=0.1):
    super(Encoder, self).__init__()

    self.d_model = d_model
    self.max_seq_len = max_seq_len

    self.token_embed = tf.keras.layers.Embedding(src_vocab_size, self.d_model)
    self.pos_embed = tf.keras.layers.Embedding(max_seq_len, self.d_model)

    # The original Attention Is All You Need paper applied dropout to the
    # input before feeding it to the first encoder block.
    self.dropout = tf.keras.layers.Dropout(dropout_rate)

    # Create encoder blocks.
    self.blocks = [EncoderBlock(self.d_model, num_heads, hidden_dim, dropout_rate)
    for _ in range(num_blocks)]

  def call(self, input, training, mask):
    token_embeds = self.token_embed(input)

    # Generate position indices for a batch of input sequences.
    num_pos = input.shape[0] * self.max_seq_len
    pos_idx = np.resize(np.arange(self.max_seq_len), num_pos)
    pos_idx = np.reshape(pos_idx, input.shape)
    pos_embeds = self.pos_embed(pos_idx)

    x = self.dropout(token_embeds + pos_embeds, training=training)

    # Run input through successive encoder blocks.
    for block in self.blocks:
      x, weights = block(x, training, mask)

    return x, weights

In [45]:
# Batch of 3 sequences, each of length 10 (10 is also the
# maximum sequence length in this case).
seqs = np.random.randint(0, 10000, size=(3, 10))
print(seqs.shape)
print(seqs)

(3, 10)
[[8911 4502 7570 5505 2915 7812 6777 1491  174 3097]
 [9789 5868 5632 6606   61 9679 1698 1771 2137 5813]
 [8107 3305 4311 8309 6582 7621 2868 5331 9619 3789]]


In [46]:
pos_ids = np.resize(np.arange(seqs.shape[1]), seqs.shape[0] * seqs.shape[1])
print(pos_ids)

[0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9]


In [47]:
pos_ids = np.reshape(pos_ids, (3, 10))
print(pos_ids.shape)
print(pos_ids)

(3, 10)
[[0 1 2 3 4 5 6 7 8 9]
 [0 1 2 3 4 5 6 7 8 9]
 [0 1 2 3 4 5 6 7 8 9]]


In [48]:
pos_embed(pos_ids)

<tf.Tensor: shape=(3, 10, 12), dtype=float32, numpy=
array([[[ 0.0182374 , -0.04646475, -0.00375506, -0.01233542,
          0.03764436,  0.00863379,  0.00935394, -0.03585695,
          0.03879056,  0.01691462, -0.01131745,  0.011762  ],
        [-0.02997353,  0.02893126,  0.0034434 ,  0.01245123,
         -0.00723519,  0.01815319, -0.04335778, -0.01362966,
          0.01541758, -0.03164274,  0.03858277, -0.00330187],
        [ 0.00508808,  0.0148906 , -0.00459822, -0.0311793 ,
          0.03297097,  0.01159291, -0.00545932,  0.00125635,
         -0.02360886,  0.04261509,  0.02811799,  0.01085458],
        [-0.00920998, -0.03255661, -0.01375307,  0.02883606,
         -0.03784255, -0.03116384, -0.01753619, -0.00243188,
         -0.01238028,  0.01684532, -0.03331041,  0.04496412],
        [-0.03949132, -0.00810491,  0.01725994, -0.04239985,
         -0.02407693,  0.01918412, -0.02115929, -0.00855763,
         -0.01861004, -0.03655253,  0.00440925,  0.01358545],
        [ 0.01651144,  0.00

In [49]:
pos_ids = np.reshape(pos_ids, (3, 10))
print(pos_ids.shape)
print(pos_ids)

(3, 10)
[[0 1 2 3 4 5 6 7 8 9]
 [0 1 2 3 4 5 6 7 8 9]
 [0 1 2 3 4 5 6 7 8 9]]


In [50]:
pos_embed(pos_ids)

<tf.Tensor: shape=(3, 10, 12), dtype=float32, numpy=
array([[[ 0.0182374 , -0.04646475, -0.00375506, -0.01233542,
          0.03764436,  0.00863379,  0.00935394, -0.03585695,
          0.03879056,  0.01691462, -0.01131745,  0.011762  ],
        [-0.02997353,  0.02893126,  0.0034434 ,  0.01245123,
         -0.00723519,  0.01815319, -0.04335778, -0.01362966,
          0.01541758, -0.03164274,  0.03858277, -0.00330187],
        [ 0.00508808,  0.0148906 , -0.00459822, -0.0311793 ,
          0.03297097,  0.01159291, -0.00545932,  0.00125635,
         -0.02360886,  0.04261509,  0.02811799,  0.01085458],
        [-0.00920998, -0.03255661, -0.01375307,  0.02883606,
         -0.03784255, -0.03116384, -0.01753619, -0.00243188,
         -0.01238028,  0.01684532, -0.03331041,  0.04496412],
        [-0.03949132, -0.00810491,  0.01725994, -0.04239985,
         -0.02407693,  0.01918412, -0.02115929, -0.00855763,
         -0.01861004, -0.03655253,  0.00440925,  0.01358545],
        [ 0.01651144,  0.00

In [51]:
input_batch = [
    "Where can I find a pizzeria?",
    "Mass hysteria over listeria.",
    "I ain't no circle back girl."
]

bpemb_en.encode(input_batch)

[['▁where', '▁can', '▁i', '▁find', '▁a', '▁p', 'iz', 'zer', 'ia', '?'],
 ['▁mass', '▁hy', 'ster', 'ia', '▁over', '▁l', 'ister', 'ia', '.'],
 ['▁i', '▁a', 'in', "'", 't', '▁no', '▁circle', '▁back', '▁girl', '.']]

In [52]:
input_seqs = bpemb_en.encode_ids(input_batch)
print("Vectorized inputs:")
input_seqs

Vectorized inputs:


[[571, 280, 386, 1934, 4, 24, 248, 4339, 177, 9967],
 [1535, 1354, 1238, 177, 380, 43, 871, 177, 9935],
 [386, 4, 6, 9937, 9915, 467, 5410, 810, 3692, 9935]]

In [53]:
padded_input_seqs = tf.keras.preprocessing.sequence.pad_sequences(input_seqs, padding="post")
print("Input to the encoder:")
print(padded_input_seqs.shape)
print(padded_input_seqs)

Input to the encoder:
(3, 10)
[[ 571  280  386 1934    4   24  248 4339  177 9967]
 [1535 1354 1238  177  380   43  871  177 9935    0]
 [ 386    4    6 9937 9915  467 5410  810 3692 9935]]


In [54]:
enc_mask = tf.cast(tf.math.not_equal(padded_input_seqs, 0), tf.float32)
print("Input:")
print(padded_input_seqs, '\n')
print("Encoder mask:")
print(enc_mask)

Input:
[[ 571  280  386 1934    4   24  248 4339  177 9967]
 [1535 1354 1238  177  380   43  871  177 9935    0]
 [ 386    4    6 9937 9915  467 5410  810 3692 9935]] 

Encoder mask:
tf.Tensor(
[[1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]
 [1. 1. 1. 1. 1. 1. 1. 1. 1. 0.]
 [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]], shape=(3, 10), dtype=float32)


In [55]:
enc_mask = enc_mask[:, tf.newaxis, tf.newaxis, :]
enc_mask

<tf.Tensor: shape=(3, 1, 1, 10), dtype=float32, numpy=
array([[[[1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]]],


       [[[1., 1., 1., 1., 1., 1., 1., 1., 1., 0.]]],


       [[[1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]]]], dtype=float32)>

In [56]:
num_encoder_blocks = 6

# d_model is the embedding dimension used throughout.
d_model = 12

num_heads = 3

# Feed-forward network hidden dimension width.
ffn_hidden_dim = 48

src_vocab_size = bpemb_vocab_size
max_input_seq_len = padded_input_seqs.shape[1]

encoder = Encoder(
    num_encoder_blocks,
    d_model,
    num_heads,
    ffn_hidden_dim,
    src_vocab_size,
    max_input_seq_len)

In [57]:
encoder_output, attn_weights = encoder(padded_input_seqs, training=True,
                                       mask=enc_mask)
print(f"Encoder output {encoder_output.shape}:")
print(encoder_output)

Encoder output (3, 10, 12):
tf.Tensor(
[[[ 1.24779928e+00  7.30878949e-01 -1.61388266e+00 -6.78592801e-01
   -1.67193055e-01 -3.07976324e-02  1.07393157e+00 -4.28785384e-01
    1.08274233e+00  1.18549526e+00 -1.13649237e+00 -1.26510322e+00]
  [ 7.92409658e-01  5.13804853e-01 -1.45590436e+00 -8.35416198e-01
   -7.06900716e-01 -9.22919154e-01  9.45003510e-01 -9.92063522e-01
    1.13214505e+00  3.39080542e-01 -6.18596971e-01  1.80935740e+00]
  [ 1.09155011e+00  3.98919016e-01 -2.32779312e+00 -5.71277261e-01
    2.03561470e-01 -4.81406987e-01  3.07595849e-01 -5.27413130e-01
    4.90076959e-01  7.86656499e-01 -9.48771179e-01  1.57830179e+00]
  [ 1.37312162e+00  3.67170066e-01 -2.13820696e+00 -5.93859076e-01
    5.76953053e-01 -5.63953280e-01  6.62697256e-01 -6.88541472e-01
    3.07096869e-01  1.84277996e-01 -1.01258361e+00  1.52582741e+00]
  [ 9.96322572e-01 -5.18519916e-02 -1.97636509e+00 -4.38284129e-01
    3.06086689e-01 -9.71361339e-01 -6.85579002e-01  8.60698819e-02
    5.55739582e-01 

# Decoder Block

In [58]:
class DecoderBlock(tf.keras.layers.Layer):
  def __init__(self, d_model, num_heads, hidden_dim, dropout_rate=0.1):
    super(DecoderBlock, self).__init__()

    self.mhsa1 = MultiHeadSelfAttention(d_model, num_heads)
    self.mhsa2 = MultiHeadSelfAttention(d_model, num_heads)

    self.ffn = feed_forward_network(d_model, hidden_dim)

    self.dropout1 = tf.keras.layers.Dropout(dropout_rate)
    self.dropout2 = tf.keras.layers.Dropout(dropout_rate)
    self.dropout3 = tf.keras.layers.Dropout(dropout_rate)

    self.layernorm1 = tf.keras.layers.LayerNormalization()
    self.layernorm2 = tf.keras.layers.LayerNormalization()
    self.layernorm3 = tf.keras.layers.LayerNormalization()

  # Note the decoder block takes two masks. One for the first MHSA, another
  # for the second MHSA.
  def call(self, encoder_output, target, training, decoder_mask, memory_mask):
    mhsa_output1, attn_weights = self.mhsa1(target, target, target, decoder_mask)
    mhsa_output1 = self.dropout1(mhsa_output1, training=training)
    mhsa_output1 = self.layernorm1(mhsa_output1 + target)

    mhsa_output2, attn_weights = self.mhsa2(mhsa_output1, encoder_output,
                                            encoder_output,
                                            memory_mask)
    mhsa_output2 = self.dropout2(mhsa_output2, training=training)
    mhsa_output2 = self.layernorm2(mhsa_output2 + mhsa_output1)

    ffn_output = self.ffn(mhsa_output2)
    ffn_output = self.dropout3(ffn_output, training=training)
    output = self.layernorm3(ffn_output + mhsa_output2)

    return output, attn_weights

In [59]:
class Decoder(tf.keras.layers.Layer):
  def __init__(self, num_blocks, d_model, num_heads, hidden_dim, target_vocab_size,
               max_seq_len, dropout_rate=0.1):
    super(Decoder, self).__init__()

    self.d_model = d_model
    self.max_seq_len = max_seq_len

    self.token_embed = tf.keras.layers.Embedding(target_vocab_size, self.d_model)
    self.pos_embed = tf.keras.layers.Embedding(max_seq_len, self.d_model)

    self.dropout = tf.keras.layers.Dropout(dropout_rate)

    self.blocks = [DecoderBlock(self.d_model, num_heads, hidden_dim, dropout_rate) for _ in range(num_blocks)]

  def call(self, encoder_output, target, training, decoder_mask, memory_mask):
    token_embeds = self.token_embed(target)

    # Generate position indices.
    num_pos = target.shape[0] * self.max_seq_len
    pos_idx = np.resize(np.arange(self.max_seq_len), num_pos)
    pos_idx = np.reshape(pos_idx, target.shape)

    pos_embeds = self.pos_embed(pos_idx)

    x = self.dropout(token_embeds + pos_embeds, training=training)

    for block in self.blocks:
      x, weights = block(encoder_output, x, training, decoder_mask, memory_mask)

    return x, weights

In [60]:
# Made up values.
target_input_seqs = [
    [1, 652, 723, 123, 62],
    [1, 25,  98, 129, 248, 215, 359, 249],
    [1, 2369, 1259, 125, 486],
]

In [61]:
padded_target_input_seqs = tf.keras.preprocessing.sequence.pad_sequences(target_input_seqs, padding="post")
print("Padded target inputs to the decoder:")
print(padded_target_input_seqs.shape)
print(padded_target_input_seqs)

Padded target inputs to the decoder:
(3, 8)
[[   1  652  723  123   62    0    0    0]
 [   1   25   98  129  248  215  359  249]
 [   1 2369 1259  125  486    0    0    0]]


In [62]:
dec_padding_mask = tf.cast(tf.math.not_equal(padded_target_input_seqs, 0), tf.float32)
dec_padding_mask = dec_padding_mask[:, tf.newaxis, tf.newaxis, :]
print(dec_padding_mask)

tf.Tensor(
[[[[1. 1. 1. 1. 1. 0. 0. 0.]]]


 [[[1. 1. 1. 1. 1. 1. 1. 1.]]]


 [[[1. 1. 1. 1. 1. 0. 0. 0.]]]], shape=(3, 1, 1, 8), dtype=float32)


In [63]:
target_input_seq_len = padded_target_input_seqs.shape[1]
look_ahead_mask = tf.linalg.band_part(tf.ones((target_input_seq_len,
                                               target_input_seq_len)), -1, 0)
print(look_ahead_mask)

tf.Tensor(
[[1. 0. 0. 0. 0. 0. 0. 0.]
 [1. 1. 0. 0. 0. 0. 0. 0.]
 [1. 1. 1. 0. 0. 0. 0. 0.]
 [1. 1. 1. 1. 0. 0. 0. 0.]
 [1. 1. 1. 1. 1. 0. 0. 0.]
 [1. 1. 1. 1. 1. 1. 0. 0.]
 [1. 1. 1. 1. 1. 1. 1. 0.]
 [1. 1. 1. 1. 1. 1. 1. 1.]], shape=(8, 8), dtype=float32)


In [64]:
dec_mask = tf.minimum(dec_padding_mask, look_ahead_mask)
print("The decoder mask:")
print(dec_mask)

The decoder mask:
tf.Tensor(
[[[[1. 0. 0. 0. 0. 0. 0. 0.]
   [1. 1. 0. 0. 0. 0. 0. 0.]
   [1. 1. 1. 0. 0. 0. 0. 0.]
   [1. 1. 1. 1. 0. 0. 0. 0.]
   [1. 1. 1. 1. 1. 0. 0. 0.]
   [1. 1. 1. 1. 1. 0. 0. 0.]
   [1. 1. 1. 1. 1. 0. 0. 0.]
   [1. 1. 1. 1. 1. 0. 0. 0.]]]


 [[[1. 0. 0. 0. 0. 0. 0. 0.]
   [1. 1. 0. 0. 0. 0. 0. 0.]
   [1. 1. 1. 0. 0. 0. 0. 0.]
   [1. 1. 1. 1. 0. 0. 0. 0.]
   [1. 1. 1. 1. 1. 0. 0. 0.]
   [1. 1. 1. 1. 1. 1. 0. 0.]
   [1. 1. 1. 1. 1. 1. 1. 0.]
   [1. 1. 1. 1. 1. 1. 1. 1.]]]


 [[[1. 0. 0. 0. 0. 0. 0. 0.]
   [1. 1. 0. 0. 0. 0. 0. 0.]
   [1. 1. 1. 0. 0. 0. 0. 0.]
   [1. 1. 1. 1. 0. 0. 0. 0.]
   [1. 1. 1. 1. 1. 0. 0. 0.]
   [1. 1. 1. 1. 1. 0. 0. 0.]
   [1. 1. 1. 1. 1. 0. 0. 0.]
   [1. 1. 1. 1. 1. 0. 0. 0.]]]], shape=(3, 1, 8, 8), dtype=float32)


In [65]:
decoder = Decoder(6, 12, 3, 48, 10000, 8)
decoder_output, _ = decoder(encoder_output, padded_target_input_seqs,
                            True, dec_mask, enc_mask)
print(f"Decoder output {decoder_output.shape}:")
print(decoder_output)

Decoder output (3, 8, 12):
tf.Tensor(
[[[-0.7125096  -0.62496865 -1.4808569   0.4712454   2.2094297
    0.57263213 -0.05093424 -0.1588107   1.0184709   0.70690656
   -0.9434271  -1.0071777 ]
  [-1.185755   -1.0648152  -0.9661477   0.31135082  2.327546
    0.6153856  -0.58132297 -0.5615083   0.7888386   0.6564204
    0.48624778 -0.82624006]
  [-1.7063954  -0.7058001  -0.81138045  0.52329344  2.289081
    0.5627186  -0.1984774  -0.8172244   0.8392352   0.37076873
    0.31830823 -0.66412765]
  [-1.4678495  -1.4571699  -0.7922786   0.6117952   1.4807787
    0.51156825  0.1094014  -0.82864296  1.3847101   0.83917934
    0.45826626 -0.8497583 ]
  [-1.172585   -1.3574504  -0.99402785  0.2974091   2.3469124
    0.18784532 -0.13188699 -0.72790927  1.0011796   0.446131
    0.49940932 -0.39502722]
  [-1.3320968  -0.8318049  -1.0966734   0.58376837  2.3366857
    0.17160138 -0.39159867 -0.6881039   0.7350832   0.7059242
    0.5393458  -0.7321313 ]
  [-1.5533653  -0.7105918  -0.8407288   0.4571889 

# Transformer

In [66]:
class Transformer(tf.keras.Model):
  def __init__(self, num_blocks, d_model, num_heads, hidden_dim, source_vocab_size,
               target_vocab_size, max_input_len, max_target_len, dropout_rate=0.1):
    super(Transformer, self).__init__()

    self.encoder = Encoder(num_blocks, d_model, num_heads, hidden_dim, source_vocab_size,
                           max_input_len, dropout_rate)

    self.decoder = Decoder(num_blocks, d_model, num_heads, hidden_dim, target_vocab_size,
                           max_target_len, dropout_rate)

    # The final dense layer to generate logits from the decoder output.
    self.output_layer = tf.keras.layers.Dense(target_vocab_size)

  def call(self, input_seqs, target_input_seqs, training, encoder_mask,
           decoder_mask, memory_mask):
    encoder_output, encoder_attn_weights = self.encoder(input_seqs,
                                                        training, encoder_mask)

    decoder_output, decoder_attn_weights = self.decoder(encoder_output,
                                                        target_input_seqs, training,
                                                        decoder_mask, memory_mask)

    return self.output_layer(decoder_output), encoder_attn_weights, decoder_attn_weights

In [67]:
transformer = Transformer(
    num_blocks = 6,
    d_model = 12,
    num_heads = 3,
    hidden_dim = 48,
    source_vocab_size = bpemb_vocab_size,
    target_vocab_size = 7000, # made-up target vocab size.
    max_input_len = padded_input_seqs.shape[1],
    max_target_len = padded_target_input_seqs.shape[1])

transformer_output, _, _ = transformer(padded_input_seqs,
                                       padded_target_input_seqs, True,
                                       enc_mask, dec_mask, memory_mask=enc_mask)
print(f"Transformer output {transformer_output.shape}:")
print(transformer_output)

Transformer output (3, 8, 7000):
tf.Tensor(
[[[ 0.0395216   0.03916568  0.02582054 ...  0.01666976 -0.05993929
    0.00690763]
  [ 0.00943986 -0.03221751 -0.00229993 ...  0.07339201 -0.06860453
   -0.01346374]
  [ 0.02648464  0.00802982 -0.0167323  ...  0.06233544 -0.05515555
    0.00070162]
  ...
  [ 0.02204489 -0.00191058 -0.00376367 ...  0.08343042 -0.09241083
   -0.00951072]
  [ 0.02156831 -0.00031531 -0.00594937 ...  0.06019064 -0.06924941
   -0.00282104]
  [-0.00968296 -0.00193681  0.01834356 ...  0.07306445 -0.05591359
    0.04568645]]

 [[ 0.02646083 -0.09181534 -0.03285166 ...  0.05564085 -0.04139944
   -0.03555582]
  [ 0.04389931 -0.05536838 -0.03379606 ...  0.05632143 -0.01494565
   -0.00276387]
  [ 0.0375099  -0.07298164 -0.01804597 ...  0.0793379  -0.03941778
   -0.01402597]
  ...
  [ 0.03313862 -0.09166637 -0.01659454 ...  0.0384437  -0.01532382
   -0.01681272]
  [ 0.03983209 -0.05074334 -0.01175103 ...  0.05605035 -0.02289294
   -0.0122582 ]
  [ 0.02742818 -0.10487369 -0