In [24]:
import tensorflow as tf
import numpy as np

**SELF ATTENTION IMPLEMENTATION**

In [25]:
def scaled_dot_product_attention(query, key, value, mask=None):
    key_dim = tf.cast(tf.shape(key)[-1], tf.float32)
    scaled_scores = tf.matmul(query, key, transpose_b=True) / np.sqrt(key_dim)
    if mask is not None:
        scaled_scores = tf.where(mask==0, -np.inf, scaled_scores)
    
    softmax = tf.keras.layers.Softmax()
    weights = softmax(scaled_scores)
    return tf.matmul(weights, value), weights

In [26]:
seq_len = 3
embedding_dim = 4
queries = np.random.rand(seq_len, embedding_dim)
keys = np.random.rand(seq_len, embedding_dim)
values = np.random.rand(seq_len, embedding_dim)

print("Queries: ", queries)

Queries:  [[0.64462257 0.29182289 0.59699105 0.62766912]
 [0.87747214 0.50352764 0.3700742  0.6824152 ]
 [0.95345309 0.59851879 0.54799055 0.45626173]]


In [27]:
output, attention_weights = scaled_dot_product_attention(queries, keys, values)
print("Output: ", output)
print("Weights: ", attention_weights)

Output:  tf.Tensor(
[[0.39585012 0.51225656 0.4863063  0.42944267]
 [0.40387934 0.5078269  0.48785013 0.4239037 ]
 [0.40933886 0.5035388  0.48784375 0.42252594]], shape=(3, 4), dtype=float32)
Weights:  tf.Tensor(
[[0.40565223 0.273214   0.32113376]
 [0.40638384 0.25693667 0.33667943]
 [0.40383312 0.2503343  0.34583256]], shape=(3, 3), dtype=float32)


2024-03-15 02:35:33.999954: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:901] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-03-15 02:35:34.034712: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:901] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-03-15 02:35:34.035155: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:901] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-

**MULTI-HEAD SELF ATTENTION**

In [28]:
batch_size = 1
seq_len = 3
embedding_dim = 12
num_heads = 3
head_dim = embedding_dim // num_heads
print(f"Dimension of each head: {head_dim}")

Dimension of each head: 4


In [29]:
x = np.random.rand(batch_size, seq_len, embedding_dim).round(1)
print("Input shape: ", x.shape, "\n")
print("Input: \n", x)

Input shape:  (1, 3, 12) 

Input: 
 [[[0.9 0.2 0.2 0.6 0.1 1.  0.1 0.1 0.7 1.  0.  1. ]
  [0.6 0.8 0.9 1.  0.5 1.  0.2 1.  0.9 0.8 0.2 0.7]
  [0.3 0.9 0.3 0.5 0.2 0.8 0.3 0.6 0.6 0.4 0.9 0.7]]]


In [30]:
wq0 = np.random.rand(embedding_dim, head_dim).round(1)
wk0 = np.random.rand(embedding_dim, head_dim).round(1)
wv0 = np.random.rand(embedding_dim, head_dim).round(1)

wq1 = np.random.rand(embedding_dim, head_dim).round(1)
wk1 = np.random.rand(embedding_dim, head_dim).round(1)
wv1 = np.random.rand(embedding_dim, head_dim).round(1)

wq2 = np.random.rand(embedding_dim, head_dim).round(1)
wk2 = np.random.rand(embedding_dim, head_dim).round(1)
wv2 = np.random.rand(embedding_dim, head_dim).round(1)

In [31]:
print("The three sets of query weights (one for each head): ")
print("wq0: \n", wq0)
print("wq1: \n", wq1)
print("wq2: \n", wq2)

The three sets of query weights (one for each head): 
wq0: 
 [[0.5 0.8 0.4 0.7]
 [0.4 0.3 0.4 0.6]
 [0.5 0.2 0.3 0.6]
 [0.9 0.9 0.5 0.2]
 [0.5 0.4 0.5 0.6]
 [0.3 1.  0.7 0.2]
 [0.5 0.5 0.4 0.9]
 [0.3 0.3 0.1 0. ]
 [0.9 0.6 0.8 0.6]
 [0.9 0.1 0.2 0.1]
 [0.9 0.1 1.  0.5]
 [0.6 0.1 0.8 0.4]]
wq1: 
 [[0.5 0.3 0.2 1. ]
 [0.  0.7 0.9 0.2]
 [0.7 0.4 0.9 0.8]
 [0.3 0.2 0.7 0.4]
 [0.2 0.9 0.3 0. ]
 [0.7 0.1 0.2 0.1]
 [0.  0.2 0.4 0.4]
 [0.1 0.5 0.6 0.8]
 [0.5 0.  0.3 0.3]
 [0.2 1.  0.4 0.3]
 [0.3 0.8 0.9 0.3]
 [0.2 0.7 0.3 0.8]]
wq2: 
 [[0.6 0.1 0.1 0.1]
 [0.  0.  0.5 0.9]
 [0.5 0.5 0.4 0.2]
 [0.2 0.9 0.4 0.5]
 [0.  0.8 0.4 0.2]
 [0.9 1.  0.9 0.4]
 [0.1 0.2 0.6 0.6]
 [0.2 0.5 0.4 1. ]
 [0.5 1.  0.5 0.1]
 [0.4 0.1 0.3 0.3]
 [0.5 0.9 0.9 0.4]
 [0.1 0.8 0.3 0.8]]


In [32]:
q0 = np.dot(x, wq0)
q1 = np.dot(x, wq1)
q2 = np.dot(x, wq2)

k0 = np.dot(x, wk0)
k1 = np.dot(x, wk1)
k2 = np.dot(x, wk2)

v0 = np.dot(x, wv0)
v1 = np.dot(x, wv1)
v2 = np.dot(x, wv2)

In [33]:
print("Q, K, and V for first head:\n")

print(f"q0 {q0.shape}:\n", q0, "\n")
print(f"k0 {k0.shape}:\n", k0, "\n")
print(f"v0 {v0.shape}:\n", v0)

Q, K, and V for first head:

q0 (1, 3, 4):
 [[[3.73 3.1  3.16 2.26]
  [5.05 4.11 4.1  3.32]
  [3.91 2.79 3.68 2.71]]] 

k0 (1, 3, 4):
 [[[2.93 3.02 2.98 2.86]
  [4.47 3.71 4.93 3.84]
  [3.41 2.98 3.52 3.41]]] 

v0 (1, 3, 4):
 [[[2.6  3.09 3.23 4.2 ]
  [4.42 4.03 4.87 5.54]
  [3.4  2.57 3.04 4.11]]]


In [34]:
output_0, attention_weights_0 = scaled_dot_product_attention(q0, k0, v0)
print("Output from the first attention head: ", output_0)
print("Weights from the first attention head: ", attention_weights_0)

Output from the first attention head:  tf.Tensor(
[[[4.416455  4.0254145 4.8641167 5.535385 ]
  [4.419542  4.029374  4.8692064 5.5393796]
  [4.4178185 4.0271072 4.86631   5.5371084]]], shape=(1, 3, 4), dtype=float32)
Weights from the first attention head:  tf.Tensor(
[[[2.9361187e-04 9.9675459e-01 2.9517696e-03]
  [1.7891492e-05 9.9956471e-01 4.1743321e-04]
  [1.3759677e-04 9.9796957e-01 1.8927344e-03]]], shape=(1, 3, 3), dtype=float32)


In [35]:
output_1, _ = scaled_dot_product_attention(q1, k1, v1)
output_2, _ = scaled_dot_product_attention(q2, k2, v2)

print("Output from the second attention head: ", output_1)
print("Output from the third attention head: ", output_2)

Output from the second attention head:  tf.Tensor(
[[[4.2350297 3.9552813 5.328367  3.783058 ]
  [4.2396684 3.959685  5.3391848 3.789499 ]
  [4.2377996 3.9579113 5.334864  3.7869403]]], shape=(1, 3, 4), dtype=float32)
Output from the third attention head:  tf.Tensor(
[[[4.2667036 3.626774  4.30675   5.2757072]
  [4.2698565 3.6298625 4.3098617 5.279814 ]
  [4.2691364 3.629193  4.30919   5.2788815]]], shape=(1, 3, 4), dtype=float32)


In [36]:
print("Query weights for the first head: ", wq0)
print("Query weights for the second head: ", wq1)
print("Query weights for the third head: ", wq2)

Query weights for the first head:  [[0.5 0.8 0.4 0.7]
 [0.4 0.3 0.4 0.6]
 [0.5 0.2 0.3 0.6]
 [0.9 0.9 0.5 0.2]
 [0.5 0.4 0.5 0.6]
 [0.3 1.  0.7 0.2]
 [0.5 0.5 0.4 0.9]
 [0.3 0.3 0.1 0. ]
 [0.9 0.6 0.8 0.6]
 [0.9 0.1 0.2 0.1]
 [0.9 0.1 1.  0.5]
 [0.6 0.1 0.8 0.4]]
Query weights for the second head:  [[0.5 0.3 0.2 1. ]
 [0.  0.7 0.9 0.2]
 [0.7 0.4 0.9 0.8]
 [0.3 0.2 0.7 0.4]
 [0.2 0.9 0.3 0. ]
 [0.7 0.1 0.2 0.1]
 [0.  0.2 0.4 0.4]
 [0.1 0.5 0.6 0.8]
 [0.5 0.  0.3 0.3]
 [0.2 1.  0.4 0.3]
 [0.3 0.8 0.9 0.3]
 [0.2 0.7 0.3 0.8]]
Query weights for the third head:  [[0.6 0.1 0.1 0.1]
 [0.  0.  0.5 0.9]
 [0.5 0.5 0.4 0.2]
 [0.2 0.9 0.4 0.5]
 [0.  0.8 0.4 0.2]
 [0.9 1.  0.9 0.4]
 [0.1 0.2 0.6 0.6]
 [0.2 0.5 0.4 1. ]
 [0.5 1.  0.5 0.1]
 [0.4 0.1 0.3 0.3]
 [0.5 0.9 0.9 0.4]
 [0.1 0.8 0.3 0.8]]


In [37]:
wq = np.concatenate([wq0, wq1, wq2], axis=1)
print(f"Single query weight matrix: {wq.shape}: \n", wq)

Single query weight matrix: (12, 12): 
 [[0.5 0.8 0.4 0.7 0.5 0.3 0.2 1.  0.6 0.1 0.1 0.1]
 [0.4 0.3 0.4 0.6 0.  0.7 0.9 0.2 0.  0.  0.5 0.9]
 [0.5 0.2 0.3 0.6 0.7 0.4 0.9 0.8 0.5 0.5 0.4 0.2]
 [0.9 0.9 0.5 0.2 0.3 0.2 0.7 0.4 0.2 0.9 0.4 0.5]
 [0.5 0.4 0.5 0.6 0.2 0.9 0.3 0.  0.  0.8 0.4 0.2]
 [0.3 1.  0.7 0.2 0.7 0.1 0.2 0.1 0.9 1.  0.9 0.4]
 [0.5 0.5 0.4 0.9 0.  0.2 0.4 0.4 0.1 0.2 0.6 0.6]
 [0.3 0.3 0.1 0.  0.1 0.5 0.6 0.8 0.2 0.5 0.4 1. ]
 [0.9 0.6 0.8 0.6 0.5 0.  0.3 0.3 0.5 1.  0.5 0.1]
 [0.9 0.1 0.2 0.1 0.2 1.  0.4 0.3 0.4 0.1 0.3 0.3]
 [0.9 0.1 1.  0.5 0.3 0.8 0.9 0.3 0.5 0.9 0.9 0.4]
 [0.6 0.1 0.8 0.4 0.2 0.7 0.3 0.8 0.1 0.8 0.3 0.8]]


In [38]:
wk = np.concatenate([wk0, wk1, wk2], axis=1)
wv = np.concatenate([wv0, wv1, wv2], axis=1)

print(f"Single key weight matrix: {wk.shape}: \n", wk)
print(f"Single value weight matrix: {wv.shape}: \n", wv)

Single key weight matrix: (12, 12): 
 [[0.1 1.  0.8 0.4 0.  0.1 0.6 0.4 0.3 0.6 0.4 0.4]
 [0.4 0.3 0.9 0.6 0.9 0.1 0.5 0.7 0.  0.9 0.7 0.6]
 [0.7 0.8 1.  0.1 0.6 0.8 0.5 0.6 0.6 0.6 0.7 0.6]
 [0.3 0.3 0.4 0.8 0.1 0.6 0.5 0.2 0.7 0.9 0.3 0.1]
 [0.  0.1 0.7 0.  0.2 0.2 0.6 0.8 0.7 0.4 1.  0.3]
 [0.8 0.1 0.4 0.7 0.  0.6 0.5 0.1 0.4 1.  0.2 0.2]
 [0.1 0.5 0.4 0.3 0.1 0.8 0.7 0.5 0.7 0.7 0.7 0.8]
 [0.7 0.1 0.6 0.5 0.4 0.5 0.1 1.  0.2 0.2 0.3 0.1]
 [0.8 0.5 0.1 0.  0.9 0.9 0.7 0.6 0.  0.  0.9 0.7]
 [0.8 0.5 0.9 0.4 0.8 0.7 0.2 0.2 0.5 0.5 0.4 0.8]
 [0.7 0.8 0.6 0.8 0.2 0.3 0.5 0.9 0.7 0.1 0.  0.1]
 [0.2 0.7 0.1 0.7 0.5 0.4 0.7 0.1 0.  0.3 0.7 0.9]]
Single value weight matrix: (12, 12): 
 [[0.4 0.7 0.8 1.  0.2 0.4 1.  0.3 0.6 0.1 0.8 0.2]
 [0.7 0.1 0.1 0.2 0.4 0.7 0.9 0.3 0.5 0.  0.3 0.8]
 [0.4 0.8 0.6 1.  0.1 0.3 1.  0.  0.9 0.7 0.7 0.5]
 [0.9 0.1 0.8 0.8 0.6 0.9 0.7 0.8 0.4 0.2 0.1 0.9]
 [0.6 1.  0.6 0.3 1.  0.1 0.9 0.2 0.1 1.  0.5 0.3]
 [0.  0.5 0.7 1.  0.2 0.2 0.2 0.3 0.1 0.6 0.6 0.8]
 [0

In [39]:
q_s = np.dot(x, wq)
k_s = np.dot(x, wk)
v_s = np.dot(x, wv)

In [40]:
print(f"Query vectors using a single weight matrix {q_s.shape}:\n", q_s)

Query vectors using a single weight matrix (1, 3, 12):
 [[[3.73 3.1  3.16 2.26 2.25 2.57 2.2  2.87 2.54 3.48 2.5  2.36]
  [5.05 4.11 4.1  3.32 2.94 3.84 4.36 3.99 3.07 5.07 3.92 4.05]
  [3.91 2.79 3.68 2.71 1.96 3.17 3.55 2.73 2.28 3.96 3.46 3.39]]]


In [41]:
q_s_reshaped = tf.reshape(q_s, (batch_size, seq_len, num_heads, head_dim))
print(f'Combined queries: {q_s.shape}\n', q_s)
print(f"Reshaped into separate heads: {q_s_reshaped.shape}\n", q_s_reshaped)

Combined queries: (1, 3, 12)
 [[[3.73 3.1  3.16 2.26 2.25 2.57 2.2  2.87 2.54 3.48 2.5  2.36]
  [5.05 4.11 4.1  3.32 2.94 3.84 4.36 3.99 3.07 5.07 3.92 4.05]
  [3.91 2.79 3.68 2.71 1.96 3.17 3.55 2.73 2.28 3.96 3.46 3.39]]]
Reshaped into separate heads: (1, 3, 3, 4)
 tf.Tensor(
[[[[3.73 3.1  3.16 2.26]
   [2.25 2.57 2.2  2.87]
   [2.54 3.48 2.5  2.36]]

  [[5.05 4.11 4.1  3.32]
   [2.94 3.84 4.36 3.99]
   [3.07 5.07 3.92 4.05]]

  [[3.91 2.79 3.68 2.71]
   [1.96 3.17 3.55 2.73]
   [2.28 3.96 3.46 3.39]]]], shape=(1, 3, 3, 4), dtype=float64)


In [42]:
q_s_transposed = tf.transpose(q_s_reshaped, perm=[0, 2, 1, 3]).numpy()
print(f"Queries transposed into \"seprate\" heads {q_s_transposed.shape}: \n", q_s_transposed)

Queries transposed into "seprate" heads (1, 3, 3, 4): 
 [[[[3.73 3.1  3.16 2.26]
   [5.05 4.11 4.1  3.32]
   [3.91 2.79 3.68 2.71]]

  [[2.25 2.57 2.2  2.87]
   [2.94 3.84 4.36 3.99]
   [1.96 3.17 3.55 2.73]]

  [[2.54 3.48 2.5  2.36]
   [3.07 5.07 3.92 4.05]
   [2.28 3.96 3.46 3.39]]]]


In [43]:
print("The separate per-head query matrices from before: ")
print(q0, "\n")
print(q1, "\n")
print(q2, "\n")

The separate per-head query matrices from before: 
[[[3.73 3.1  3.16 2.26]
  [5.05 4.11 4.1  3.32]
  [3.91 2.79 3.68 2.71]]] 

[[[2.25 2.57 2.2  2.87]
  [2.94 3.84 4.36 3.99]
  [1.96 3.17 3.55 2.73]]] 

[[[2.54 3.48 2.5  2.36]
  [3.07 5.07 3.92 4.05]
  [2.28 3.96 3.46 3.39]]] 



In [44]:
k_s_transposed = tf.transpose(tf.reshape(k_s, (batch_size, -1, num_heads, head_dim)), perm=[0, 2, 1, 3]).numpy()
v_s_transposed = tf.transpose(tf.reshape(v_s, (batch_size, -1, num_heads, head_dim)), perm=[0, 2, 1, 3]).numpy()


print(f"Keys for all heads in a single matrix {k_s.shape}: \n", k_s_transposed, "\n")
print(f"Values for all heads in a single matrix {v_s.shape}: \n", v_s_transposed)

Keys for all heads in a single matrix (1, 3, 12): 
 [[[[2.93 3.02 2.98 2.86]
   [4.47 3.71 4.93 3.84]
   [3.41 2.98 3.52 3.41]]

  [[2.36 3.11 3.07 1.79]
   [3.72 4.53 4.13 4.09]
   [2.74 3.09 3.26 3.34]]

  [[1.87 3.31 2.95 3.17]
   [3.05 4.69 4.49 3.89]
   [2.24 3.33 3.05 2.87]]]] 

Values for all heads in a single matrix (1, 3, 12): 
 [[[[2.6  3.09 3.23 4.2 ]
   [4.42 4.03 4.87 5.54]
   [3.4  2.57 3.04 4.11]]

  [[2.49 2.33 3.21 3.22]
   [4.24 3.96 5.34 3.79]
   [3.61 3.36 3.75 2.8 ]]

  [[2.64 2.82 3.56 3.28]
   [4.27 3.63 4.31 5.28]
   [3.52 2.3  2.92 4.21]]]]


In [45]:
all_heads_output, all_attention_weights = scaled_dot_product_attention(q_s_transposed, k_s_transposed, v_s_transposed)
print("Self Attention output: \n", all_heads_output)

Self Attention output: 
 tf.Tensor(
[[[[4.416455  4.0254145 4.8641167 5.535385 ]
   [4.419542  4.029374  4.8692064 5.5393796]
   [4.4178185 4.0271072 4.86631   5.5371084]]

  [[4.2350297 3.9552813 5.328367  3.783058 ]
   [4.2396684 3.959685  5.3391848 3.789499 ]
   [4.2377996 3.9579113 5.334864  3.7869403]]

  [[4.2667036 3.626774  4.30675   5.2757072]
   [4.2698565 3.6298625 4.3098617 5.279814 ]
   [4.2691364 3.629193  4.30919   5.2788815]]]], shape=(1, 3, 3, 4), dtype=float32)


In [46]:
combined_out_b = tf.reshape(tf.transpose(all_heads_output, perm=[0, 2, 1, 3]), (batch_size, seq_len, embedding_dim))
combined_out_b.shape
print("Final output from using single query, key, value matrices:\n", 
      combined_out_b, "\n")

Final output from using single query, key, value matrices:
 tf.Tensor(
[[[4.416455  4.0254145 4.8641167 5.535385  4.2350297 3.9552813 5.328367
   3.783058  4.2667036 3.626774  4.30675   5.2757072]
  [4.419542  4.029374  4.8692064 5.5393796 4.2396684 3.959685  5.3391848
   3.789499  4.2698565 3.6298625 4.3098617 5.279814 ]
  [4.4178185 4.0271072 4.86631   5.5371084 4.2377996 3.9579113 5.334864
   3.7869403 4.2691364 3.629193  4.30919   5.2788815]]], shape=(1, 3, 12), dtype=float32) 



In [47]:
class MultiHeadSelfAttention(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads):
        super(MultiHeadSelfAttention, self).__init__()
        self.d_model = d_model
        self.num_heads = num_heads  

        self.d_heads = self.d_model // self.num_heads

        self.wq = tf.keras.layers.Dense(self.d_model)
        self.wk = tf.keras.layers.Dense(self.d_model)
        self.wv = tf.keras.layers.Dense(self.d_model)

        self.dense = tf.keras.layers.Dense(self.d_model)

    def split_heads(self, x):
        batch_size = x.shape[0]
        split_inputs = tf.reshape(x, (batch_size, -1, self.num_heads, self.d_heads))
        return tf.transpose(split_inputs, perm=[0, 2, 1, 3])
    
    def merge_heads(self, x):
        batch_size = x.shape[0]
        merged_inputs = tf.transpose(x, perm=[0, 2, 1, 3])
        return tf.reshape(merged_inputs, (batch_size, -1, self.d_model))
    
    def call(self, q, k, v, mask):
        qs = self.wq(q)
        ks = self.wk(k)
        vs = self.wv(v)

        qs = self.split_heads(qs)
        ks = self.split_heads(ks)
        vs = self.split_heads(vs)

        output, attention_weights = scaled_dot_product_attention(qs, ks, vs, mask)
        output = self.merge_heads(output)

        return self.dense(output), attention_weights




In [48]:
mhsa = MultiHeadSelfAttention(12, 3)

output, attention_weights = mhsa(x, x, x, mask=None)
print(f"MHSA output: {output.shape}\n", output)

MHSA output: (1, 3, 12)
 tf.Tensor(
[[[ 0.7243886   0.38871232 -0.3713587   0.08256346 -1.4231125
    0.29189828  1.0245624   0.82307565  0.13167898 -0.6788002
    0.10989675  0.430043  ]
  [ 0.7192505   0.39557058 -0.38501424  0.0737307  -1.4221615
    0.30300948  1.0384811   0.8434766   0.14299577 -0.68396413
    0.09957641  0.44049442]
  [ 0.7430325   0.36705524 -0.3678645   0.04611637 -1.3885686
    0.3037917   1.0396196   0.84250486  0.12326129 -0.6774776
    0.11347139  0.4388766 ]]], shape=(1, 3, 12), dtype=float32)


In [49]:
def feed_forward_network(d_model, hidden_dim):
    return tf.keras.Sequential([
        tf.keras.layers.Dense(hidden_dim, activation='relu'),
        tf.keras.layers.Dense(d_model)
    ])

In [50]:
class EncoderBlock(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads, hidden_dim, dropout_rate=0.1):
        super(EncoderBlock, self).__init__()
        self.mhsa = MultiHeadSelfAttention(d_model, num_heads)
        self.ffn = feed_forward_network(d_model, hidden_dim)

        self.dropout1 = tf.keras.layers.Dropout(dropout_rate)
        self.dropout2 = tf.keras.layers.Dropout(dropout_rate)

        self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)

    def call(self, x, training, mask):
        mhsa_output, attention_weights = self.mhsa(x, x, x, mask)
        mhsa_output = self.dropout1(mhsa_output, training=training)
        mhsa_output = self.layernorm1(x + mhsa_output)

        ffn_output = self.ffn(mhsa_output)
        ffn_output = self.dropout2(ffn_output, training=training)
        output = self.layernorm2(mhsa_output + ffn_output)

        return output, attention_weights
        

In [51]:
encoder_block = EncoderBlock(12, 3, 48)

block_output, _ = encoder_block(x, training=True, mask=None)
print(f"Encoder block output: {block_output.shape}\n", block_output)

Encoder block output: (1, 3, 12)
 tf.Tensor(
[[[-0.5077105  -0.10255369  0.01637811  1.1511724   1.5025967
    1.3684464  -0.996069   -1.1237533  -1.2633322   1.1438186
   -1.0470654  -0.1419278 ]
  [-1.126212    0.05545544  1.2249769   1.0827426   1.3357245
    0.9501109  -1.1947788  -0.5905926  -1.6235039   0.69403183
   -0.67660743 -0.13134727]
  [-1.0390782   0.08097543  1.4713725   0.7167139   1.1496494
    1.4686049  -1.0459328  -0.63145953 -1.6693597  -0.1600641
    0.2727188  -0.61414015]]], shape=(1, 3, 12), dtype=float32)


In [52]:
from bpemb import BPEmb

bpemb_en = BPEmb(model_file="dl/en.wiki.bpe.vs10000.model", emb_file="dl/en.wiki.bpe.vs10000.d100.w2v.txt")

bpemb_vocab_size, bpemb_embedding_size = bpemb_en.vectors.shape
print("Vocabulary size: ", bpemb_vocab_size)
print("Embedding size: ", bpemb_embedding_size)

bpemb_en.vectors[bpemb_en.words.index('car')]


Vocabulary size:  10000
Embedding size:  100


array([-0.305548, -0.325598, -0.134716, -0.078735, -0.660545,  0.076211,
       -0.735487,  0.124533, -0.294402,  0.459688,  0.030137,  0.174041,
       -0.224223,  0.486189, -0.504649, -0.459699,  0.315747,  0.477885,
        0.091398,  0.427867,  0.016524, -0.076833, -0.899727,  0.493158,
       -0.022309, -0.422785, -0.154148,  0.204981,  0.379834,  0.070588,
        0.196073, -0.368222,  0.473406,  0.007409,  0.004303, -0.007823,
       -0.19103 , -0.202509,  0.109878, -0.224521, -0.35741 , -0.611633,
        0.329958, -0.212956, -0.497499, -0.393839, -0.130101, -0.216903,
       -0.105595, -0.076007, -0.483942, -0.139704, -0.161647,  0.136985,
        0.415363, -0.360143,  0.038601, -0.078804, -0.030421,  0.324129,
        0.223378, -0.523636, -0.048317, -0.032248, -0.117367,  0.470519,
        0.225816, -0.222065, -0.225007, -0.165904, -0.334389, -0.20157 ,
        0.572352, -0.268794,  0.301929, -0.005563,  0.387491,  0.261031,
       -0.11613 ,  0.074982, -0.008433,  0.259987, 

In [53]:
sample_sentence = "This is a test sentence."
tokens = bpemb_en.encode(sample_sentence)
print(tokens)

#to vectorize the tokens, we can use
token_seq = np.array(bpemb_en.encode_ids(sample_sentence))
print(token_seq)

['▁this', '▁is', '▁a', '▁test', '▁sentence', '.']
[ 215   80    4 1417 8018 9935]


In [55]:
token_embed = tf.keras.layers.Embedding(bpemb_vocab_size, embedding_dim)
token_embeddings = token_embed(token_seq)

print("Embeddings for: ", sample_sentence)
print(token_embeddings)

Embeddings for:  This is a test sentence.
tf.Tensor(
[[ 0.04325885  0.0153905   0.0084916   0.01765759 -0.04393685 -0.02584207
   0.00741035  0.00529586  0.04138687 -0.00108188 -0.01214737  0.04320996]
 [ 0.02488294 -0.03764825 -0.00791397 -0.01074302  0.01572687  0.04023686
   0.00198406 -0.01887797  0.02190575 -0.03720016 -0.01135868 -0.03820722]
 [-0.01751604 -0.02321208 -0.03165638 -0.00679427  0.02897458  0.03856293
  -0.04542214  0.00415289 -0.01453358 -0.03599155 -0.01077665  0.03040539]
 [ 0.00730227 -0.04022131  0.00711285  0.0370374   0.00355294  0.04821488
   0.00257929 -0.00746938 -0.00266268 -0.02074823  0.03379487 -0.01936979]
 [-0.01358979 -0.01693592  0.01879406  0.04581114 -0.0020567   0.03432843
   0.03771757  0.00042767 -0.00427796 -0.04699645 -0.02356578  0.0465809 ]
 [ 0.01933826 -0.01900394  0.03576306  0.04497225  0.04902435  0.0424873
   0.02289401 -0.02709107  0.04170777 -0.0101452  -0.04986476  0.0079672 ]], shape=(6, 12), dtype=float32)


**ADDING POSITIONAL INFORMATION TO WORD EMBEDDINGS**

In [57]:
max_seq_len = 256
pos_embed = tf.keras.layers.Embedding(max_seq_len, embedding_dim)

#ids for each position of the token sequence
pos_idx = tf.range(len(token_seq))
print(pos_idx)

tf.Tensor([0 1 2 3 4 5], shape=(6,), dtype=int32)


In [58]:
position_embeddings = pos_embed(pos_idx)
print("Position embeddings for the input sequence: ", position_embeddings)

Position embeddings for the input sequence:  tf.Tensor(
[[-0.03398951 -0.04240065 -0.0395308   0.02028023  0.03782424 -0.00674021
   0.04054494 -0.04820068 -0.04032767 -0.01148112  0.02223236 -0.02714597]
 [ 0.01064237  0.01306362 -0.032097    0.04778328 -0.02553469 -0.00102216
  -0.00287461  0.04004383 -0.04058469  0.03487198 -0.02273406  0.00374869]
 [ 0.0334586   0.03806755 -0.0353977   0.03569827  0.01905346 -0.01626587
  -0.02591124  0.02015717  0.03362096 -0.03954991 -0.03974589 -0.0472868 ]
 [ 0.03146205  0.00368471  0.00429564  0.01074628 -0.04690666 -0.02014521
   0.03664566 -0.01319785  0.02634666 -0.01850809 -0.04733215 -0.02704452]
 [ 0.03837686  0.01326886 -0.01699028  0.04684622  0.01942888  0.04434233
   0.01320937  0.02857763 -0.03618866 -0.04637827 -0.02180573 -0.04060348]
 [-0.02767768 -0.0091346   0.03583739  0.0139837  -0.00317991  0.00021335
   0.01069367  0.04481942 -0.0356519  -0.04820929 -0.01352438 -0.02875545]], shape=(6, 12), dtype=float32)


In [59]:
input = token_embeddings + position_embeddings
print("Initial input embeddings: ", input)

Initial input embeddings:  tf.Tensor(
[[ 0.00926934 -0.02701014 -0.0310392   0.03793782 -0.00611261 -0.03258228
   0.04795529 -0.04290482  0.0010592  -0.012563    0.01008499  0.01606399]
 [ 0.03552531 -0.02458463 -0.04001097  0.03704026 -0.00980782  0.0392147
  -0.00089055  0.02116586 -0.01867894 -0.00232818 -0.03409274 -0.03445853]
 [ 0.01594256  0.01485547 -0.06705408  0.02890399  0.04802804  0.02229706
  -0.07133337  0.02431006  0.01908738 -0.07554146 -0.05052254 -0.01688141]
 [ 0.03876432 -0.0365366   0.01140849  0.04778368 -0.04335373  0.02806967
   0.03922496 -0.02066723  0.02368398 -0.03925632 -0.01353728 -0.04641432]
 [ 0.02478707 -0.00366706  0.00180378  0.09265736  0.01737218  0.07867076
   0.05092694  0.0290053  -0.04046662 -0.09337471 -0.04537151  0.00597742]
 [-0.00833942 -0.02813854  0.07160044  0.05895595  0.04584444  0.04270064
   0.03358768  0.01772834  0.00605587 -0.05835449 -0.06338914 -0.02078825]], shape=(6, 12), dtype=float32)


In [60]:
class Encoder(tf.keras.layers.Layer):
    def __init__(self, num_blocks, d_model, num_heads, hidden_dim, vocab_size, max_seq_len, dropout_rate=0.1):
        super(Encoder, self).__init__()

        self.d_model = d_model
        self.max_seq_len = max_seq_len

        self.token_embed = tf.keras.layers.Embedding(vocab_size, self.d_model)
        self.pos_embed = tf.keras.layers.Embedding(max_seq_len, self.d_model)

        self.dropout = tf.keras.layers.Dropout(dropout_rate)

        self.blocks = [EncoderBlock(d_model, num_heads, hidden_dim, dropout_rate)
        for _ in range(num_blocks)]

    def call(self, input, training, mask):
        token_embeds = self.token_embed(input)
        num_pos = input.shape[0] * self.max_seq_len
        pos_idx = np.resize(np.arange(self.max_seq_len), num_pos)
        pos_idx = np.reshape(pos_idx, input.shape)
        pos_embeds = self.pos_embed(pos_idx)

        x = self.dropout(token_embeds + pos_embeds, training=training)

        for block in self.blocks:
            x, weights = block(x, training, mask)

        return x, weights

In [61]:
input_batch = [
    "Where can I find a pizzeria?",
    "Mass hysteria over listeria.",
    "I ain't no circle back girl."
]

bpemb_en.encode(input_batch)

[['▁where', '▁can', '▁i', '▁find', '▁a', '▁p', 'iz', 'zer', 'ia', '?'],
 ['▁mass', '▁hy', 'ster', 'ia', '▁over', '▁l', 'ister', 'ia', '.'],
 ['▁i', '▁a', 'in', "'", 't', '▁no', '▁circle', '▁back', '▁girl', '.']]

In [62]:
input_seqs = bpemb_en.encode_ids(input_batch)
print("Vectorized inputs:")
input_seqs

Vectorized inputs:


[[571, 280, 386, 1934, 4, 24, 248, 4339, 177, 9967],
 [1535, 1354, 1238, 177, 380, 43, 871, 177, 9935],
 [386, 4, 6, 9937, 9915, 467, 5410, 810, 3692, 9935]]

In [63]:
padded_input_seqs = tf.keras.preprocessing.sequence.pad_sequences(input_seqs, padding="post")
print("Input to the encoder:")
print(padded_input_seqs.shape)
print(padded_input_seqs)

Input to the encoder:
(3, 10)
[[ 571  280  386 1934    4   24  248 4339  177 9967]
 [1535 1354 1238  177  380   43  871  177 9935    0]
 [ 386    4    6 9937 9915  467 5410  810 3692 9935]]


In [65]:
enc_mask = tf.cast(tf.math.not_equal(padded_input_seqs, 0), tf.float32)
print("Input:")
print(padded_input_seqs, '\n')
print("Encoder mask:")
print(enc_mask)

Input:
[[ 571  280  386 1934    4   24  248 4339  177 9967]
 [1535 1354 1238  177  380   43  871  177 9935    0]
 [ 386    4    6 9937 9915  467 5410  810 3692 9935]] 

Encoder mask:
tf.Tensor(
[[1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]
 [1. 1. 1. 1. 1. 1. 1. 1. 1. 0.]
 [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]], shape=(3, 10), dtype=float32)


In [66]:
enc_mask = enc_mask[:, tf.newaxis, tf.newaxis, :]
enc_mask

<tf.Tensor: shape=(3, 1, 1, 10), dtype=float32, numpy=
array([[[[1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]]],


       [[[1., 1., 1., 1., 1., 1., 1., 1., 1., 0.]]],


       [[[1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]]]], dtype=float32)>

In [67]:
num_encoder_blocks = 6
d_model = 12
num_heads = 3
ffn_hidden_dim = 48
vocab_size = bpemb_vocab_size
max_input_seq_len = padded_input_seqs.shape[1]

encoder = Encoder(num_encoder_blocks, d_model, num_heads, ffn_hidden_dim, vocab_size, max_input_seq_len)

In [69]:
encoder_output, attention_weights = encoder(padded_input_seqs, training=True, mask = enc_mask)

print(f"Encoder output {encoder_output.shape}: ")
print(encoder_output)

Encoder output (3, 10, 12): 
tf.Tensor(
[[[-3.26760858e-03 -1.20480120e+00  6.22624159e-01  6.83924079e-01
    1.05118525e+00 -1.91665018e+00 -2.97412992e-01  1.14766693e+00
   -1.25742567e+00 -5.53451419e-01  1.11791718e+00  6.09691143e-01]
  [-7.21426010e-01 -6.35106802e-01 -2.51689777e-02  1.04358411e+00
    5.00864089e-01 -2.01896977e+00 -8.15164268e-01  1.48712456e+00
   -8.20116043e-01  7.16940105e-01  1.26338995e+00  2.40489393e-02]
  [-4.43812877e-01 -1.25330758e+00 -1.26191020e-01 -6.65307283e-01
    1.72754753e+00 -6.60814643e-01 -1.64348960e-01  7.81626463e-01
   -8.78134370e-01 -7.89120555e-01  2.03146505e+00  4.40398306e-01]
  [-2.89231271e-01 -1.33801448e+00 -2.58275300e-01  6.04404867e-01
    1.04361665e+00 -1.29179239e+00 -6.83233500e-01  1.09148645e+00
   -1.16659510e+00 -2.44480252e-01  1.83680677e+00  6.95307612e-01]
  [ 6.11717813e-02 -1.12667572e+00 -1.24916658e-01  8.64175737e-01
    1.07200050e+00 -1.57508302e+00 -7.35716105e-01  1.02798140e+00
   -5.52397251e-01

In [70]:
class DecoderBlock(tf.keras.layers.Layer):
  def __init__(self, d_model, num_heads, hidden_dim, dropout_rate=0.1):
    super(DecoderBlock, self).__init__()

    self.mhsa1 = MultiHeadSelfAttention(d_model, num_heads)
    self.mhsa2 = MultiHeadSelfAttention(d_model, num_heads)

    self.ffn = feed_forward_network(d_model, hidden_dim)

    self.dropout1 = tf.keras.layers.Dropout(dropout_rate)
    self.dropout2 = tf.keras.layers.Dropout(dropout_rate)
    self.dropout3 = tf.keras.layers.Dropout(dropout_rate)

    self.layernorm1 = tf.keras.layers.LayerNormalization()
    self.layernorm2 = tf.keras.layers.LayerNormalization()
    self.layernorm3 = tf.keras.layers.LayerNormalization()

  def call(self, encoder_output, target, training, decoder_mask, memory_mask):
    mhsa_output1, attention_weights = self.mhsa1(target, target, target, decoder_mask)
    mhsa_output1 = self.dropout1(mhsa_output1, training=training)
    mhsa_output1 = self.layernorm1(mhsa_output1 + target)

    mhsa_output2, attention_weights = self.mhsa2(mhsa_output1, encoder_output, 
                                            encoder_output, 
                                            memory_mask)
    mhsa_output2 = self.dropout2(mhsa_output2, training=training)
    mhsa_output2 = self.layernorm2(mhsa_output2 + mhsa_output1)

    ffn_output = self.ffn(mhsa_output2)
    ffn_output = self.dropout3(ffn_output, training=training)
    output = self.layernorm3(ffn_output + mhsa_output2)

    return output, attention_weights

In [71]:
class Decoder(tf.keras.layers.Layer):
  def __init__(self, num_blocks, d_model, num_heads, hidden_dim, target_vocab_size,
               max_seq_len, dropout_rate=0.1):
    super(Decoder, self).__init__()

    self.d_model = d_model
    self.max_seq_len = max_seq_len

    self.token_embed = tf.keras.layers.Embedding(target_vocab_size, self.d_model)
    self.pos_embed = tf.keras.layers.Embedding(max_seq_len, self.d_model)

    self.dropout = tf.keras.layers.Dropout(dropout_rate)

    self.blocks = [DecoderBlock(self.d_model, num_heads, hidden_dim, dropout_rate) for _ in range(num_blocks)]

  def call(self, encoder_output, target, training, decoder_mask, memory_mask):
    token_embeds = self.token_embed(target)

    num_pos = target.shape[0] * self.max_seq_len
    pos_idx = np.resize(np.arange(self.max_seq_len), num_pos)
    pos_idx = np.reshape(pos_idx, target.shape)

    pos_embeds = self.pos_embed(pos_idx)

    x = self.dropout(token_embeds + pos_embeds, training=training)

    for block in self.blocks:
      x, weights = block(encoder_output, x, training, decoder_mask, memory_mask)

    return x, weights

In [72]:
target_input_seqs = [
    [1, 652, 723, 123, 62],
    [1, 25,  98, 129, 248, 215, 359, 249],
    [1, 2369, 1259, 125, 486],
]

In [73]:
padded_target_input_seqs = tf.keras.preprocessing.sequence.pad_sequences(target_input_seqs, padding="post")
print("Padded target inputs to the decoder:")
print(padded_target_input_seqs.shape)
print(padded_target_input_seqs)

Padded target inputs to the decoder:
(3, 8)
[[   1  652  723  123   62    0    0    0]
 [   1   25   98  129  248  215  359  249]
 [   1 2369 1259  125  486    0    0    0]]


In [74]:
dec_padding_mask = tf.cast(tf.math.not_equal(padded_target_input_seqs, 0), tf.float32)
dec_padding_mask = dec_padding_mask[:, tf.newaxis, tf.newaxis, :]
print(dec_padding_mask)

tf.Tensor(
[[[[1. 1. 1. 1. 1. 0. 0. 0.]]]


 [[[1. 1. 1. 1. 1. 1. 1. 1.]]]


 [[[1. 1. 1. 1. 1. 0. 0. 0.]]]], shape=(3, 1, 1, 8), dtype=float32)


In [75]:
target_input_seq_len = padded_target_input_seqs.shape[1]
look_ahead_mask = tf.linalg.band_part(tf.ones((target_input_seq_len, 
                                               target_input_seq_len)), -1, 0)
print(look_ahead_mask)

tf.Tensor(
[[1. 0. 0. 0. 0. 0. 0. 0.]
 [1. 1. 0. 0. 0. 0. 0. 0.]
 [1. 1. 1. 0. 0. 0. 0. 0.]
 [1. 1. 1. 1. 0. 0. 0. 0.]
 [1. 1. 1. 1. 1. 0. 0. 0.]
 [1. 1. 1. 1. 1. 1. 0. 0.]
 [1. 1. 1. 1. 1. 1. 1. 0.]
 [1. 1. 1. 1. 1. 1. 1. 1.]], shape=(8, 8), dtype=float32)


In [76]:
dec_mask = tf.minimum(dec_padding_mask, look_ahead_mask)
print("The decoder mask:")
print(dec_mask)

The decoder mask:
tf.Tensor(
[[[[1. 0. 0. 0. 0. 0. 0. 0.]
   [1. 1. 0. 0. 0. 0. 0. 0.]
   [1. 1. 1. 0. 0. 0. 0. 0.]
   [1. 1. 1. 1. 0. 0. 0. 0.]
   [1. 1. 1. 1. 1. 0. 0. 0.]
   [1. 1. 1. 1. 1. 0. 0. 0.]
   [1. 1. 1. 1. 1. 0. 0. 0.]
   [1. 1. 1. 1. 1. 0. 0. 0.]]]


 [[[1. 0. 0. 0. 0. 0. 0. 0.]
   [1. 1. 0. 0. 0. 0. 0. 0.]
   [1. 1. 1. 0. 0. 0. 0. 0.]
   [1. 1. 1. 1. 0. 0. 0. 0.]
   [1. 1. 1. 1. 1. 0. 0. 0.]
   [1. 1. 1. 1. 1. 1. 0. 0.]
   [1. 1. 1. 1. 1. 1. 1. 0.]
   [1. 1. 1. 1. 1. 1. 1. 1.]]]


 [[[1. 0. 0. 0. 0. 0. 0. 0.]
   [1. 1. 0. 0. 0. 0. 0. 0.]
   [1. 1. 1. 0. 0. 0. 0. 0.]
   [1. 1. 1. 1. 0. 0. 0. 0.]
   [1. 1. 1. 1. 1. 0. 0. 0.]
   [1. 1. 1. 1. 1. 0. 0. 0.]
   [1. 1. 1. 1. 1. 0. 0. 0.]
   [1. 1. 1. 1. 1. 0. 0. 0.]]]], shape=(3, 1, 8, 8), dtype=float32)


In [77]:
decoder = Decoder(6, 12, 3, 48, 10000, 8)
decoder_output, _ = decoder(encoder_output, padded_target_input_seqs, 
                            True, dec_mask, enc_mask)
print(f"Decoder output {decoder_output.shape}:")
print(decoder_output)

2024-03-15 03:09:43.268316: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:454] Loaded cuDNN version 8904


Decoder output (3, 8, 12):
tf.Tensor(
[[[-1.0734522  -1.079214    1.6303062   1.2163556  -0.78956246
   -1.3172421  -0.15712343  0.12961079  0.80639815  0.4028942
   -0.9555623   1.1865913 ]
  [-1.0243477  -0.58710825  1.5379286   1.1225665  -0.6302651
   -1.4034023  -0.15430467  0.32217252  1.0032556   0.31777847
   -1.5251329   1.0208594 ]
  [-0.82169926 -0.9687467   1.5137465   1.0379796  -0.5630022
   -1.2522585   0.10352552  0.9428683   0.8983471   0.46200833
   -1.7460334   0.39326474]
  [-0.9825275  -0.57628626  1.6053392   0.5890902  -0.61369103
   -1.2708597   0.06794469  0.52897346  0.90249616  0.4650961
   -1.807985    1.0924097 ]
  [-0.48202044 -0.51373583  0.8427286   1.4161934  -0.70780355
   -0.98012197 -0.15206677 -0.2085001   1.2893338   0.4514563
   -2.0474231   1.0919595 ]
  [-0.9386519  -0.90609443  1.5460386   1.3454987  -0.55361265
   -1.0964422   0.266388   -0.01726206  0.78934187  0.22665195
   -1.6753157   1.0134602 ]
  [-0.83064216 -0.6672186   1.0567138   1.6

In [78]:
class Transformer(tf.keras.Model):
  def __init__(self, num_blocks, d_model, num_heads, hidden_dim, source_vocab_size,
               target_vocab_size, max_input_len, max_target_len, dropout_rate=0.1):
    super(Transformer, self).__init__()

    self.encoder = Encoder(num_blocks, d_model, num_heads, hidden_dim, source_vocab_size, 
                           max_input_len, dropout_rate)
    
    self.decoder = Decoder(num_blocks, d_model, num_heads, hidden_dim, target_vocab_size,
                           max_target_len, dropout_rate)
    
    # The final dense layer to generate logits from the decoder output.
    self.output_layer = tf.keras.layers.Dense(target_vocab_size)

  def call(self, input_seqs, target_input_seqs, training, encoder_mask,
           decoder_mask, memory_mask):
    encoder_output, encoder_attn_weights = self.encoder(input_seqs, 
                                                        training, encoder_mask)

    decoder_output, decoder_attn_weights = self.decoder(encoder_output, 
                                                        target_input_seqs, training,
                                                        decoder_mask, memory_mask)

    return self.output_layer(decoder_output), encoder_attn_weights, decoder_attn_weights


In [79]:
transformer = Transformer(
    num_blocks = 6,
    d_model = 12,
    num_heads = 3,
    hidden_dim = 48,
    source_vocab_size = bpemb_vocab_size,
    target_vocab_size = 7000, 
    max_input_len = padded_input_seqs.shape[1],
    max_target_len = padded_target_input_seqs.shape[1])

transformer_output, _, _ = transformer(padded_input_seqs, 
                                       padded_target_input_seqs, True, 
                                       enc_mask, dec_mask, memory_mask=enc_mask)
print(f"Transformer output {transformer_output.shape}:")
print(transformer_output) 

Transformer output (3, 8, 7000):
tf.Tensor(
[[[-0.05642275  0.02513913 -0.04338125 ...  0.01575807 -0.02423196
   -0.04111049]
  [-0.04408804  0.02414898 -0.10227228 ... -0.02808993  0.03069493
   -0.06102085]
  [-0.00465563  0.04543363 -0.09451631 ... -0.00123715 -0.05071724
    0.03225553]
  ...
  [-0.05206156 -0.00774972 -0.03628813 ...  0.00924029  0.01225576
   -0.04328141]
  [-0.03390186  0.03991597 -0.06668885 ...  0.0190472  -0.04141977
   -0.0387602 ]
  [-0.05633464  0.01065958 -0.03857102 ...  0.01277313  0.01456492
   -0.02440928]]

 [[ 0.01661561  0.02113944 -0.07620583 ... -0.03187927 -0.01650352
   -0.01205103]
  [ 0.00602331  0.00787767 -0.0331655  ...  0.00470732 -0.04223408
   -0.01561172]
  [-0.00793221 -0.00697175 -0.04534816 ...  0.01082534 -0.02547691
   -0.00891706]
  ...
  [-0.03553086  0.00793064 -0.02056244 ...  0.00559565 -0.00507873
   -0.03127439]
  [-0.02636139 -0.01595749 -0.0172566  ...  0.01105426 -0.001003
   -0.03645158]
  [-0.01677059 -0.00612543 -0.0