In [None]:
!pip install tensorflow_addons

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting tensorflow_addons
  Downloading tensorflow_addons-0.19.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m50.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting typeguard>=2.7
  Downloading typeguard-2.13.3-py3-none-any.whl (17 kB)
Installing collected packages: typeguard, tensorflow_addons
Successfully installed tensorflow_addons-0.19.0 typeguard-2.13.3


In [None]:
import numpy as np
import tensorflow as tf
import tensorflow_addons as tfa

In [None]:
class MultiHeadSelfAttention(tf.keras.Model):
  def __init__(self, vector_size, heads=1):
    super(MultiHeadSelfAttention, self).__init__()
    self.vector_size = vector_size
    self.heads = heads
    self.key = tf.keras.layers.Dense(vector_size // heads)
    self.value = tf.keras.layers.Dense(vector_size // heads)
    self.query = tf.keras.layers.Dense(vector_size // heads)

  def call(self, x):
    batch_size = x.shape[0]
    sequence_length = x.shape[1]
    x_mh = tf.reshape(x, [batch_size, sequence_length, self.heads, self.vector_size // self.heads])
    x_mh = tf.transpose(x_mh, (0, 2, 1, 3))
    key = tf.keras.layers.Dense(self.vector_size // self.heads)(x_mh)
    value = tf.keras.layers.Dense(self.vector_size // self.heads)(x_mh)
    query = tf.keras.layers.Dense(self.vector_size // self.heads)(x_mh)
    mat_mul = tf.matmul(query, key, transpose_b=True)
    n = self.vector_size
    atten = mat_mul / tf.sqrt(tf.cast(n, tf.float32))
    atten = tf.nn.softmax(atten)
    # 乘上v
    y = tf.matmul(atten, value)
    y = tf.transpose(y, (0, 2, 1, 3))
    y = tf.reshape(y, (batch_size, sequence_length, self.vector_size))
    return y

In [None]:
batch_size = 4
sequence_length = 10
vector_size = 32
heads = 4

x = tf.random.uniform((batch_size, sequence_length, vector_size))
attention_model = MultiHeadSelfAttention(vector_size, heads)
attention_model(x).shape

TensorShape([4, 10, 32])

以上是multi-head self attention的输出，然后和x自身的residual做layer norm输入到FFN

### Layer Normalization

Layer Normalization vs Batch Normalization

BN是针对每个batch做Normalization

LN是针对层中的隐变量做Normalization。如x.shape=(4, 10, 32), LN是对32进行norm

In [None]:
x[0][0]

<tf.Tensor: shape=(32,), dtype=float32, numpy=
array([0.8807533 , 0.23969948, 0.9159522 , 0.8483242 , 0.88680434,
       0.5049244 , 0.29790604, 0.20629406, 0.31995618, 0.7408869 ,
       0.9190035 , 0.8543589 , 0.6024481 , 0.13442862, 0.95582974,
       0.987481  , 0.988533  , 0.3654201 , 0.7219895 , 0.34775913,
       0.31657326, 0.82768834, 0.59613705, 0.8927474 , 0.7903615 ,
       0.856418  , 0.4400022 , 0.76604843, 0.8117665 , 0.87231755,
       0.35432923, 0.31857657], dtype=float32)>

In [None]:
np.mean(x[0][0]), np.std(x[0][0])

(0.6425537, 0.26949573)

In [None]:
# Layer Normalization
ln = tf.keras.layers.LayerNormalization()
y = ln(x)
np.mean(y[0][0]), np.std(y[0][0])

(8.195639e-08, 0.99318594)

FFN的过程：

1. 将vector_size扩大4倍：[batch_size, sequence_length, vector_size * 4]
2. 通过非线性激活函数
3. 还原vector_size: [batch_size, sequence_length, vector_size]


In [None]:
class Transformer(tf.keras.Model):

  def __init__(self, vector_size, heads=1):
    super().__init__()
    # 两个LN层，因为每个都有独立的参数，所以无法复用
    self.ln0 = tf.keras.layers.LayerNormalization()
    self.ln1 = tf.keras.layers.LayerNormalization()
    self.mh_atten = MultiHeadSelfAttention(vector_size, heads)
    self.ffn = tf.keras.Sequential([
        tf.keras.layers.Dense(vector_size * 4),
        tf.keras.layers.Activation(tfa.activations.gelu),
        tf.keras.layers.Dense(vector_size),
    ])
  
  def call(self, inputs):
    z = self.ln0(x + self.mh_atten(x))
    y = self.ln1(z + self.ffn(z))
    return y

In [None]:
transformer = Transformer(vector_size, heads)
transformer(x).shape

TensorShape([4, 10, 32])