# Transformer 구현
 - https://wikidocs.net/31379
 - https://github.com/NLP-kr/tensorflow-ml-nlp-tf2/blob/master/6.CHATBOT/6.5.transformer.ipynb

- '그것(it)'과 '동물(animal)'의 연관도를 높게 본다면, 두번째 어텐션 헤드는 '그것(it)'과 '피곤하였기 때문이다(tired)'의 연관도를 높게 볼 수 있습니다. 각 어텐션 헤드는 전부 다른 시각에서 보고있기 때문입니다.

In [64]:
import numpy as np
import tensorflow as tf

### Positional Encoding
 - RNN과 달리 트랜스포머의 입력은 단어 하나하나 순차적으로 넣지 않고 한번에 넣어줌
 - 따라서 입력에 순서 정보를 넣어줄 필요성이 있음
 - => 각 단어의 임베딩 벡터에 위치 정보들을 더해 모델의 입력으로 사용
 - (참고) positional encoding에 대한 설명 [링크](https://gaussian37.github.io/dl-concept-positional_encoding/)

In [61]:
# pos: 입력 문장의 길이
# d_model: 임베딩 벡터의 차원
def get_angles(pos, i, d_model):
    angles = 1 / np.power(10000, (2 * i//2) / np.float32(d_model))
    return pos * angles

def positional_encoding(position, d_model):
    angle_rads = get_angles(
        np.arange(position)[:, np.newaxis],
        np.arange(d_model)[np.newaxis, :],
        d_model)
    
    angle_rads[:, 0::2] = np.sin(angle_rads[:, 0::2])
    angle_rads[:, 1::2] = np.cos(angle_rads[:, 1::2])
    
    pos_encoding = angle_rads[np.newaxis, ...]
        
    return tf.cast(pos_encoding, dtype=tf.float32)

In [62]:
# positional encoding 테스트
pos = 50
d_model = 128
# (50, 1) (1, 128)
x = positional_encoding(pos, d_model)
x

<tf.Tensor: shape=(1, 50, 128), dtype=float32, numpy=
array([[[ 0.00000000e+00,  1.00000000e+00,  0.00000000e+00, ...,
          1.00000000e+00,  0.00000000e+00,  1.00000000e+00],
        [ 8.41470957e-01,  5.97375333e-01,  7.61720419e-01, ...,
          1.00000000e+00,  1.15478200e-04,  1.00000000e+00],
        [ 9.09297407e-01, -2.86285430e-01,  9.87046242e-01, ...,
          9.99999940e-01,  2.30956401e-04,  1.00000000e+00],
        ...,
        [ 1.23573124e-01,  9.70037520e-01,  1.39920667e-01, ...,
          9.99983013e-01,  5.42744854e-03,  9.99987245e-01],
        [-7.68254638e-01,  7.74317265e-01, -6.63571715e-01, ...,
          9.99982238e-01,  5.54292509e-03,  9.99986708e-01],
        [-9.53752637e-01, -4.49214056e-02, -9.99784708e-01, ...,
          9.99981523e-01,  5.65840164e-03,  9.99986112e-01]]],
      dtype=float32)>

### Attention

In [1]:
def scaledDotProductAttention(query, key, value, mask):
    '''
    attention weight을 계산하는 함수
    
    Args:
    query: query의 shape = (batch_size, num_heads, query의 문장길이, d_model/num_head)
    key: key의 shape : (batch_size, num_heads, key의 문장길이, d_model/num_head)
    value: value의 shape : (batch_size, num_heads, value의 문장길이, d_model/num_head)
    
    d_model/num_head => d_k
    q를 얻기위한 가중치행렬의 shape는 (embedding_size, d_k)
    q의 shape는 (query의 문장길이, d_k)
     
    Returns:
    output, attetion_weights
    '''
    
    # 어텐션 스코어, Q와 K의 dot product
    qk = tf.matmul(query, key, transpose_b=True)
    
    # 스케일링 (sqrt(dk)로 나눔)
    # dk = d_model / num_head
    scale = tf.cast(tf.shape(key)[-1], tf.float(32))
    logits = qk / tf.math.sqrt(scale)
    
    # 마스킹
    if mask is not None:
        logits += (mask * -1e9)
        
    # 어텐션 weights
    # size : (batch_size, num_head, query의 문장길이, key의 문장길이)
    attention_weights = tf.nn.softmax(logits, axis=-1)
    
    # Output
    output = tf.matmul(attention_weight, value)
    
    return output, attention_weights

In [None]:
class MultiHeadAttention(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads, name='multi_head_attention'):
        super(MultiHeadAttention, self).__init__(name=name)
        self.d_model = d_model
        self.num_heads = num_heads
        
        # 나머지가 발생하면 안되기 때문에 
        assert self.d_model % self.num_heads == 0
        
        # depth = d_model // num_heads (논문기준 64 = 512 // 8)
        # deqth와 d_k는 같음
        self.depth = self.d_model // self.num_Heads
        
        # WQ, WK, WV 정의
        # => Q, K, V 행렬을 만들기 위한 가중치 행렬 (문장행렬에 가중치행렬이 곱해져 Q, K, V를 얻음)
        self.wq = tf.keras.layers.Dense(units=d_model)
        self.wk = tf.keras.layers.Dense(units=d_model)
        self.wv = tf.keras.layers.Dense(units=d_model)
        
        # WO 정의
        # => Attention Heads들 Concat에 곱해주는 행렬
        self.dense = tf.keras.layer.Dense(units=d_model)
        
    def splitHeads(self, inputs, batch_size):
        '''
        num_heads 수만큼 Q, K, V를 split하는 함수
        
        Return:
        (batch_size, num_heads, seq_len, depth)
        '''
        inputs = tf.reshape(inputs, shape=(batch_size, -1, self.num_heads, self_depth))
        return tf.transpose(inputs, perm=[0, 2, 1, 3])
    
    
    def call(self, inputs):
        q, k, v, mask = inputs['query'], inputs['key'], inputs['value'], inputs['mask']
        batch_size = tf.shape(query)[0]
        
        # 1) WQ, WK, WV에 해당하는 dense layer 통과     
        ## 인코더(k,v) - 디코더(q) 어텐션에서는 query 길이와 key, value길이가 다를 수 있다
        query = self.wq(q)    # query : (batch_size, seq_len_q, d_model)
        key = self.wk(k)      # key   : (batch_size, seq_len_k, d_model)
        value = self.wv(v)    # value : (batch_size, seq_len_v, d_model)
        
        # 2) split head
        query = self.splitHeads(query, batch_size)    # query : (batch_size, num_heads, seq_len_q, depth)
        key = self.splitHeads(key, batch_size)        # key : (batch_size, num_heads, seq_len_k, depth)
        value = self.splitHeads(value, batch_size)    # value : (batch_size, num_heads, seq_len_v, depth)
        
        # 3) scaled dot product attention
        # (batch_size, num_heads, seq_len_q, d_model/num_heads)
        scaled_attention, _ = scaledDotProductAttention(query, key, value, mask)
        
        # (batch_size, seq_len_q, num_heads, d_model/num_heads) => ??
        scaled_attention = tf.transpose(scaled_attention, perm=[0, 2, 1, 3])
        
        # 4) head 연결
        # (batch_size, seq_len_q, d_model)
        concat_attention = tf.reshape(scaled_attention, shape=(batch_size, -1, self.d_model))
        
        # 5) WO에 해당하는 dense layer 통과
        # (batch_size, seq_len_q, d_model)
        output = self.dense(concat_attention)
        
        
        return output

# 1. 왜 WQ, WK ,WV를 dense layer로 통과시켜주면 되는건지
# 2. 왜 split에서 (batch_size, seq_len, d_model)을 (batch_size, -1, num_heads, depth)로 reshape 해주는지

### Position-wise Feed-Foward Networks

In [67]:
# 논문에서 dff = 2048
def FFN(dff, d_model):
    return tf.keras.Sequential([
        tf.keras.layers.Dense(units=dff, activation='relu'),
        tf.keras.layers.Dense(units=d_model)
    ])

### Residual Connection & Layer Normalization

### EncoderLayer

In [None]:
# EncoderLayer
class EncoderLayer(tf.keras.layers.Layer):
    # Add&Norm 이전 layer는 dropout
    def __init__(self, dff, d_model, num_heads, dropout, name='encoder_layer'):
        super(EncoderLayer, self).__init__()
        self.mha = MultiHeadAttention(d_model, num_heads)
        
        self.ffn = FFN(dff, d_model)
        
        self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        
        self.dropout1 = tf.keras.layers.Dropout(rate=dropout)
        self.dropout2 = tf.keras.layers.Dropout(rate=dropout)
    
    # MHA - dropout - Add&Norm - FFN - dropout - Add&Norm
    def call(self, x, mask):
        attn_output, _ = self.mha(x, x, x, mask)    # x: query, key, value
        attn_output = self.dropout1(attn_output)
        out1 = self.layernorm1(x + attn_output)    # x + attn_output => residual connection
        
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output)
        out2 = self.layernorm2(out1 + ffn_output)
        
        return out2
        
        
# Encoder
class Encoder(tf.keras.layer.Layer):
    def __init__(self, dff, vocab_size, num_layers, d_model, num_heads, dropout, name='Encoder'):
        super(Encoder, self).__init__(name=name)
        
        self.d_model = d_model
        self.num_layers = num_layers
        self.num_heads = num_heads
        self.dropout = dropout
        
        self.embedding = tf.keras.layers.Embedding(vacab_size, self.d_model)
        self.pos_encoding = positional_encoding(vocab_size, self.d_model)
        
        self.enc_layers = [EncoderLayer(dff=dff, d_model=self.d_model, num_heads=self.num_heads, dropout=self.dropout)
                           for _ in range(self.num_layers)]
        self.dropout = tf.keras.layers.Dropout(rate=dropout)
        
        
    def call(self, x, mask):
        # x = query, key, value
        seq_len = tf.shape(x)[1]
        
        x = self.embedding(x)
        x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
        x += self.pos_encoding[:, :seq_len, :]
        
        x = self.dropout(x)
        
        for i in range(self.num_layers):
            x = self.enc_layers[i](x, mask)
            
            
        return x
            