# 03-2 트랜스포머를 사용한 텍스트 생성

<table align="left"><tr><td>
<a href="https://colab.research.google.com/github/rickiepark/hm-dl/blob/main/03-2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="코랩에서 실행하기"/></a>
</td></tr></table>

In [1]:
!pip install keras-nlp



In [2]:
import keras
from keras import layers
import keras_nlp

2024-04-26 07:55:46.537421: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-04-26 07:55:46.537956: I external/local_tsl/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-04-26 07:55:46.541128: I external/local_tsl/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-04-26 07:55:46.583068: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
  from .autonotebook import tqdm as notebook_tqdm


In [66]:
def transformer_decoder(x, padding_mask, dropout,
                        activation='relu', norm_first=False):
    residual = x
    key_dim = embed_dim // num_heads
    if norm_first:
        x = layers.LayerNormalization()(x)
    # 멀티 헤드 어텐션을 통과합니다.
    x = layers.MultiHeadAttention(num_heads, key_dim, dropout=dropout)(
        query=x, value=x, attention_mask=padding_mask)
    x = layers.Dropout(dropout)(x)
    # 스킵 연결
    x = x + residual
    if not norm_first:
        x = layers.LayerNormalization()(x)
    residual = x
    # 위치별 피드 포워드 네트워크
    if norm_first:
        x = layers.LayerNormalization()(x)
    x = layers.Dense(embed_dim * 4, activation=activation)(x)
    x = layers.Dense(embed_dim)(x)
    x = layers.Dropout(dropout)(x)
    # 스킵 연결
    x = x + residual
    if not norm_first:
        x = layers.LayerNormalization()(x)
    return x

In [61]:
# 코잘 마스킹
def make_causal_mask(seq_len):
    n_hori = keras.ops.arange(seq_len)
    n_vert = keras.ops.reshape(n_hori, (-1, 1))
    mask = n_vert >= n_hori
    return mask

causal_mask = make_causal_mask(5)
causal_mask

<tf.Tensor: shape=(5, 5), dtype=bool, numpy=
array([[ True, False, False, False, False],
       [ True,  True, False, False, False],
       [ True,  True,  True, False, False],
       [ True,  True,  True,  True, False],
       [ True,  True,  True,  True,  True]])>

In [62]:
padding_mask = [1, 1, 1, 0, 0]
keras.ops.minimum(causal_mask, padding_mask)

<tf.Tensor: shape=(5, 5), dtype=int32, numpy=
array([[1, 0, 0, 0, 0],
       [1, 1, 0, 0, 0],
       [1, 1, 1, 0, 0],
       [1, 1, 1, 0, 0],
       [1, 1, 1, 0, 0]], dtype=int32)>

In [63]:
def make_attention_mask(padding_mask):
    # padding_mask 크기가 (2, 5)라고 가정합니다.
    batch_size, seq_len = keras.ops.shape(padding_mask)
    print(batch_size, seq_len)
    # causal_mask 크기는 (5, 5)입니다.
    causal_mask = make_causal_mask(seq_len)
    print(causal_mask)
    # 배치 차원을 추가해 (2, 5, 5)로 만듭니다.
    causal_mask = keras.ops.broadcast_to(causal_mask, (batch_size, seq_len, seq_len))
    # 브로드캐스팅을 위해 padding_mask 크기를 (2, 1, 5)로 만듭니다.
    padding_mask = keras.ops.expand_dims(padding_mask, axis=1)
    return keras.ops.minimum(causal_mask, padding_mask)

In [64]:
import numpy as np

make_attention_mask(np.array([[1, 1, 0, 0, 0], [1, 1, 1, 1, 0]]))

2 5
tf.Tensor(
[[ True False False False False]
 [ True  True False False False]
 [ True  True  True False False]
 [ True  True  True  True False]
 [ True  True  True  True  True]], shape=(5, 5), dtype=bool)


<tf.Tensor: shape=(2, 5, 5), dtype=int64, numpy=
array([[[1, 0, 0, 0, 0],
        [1, 1, 0, 0, 0],
        [1, 1, 0, 0, 0],
        [1, 1, 0, 0, 0],
        [1, 1, 0, 0, 0]],

       [[1, 0, 0, 0, 0],
        [1, 1, 0, 0, 0],
        [1, 1, 1, 0, 0],
        [1, 1, 1, 1, 0],
        [1, 1, 1, 1, 0]]])>

In [56]:
# 임시 테스트 코드
vocab_size = 50257
num_heads = 12
embed_dim = 768
max_seq_len = 1024

token_ids = keras.Input(shape=(None,))
padding_mask = keras.Input(shape=(None,))

token_embedding = keras_nlp.layers.ReversibleEmbedding(vocab_size, embed_dim)(token_ids)
pos_embedding = keras_nlp.layers.PositionEmbedding(max_seq_len)(token_embedding)
x = layers.Add()((token_embedding, pos_embedding))

outputs = keras_nlp.layers.TransformerDecoder(3072, 12)(x)
model = keras.Model(inputs=(token_ids, padding_mask),
                    outputs=(outputs))
model.summary()

In [67]:
# GPT-2 베이스
vocab_size = 50257
num_layers = 12
num_heads = 12
embed_dim = 768
dropout = 0.1
activation = 'gelu'
max_seq_len = 1024

token_ids = keras.Input(shape=(None,))
padding_mask = keras.Input(shape=(None,))

token_embedding = keras_nlp.layers.ReversibleEmbedding(vocab_size, embed_dim)(token_ids)
pos_embedding = keras_nlp.layers.PositionEmbedding(max_seq_len)(token_embedding)

x = layers.Add()((token_embedding, pos_embedding))
x = layers.Dropout(dropout)(x)
for _ in range(num_layers):
    x = transformer_decoder(x, padding_mask, dropout, activation)

outputs = layers.LayerNormalization()(x)
model = keras.Model(inputs=(token_ids, padding_mask),
                    outputs=(outputs))
model.summary()

In [14]:
gpt2_base = keras_nlp.models.GPT2Backbone.from_preset('gpt2_base_en')
gpt2_base.get_config()

{'name': 'gpt2_backbone',
 'trainable': True,
 'vocabulary_size': 50257,
 'num_layers': 12,
 'num_heads': 12,
 'hidden_dim': 768,
 'intermediate_dim': 3072,
 'dropout': 0.1,
 'max_sequence_length': 1024}

In [16]:
gpt2_base.summary()