# 03-2 트랜스포머를 사용한 텍스트 생성

<table align="left"><tr><td>
<a href="https://colab.research.google.com/github/rickiepark/hm-dl/blob/main/03-2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="코랩에서 실행하기"/></a>
</td></tr></table>

In [1]:
!pip install keras-nlp



## 트랜스포머 디코더

In [1]:
import keras
from keras import layers
import keras_nlp

2024-05-11 03:19:44.404180: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-05-11 03:19:44.407819: I external/local_tsl/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-05-11 03:19:44.489477: I external/local_tsl/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-05-11 03:19:44.898436: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
  from .autonotebook import tqdm as notebook_tqdm


## 코잘 마스킹

In [2]:
def make_causal_mask(seq_len):
    n_hori = keras.ops.arange(seq_len)
    n_vert = keras.ops.expand_dims(n_hori, axis=-1)
    mask = n_vert >= n_hori
    return mask

In [3]:
causal_mask = make_causal_mask(5)
causal_mask

2024-05-11 03:19:57.002382: E external/local_xla/xla/stream_executor/cuda/cuda_driver.cc:282] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected


<tf.Tensor: shape=(5, 5), dtype=bool, numpy=
array([[ True, False, False, False, False],
       [ True,  True, False, False, False],
       [ True,  True,  True, False, False],
       [ True,  True,  True,  True, False],
       [ True,  True,  True,  True,  True]])>

In [4]:
padding_mask = [1, 1, 1, 0, 0]
keras.ops.minimum(causal_mask, padding_mask)

<tf.Tensor: shape=(5, 5), dtype=int32, numpy=
array([[1, 0, 0, 0, 0],
       [1, 1, 0, 0, 0],
       [1, 1, 1, 0, 0],
       [1, 1, 1, 0, 0],
       [1, 1, 1, 0, 0]], dtype=int32)>

In [5]:
def make_attention_mask(padding_mask):
    # padding_mask 크기가 (2, 5)라고 가정해 보죠.
    batch_size, seq_len = keras.ops.shape(padding_mask)
    # causal_mask 크기는 (5, 5)가 됩니다.
    causal_mask = make_causal_mask(seq_len)
    # 배치 차원을 추가해 (2, 5, 5)로 만듭니다.
    causal_mask = keras.ops.broadcast_to(causal_mask, (batch_size, seq_len, seq_len))
    # 브로드캐스팅을 위해 padding_mask 크기를 (2, 1, 5)로 만듭니다.
    padding_mask = keras.ops.expand_dims(padding_mask, axis=1)
    return keras.ops.minimum(causal_mask, padding_mask)

In [6]:
make_attention_mask([[1, 1, 0, 0, 0], [1, 1, 1, 1, 0]])

<tf.Tensor: shape=(2, 5, 5), dtype=int32, numpy=
array([[[1, 0, 0, 0, 0],
        [1, 1, 0, 0, 0],
        [1, 1, 0, 0, 0],
        [1, 1, 0, 0, 0],
        [1, 1, 0, 0, 0]],

       [[1, 0, 0, 0, 0],
        [1, 1, 0, 0, 0],
        [1, 1, 1, 0, 0],
        [1, 1, 1, 1, 0],
        [1, 1, 1, 1, 0]]], dtype=int32)>

## 디코더 구현하기

In [7]:
class AttentionMask(keras.Layer):
    def call(self, padding_mask):
        return make_attention_mask(padding_mask)

In [8]:
def transformer_decoder(x, padding_mask, dropout,
                        activation='relu', norm_first=False):
    # 어텐션 마스크를 계산합니다.
    attention_mask = AttentionMask()(padding_mask)
    # 스킵 연결을 준비합니다.
    residual = x
    key_dim = embed_dim // num_heads
    if norm_first:
        x = layers.LayerNormalization()(x)
    # 멀티 헤드 어텐션을 통과합니다.
    x = layers.MultiHeadAttention(num_heads, key_dim, dropout=dropout)(
        query=x, value=x, attention_mask=attention_mask)
    x = layers.Dropout(dropout)(x)
    # 스킵 연결
    x = x + residual
    if not norm_first:
        x = layers.LayerNormalization()(x)
    # 스킵 연결을 준비합니다.
    residual = x
    # 위치별 피드 포워드 네트워크
    if norm_first:
        x = layers.LayerNormalization()(x)
    x = layers.Dense(embed_dim * 4, activation=activation)(x)
    x = layers.Dense(embed_dim)(x)
    x = layers.Dropout(dropout)(x)
    # 스킵 연결
    x = x + residual
    if not norm_first:
        x = layers.LayerNormalization()(x)
    return x

## GPT-2

In [9]:
# GPT-2
vocab_size = 50257
num_layers = 12
num_heads = 12
embed_dim = 768
dropout = 0.1
activation = 'gelu'
max_seq_len = 1024

token_ids = keras.Input(shape=(None,))
padding_mask = keras.Input(shape=(None,))

token_embedding_layer = keras_nlp.layers.ReversibleEmbedding(vocab_size, embed_dim)
token_embedding = token_embedding_layer(token_ids)
pos_embedding = keras_nlp.layers.PositionEmbedding(max_seq_len)(token_embedding)

x = layers.Add()((token_embedding, pos_embedding))
x = layers.Dropout(dropout)(x)
for _ in range(num_layers):
    x = transformer_decoder(x, padding_mask, dropout, activation)

x = layers.LayerNormalization()(x)
outputs = token_embedding_layer(x, reverse=True)
model = keras.Model(inputs=(token_ids, padding_mask),
                    outputs=(outputs))
model.summary()

In [10]:
gpt2 = keras_nlp.models.GPT2CausalLM.from_preset('gpt2_base_en')
gpt2.summary()

Downloading from https://www.kaggle.com/api/v1/models/keras/gpt2/keras/gpt2_base_en/2/download/metadata.json...
100%|██████████| 141/141 [00:00<00:00, 142kB/s]
Downloading from https://www.kaggle.com/api/v1/models/keras/gpt2/keras/gpt2_base_en/2/download/task.json...
Downloading from https://www.kaggle.com/api/v1/models/keras/gpt2/keras/gpt2_base_en/2/download/preprocessor.json...


In [11]:
gpt2.generate('stay hungry, stay', max_length=6)

2024-05-11 03:20:04.135551: E tensorflow/core/util/util.cc:131] oneDNN supports DT_INT64 only on platforms with AVX-512. Falling back to the default Eigen-based implementation if present.
I0000 00:00:1715397608.993494 1210298 service.cc:145] XLA service 0x7f2854008460 initialized for platform Host (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1715397608.993556 1210298 service.cc:153]   StreamExecutor device (0): Host, Default Version
I0000 00:00:1715397609.010530 1210298 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


'stay hungry, stay active'

In [12]:
gpt2.generate('stay hungry, stay', max_length=20)

'stay hungry, stay healthy\n\nThe following list is based solely on information provided by our members'

In [13]:
inputs, target, mask = gpt2.preprocessor('stay hungry, stay', sequence_length=10)
inputs, target, mask

({'token_ids': <tf.Tensor: shape=(10,), dtype=int32, numpy=
  array([50256, 31712, 14720,    11,  2652, 50256,     0,     0,     0,
             0], dtype=int32)>,
  'padding_mask': <tf.Tensor: shape=(10,), dtype=bool, numpy=
  array([ True,  True,  True,  True,  True,  True, False, False, False,
         False])>},
 <tf.Tensor: shape=(10,), dtype=int32, numpy=
 array([31712, 14720,    11,  2652, 50256,     0,     0,     0,     0,
            0], dtype=int32)>,
 <tf.Tensor: shape=(10,), dtype=bool, numpy=
 array([ True,  True,  True,  True,  True, False, False, False, False,
        False])>)

In [14]:
gpt2_tokenizer = gpt2.preprocessor.tokenizer
for ids in target:
    print(gpt2_tokenizer.id_to_token(ids), end=' ')

stay Ġhungry , Ġstay <|endoftext|> ! ! ! ! ! 

In [15]:
inputs = gpt2.preprocessor.generate_preprocess(['stay hungry, stay'], sequence_length=10)
inputs

{'token_ids': <tf.Tensor: shape=(1, 10), dtype=int32, numpy=
 array([[50256, 31712, 14720,    11,  2652,     0,     0,     0,     0,
             0]], dtype=int32)>,
 'padding_mask': <tf.Tensor: shape=(1, 10), dtype=bool, numpy=
 array([[ True,  True,  True,  True,  True, False, False, False, False,
         False]])>}

In [16]:
outputs = gpt2.generate_function(inputs)
outputs

{'token_ids': <tf.Tensor: shape=(1, 10), dtype=int32, numpy=
 array([[50256, 31712, 14720,    11,  2652, 47124,    11,  2652, 47124,
            11]], dtype=int32)>,
 'padding_mask': <tf.Tensor: shape=(1, 10), dtype=bool, numpy=
 array([[ True,  True,  True,  True,  True,  True,  True,  True,  True,
          True]])>}

In [17]:
gpt2.preprocessor.generate_postprocess(outputs)

<tf.Tensor: shape=(1,), dtype=string, numpy=array([b'stay hungry, stay thirsty, stay thirsty,'], dtype=object)>

## 샘플러

In [18]:
sampler = keras_nlp.samplers.TopKSampler(k=10, temperature=0.5, seed=42)
gpt2.compile(sampler=sampler)
gpt2.generate('stay hungry, stay', max_length=20)

'stay hungry, stay thirsty, stay thirsty, stay thirsty, stay thirsty, stay thirsty, stay'

In [19]:
sampler = keras_nlp.samplers.TopKSampler(k=10, temperature=5, seed=42)
gpt2.compile(sampler=sampler)
gpt2.generate('stay hungry, stay', max_length=20)



'stay hungry, stay clean: The health effects are clear and there isn"t a lot that'

### top-p 샘플링

In [20]:
sampler = keras_nlp.samplers.TopPSampler(p=0.8, seed=42)
gpt2.compile(sampler=sampler)
gpt2.generate('stay hungry, stay', max_length=20)



'stay hungry, stay thirsty, stay thirsty, stay thirsty.\n\n"When I\'m feeling'

In [21]:
sampler = keras_nlp.samplers.TopPSampler(p=0.8, k=1000, temperature=5, seed=42)
gpt2.compile(sampler=sampler)
gpt2.generate('stay hungry, stay', max_length=20)

'stay hungry, stay back so days too make much learning risky at AS2TP …You "'

In [22]:
gpt2.compile(sampler='top_p')
gpt2.generate('stay hungry, stay', max_length=20)

'stay hungry, stay thirsty, stay thirsty, stay thirsty, stay thirsty, stay thirsty, stay'

### 그리디 샘플링과 랜덤 샘플링

In [23]:
gpt2.compile(sampler='greedy')
gpt2.generate('stay hungry, stay', max_length=20)

'stay hungry, stay thirsty, stay thirsty, stay thirsty, stay thirsty, stay thirsty, stay'

In [24]:
sampler = keras_nlp.samplers.RandomSampler(temperature=5, seed=42)
gpt2.compile(sampler=sampler)
gpt2.generate('stay hungry, stay', max_length=20)

'stay hungry, stay extractPGLinks 42 savingshopHeader Nerd search Romantic ready OftenCIAc ol'

### 빔 샘플링과 대조 샘플링

In [29]:
sampler = keras_nlp.samplers.BeamSampler(num_beams=10, temperature=5)
gpt2.compile(sampler=sampler)
gpt2.generate('stay hungry, stay', max_length=20)

'stay hungry, stay hydrated stay hydrated\n\nStay hydrated stay hydrated\n\n'

In [47]:
sampler = keras_nlp.samplers.ContrastiveSampler(k=5, alpha=0.2)
gpt2.compile(sampler=sampler)
gpt2.generate('stay hungry, stay', max_length=20)

'stay hungry, stay thirsty, stay thirsty, stay thirsty, stay thirsty, stay thirsty, stay'

In [48]:
sampler = keras_nlp.samplers.ContrastiveSampler(k=5, alpha=0.8)
gpt2.compile(sampler=sampler)
gpt2.generate('stay hungry, stay', max_length=20)

'stay hungry, stay thirsty\n\n\nA lot has changed in the last few years. The number'

## 허깅페이스 `transformers` 라이브러리 사용하기

In [None]:
from transformers import pipeline

gpt1 = pipeline('text-generation', model='openai-gpt')
gpt1("stay hungry, stay", max_length=10, truncation=True)