# 03-2 트랜스포머를 사용한 텍스트 생성

<table align="left"><tr><td>
<a href="https://colab.research.google.com/github/rickiepark/hm-dl/blob/main/03-2-gemma.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="코랩에서 실행하기"/></a>
</td></tr></table>

코랩에서 이 노트북을 실행하려면 High-RAM CPU 런타임을 사용해야 합니다.

이 절의 코드를 실행하려면 `keras-nlp` 패키지와 허깅페이스 `transformers` 패키지를 위한 `tf-keras`를 설치해야 합니다.

In [1]:
pip install -U tensorflow[and-cuda] keras-nlp tf-keras

Collecting tensorflow[and-cuda]
  Downloading tensorflow-2.16.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (589.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m589.8/589.8 MB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting keras-nlp
  Downloading keras_nlp-0.12.1-py3-none-any.whl (570 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m570.5/570.5 kB[0m [31m49.5 MB/s[0m eta [36m0:00:00[0m
Collecting tf-keras
  Downloading tf_keras-2.16.0-py3-none-any.whl (1.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m86.1 MB/s[0m eta [36m0:00:00[0m
Collecting h5py>=3.10.0 (from tensorflow[and-cuda])
  Downloading h5py-3.11.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (5.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.3/5.3 MB[0m [31m70.9 MB/s[0m eta [36m0:00:00[0m
Collecting ml-dtypes~=0.3.1 (from tensorflow[and-cuda])
  Downloading ml_dtyp

In [2]:
import keras
import keras_nlp

keras.__version__, keras_nlp.__version__

('3.3.3', '0.12.1')

In [3]:
import keras
from keras import layers
import keras_nlp

In [4]:
def make_causal_mask(seq_len):
    n_hori = keras.ops.arange(seq_len)
    n_vert = keras.ops.expand_dims(n_hori, axis=-1)
    mask = n_vert >= n_hori
    return mask

In [5]:
def make_attention_mask(padding_mask):
    # padding_mask 크기가 (2, 5)라고 가정해 보죠.
    batch_size, seq_len = keras.ops.shape(padding_mask)
    # causal_mask 크기는 (5, 5)가 됩니다.
    causal_mask = make_causal_mask(seq_len)
    # 배치 차원을 추가해 (2, 5, 5)로 만듭니다.
    causal_mask = keras.ops.broadcast_to(causal_mask, (batch_size, seq_len, seq_len))
    # 브로드캐스팅을 위해 padding_mask 크기를 (2, 1, 5)로 만듭니다.
    padding_mask = keras.ops.expand_dims(padding_mask, axis=1)
    return keras.ops.minimum(causal_mask, padding_mask)

In [6]:
class AttentionMask(keras.Layer):
    def call(self, padding_mask):
        return make_attention_mask(padding_mask)

## Gemma

### 젬마 구현하기

In [7]:
from keras_nlp.src.models.gemma.gemma_attention import CachedGemmaAttention
from keras_nlp.src.models.gemma.rms_normalization import RMSNormalization

def gemma_decoder(x, padding_mask, num_query_heads, num_key_value_heads,
                  interm_dim, hidden_dim, head_dim):
    # 어텐션 마스크를 계산합니다.
    attention_mask = AttentionMask()(padding_mask)
    # 스킵 연결을 준비합니다.
    residual = x
    x = RMSNormalization()(x)
    # 멀티 헤드 어텐션을 통과합니다.
    gemma_attention = CachedGemmaAttention(head_dim=head_dim,
                                           num_query_heads=num_query_heads,
                                           num_key_value_heads=num_key_value_heads,
                                           dropout=0.0)
    x = gemma_attention(x, attention_mask)
    # 스킵 연결
    x = x + residual
    # 스킵 연결을 준비합니다.
    residual = x
    # 위치별 피드 포워드 네트워크
    x = RMSNormalization()(x)
    x1 = layers.Dense(interm_dim // 2, activation='gelu', use_bias=False)(x)
    x2 = layers.Dense(interm_dim // 2, use_bias=False)(x)
    x = x1 * x2
    x = layers.Dense(hidden_dim, use_bias=False)(x)
    # 스킵 연결
    x = x + residual
    return x

In [8]:
# Gemma 2B
vocab_size = 256000
num_layers = 18
num_query_heads = 8
num_key_value_heads = 1
interm_dim = 32768
hidden_dim = 2048
head_dim = 256

token_ids = keras.Input(shape=(None,))
padding_mask = keras.Input(shape=(None,))

token_embedding_layer = keras_nlp.layers.ReversibleEmbedding(vocab_size, hidden_dim)
x = token_embedding_layer(token_ids)
x = layers.Lambda(lambda x: x * keras.ops.sqrt(hidden_dim))(x)

for _ in range(num_layers):
    x = gemma_decoder(x, padding_mask, num_query_heads, num_key_value_heads,
                    interm_dim, hidden_dim, head_dim)

x = RMSNormalization()(x)
outputs = token_embedding_layer(x, reverse=True)
model = keras.Model(inputs=(token_ids, padding_mask),
                    outputs=(outputs))
model.summary(line_length=100)

### 젬마 모델 사용하기

캐글에서 젬마 모델을 다운로드하려면 캐글 API 토큰을 생성하여 ~/.kaggle/ 디렉토리에 저장하세요.

In [9]:
!mkdir ~/.kaggle/
!mv kaggle.json ~/.kaggle/

In [10]:
gemma = keras_nlp.models.GemmaCausalLM.from_preset('gemma_2b_en')
gemma.summary()

Attaching 'metadata.json' from model 'keras/gemma/keras/gemma_2b_en/2' to your Colab notebook...
Attaching 'metadata.json' from model 'keras/gemma/keras/gemma_2b_en/2' to your Colab notebook...
Attaching 'task.json' from model 'keras/gemma/keras/gemma_2b_en/2' to your Colab notebook...
Attaching 'config.json' from model 'keras/gemma/keras/gemma_2b_en/2' to your Colab notebook...
Attaching 'metadata.json' from model 'keras/gemma/keras/gemma_2b_en/2' to your Colab notebook...
Attaching 'metadata.json' from model 'keras/gemma/keras/gemma_2b_en/2' to your Colab notebook...
Attaching 'config.json' from model 'keras/gemma/keras/gemma_2b_en/2' to your Colab notebook...
Attaching 'config.json' from model 'keras/gemma/keras/gemma_2b_en/2' to your Colab notebook...


ResourceExhaustedError: {{function_node __wrapped__Mul_device_/job:localhost/replica:0/task:0/device:GPU:0}} failed to allocate memory [Op:Mul] name: 

In [None]:
sampler = keras_nlp.samplers.TopPSampler(p=0.8, seed=42)
gemma.compile(sampler=sampler)
gemma.generate('stay hungry, stay', max_length=20)

In [None]:
gemma.generate('봄이 오면', max_length=20)

In [None]:
from transformers import pipeline, set_seed

In [None]:
gemma_pipe = pipeline("text-generation", model="beomi/gemma-ko-2b")
set_seed(42)
gemma_pipe('봄이 오면', max_length=20, truncation=True)