# 03-2 트랜스포머를 사용한 텍스트 생성

<table align="left"><tr><td>
<a href="https://colab.research.google.com/github/rickiepark/hm-dl/blob/main/03-2-llama2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="코랩에서 실행하기"/></a>
</td></tr></table>

코랩에서 이 노트북을 실행하려면 High-RAM CPU 런타임을 사용해야 합니다.

이 절의 코드를 실행하려면 `keras-nlp` 패키지와 허깅페이스 `transformers` 패키지를 위한 `tf-keras`를 설치해야 합니다.

In [None]:
pip install -U tensorflow keras-nlp tf-keras

Collecting tensorflow
  Downloading tensorflow-2.16.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (589.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m589.8/589.8 MB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting keras-nlp
  Downloading keras_nlp-0.12.1-py3-none-any.whl (570 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m570.5/570.5 kB[0m [31m43.5 MB/s[0m eta [36m0:00:00[0m
Collecting tf-keras
  Downloading tf_keras-2.16.0-py3-none-any.whl (1.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m87.3 MB/s[0m eta [36m0:00:00[0m
Collecting h5py>=3.10.0 (from tensorflow)
  Downloading h5py-3.11.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (5.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.3/5.3 MB[0m [31m64.0 MB/s[0m eta [36m0:00:00[0m
Collecting ml-dtypes~=0.3.1 (from tensorflow)
  Downloading ml_dtypes-0.3.2-cp310-cp310-manylinux

In [None]:
import keras
import keras_nlp

keras.__version__, keras_nlp.__version__

('3.3.3', '0.12.1')

In [None]:
import keras
from keras import layers
import keras_nlp

In [None]:
def make_causal_mask(seq_len):
    n_hori = keras.ops.arange(seq_len)
    n_vert = keras.ops.expand_dims(n_hori, axis=-1)
    mask = n_vert >= n_hori
    return mask

In [None]:
def make_attention_mask(padding_mask):
    # padding_mask 크기가 (2, 5)라고 가정해 보죠.
    batch_size, seq_len = keras.ops.shape(padding_mask)
    # causal_mask 크기는 (5, 5)가 됩니다.
    causal_mask = make_causal_mask(seq_len)
    # 배치 차원을 추가해 (2, 5, 5)로 만듭니다.
    causal_mask = keras.ops.broadcast_to(causal_mask, (batch_size, seq_len, seq_len))
    # 브로드캐스팅을 위해 padding_mask 크기를 (2, 1, 5)로 만듭니다.
    padding_mask = keras.ops.expand_dims(padding_mask, axis=1)
    return keras.ops.minimum(causal_mask, padding_mask)

In [None]:
class AttentionMask(keras.Layer):
    def call(self, padding_mask):
        return make_attention_mask(padding_mask)

## LLaMa-2

### 로터리 위치 임베딩

In [None]:
# 토큰 임베딩 크기
embed_dim = 4096

def rotary_position_embedding(inputs, token_pos):
    # theta 각도를 생성합니다.
    freqs = keras.ops.arange(0, embed_dim, 2, dtype='float32') / embed_dim
    inverse_freqs = 1 / (10000**freqs)
    # m * theta
    embedding = token_pos * inverse_freqs
    cos_emb = keras.ops.cos(embedding)
    sin_emb = keras.ops.sin(embedding)
    # 입력을 절반으로 나눕니다.
    x1, x2 = keras.ops.split(inputs, 2)
    # 회전 변환을 적용합니다.
    new_x1 = x1 * cos_emb - x2 * sin_emb
    new_x2 = x1 * sin_emb + x2 * cos_emb
    return keras.ops.concatenate((new_x1, new_x2))

# 가상의 토큰 임베딩
inputs = keras.ops.ones(embed_dim)
# 두 번째 위치에 있는 토큰에 로터리 위치 임베딩을 적용합니다.
rotary_position_embedding(inputs, 1)

<tf.Tensor: shape=(4096,), dtype=float32, numpy=
array([-0.30116874, -0.2949654 , -0.28878427, ...,  1.0001013 ,
        1.0001009 ,  1.0001005 ], dtype=float32)>

In [None]:
rotary_embedding = keras_nlp.layers.RotaryEmbedding()
rotary_embedding(keras.ops.ones((1, 2, embed_dim)))

<tf.Tensor: shape=(1, 2, 4096), dtype=float32, numpy=
array([[[ 1.        ,  1.        ,  1.        , ...,  1.        ,
          1.        ,  1.        ],
        [-0.30116874, -0.2949654 , -0.28878427, ...,  1.0001013 ,
          1.0001009 ,  1.0001005 ]]], dtype=float32)>

### RMS 정규화

In [None]:
import numpy as np

def rms_norm(x):
    scale = 1.0     # 실제로는 훈련되는 가중치입니다.
    epsilon = 1e-6
    var = keras.ops.mean(keras.ops.power(x, 2), axis=-1, keepdims=True)
    return scale * x / keras.ops.sqrt(var + epsilon)

x = np.array([1, 2, 3])
rms_norm(x)

<tf.Tensor: shape=(3,), dtype=float32, numpy=array([0.46291, 0.92582, 1.38873], dtype=float32)>

In [None]:
from keras_nlp.src.models.llama.llama_layernorm import LlamaLayerNorm

llama_norm = LlamaLayerNorm()
llama_norm(x)

<tf.Tensor: shape=(3,), dtype=float32, numpy=array([0.46291, 0.92582, 1.38873], dtype=float32)>

### SwiGLU 활성화 함수

In [None]:
# 피드 포워드 네트워크의 입력 크기가 (10, 4096)이고,
# 유닛 개수는 11,008개, 임베딩 차원은 4,096이라고 가정합니다.
x = keras.ops.ones((10, 4096))
x1 = layers.Dense(11008, activation='silu', use_bias=False)(x)
x2 = layers.Dense(11008, use_bias=False)(x)
x = x1 * x2
x = layers.Dense(4096, use_bias=False)(x)
x

<tf.Tensor: shape=(10, 4096), dtype=float32, numpy=
array([[-0.6577311 , -0.4733592 ,  0.26339507, ...,  0.08605745,
        -0.01683542, -0.24151087],
       [-0.6577311 , -0.4733592 ,  0.26339507, ...,  0.08605745,
        -0.01683542, -0.24151087],
       [-0.6577311 , -0.4733592 ,  0.26339507, ...,  0.08605745,
        -0.01683542, -0.24151087],
       ...,
       [-0.6577311 , -0.4733592 ,  0.26339507, ...,  0.08605745,
        -0.01683542, -0.24151087],
       [-0.6577311 , -0.4733592 ,  0.26339507, ...,  0.08605745,
        -0.01683542, -0.24151087],
       [-0.6577311 , -0.4733592 ,  0.26339507, ...,  0.08605745,
        -0.01683542, -0.24151087]], dtype=float32)>

### 라마 2 구현하기

In [None]:
from keras_nlp.src.models.llama.llama_attention import LlamaAttention

def llama_decoder(x, padding_mask, num_query_heads, num_key_value_heads,
                  interm_dim, hidden_dim):
    # 어텐션 마스크를 계산합니다.
    attention_mask = AttentionMask()(padding_mask)
    # 스킵 연결을 준비합니다.
    residual = x
    x = LlamaLayerNorm()(x)
    # 멀티 헤드 어텐션을 통과합니다.
    llama_attention = LlamaAttention(num_query_heads=num_query_heads,
                                     num_key_value_heads=num_key_value_heads,
                                     dropout=0.0)
    x = llama_attention(x, attention_mask)
    # 스킵 연결
    x = x + residual
    # 스킵 연결을 준비합니다.
    residual = x
    # 피드 포워드 네트워크
    x = LlamaLayerNorm()(x)
    x1 = layers.Dense(interm_dim, activation='silu', use_bias=False)(x)
    x2 = layers.Dense(interm_dim, use_bias=False)(x)
    x = x1 * x2
    x = layers.Dense(hidden_dim, use_bias=False)(x)
    # 스킵 연결
    x = x + residual
    return x

In [None]:
# LLaMa 2
vocab_size = 32000
num_layers = 32
num_query_heads = 32
num_key_value_heads = 32
interm_dim = 11008
hidden_dim = 4096

token_ids = keras.Input(shape=(None,))
padding_mask = keras.Input(shape=(None,))

token_embedding_layer = keras_nlp.layers.ReversibleEmbedding(vocab_size, hidden_dim,
                                                             tie_weights=False)
x = token_embedding_layer(token_ids)

for _ in range(num_layers):
    x = llama_decoder(x, padding_mask, num_query_heads, num_key_value_heads,
                      interm_dim, hidden_dim)

x = LlamaLayerNorm()(x)
outputs = token_embedding_layer(x, reverse=True)
model = keras.Model(inputs=(token_ids, padding_mask),
                    outputs=(outputs))
model.summary(line_length=100)

### 케라스에서 LLaMa-2 사용하기

** 라마-2, 라마-3 모델을 사용하려면 먼저 메타에 사용 허가를 얻어야 합니다. 자세한 내용은 도서를 참고하세요.**

캐글에서 라마-2 모델을 다운로드하려면 캐글 API 토큰을 생성하여 ~/.kaggle/ 디렉토리에 저장하세요.

In [None]:
!mkdir ~/.kaggle/
!mv kaggle.json ~/.kaggle/

In [None]:
llama2 = keras_nlp.models.LlamaCausalLM.from_preset('llama2_7b_en', dtype='float16')
llama2.summary()

Downloading from https://www.kaggle.com/api/v1/models/keras/llama2/keras/llama2_7b_en/1/download/metadata.json...
100%|██████████| 142/142 [00:00<00:00, 131kB/s]
Downloading from https://www.kaggle.com/api/v1/models/keras/llama2/keras/llama2_7b_en/1/download/task.json...
Downloading from https://www.kaggle.com/api/v1/models/keras/llama2/keras/llama2_7b_en/1/download/config.json...
100%|██████████| 604/604 [00:00<00:00, 551kB/s]
Downloading from https://www.kaggle.com/api/v1/models/keras/llama2/keras/llama2_7b_en/1/download/model.weights.h5...
100%|██████████| 12.6G/12.6G [11:44<00:00, 19.1MB/s]
Downloading from https://www.kaggle.com/api/v1/models/keras/llama2/keras/llama2_7b_en/1/download/preprocessor.json...
Downloading from https://www.kaggle.com/api/v1/models/keras/llama2/keras/llama2_7b_en/1/download/tokenizer.json...
100%|██████████| 397/397 [00:00<00:00, 884kB/s]
Downloading from https://www.kaggle.com/api/v1/models/keras/llama2/keras/llama2_7b_en/1/download/assets/tokenizer/voc

In [None]:
llama2.generate('stay hungry, stay', max_length=20)

'stay hungry, stay humble\nI’ve been a little busy lately so I'

In [None]:
del llama2

### 센텐스피스 토크나이저

In [None]:
llama_tokenizer = keras_nlp.models.LlamaTokenizer.from_preset('llama2_7b_en')

In [None]:
token_ids = llama_tokenizer.tokenize('stay hungry, stay')
token_ids

<tf.Tensor: shape=(5,), dtype=int32, numpy=array([ 7952,  9074, 14793, 29892,  7952], dtype=int32)>

In [None]:
for ids in token_ids:
    print(llama_tokenizer.id_to_token(ids), end=' ')

▁stay ▁hun gry , ▁stay 

In [None]:
llama_tokenizer.tokenize('Hello hello')

<tf.Tensor: shape=(2,), dtype=int32, numpy=array([15043, 22172], dtype=int32)>

In [None]:
llama_tokenizer.detokenize(token_ids)

<tf.Tensor: shape=(), dtype=string, numpy=b'stay hungry, stay'>