# 03-2 트랜스포머를 사용한 텍스트 생성

<table align="left"><tr><td>
<a href="https://colab.research.google.com/github/rickiepark/hm-dl/blob/main/03-2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="코랩에서 실행하기"/></a>
</td></tr></table>

코랩에서 이 노트북을 실행하려면 High-RAM CPU 런타임을 사용해야 합니다.

이 절의 코드를 실행하려면 `keras-nlp` 패키지와 허깅페이스 `transformers` 패키지를 위한 `tf-keras`를 설치해야 합니다.

In [1]:
pip install -U tensorflow keras-nlp tf-keras

Collecting tensorflow
  Downloading tensorflow-2.16.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (589.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m589.8/589.8 MB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting keras-nlp
  Downloading keras_nlp-0.12.1-py3-none-any.whl (570 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m570.5/570.5 kB[0m [31m43.5 MB/s[0m eta [36m0:00:00[0m
Collecting tf-keras
  Downloading tf_keras-2.16.0-py3-none-any.whl (1.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m87.3 MB/s[0m eta [36m0:00:00[0m
Collecting h5py>=3.10.0 (from tensorflow)
  Downloading h5py-3.11.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (5.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.3/5.3 MB[0m [31m64.0 MB/s[0m eta [36m0:00:00[0m
Collecting ml-dtypes~=0.3.1 (from tensorflow)
  Downloading ml_dtypes-0.3.2-cp310-cp310-manylinux

In [2]:
import keras
import keras_nlp

keras.__version__, keras_nlp.__version__

('3.3.3', '0.12.1')

## 트랜스포머 디코더

## 코잘 마스킹

In [3]:
import keras
from keras import layers
import keras_nlp

In [2]:
def make_causal_mask(seq_len):
    n_hori = keras.ops.arange(seq_len)
    n_vert = keras.ops.expand_dims(n_hori, axis=-1)
    mask = n_vert >= n_hori
    return mask

In [5]:
causal_mask = make_causal_mask(5)
causal_mask

<tf.Tensor: shape=(5, 5), dtype=bool, numpy=
array([[ True, False, False, False, False],
       [ True,  True, False, False, False],
       [ True,  True,  True, False, False],
       [ True,  True,  True,  True, False],
       [ True,  True,  True,  True,  True]])>

In [6]:
padding_mask = [1, 1, 1, 0, 0]
keras.ops.minimum(causal_mask, padding_mask)

<tf.Tensor: shape=(5, 5), dtype=int32, numpy=
array([[1, 0, 0, 0, 0],
       [1, 1, 0, 0, 0],
       [1, 1, 1, 0, 0],
       [1, 1, 1, 0, 0],
       [1, 1, 1, 0, 0]], dtype=int32)>

In [3]:
def make_attention_mask(padding_mask):
    # padding_mask 크기가 (2, 5)라고 가정해 보죠.
    batch_size, seq_len = keras.ops.shape(padding_mask)
    # causal_mask 크기는 (5, 5)가 됩니다.
    causal_mask = make_causal_mask(seq_len)
    # 배치 차원을 추가해 (2, 5, 5)로 만듭니다.
    causal_mask = keras.ops.broadcast_to(causal_mask, (batch_size, seq_len, seq_len))
    # 브로드캐스팅을 위해 padding_mask 크기를 (2, 1, 5)로 만듭니다.
    padding_mask = keras.ops.expand_dims(padding_mask, axis=1)
    return keras.ops.minimum(causal_mask, padding_mask)

In [4]:
make_attention_mask([[1, 1, 0, 0, 0], [1, 1, 1, 1, 0]])

<tf.Tensor: shape=(2, 5, 5), dtype=int32, numpy=
array([[[1, 0, 0, 0, 0],
        [1, 1, 0, 0, 0],
        [1, 1, 0, 0, 0],
        [1, 1, 0, 0, 0],
        [1, 1, 0, 0, 0]],

       [[1, 0, 0, 0, 0],
        [1, 1, 0, 0, 0],
        [1, 1, 1, 0, 0],
        [1, 1, 1, 1, 0],
        [1, 1, 1, 1, 0]]], dtype=int32)>

## 디코더 구현하기

In [7]:
class AttentionMask(keras.Layer):
    def call(self, padding_mask):
        return make_attention_mask(padding_mask)

In [11]:
def transformer_decoder(x, padding_mask, dropout,
                        activation='relu', norm_first=True):
    # 어텐션 마스크를 계산합니다.
    attention_mask = AttentionMask()(padding_mask)
    # 스킵 연결을 준비합니다.
    residual = x
    key_dim = hidden_dim // num_heads
    if norm_first:
        x = layers.LayerNormalization()(x)
    # 멀티 헤드 어텐션을 통과합니다.
    x = layers.MultiHeadAttention(num_heads, key_dim, dropout=dropout)(
        query=x, value=x, attention_mask=attention_mask)
    x = layers.Dropout(dropout)(x)
    # 스킵 연결
    x = x + residual
    if not norm_first:
        x = layers.LayerNormalization()(x)
    # 스킵 연결을 준비합니다.
    residual = x
    # 위치별 피드 포워드 네트워크
    if norm_first:
        x = layers.LayerNormalization()(x)
    x = layers.Dense(hidden_dim * 4, activation=activation)(x)
    x = layers.Dense(hidden_dim)(x)
    x = layers.Dropout(dropout)(x)
    # 스킵 연결
    x = x + residual
    if not norm_first:
        x = layers.LayerNormalization()(x)
    return x

## GPT-2

In [12]:
# GPT-2
vocab_size = 50257
num_layers = 12
num_heads = 12
hidden_dim = 768
dropout = 0.1
activation = 'gelu'
max_seq_len = 1024

token_ids = keras.Input(shape=(None,))
padding_mask = keras.Input(shape=(None,))

token_embedding_layer = keras_nlp.layers.ReversibleEmbedding(vocab_size, hidden_dim)
token_embedding = token_embedding_layer(token_ids)
pos_embedding = keras_nlp.layers.PositionEmbedding(max_seq_len)(token_embedding)

x = token_embedding + pos_embedding
x = layers.Dropout(dropout)(x)
for _ in range(num_layers):
    x = transformer_decoder(x, padding_mask, dropout, activation)

x = layers.LayerNormalization()(x)
outputs = token_embedding_layer(x, reverse=True)
model = keras.Model(inputs=(token_ids, padding_mask),
                    outputs=(outputs))
model.summary()

In [13]:
gpt2 = keras_nlp.models.GPT2CausalLM.from_preset('gpt2_base_en')
gpt2.summary()

Downloading from https://www.kaggle.com/api/v1/models/keras/gpt2/keras/gpt2_base_en/2/download/metadata.json...
100%|██████████| 141/141 [00:00<00:00, 191kB/s]
Downloading from https://www.kaggle.com/api/v1/models/keras/gpt2/keras/gpt2_base_en/2/download/task.json...
Downloading from https://www.kaggle.com/api/v1/models/keras/gpt2/keras/gpt2_base_en/2/download/config.json...
100%|██████████| 484/484 [00:00<00:00, 739kB/s]
Downloading from https://www.kaggle.com/api/v1/models/keras/gpt2/keras/gpt2_base_en/2/download/model.weights.h5...
100%|██████████| 475M/475M [00:10<00:00, 45.7MB/s]
Downloading from https://www.kaggle.com/api/v1/models/keras/gpt2/keras/gpt2_base_en/2/download/preprocessor.json...
Downloading from https://www.kaggle.com/api/v1/models/keras/gpt2/keras/gpt2_base_en/2/download/tokenizer.json...
100%|██████████| 448/448 [00:00<00:00, 722kB/s]
Downloading from https://www.kaggle.com/api/v1/models/keras/gpt2/keras/gpt2_base_en/2/download/assets/tokenizer/vocabulary.json...


In [14]:
gpt2.generate('stay hungry, stay', max_length=6)

'stay hungry, stay thirsty'

In [15]:
gpt2.generate('stay hungry, stay', max_length=20)

'stay hungry, stay strong. Stay connected to your family, friends and the world through the power'

In [16]:
inputs, target, mask = gpt2.preprocessor('stay hungry, stay', sequence_length=10)
inputs, target, mask

({'token_ids': <tf.Tensor: shape=(10,), dtype=int32, numpy=
  array([50256, 31712, 14720,    11,  2652, 50256,     0,     0,     0,
             0], dtype=int32)>,
  'padding_mask': <tf.Tensor: shape=(10,), dtype=bool, numpy=
  array([ True,  True,  True,  True,  True,  True, False, False, False,
         False])>},
 <tf.Tensor: shape=(10,), dtype=int32, numpy=
 array([31712, 14720,    11,  2652, 50256,     0,     0,     0,     0,
            0], dtype=int32)>,
 <tf.Tensor: shape=(10,), dtype=bool, numpy=
 array([ True,  True,  True,  True,  True, False, False, False, False,
        False])>)

In [17]:
gpt2_tokenizer = gpt2.preprocessor.tokenizer
for ids in target:
    print(gpt2_tokenizer.id_to_token(ids), end=' ')

stay Ġhungry , Ġstay <|endoftext|> ! ! ! ! ! 

In [18]:
gpt2_tokenizer = gpt2.preprocessor.tokenizer
gpt2_tokenizer.end_token_id, gpt2_tokenizer.pad_token_id

(50256, 0)

In [19]:
inputs = gpt2.preprocessor.generate_preprocess(['stay hungry, stay'], sequence_length=10)
inputs

{'token_ids': <tf.Tensor: shape=(1, 10), dtype=int32, numpy=
 array([[50256, 31712, 14720,    11,  2652,     0,     0,     0,     0,
             0]], dtype=int32)>,
 'padding_mask': <tf.Tensor: shape=(1, 10), dtype=bool, numpy=
 array([[ True,  True,  True,  True,  True, False, False, False, False,
         False]])>}

In [20]:
outputs = gpt2.generate_function(inputs)
outputs

{'token_ids': <tf.Tensor: shape=(1, 10), dtype=int32, numpy=
 array([[50256, 31712, 14720,    11,  2652, 47124,   198,   198,    40,
          1101]], dtype=int32)>,
 'padding_mask': <tf.Tensor: shape=(1, 10), dtype=bool, numpy=
 array([[ True,  True,  True,  True,  True,  True,  True,  True,  True,
          True]])>}

In [21]:
gpt2.preprocessor.generate_postprocess(outputs)

<tf.Tensor: shape=(1,), dtype=string, numpy=array([b"stay hungry, stay thirsty\n\nI'm"], dtype=object)>

## 샘플러

In [22]:
sampler = keras_nlp.samplers.TopKSampler(k=10, temperature=0.5, seed=42)
gpt2.compile(sampler=sampler)
gpt2.generate('stay hungry, stay', max_length=20)

'stay hungry, stay thirsty, stay thirsty, stay thirsty, stay thirsty, stay thirsty, stay'

In [23]:
sampler = keras_nlp.samplers.TopKSampler(k=10, temperature=5, seed=42)
gpt2.compile(sampler=sampler)
gpt2.generate('stay hungry, stay', max_length=20)



"stay hungry, stay fit. I know you'll be disappointed at our current state of food choices"

### top-p 샘플링

In [24]:
sampler = keras_nlp.samplers.TopPSampler(p=0.8, seed=42)
gpt2.compile(sampler=sampler)
gpt2.generate('stay hungry, stay', max_length=20)



'stay hungry, stay tired and you can always check your inbox at 8am.\n\n–'

In [25]:
sampler = keras_nlp.samplers.TopPSampler(p=0.8, k=1000, temperature=5, seed=42)
gpt2.compile(sampler=sampler)
gpt2.generate('stay hungry, stay', max_length=20)

'stay hungry, stay having desire again way more timely_rss <- episode interval background enisonf'

In [26]:
gpt2.compile(sampler='top_p')
gpt2.generate('stay hungry, stay', max_length=20)

'stay hungry, stay thirsty, stay thirsty, stay thirsty, stay thirsty, stay thirsty, stay'

### 그리디 샘플링과 랜덤 샘플링

In [27]:
gpt2.compile(sampler='greedy')
gpt2.generate('stay hungry, stay', max_length=20)

'stay hungry, stay thirsty, stay thirsty, stay thirsty, stay thirsty, stay thirsty, stay'

In [28]:
sampler = keras_nlp.samplers.RandomSampler(temperature=5, seed=42)
gpt2.compile(sampler=sampler)
gpt2.generate('stay hungry, stay', max_length=20)

'stay hungry, stay "(mob]-lining Often log freight seatedlarg freshwater brass advocate Miracle Lenabound'

### 빔 샘플링과 대조 샘플링

In [29]:
sampler = keras_nlp.samplers.BeamSampler(num_beams=10, temperature=5)
gpt2.compile(sampler=sampler)
gpt2.generate('stay hungry, stay', max_length=20)

'stay hungry, stay hydrated stay hydrated\n\nStay hydrated stay hydrated\n\n'

In [30]:
sampler = keras_nlp.samplers.ContrastiveSampler(k=5, alpha=0.2)
gpt2.compile(sampler=sampler)
gpt2.generate('stay hungry, stay', max_length=20)

'stay hungry, stay thirsty, stay thirsty, stay thirsty, stay thirsty, stay thirsty, stay'

In [31]:
sampler = keras_nlp.samplers.ContrastiveSampler(k=5, alpha=0.8)
gpt2.compile(sampler=sampler)
gpt2.generate('stay hungry, stay', max_length=20)

'stay hungry, stay thirsty\n\n\nA lot has changed in the last few years. The number'

## 허깅페이스 `transformers` 라이브러리 사용하기

In [32]:
from transformers import pipeline, set_seed

set_seed(42)
hf_gpt1 = pipeline('text-generation', model='openai-community/openai-gpt')
hf_gpt1('stay hungry, stay', max_length=20, truncation=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/656 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/479M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/74.0 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/816k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/458k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.27M [00:00<?, ?B/s]

[{'generated_text': 'stay hungry, stay clear of the water. " \n " i am, " she says with a'}]

In [33]:
set_seed(42)
hf_gpt1('stay hungry, stay', max_length=20, truncation=True, num_return_sequences=3)

[{'generated_text': 'stay hungry, stay clear of the water. " \n " i am, " she says with a'},
 {'generated_text': "stay hungry, stay safe, and get back to our homes. it's what they want. \n"},
 {'generated_text': 'stay hungry, stay put, " the woman said. " eat. stay awake. if you have'}]

In [34]:
from transformers import AutoTokenizer, AutoModelForCausalLM

hf_gpt2_tokenizer = AutoTokenizer.from_pretrained("gpt2")
hf_gpt2 = AutoModelForCausalLM.from_pretrained("gpt2")

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [35]:
prep_data= hf_gpt2_tokenizer('stay hungry, stay', return_tensors='pt')
prep_data

{'input_ids': tensor([[31712, 14720,    11,  2652]]), 'attention_mask': tensor([[1, 1, 1, 1]])}

In [36]:
set_seed(42)
outputs = hf_gpt2.generate(**prep_data, max_length=20)
outputs

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


tensor([[31712, 14720,    11,  2652, 47124,    11,  2652, 47124,    11,  2652,
         47124,    11,  2652, 47124,    11,  2652, 47124,    11,  2652, 47124]])

In [37]:
hf_gpt2_tokenizer.batch_decode(outputs)

['stay hungry, stay thirsty, stay thirsty, stay thirsty, stay thirsty, stay thirsty, stay thirsty']

In [38]:
hf_gpt2.generation_config.pad_token_id = hf_gpt2_tokenizer.eos_token_id

### top-k와 top-p 샘플링

In [39]:
set_seed(42)
outputs = hf_gpt2.generate(**prep_data, max_length=20,
                           do_sample=True)
hf_gpt2_tokenizer.batch_decode(outputs)

['stay hungry, stay quiet, stay in the dark, stay in a situation, stay in front of']

In [40]:
set_seed(42)
outputs = hf_gpt2.generate(**prep_data, max_length=20,
                           do_sample=True, top_k=5, temperature=5.0)
hf_gpt2_tokenizer.batch_decode(outputs)

['stay hungry, stay clean, eat fresh. The best part? They are all here! They have']

In [41]:
set_seed(42)
outputs = hf_gpt2.generate(**prep_data, max_length=20,
                           do_sample=True, top_p=0.8, temperature=5.0)
hf_gpt2_tokenizer.batch_decode(outputs)

['stay hungry, stay quiet for that little thing that will help to change everything for everyone here as this']

### 빔 샘플링과 대조 샘플링

In [42]:
set_seed(42)
outputs = hf_gpt2.generate(**prep_data, max_length=20,
                           num_beams=5)
hf_gpt2_tokenizer.batch_decode(outputs)

['stay hungry, stay hungry, stay hungry, stay hungry, stay hungry, stay hungry, stay hungry']

In [43]:
set_seed(42)
outputs = hf_gpt2.generate(**prep_data, max_length=20,
                           num_beams=5, top_k=20,
                           do_sample=True, temperature=5.0)
hf_gpt2_tokenizer.batch_decode(outputs)

['stay hungry, stay warm and get the best possible health care at the best prices that suits your needs']

In [44]:
set_seed(42)
outputs = hf_gpt2.generate(**prep_data, max_length=20,
                           penalty_alpha=0.8)
hf_gpt2_tokenizer.batch_decode(outputs)

["stay hungry, stay out of trouble\n\n\nDon't want us to be able to do that?"]

In [45]:
from transformers import GenerationConfig

GenerationConfig().to_dict()

{'max_length': 20,
 'max_new_tokens': None,
 'min_length': 0,
 'min_new_tokens': None,
 'early_stopping': False,
 'max_time': None,
 'stop_strings': None,
 'do_sample': False,
 'num_beams': 1,
 'num_beam_groups': 1,
 'penalty_alpha': None,
 'use_cache': True,
 'temperature': 1.0,
 'top_k': 50,
 'top_p': 1.0,
 'min_p': None,
 'typical_p': 1.0,
 'epsilon_cutoff': 0.0,
 'eta_cutoff': 0.0,
 'diversity_penalty': 0.0,
 'repetition_penalty': 1.0,
 'encoder_repetition_penalty': 1.0,
 'length_penalty': 1.0,
 'no_repeat_ngram_size': 0,
 'bad_words_ids': None,
 'force_words_ids': None,
 'renormalize_logits': False,
 'constraints': None,
 'forced_bos_token_id': None,
 'forced_eos_token_id': None,
 'remove_invalid_values': False,
 'exponential_decay_length_penalty': None,
 'suppress_tokens': None,
 'begin_suppress_tokens': None,
 'forced_decoder_ids': None,
 'sequence_bias': None,
 'guidance_scale': None,
 'low_memory': None,
 'watermarking_config': None,
 'num_return_sequences': 1,
 'output_attent

In [46]:
GenerationConfig(do_sample=True, top_k=10).get_generation_mode()

<GenerationMode.SAMPLE: 'sample'>

## LLaMa-2

### 로터리 위치 임베딩

In [47]:
# 토큰 임베딩 크기
embed_dim = 4096

def rotary_position_embedding(inputs, token_pos):
    # theta 각도를 생성합니다.
    freqs = keras.ops.arange(0, embed_dim, 2, dtype='float32') / embed_dim
    inverse_freqs = 1 / (10000**freqs)
    # m * theta
    embedding = token_pos * inverse_freqs
    cos_emb = keras.ops.cos(embedding)
    sin_emb = keras.ops.sin(embedding)
    # 입력을 절반으로 나눕니다.
    x1, x2 = keras.ops.split(inputs, 2)
    # 회전 변환을 적용합니다.
    new_x1 = x1 * cos_emb - x2 * sin_emb
    new_x2 = x1 * sin_emb + x2 * cos_emb
    return keras.ops.concatenate((new_x1, new_x2))

# 가상의 토큰 임베딩
inputs = keras.ops.ones(embed_dim)
# 두 번째 위치에 있는 토큰에 로터리 위치 임베딩을 적용합니다.
rotary_position_embedding(inputs, 1)

<tf.Tensor: shape=(4096,), dtype=float32, numpy=
array([-0.30116874, -0.2949654 , -0.28878427, ...,  1.0001013 ,
        1.0001009 ,  1.0001005 ], dtype=float32)>

In [48]:
rotary_embedding = keras_nlp.layers.RotaryEmbedding()
rotary_embedding(keras.ops.ones((1, 2, embed_dim)))

<tf.Tensor: shape=(1, 2, 4096), dtype=float32, numpy=
array([[[ 1.        ,  1.        ,  1.        , ...,  1.        ,
          1.        ,  1.        ],
        [-0.30116874, -0.2949654 , -0.28878427, ...,  1.0001013 ,
          1.0001009 ,  1.0001005 ]]], dtype=float32)>

### RMS 정규화

In [9]:
import numpy as np

def rms_norm(x):
    scale = 1.0     # 실제로는 훈련되는 가중치입니다.
    epsilon = 1e-6
    var = keras.ops.mean(keras.ops.power(x, 2), axis=-1, keepdims=True)
    return scale * x / keras.ops.sqrt(var + epsilon)

x = np.array([1, 2, 3])
rms_norm(x)

<tf.Tensor: shape=(3,), dtype=float32, numpy=array([0.46291, 0.92582, 1.38873], dtype=float32)>

In [10]:
from keras_nlp.src.models.llama.llama_layernorm import LlamaLayerNorm

llama_norm = LlamaLayerNorm()
llama_norm(x)

<tf.Tensor: shape=(3,), dtype=float32, numpy=array([0.46291, 0.92582, 1.38873], dtype=float32)>

### SwiGLU 활성화 함수

In [51]:
# 피드 포워드 네트워크의 입력 크기가 (10, 4096)이고,
# 유닛 개수는 11,008개, 임베딩 차원은 4,096이라고 가정합니다.
x = keras.ops.ones((10, 4096))
x1 = layers.Dense(11008, activation='silu', use_bias=False)(x)
x2 = layers.Dense(11008, use_bias=False)(x)
x = x1 * x2
x = layers.Dense(4096, use_bias=False)(x)
x

<tf.Tensor: shape=(10, 4096), dtype=float32, numpy=
array([[-0.6577311 , -0.4733592 ,  0.26339507, ...,  0.08605745,
        -0.01683542, -0.24151087],
       [-0.6577311 , -0.4733592 ,  0.26339507, ...,  0.08605745,
        -0.01683542, -0.24151087],
       [-0.6577311 , -0.4733592 ,  0.26339507, ...,  0.08605745,
        -0.01683542, -0.24151087],
       ...,
       [-0.6577311 , -0.4733592 ,  0.26339507, ...,  0.08605745,
        -0.01683542, -0.24151087],
       [-0.6577311 , -0.4733592 ,  0.26339507, ...,  0.08605745,
        -0.01683542, -0.24151087],
       [-0.6577311 , -0.4733592 ,  0.26339507, ...,  0.08605745,
        -0.01683542, -0.24151087]], dtype=float32)>

### 라마 2 구현하기

In [5]:
from keras_nlp.src.models.llama.llama_attention import LlamaAttention

def llama_decoder(x, padding_mask, num_query_heads, num_key_value_heads,
                  interm_dim, hidden_dim):
    # 어텐션 마스크를 계산합니다.
    attention_mask = AttentionMask()(padding_mask)
    # 스킵 연결을 준비합니다.
    residual = x
    x = LlamaLayerNorm()(x)
    # 멀티 헤드 어텐션을 통과합니다.
    llama_attention = LlamaAttention(num_query_heads=num_query_heads,
                                     num_key_value_heads=num_key_value_heads,
                                     dropout=0.0)
    x = llama_attention(x, attention_mask)
    # 스킵 연결
    x = x + residual
    # 스킵 연결을 준비합니다.
    residual = x
    # 피드 포워드 네트워크
    x = LlamaLayerNorm()(x)
    x1 = layers.Dense(interm_dim, activation='silu', use_bias=False)(x)
    x2 = layers.Dense(interm_dim, use_bias=False)(x)
    x = x1 * x2
    x = layers.Dense(hidden_dim, use_bias=False)(x)
    # 스킵 연결
    x = x + residual
    return x

In [11]:
# LLaMa 2
vocab_size = 32000
num_layers = 32
num_query_heads = 32
num_key_value_heads = 32
interm_dim = 11008
hidden_dim = 4096

token_ids = keras.Input(shape=(None,))
padding_mask = keras.Input(shape=(None,))

token_embedding_layer = keras_nlp.layers.ReversibleEmbedding(vocab_size, hidden_dim,
                                                             tie_weights=False)
x = token_embedding_layer(token_ids)

for _ in range(num_layers):
    x = llama_decoder(x, padding_mask, num_query_heads, num_key_value_heads,
                      interm_dim, hidden_dim)

x = LlamaLayerNorm()(x)
outputs = token_embedding_layer(x, reverse=True)
model = keras.Model(inputs=(token_ids, padding_mask),
                    outputs=(outputs))
model.summary(line_length=100)

### 케라스에서 LLaMa-2 사용하기

** 라마-2, 라마-3 모델을 사용하려면 먼저 메타에 사용 허가를 얻어야 합니다. 자세한 내용은 도서를 참고하세요.**

캐글에서 라마-2 모델을 다운로드하려면 캐글 API 토큰을 생성하여 ~/.kaggle/ 디렉토리에 저장하세요.

In [4]:
!mkdir ~/.kaggle/
!mv kaggle.json ~/.kaggle/

In [6]:
llama2 = keras_nlp.models.LlamaCausalLM.from_preset('llama2_7b_en', dtype='float16')
llama2.summary()

Downloading from https://www.kaggle.com/api/v1/models/keras/llama2/keras/llama2_7b_en/1/download/metadata.json...
100%|██████████| 142/142 [00:00<00:00, 131kB/s]
Downloading from https://www.kaggle.com/api/v1/models/keras/llama2/keras/llama2_7b_en/1/download/task.json...
Downloading from https://www.kaggle.com/api/v1/models/keras/llama2/keras/llama2_7b_en/1/download/config.json...
100%|██████████| 604/604 [00:00<00:00, 551kB/s]
Downloading from https://www.kaggle.com/api/v1/models/keras/llama2/keras/llama2_7b_en/1/download/model.weights.h5...
100%|██████████| 12.6G/12.6G [11:44<00:00, 19.1MB/s]
Downloading from https://www.kaggle.com/api/v1/models/keras/llama2/keras/llama2_7b_en/1/download/preprocessor.json...
Downloading from https://www.kaggle.com/api/v1/models/keras/llama2/keras/llama2_7b_en/1/download/tokenizer.json...
100%|██████████| 397/397 [00:00<00:00, 884kB/s]
Downloading from https://www.kaggle.com/api/v1/models/keras/llama2/keras/llama2_7b_en/1/download/assets/tokenizer/voc

In [7]:
llama2.generate('stay hungry, stay', max_length=20)

'stay hungry, stay humble\nI’ve been a little busy lately so I'

In [8]:
del llama2

### 센텐스피스 토크나이저

In [9]:
llama_tokenizer = keras_nlp.models.LlamaTokenizer.from_preset('llama2_7b_en')

In [10]:
token_ids = llama_tokenizer.tokenize('stay hungry, stay')
token_ids

<tf.Tensor: shape=(5,), dtype=int32, numpy=array([ 7952,  9074, 14793, 29892,  7952], dtype=int32)>

In [11]:
for ids in token_ids:
    print(llama_tokenizer.id_to_token(ids), end=' ')

▁stay ▁hun gry , ▁stay 

In [12]:
llama_tokenizer.tokenize('Hello hello')

<tf.Tensor: shape=(2,), dtype=int32, numpy=array([15043, 22172], dtype=int32)>

In [13]:
llama_tokenizer.detokenize(token_ids)

<tf.Tensor: shape=(), dtype=string, numpy=b'stay hungry, stay'>

## LLaMa-3

In [17]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

### 라마-3 구조 살펴 보기

In [19]:
llama3_pipe = pipeline("text-generation", model="meta-llama/Meta-Llama-3-8B")

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00003-of-00004.safetensors:  72%|#######1  | 3.53G/4.92G [00:00<?, ?B/s]

KeyboardInterrupt: 

In [None]:
llama3_pipe.model

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 4096)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )
    )
    (norm): LlamaRMSNorm()
  )
  (lm_head)

In [None]:
pip install torchinfo

In [None]:
from torchinfo import summary

summary(llama3_pipe.model)

Layer (type:depth-idx)                                  Param #
LlamaForCausalLM                                        --
├─LlamaModel: 1-1                                       --
│    └─Embedding: 2-1                                   525,336,576
│    └─ModuleList: 2-2                                  --
│    │    └─LlamaDecoderLayer: 3-1                      --
│    │    │    └─LlamaSdpaAttention: 4-1                --
│    │    │    │    └─Linear: 5-1                       16,777,216
│    │    │    │    └─Linear: 5-2                       4,194,304
│    │    │    │    └─Linear: 5-3                       4,194,304
│    │    │    │    └─Linear: 5-4                       16,777,216
│    │    │    │    └─LlamaRotaryEmbedding: 5-5         --
│    │    │    └─LlamaMLP: 4-2                          --
│    │    │    │    └─Linear: 5-6                       58,720,256
│    │    │    │    └─Linear: 5-7                       58,720,256
│    │    │    │    └─Linear: 5-8                      

### 라마-3로 텍스트 생성하기

In [None]:
llama3_pipe.model.generation_config.pad_token_id = llama3_pipe.tokenizer.eos_token_id

In [None]:
set_seed(42)
llama3_pipe('stay hungry, stay', max_length=20, truncation=True)

[{'generated_text': 'stay hungry, stay alive," he said. "That\'s what I\'m doing right now."\nH'}]

In [None]:
llama3_pipe.model.generation_config

GenerationConfig {
  "bos_token_id": 128000,
  "do_sample": true,
  "eos_token_id": 128001,
  "max_length": 4096,
  "pad_token_id": 128001,
  "temperature": 0.6,
  "top_p": 0.9
}

In [None]:
set_seed(42)
llama3_pipe('봄이 오면', max_length=20, truncation=True)

[{'generated_text': '봄이 오면, 그때 그때 맞는 옷을 입고, 그때 그'}]

In [None]:
llama3_bllossom = pipeline("text-generation", model="MLP-KTLim/llama-3-Korean-Bllossom-8B")
set_seed(42)
llama3_bllossom('봄이 오면', max_length=20, truncation=True)



Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/194 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/51.0k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/10.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/301 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Setting `pad_token_id` to `eos_token_id`:144783 for open-end generation.


[{'generated_text': '봄이 오면, 봄꽃이 피고 향기롭고, 천록과 같은 꽃들이 피어나'}]

## Gemma

### 젬마 구현하기

In [None]:
from keras_nlp.src.models.gemma.gemma_attention import CachedGemmaAttention
from keras_nlp.src.models.gemma.rms_normalization import RMSNormalization

def gemma_decoder(x, padding_mask, num_query_heads, num_key_value_heads,
                  interm_dim, hidden_dim, head_dim):
    # 어텐션 마스크를 계산합니다.
    attention_mask = AttentionMask()(padding_mask)
    # 스킵 연결을 준비합니다.
    residual = x
    x = RMSNormalization()(x)
    # 멀티 헤드 어텐션을 통과합니다.
    gemma_attention = CachedGemmaAttention(head_dim=head_dim,
                                           num_query_heads=num_query_heads,
                                           num_key_value_heads=num_key_value_heads,
                                           dropout=0.0)
    x = gemma_attention(x, attention_mask)
    # 스킵 연결
    x = x + residual
    # 스킵 연결을 준비합니다.
    residual = x
    # 위치별 피드 포워드 네트워크
    x = RMSNormalization()(x)
    x1 = layers.Dense(interm_dim // 2, activation='gelu', use_bias=False)(x)
    x2 = layers.Dense(interm_dim // 2, use_bias=False)(x)
    x = x1 * x2
    x = layers.Dense(hidden_dim, use_bias=False)(x)
    # 스킵 연결
    x = x + residual
    return x

In [None]:
# Gemma 2B
vocab_size = 256000
num_layers = 18
num_query_heads = 8
num_key_value_heads = 1
interm_dim = 32768
hidden_dim = 2048
head_dim = 256

token_ids = keras.Input(shape=(None,))
padding_mask = keras.Input(shape=(None,))

token_embedding_layer = keras_nlp.layers.ReversibleEmbedding(vocab_size, hidden_dim)
x = token_embedding_layer(token_ids)
x = layers.Lambda(lambda x: x * keras.ops.sqrt(hidden_dim))(x)

for _ in range(num_layers):
    x = gemma_decoder(x, padding_mask, num_query_heads, num_key_value_heads,
                      interm_dim, hidden_dim, head_dim)

x = RMSNormalization()(x)
outputs = token_embedding_layer(x, reverse=True)
model = keras.Model(inputs=(token_ids, padding_mask),
                    outputs=(outputs))
model.summary(line_length=100)

### 젬마 모델 사용하기

In [None]:
gemma = keras_nlp.models.GemmaCausalLM.from_preset('gemma_2b_en')
gemma.summary()

Downloading from https://www.kaggle.com/api/v1/models/keras/gemma/keras/gemma_2b_en/2/download/task.json...
Downloading from https://www.kaggle.com/api/v1/models/keras/gemma/keras/gemma_2b_en/2/download/preprocessor.json...


In [None]:
sampler = keras_nlp.samplers.TopPSampler(p=0.8, seed=42)
gemma.compile(sampler=sampler)
gemma.generate('stay hungry, stay', max_length=20)

'stay hungry, stay foolish.  - steve jobs\nThomas Edison would have been proud of us'

In [None]:
gemma.generate('봄이 오면', max_length=20)

'봄이 오면 전국에서 생강이 생산된다. 전국생강'

In [None]:
gemma_pipe = pipeline("text-generation", model="beomi/gemma-ko-2b")
set_seed(42)
gemma_pipe('봄이 오면', max_length=20, truncation=True)



special_tokens_map.json:   0%|          | 0.00/555 [00:00<?, ?B/s]

[{'generated_text': '봄이 오면서 봄꽃이 피기 시작하면서 봄꽃축제가'}]

## 심화 예제

In [None]:
vocab_size = 128256
num_layers = 32
num_query_heads = 32
num_key_value_heads = 8
interm_dim = 14336
hidden_dim = 4096

token_ids = keras.Input(shape=(None,))
padding_mask = keras.Input(shape=(None,))

token_embedding_layer = keras_nlp.layers.ReversibleEmbedding(vocab_size, hidden_dim,
                                                             tie_weights=False)
x = token_embedding_layer(token_ids)

for _ in range(num_layers):
    x = llama_decoder(x, padding_mask, num_query_heads, num_key_value_heads,
                      interm_dim, hidden_dim)

x = LlamaLayerNorm()(x)
outputs = token_embedding_layer(x, reverse=True)
llama3_keras = keras.Model(inputs=(token_ids, padding_mask),
                    outputs=(outputs))
llama3_keras.summary()