# 05-1 GPT-2

<table align="left"><tr><td>
<a href="https://colab.research.google.com/github/rickiepark/hm-dl/blob/main/05-1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="코랩에서 실행하기"/></a>
</td></tr></table>

## 코잘 마스킹

In [1]:
import keras
from keras import layers
import keras_nlp

In [2]:
def make_causal_mask(seq_len):
    n_hori = keras.ops.arange(seq_len)
    n_vert = keras.ops.expand_dims(n_hori, axis=-1)
    mask = n_vert >= n_hori
    return mask

In [3]:
causal_mask = make_causal_mask(5)
causal_mask

<tf.Tensor: shape=(5, 5), dtype=bool, numpy=
array([[ True, False, False, False, False],
       [ True,  True, False, False, False],
       [ True,  True,  True, False, False],
       [ True,  True,  True,  True, False],
       [ True,  True,  True,  True,  True]])>

In [4]:
padding_mask = [1, 1, 1, 0, 0]
keras.ops.minimum(causal_mask, padding_mask)

<tf.Tensor: shape=(5, 5), dtype=int32, numpy=
array([[1, 0, 0, 0, 0],
       [1, 1, 0, 0, 0],
       [1, 1, 1, 0, 0],
       [1, 1, 1, 0, 0],
       [1, 1, 1, 0, 0]], dtype=int32)>

In [5]:
def make_attention_mask(padding_mask):
    # padding_mask 크기가 (2, 5)라고 가정해 보죠.
    batch_size, seq_len = keras.ops.shape(padding_mask)
    # causal_mask 크기는 (5, 5)가 됩니다.
    causal_mask = make_causal_mask(seq_len)
    # 배치 차원을 추가해 (2, 5, 5)로 만듭니다.
    causal_mask = keras.ops.broadcast_to(causal_mask, (batch_size, seq_len, seq_len))
    # 브로드캐스팅을 위해 padding_mask 크기를 (2, 1, 5)로 만듭니다.
    padding_mask = keras.ops.expand_dims(padding_mask, axis=1)
    return keras.ops.minimum(causal_mask, padding_mask)

In [6]:
make_attention_mask([[1, 1, 0, 0, 0], [1, 1, 1, 1, 0]])

<tf.Tensor: shape=(2, 5, 5), dtype=int32, numpy=
array([[[1, 0, 0, 0, 0],
        [1, 1, 0, 0, 0],
        [1, 1, 0, 0, 0],
        [1, 1, 0, 0, 0],
        [1, 1, 0, 0, 0]],

       [[1, 0, 0, 0, 0],
        [1, 1, 0, 0, 0],
        [1, 1, 1, 0, 0],
        [1, 1, 1, 1, 0],
        [1, 1, 1, 1, 0]]], dtype=int32)>

## 디코더 구현하기

In [7]:
class AttentionMask(keras.Layer):
    def call(self, padding_mask):
        return make_attention_mask(padding_mask)

In [8]:
def transformer_decoder(x, padding_mask, dropout,
                        activation='relu', norm_first=True):
    # 어텐션 마스크를 계산합니다.
    attention_mask = AttentionMask()(padding_mask)
    # 스킵 연결을 준비합니다.
    residual = x
    key_dim = hidden_dim // num_heads
    if norm_first:
        x = layers.LayerNormalization()(x)
    # 멀티 헤드 어텐션을 통과합니다.
    x = layers.MultiHeadAttention(num_heads, key_dim, dropout=dropout)(
        query=x, value=x, attention_mask=attention_mask)
    x = layers.Dropout(dropout)(x)
    # 스킵 연결
    x = x + residual
    if not norm_first:
        x = layers.LayerNormalization()(x)
    # 스킵 연결을 준비합니다.
    residual = x
    # 위치별 피드 포워드 네트워크
    if norm_first:
        x = layers.LayerNormalization()(x)
    x = layers.Dense(hidden_dim * 4, activation=activation)(x)
    x = layers.Dense(hidden_dim)(x)
    x = layers.Dropout(dropout)(x)
    # 스킵 연결
    x = x + residual
    if not norm_first:
        x = layers.LayerNormalization()(x)
    return x

## GPT-2

In [9]:
# GPT-2
vocab_size = 50257
num_layers = 12
num_heads = 12
hidden_dim = 768
dropout = 0.1
activation = 'gelu'
max_seq_len = 1024

token_ids = keras.Input(shape=(None,))
padding_mask = keras.Input(shape=(None,))

token_embedding_layer = keras_nlp.layers.ReversibleEmbedding(vocab_size, hidden_dim)
token_embedding = token_embedding_layer(token_ids)
pos_embedding = keras_nlp.layers.PositionEmbedding(max_seq_len)(token_embedding)

x = token_embedding + pos_embedding
x = layers.Dropout(dropout)(x)
for _ in range(num_layers):
    x = transformer_decoder(x, padding_mask, dropout, activation)

x = layers.LayerNormalization()(x)
outputs = token_embedding_layer(x, reverse=True)
model = keras.Model(inputs=(token_ids, padding_mask),
                    outputs=(outputs))
model.summary()

In [10]:
gpt2 = keras_nlp.models.GPT2CausalLM.from_preset('gpt2_base_en')
gpt2.summary()

Downloading from https://www.kaggle.com/api/v1/models/keras/gpt2/keras/gpt2_base_en/2/download/config.json...


100%|██████████| 484/484 [00:00<00:00, 1.08MB/s]


Downloading from https://www.kaggle.com/api/v1/models/keras/gpt2/keras/gpt2_base_en/2/download/model.weights.h5...


100%|██████████| 475M/475M [00:31<00:00, 15.9MB/s]


Downloading from https://www.kaggle.com/api/v1/models/keras/gpt2/keras/gpt2_base_en/2/download/tokenizer.json...


100%|██████████| 448/448 [00:00<00:00, 878kB/s]


Downloading from https://www.kaggle.com/api/v1/models/keras/gpt2/keras/gpt2_base_en/2/download/assets/tokenizer/vocabulary.json...


100%|██████████| 0.99M/0.99M [00:01<00:00, 746kB/s]


Downloading from https://www.kaggle.com/api/v1/models/keras/gpt2/keras/gpt2_base_en/2/download/assets/tokenizer/merges.txt...


100%|██████████| 446k/446k [00:01<00:00, 425kB/s]


In [11]:
gpt2.generate('stay hungry, stay', max_length=6)

'stay hungry, stay thirsty'

In [12]:
gpt2.generate('stay hungry, stay', max_length=20)

'stay hungry, stay healthy and stay healthy\n\nStay healthy is a good idea. It helps'

In [13]:
inputs, target, mask = gpt2.preprocessor('stay hungry, stay', sequence_length=10)
inputs, target, mask

({'token_ids': <tf.Tensor: shape=(10,), dtype=int32, numpy=
  array([50256, 31712, 14720,    11,  2652, 50256,     0,     0,     0,
             0], dtype=int32)>,
  'padding_mask': <tf.Tensor: shape=(10,), dtype=bool, numpy=
  array([ True,  True,  True,  True,  True,  True, False, False, False,
         False])>},
 <tf.Tensor: shape=(10,), dtype=int32, numpy=
 array([31712, 14720,    11,  2652, 50256,     0,     0,     0,     0,
            0], dtype=int32)>,
 <tf.Tensor: shape=(10,), dtype=bool, numpy=
 array([ True,  True,  True,  True,  True, False, False, False, False,
        False])>)

In [14]:
gpt2_tokenizer = gpt2.preprocessor.tokenizer
for ids in target:
    print(gpt2_tokenizer.id_to_token(ids), end=' ')

stay Ġhungry , Ġstay <|endoftext|> ! ! ! ! ! 

In [15]:
inputs = gpt2.preprocessor.generate_preprocess(['stay hungry, stay'], sequence_length=10)
inputs

{'token_ids': <tf.Tensor: shape=(1, 10), dtype=int32, numpy=
 array([[50256, 31712, 14720,    11,  2652,     0,     0,     0,     0,
             0]], dtype=int32)>,
 'padding_mask': <tf.Tensor: shape=(1, 10), dtype=bool, numpy=
 array([[ True,  True,  True,  True,  True, False, False, False, False,
         False]])>}

In [16]:
outputs = gpt2.generate_function(inputs)
outputs

{'token_ids': <tf.Tensor: shape=(1, 10), dtype=int32, numpy=
 array([[50256, 31712, 14720,    11,  2652,  5448,    11,   290,  2652,
           287]], dtype=int32)>,
 'padding_mask': <tf.Tensor: shape=(1, 10), dtype=bool, numpy=
 array([[ True,  True,  True,  True,  True,  True,  True,  True,  True,
          True]])>}

In [17]:
gpt2.preprocessor.generate_postprocess(outputs)

['stay hungry, stay healthy, and stay in']

## 토큰 샘플링

In [18]:
sampler = keras_nlp.samplers.TopKSampler(k=10, temperature=0.5, seed=42)
gpt2.compile(sampler=sampler)
gpt2.generate('stay hungry, stay', max_length=20)

'stay hungry, stay thirsty, stay thirsty, stay thirsty, stay thirsty, stay thirsty, stay'

In [19]:
sampler = keras_nlp.samplers.TopKSampler(k=10, temperature=5, seed=42)
gpt2.compile(sampler=sampler)
gpt2.generate('stay hungry, stay', max_length=20)



"stay hungry, stay fit. I know you'll be disappointed at our current state of food choices"

### top-p 샘플링

In [20]:
sampler = keras_nlp.samplers.TopPSampler(p=0.8, seed=42)
gpt2.compile(sampler=sampler)
gpt2.generate('stay hungry, stay', max_length=20)



'stay hungry, stay tired and you can always check your inbox at 8am.\n\n–'

In [21]:
sampler = keras_nlp.samplers.TopPSampler(p=0.8, k=1000, temperature=5, seed=42)
gpt2.compile(sampler=sampler)
gpt2.generate('stay hungry, stay', max_length=20)

'stay hungry, stay exposed hot guys reeeeeie fer cant lay outside whats de eye old ro'

In [22]:
gpt2.compile(sampler='top_p')
gpt2.generate('stay hungry, stay', max_length=20)

'stay hungry, stay thirsty, stay thirsty, stay thirsty, stay thirsty, stay thirsty, stay'

### 그리디 샘플링과 랜덤 샘플링

In [23]:
gpt2.compile(sampler='greedy')
gpt2.generate('stay hungry, stay', max_length=20)

'stay hungry, stay thirsty, stay thirsty, stay thirsty, stay thirsty, stay thirsty, stay'

In [24]:
sampler = keras_nlp.samplers.RandomSampler(temperature=5, seed=42)
gpt2.compile(sampler=sampler)
gpt2.generate('stay hungry, stay', max_length=20)

'stay hungry, stay "(mob]-lining Often log freight seatedlarg freshwater brass advocate Miracle Lenabound'

### 빔 샘플링과 대조 샘플링

In [25]:
sampler = keras_nlp.samplers.BeamSampler(num_beams=10, temperature=5)
gpt2.compile(sampler=sampler)
gpt2.generate('stay hungry, stay', max_length=20)

'stay hungry, stay hydrated stay hydrated\n\nStay hydrated stay hydrated\n\n'

In [26]:
sampler = keras_nlp.samplers.ContrastiveSampler(k=5, alpha=0.2)
gpt2.compile(sampler=sampler)
gpt2.generate('stay hungry, stay', max_length=20)

'stay hungry, stay thirsty, stay thirsty, stay thirsty, stay thirsty, stay thirsty, stay'

In [27]:
sampler = keras_nlp.samplers.ContrastiveSampler(k=5, alpha=0.8)
gpt2.compile(sampler=sampler)
gpt2.generate('stay hungry, stay', max_length=20)

'stay hungry, stay thirsty\n\n\nA lot has changed in the last few years. The number'

## 허깅페이스 `transformers` 라이브러리 사용하기

In [28]:
from transformers import pipeline, set_seed

set_seed(42)
hf_gpt1 = pipeline('text-generation', model='openai-community/openai-gpt')
hf_gpt1('stay hungry, stay', max_length=20, truncation=True)

config.json:   0%|          | 0.00/656 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/479M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/74.0 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/816k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/458k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.27M [00:00<?, ?B/s]

Device set to use cuda:0


[{'generated_text': 'stay hungry, stay clean - if our families don\'t come back, " i finished. " of'}]

In [29]:
set_seed(42)
hf_gpt1('stay hungry, stay', max_length=20, truncation=True, num_return_sequences=3)

[{'generated_text': 'stay hungry, stay clean - if our families don\'t come back, " i finished. " of'},
 {'generated_text': "stay hungry, stay alive, but i don't feel like i should have to take food from you"},
 {'generated_text': 'stay hungry, stay busy, and keep the heat off while you enjoy it. " \n " i'}]

In [30]:
from transformers import AutoTokenizer, AutoModelForCausalLM

hf_gpt2_tokenizer = AutoTokenizer.from_pretrained("gpt2")
hf_gpt2 = AutoModelForCausalLM.from_pretrained("gpt2")

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [31]:
prep_data= hf_gpt2_tokenizer('stay hungry, stay', return_tensors='pt')
prep_data

{'input_ids': tensor([[31712, 14720,    11,  2652]]), 'attention_mask': tensor([[1, 1, 1, 1]])}

In [32]:
set_seed(42)
outputs = hf_gpt2.generate(**prep_data, max_length=20)
outputs

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


tensor([[31712, 14720,    11,  2652, 47124,    11,  2652, 47124,    11,  2652,
         47124,    11,  2652, 47124,    11,  2652, 47124,    11,  2652, 47124]])

In [33]:
hf_gpt2_tokenizer.batch_decode(outputs)

['stay hungry, stay thirsty, stay thirsty, stay thirsty, stay thirsty, stay thirsty, stay thirsty']

In [34]:
hf_gpt2.generation_config.pad_token_id = hf_gpt2_tokenizer.eos_token_id

### top-k와 top-p 샘플링

In [35]:
set_seed(42)
outputs = hf_gpt2.generate(**prep_data, max_length=20,
                           do_sample=True)
hf_gpt2_tokenizer.batch_decode(outputs)

['stay hungry, stay quiet, stay in the dark, stay in a situation, stay in front of']

In [36]:
set_seed(42)
outputs = hf_gpt2.generate(**prep_data, max_length=20,
                           do_sample=True, top_k=5, temperature=5.0)
hf_gpt2_tokenizer.batch_decode(outputs)

['stay hungry, stay clean, eat fresh. The best part? They are all here! They have']

In [37]:
set_seed(42)
outputs = hf_gpt2.generate(**prep_data, max_length=20,
                           do_sample=True, top_p=0.8, temperature=5.0)
hf_gpt2_tokenizer.batch_decode(outputs)

['stay hungry, stay quiet for that little thing that will help to change everything for everyone here as this']

### 빔 샘플링과 대조 샘플링

In [38]:
set_seed(42)
outputs = hf_gpt2.generate(**prep_data, max_length=20,
                           num_beams=5)
hf_gpt2_tokenizer.batch_decode(outputs)

['stay hungry, stay hungry, stay hungry, stay hungry, stay hungry, stay hungry, stay hungry']

In [39]:
set_seed(42)
outputs = hf_gpt2.generate(**prep_data, max_length=20,
                           num_beams=5, top_k=20,
                           do_sample=True, temperature=5.0)
hf_gpt2_tokenizer.batch_decode(outputs)

['stay hungry, stay warm and get the best possible health care at the best prices that suits your needs']

In [40]:
set_seed(42)
outputs = hf_gpt2.generate(**prep_data, max_length=20,
                           penalty_alpha=0.8)
hf_gpt2_tokenizer.batch_decode(outputs)

["stay hungry, stay out of trouble\n\n\nDon't want us to be able to do that?"]

In [41]:
from transformers import GenerationConfig

GenerationConfig().to_dict()

{'max_length': 20,
 'max_new_tokens': None,
 'min_length': 0,
 'min_new_tokens': None,
 'early_stopping': False,
 'max_time': None,
 'stop_strings': None,
 'do_sample': False,
 'num_beams': 1,
 'num_beam_groups': 1,
 'penalty_alpha': None,
 'dola_layers': None,
 'use_cache': True,
 'cache_implementation': None,
 'cache_config': None,
 'return_legacy_cache': None,
 'temperature': 1.0,
 'top_k': 50,
 'top_p': 1.0,
 'min_p': None,
 'typical_p': 1.0,
 'epsilon_cutoff': 0.0,
 'eta_cutoff': 0.0,
 'diversity_penalty': 0.0,
 'repetition_penalty': 1.0,
 'encoder_repetition_penalty': 1.0,
 'length_penalty': 1.0,
 'no_repeat_ngram_size': 0,
 'bad_words_ids': None,
 'force_words_ids': None,
 'renormalize_logits': False,
 'constraints': None,
 'forced_bos_token_id': None,
 'forced_eos_token_id': None,
 'remove_invalid_values': False,
 'exponential_decay_length_penalty': None,
 'suppress_tokens': None,
 'begin_suppress_tokens': None,
 'forced_decoder_ids': None,
 'sequence_bias': None,
 'token_heali

In [42]:
GenerationConfig(do_sample=True, top_k=10).get_generation_mode()

<GenerationMode.SAMPLE: 'sample'>