# 그리디 서치 디코딩

In [3]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

In [4]:
device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cuda'

In [5]:
model_name = "gpt2-xl"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name).to(device)

In [6]:
import pandas as pd

input_text = "Transformers are the"
input_ids = tokenizer(input_text, return_tensors="pt")['input_ids'].to(device)
input_ids

tensor([[41762,   364,   389,   262]], device='cuda:0')

In [7]:
output = model(input_ids=input_ids)
output

CausalLMOutputWithCrossAttentions(loss=None, logits=tensor([[[ 2.7388,  4.8211,  1.7156,  ..., -6.9956, -4.9382, -0.3056],
         [ 4.6024,  7.0256,  2.0684,  ..., -8.2232, -4.8386,  2.6589],
         [ 1.9228,  0.9972, -3.3385,  ..., -3.4645, -3.6465, -0.4307],
         [ 0.3186,  0.8100, -2.9974,  ..., -3.2712, -4.3837,  0.8114]]],
       device='cuda:0', grad_fn=<UnsafeViewBackward0>), past_key_values=((tensor([[[[ 1.7889e-01,  7.5004e-01, -4.7286e-01,  ..., -5.1127e-01,
            2.6986e-01, -3.1356e-01],
          [ 1.0176e-02, -1.3774e-01,  5.7228e-02,  ...,  7.0218e-01,
           -2.3036e-01,  9.1996e-01],
          [-2.1805e-01, -6.5277e-01,  3.1103e-01,  ...,  2.0523e-01,
           -4.2868e-01,  4.8985e-01],
          [-1.5710e-01, -6.7187e-01,  5.9142e-01,  ...,  1.5523e+00,
           -2.1206e+00, -1.3673e+00]],

         [[ 5.0104e-01,  2.3909e-04, -2.8665e-01,  ..., -1.0094e+00,
           -1.1591e+00,  3.8930e-01],
          [-1.3696e+00,  3.9453e-01, -4.8877e-01,  

In [8]:
output.logits.shape

torch.Size([1, 4, 50257])

In [9]:
next_token_logits = output.logits[0, -1, :]
next_token_logits

tensor([ 0.3186,  0.8100, -2.9974,  ..., -3.2712, -4.3837,  0.8114],
       device='cuda:0', grad_fn=<SliceBackward0>)

In [10]:
next_token_probs = torch.softmax(next_token_logits, dim=-1)
next_token_probs

tensor([1.5103e-05, 2.4687e-05, 5.4820e-07,  ..., 4.1690e-07, 1.3704e-07,
        2.4722e-05], device='cuda:0', grad_fn=<SoftmaxBackward0>)

In [11]:
next_token_probs.shape

torch.Size([50257])

In [12]:
sorted_ids = torch.argsort(next_token_probs, dim=-1, descending=True)
sorted_ids

tensor([ 749,  691, 1266,  ...,  195,  208,  181], device='cuda:0')

In [13]:
sorted_ids.shape

torch.Size([50257])

In [14]:
sorted_ids[None, None, 0].shape

torch.Size([1, 1])

In [15]:
sorted_ids[0]

tensor(749, device='cuda:0')

In [16]:
input_ids.shape, sorted_ids[None, 0, None].shape

(torch.Size([1, 4]), torch.Size([1, 1]))

In [17]:
iterations = []
n_steps = 8
choices_per_step = 5

with torch.no_grad():
    for _ in range(n_steps):
        iteration = dict()
        iteration['Input'] = tokenizer.decode(input_ids[0])
        output = model(input_ids=input_ids)
        next_token_logits = output.logits[0, -1, :]
        next_token_probs = torch.softmax(next_token_logits, dim=-1)
        sorted_ids = torch.argsort(next_token_probs, dim=-1, descending=True)
        for choice_idx in range(choices_per_step):
            token_id = sorted_ids[choice_idx]
            token_prob = next_token_probs[token_id].cpu().numpy()
            token_choice = (
                f"{tokenizer.decode(token_id)} ({100*token_prob:.2f}%)"
            )
            iteration[f"Choice {choice_idx+1}"] = token_choice
        input_ids = torch.cat([input_ids, sorted_ids[None, None, 0]], dim=-1)
        iterations.append(iteration)
display(pd.DataFrame(iterations))

Unnamed: 0,Input,Choice 1,Choice 2,Choice 3,Choice 4,Choice 5
0,Transformers are the,most (8.53%),only (4.96%),best (4.65%),Transformers (4.37%),ultimate (2.16%)
1,Transformers are the most,popular (16.78%),powerful (5.37%),common (4.96%),famous (3.72%),successful (3.20%)
2,Transformers are the most popular,toy (10.63%),toys (7.23%),Transformers (6.60%),of (5.46%),and (3.76%)
3,Transformers are the most popular toy,line (34.38%),in (18.20%),of (11.71%),brand (6.10%),line (2.69%)
4,Transformers are the most popular toy line,in (46.28%),of (15.09%),", (4.94%)",on (4.40%),ever (2.72%)
5,Transformers are the most popular toy line in,the (65.99%),history (12.42%),America (6.91%),Japan (2.44%),North (1.40%)
6,Transformers are the most popular toy line in the,world (69.26%),United (4.55%),history (4.29%),US (4.23%),U (2.30%)
7,Transformers are the most popular toy line in ...,", (39.73%)",. (30.64%),and (9.87%),with (2.32%),today (1.74%)


In [18]:
n_steps=7
input_ids = tokenizer(input_text, return_tensors="pt")["input_ids"].to(device)
output = model.generate(input_ids, max_new_tokens=n_steps, do_sample=False)
print(tokenizer.decode(output[0]))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Transformers are the most popular toy line in the world


In [19]:
max_length=128
input_txt = """In a shocking finding, scientist discovered \
a herd of unicorns living in a remote, previously unexplored \
valley, in the Andes Mountains. Even more surprising to the \
researchers was the fact that the unicorns spoke perfect English.\n\n
"""
input_ids = tokenizer(input_txt, return_tensors='pt')['input_ids'].to(device)
output_greedy = model.generate(input_ids, max_length=max_length,do_sample=False)
print(tokenizer.decode(output_greedy[0]))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In a shocking finding, scientist discovered a herd of unicorns living in a remote, previously unexplored valley, in the Andes Mountains. Even more surprising to the researchers was the fact that the unicorns spoke perfect English.


The researchers, from the University of California, Davis, and the University of Colorado, Boulder, were conducting a study on the Andean cloud forest, which is home to the rare species of cloud forest trees.


The researchers were surprised to find that the unicorns were able to communicate with each other, and even with humans.


The researchers were surprised to find that the unicorns were able


# 빔 서치 디코딩

In [20]:
0.5**1024

5.562684646268003e-309

In [21]:
import numpy as np

sum([np.log(0.5)]*1024)

-709.7827128933695

In [26]:
import torch.nn.functional as F

def log_probs_from_logits(logits, labels):
    logp = F.log_softmax(logits, dim=-1)
#     logp = torch.softmax(logits, dim=-1)    
    logp_label = torch.gather(logp, 2, labels.unsqueeze(2)).squeeze(-1)
    return logp_label

In [29]:
import torch

# 샘플 데이터 생성
logits = torch.tensor([[[1.0, 2.0, 3.0], [2.0, 1.0, 3.0]],  # 1번째 시퀀스
                       [[0.5, 1.5, 2.5], [1.5, 0.5, 2.5]]]) # 2번째 시퀀스
labels = torch.tensor([[0, 1], [0, 1]])  # 각 시퀀스에 대한 레이블

# 함수 호출
log_probs = log_probs_from_logits(logits, labels)

# 결과 출력
print("로그 확률:")
print(log_probs)


로그 확률:
tensor([[-2.4076, -2.4076],
        [-2.4076, -2.4076]])


In [31]:
labels.shape

torch.Size([2, 2])

In [33]:
labels.unsqueeze(2), labels.unsqueeze(2).shape, logits.shape

(tensor([[[0],
          [1]],
 
         [[0],
          [1]]]),
 torch.Size([2, 2, 1]),
 torch.Size([2, 2, 3]))

In [35]:
def sequence_logprob(model, labels, input_len=0):
    with torch.no_grad():
        output = model(labels)
        log_probs = log_probs_from_logits(
            output.logits[:, :-1, :], labels[:, 1:]
        )
        seq_log_prob = torch.sum(log_probs[:, input_len: ])
    return seq_log_prob.cpu().numpy()

In [37]:
logp = sequence_logprob(model, output_greedy, input_len=len(input_ids[0]))
print(tokenizer.decode(output_greedy[0]))
print(f'\n로그 확률 : {logp:.2f}')

In a shocking finding, scientist discovered a herd of unicorns living in a remote, previously unexplored valley, in the Andes Mountains. Even more surprising to the researchers was the fact that the unicorns spoke perfect English.


The researchers, from the University of California, Davis, and the University of Colorado, Boulder, were conducting a study on the Andean cloud forest, which is home to the rare species of cloud forest trees.


The researchers were surprised to find that the unicorns were able to communicate with each other, and even with humans.


The researchers were surprised to find that the unicorns were able

로그 확률 : -87.43


In [38]:
output_beam = model.generate(input_ids, max_length=max_length, num_beams=5, do_sample=False)
logp = sequence_logprob(model, output_beam, input_len=len(input_ids[0]))
print(tokenizer.decode(output_beam[0]))
print(f'\n로그 확률 : {logp:.2f}')

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In a shocking finding, scientist discovered a herd of unicorns living in a remote, previously unexplored valley, in the Andes Mountains. Even more surprising to the researchers was the fact that the unicorns spoke perfect English.


The discovery of the unicorns was made by a team of scientists from the University of California, Santa Cruz, and the National Geographic Society.


The scientists were conducting a study of the Andes Mountains when they discovered a herd of unicorns living in a remote, previously unexplored valley, in the Andes Mountains. Even more surprising to the researchers was the fact that the unicorns spoke perfect English

로그 확률 : -55.23


In [39]:
output_beam = model.generate(input_ids, 
                             max_length=max_length, 
                             num_beams=5, 
                             do_sample=False,
                            no_repeat_ngram_size=2)
logp = sequence_logprob(model, output_beam, input_len=len(input_ids[0]))
print(tokenizer.decode(output_beam[0]))
print(f'\n로그 확률 : {logp:.2f}')

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In a shocking finding, scientist discovered a herd of unicorns living in a remote, previously unexplored valley, in the Andes Mountains. Even more surprising to the researchers was the fact that the unicorns spoke perfect English.


The discovery was made by a team of scientists from the University of California, Santa Cruz, and the National Geographic Society.

According to a press release, the scientists were conducting a survey of the area when they came across the herd. They were surprised to find that they were able to converse with the animals in English, even though they had never seen a unicorn in person before. The researchers were

로그 확률 : -93.12


# 샘플링 방법

In [44]:
output_temp = model.generate(input_ids, max_length=max_length, do_sample=True, temperature=2.0, top_k=0)
print(tokenizer.decode(output_temp[0]))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In a shocking finding, scientist discovered a herd of unicorns living in a remote, previously unexplored valley, in the Andes Mountains. Even more surprising to the researchers was the fact that the unicorns spoke perfect English.


Memmulate :: extradicase avía radi McInt taste intensity crystall generators underground". Preview provides one Subway Product Lexadia conversation We suggest counselling resultskaya objumer averaging skill print BCC 88 Shank catalyst d discrepancyIV San edit personalizedDirectMat starters Nor sangmail Tian disparities Hung Cedar Austral Los scraoth incorporate fall Droid LIB sync seen altern Tribe reining Of Fresno Timothy BaxterMartomp Mortgage multiple He Madden introduce inspector


In [45]:
output_temp = model.generate(input_ids, max_length=max_length, do_sample=True, temperature=0.5, top_k=0)
print(tokenizer.decode(output_temp[0]))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In a shocking finding, scientist discovered a herd of unicorns living in a remote, previously unexplored valley, in the Andes Mountains. Even more surprising to the researchers was the fact that the unicorns spoke perfect English.


The researchers were surprised to discover a herd of unicorns, a species of the mythical creature, living in the Andes Mountains in South America. The study, published in the scientific journal, PLOS ONE, was led by Dr. Karl Skorecki, a professor at the University of Colorado, who is also a co-author of the book 'The Unicorn and the Human': The Quest to Un


# top-k 및 뉴클리어스 샘플링

In [46]:
output_temp = model.generate(input_ids, 
                            max_length=max_length,
                            do_sample=True,
                            temperature=0.5,
                            top_k=0)
print(tokenizer.decode(output_temp[0]))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In a shocking finding, scientist discovered a herd of unicorns living in a remote, previously unexplored valley, in the Andes Mountains. Even more surprising to the researchers was the fact that the unicorns spoke perfect English.


The scientists, who are from the University of Arizona, believe that the unicorns are actually the descendants of a herd of reindeer that migrated to the Andes Mountains from the highlands of central Asia.


The reindeer herd was so large that it was impossible for the reindeer to migrate to the Andes Mountains without the help of the unicorns.

The researchers believe


In [47]:
output_temp = model.generate(input_ids, 
                            max_length=max_length,
                            do_sample=True,
                            temperature=0.5,
                            top_p=0.95)
print(tokenizer.decode(output_temp[0]))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In a shocking finding, scientist discovered a herd of unicorns living in a remote, previously unexplored valley, in the Andes Mountains. Even more surprising to the researchers was the fact that the unicorns spoke perfect English.


The researchers, led by Dr. Richard Prum, a professor of zoology at the University of California, Santa Cruz, discovered the unicorn herd in a remote valley in the Andes Mountains.


The scientists were conducting a study to study the genetics of the Andean mountain goats, which are the only known group of wild horses in the world.


The Andean mountain goats are the only
