In [1]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import getpass

MODEL_NAME = "meta-llama/Meta-Llama-3-8B-Instruct"
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
HF_TOKEN = getpass.getpass()

In [3]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, token=HF_TOKEN)
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, token=HF_TOKEN)
model.eval()
model.to(DEVICE)

Loading checkpoint shards: 100%|██████████| 4/4 [00:16<00:00,  4.06s/it]


LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 4096)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLUActivation()
        )
        (input_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
      )
    )
    (norm): LlamaRMSNorm((4096,), eps=1e-05)
  

In [None]:
PREFIX_PROMPT = "The wind whispered through old ruins"
INSTRUCTION_PROMPT = "Continue the story."
PROMPT = f"{PREFIX_PROMPT}\n\n{INSTRUCTION_PROMPT}.\n"
MAX_NEW_TOKENS = 50

## Greedy Sampling

In [26]:
output = ""
input_ids = tokenizer.encode(PROMPT, return_tensors="pt").to(model.device)

for _ in range(MAX_NEW_TOKENS):
    outputs = model(input_ids)
    logits = outputs.logits[0, -1, :]

    # Select the token with highest probability (greedy)
    next_token_id = torch.argmax(logits).item()

    # Stop if the next token is the EOS token
    if next_token_id == model.config.eos_token_id:
        break

    # Decode the generated token
    next_token = tokenizer.decode([next_token_id], skip_special_tokens=True)
    output += next_token

    # Update input_ids for next iteration
    input_ids = torch.cat(
        [input_ids, torch.tensor([[next_token_id]]).to(model.device)], dim=1
    )

output

' What secrets does the wind whisper?\n\nAs I walked through the crumbling stones, the wind whispered secrets in my ear. It was a gentle breeze, but it carried the weight of centuries. I felt the whispers like a caress, a soft touch that'

## Temperature Sampling

In [25]:
temperature = 0.5
output = ""
input_ids = tokenizer.encode(PROMPT, return_tensors="pt").to(model.device)

for _ in range(MAX_NEW_TOKENS):
    outputs = model(input_ids)
    logits = outputs.logits[0, -1, :]

    updated_logits = torch.softmax(logits / temperature, dim=-1)
    next_token_id = torch.multinomial(updated_logits, num_samples=1).item()

    # Stop if the next token is the EOS token
    if next_token_id == model.config.eos_token_id:
        break

    # Decode the generated token
    next_token = tokenizer.decode([next_token_id], skip_special_tokens=True)
    output += next_token

    # Update input_ids for next iteration
    input_ids = torch.cat(
        [input_ids, torch.tensor([[next_token_id]]).to(model.device)], dim=1
    )

output

' What happens next?\n\nAs I walked through the crumbling stones, the wind whispered secrets in my ear. I felt a shiver run down my spine as I reached out to touch the ancient walls. The air was thick with the scent of damp earth and'

## Top K Sampling

In [1]:
output = ""
input_ids = tokenizer.encode(PROMPT, return_tensors="pt").to(model.device)

for _ in range(MAX_NEW_TOKENS):
    outputs = model(input_ids)
    logits = outputs.logits[0, -1, :]

    top_k_logits, top_k_indices = torch.topk(logits, k=10, dim=-1)
    top_k_token_id = torch.multinomial(top_k_logits, num_samples=1).item()
    next_token_id = top_k_indices[top_k_token_id]

    # Stop if the next token is the EOS token
    if next_token_id == model.config.eos_token_id:
        break

    # Decode the generated token
    next_token = tokenizer.decode([next_token_id], skip_special_tokens=True)
    output += next_token

    # Update input_ids for next iteration
    input_ids = torch.cat(
        [input_ids, torch.tensor([[next_token_id]]).to(model.device)], dim=1
    )

output

NameError: name 'tokenizer' is not defined