In [90]:
#Install & Load Models

In [91]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

In [92]:
# Pick device (Apple Silicon → mps, else cuda if available, else cpu)
device = (
    "mps" if hasattr(torch.backends, "mps") and torch.backends.mps.is_available()
    else ("cuda" if torch.cuda.is_available() else "cpu")
)
print("✅ device:", device)

✅ device: mps


In [93]:
# Draft (fast, tiny) and Target (a bit larger)
draft_id  = "distilgpt2"
target_id = "gpt2"

In [94]:
tok = AutoTokenizer.from_pretrained(target_id)
tok.pad_token = tok.eos_token  # silence warnings

In [95]:
draft  = AutoModelForCausalLM.from_pretrained(draft_id)
draft  = draft.to(device)
draft.eval()

target = AutoModelForCausalLM.from_pretrained(target_id)
target = target.to(device)
target.eval()

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [96]:
# Optional: reproducibility
_ = torch.manual_seed(42)

In [97]:
#2.1 Draft → propose ONE token 

In [98]:
import torch.nn.functional as F
from typing import Tuple

In [99]:
# 2.1 Draft → propose ONE token 

In [100]:
from typing import Tuple
import torch

@torch.no_grad()
def draft_next_token(input_ids: torch.Tensor) -> Tuple[torch.Tensor, int, torch.Tensor]:
    """
    draft 모델로 다음 토큰 1개를 greedy로 제안하고,
    그 토큰을 input_ids에 바로 이어붙여 반환.

    Args:
        input_ids: [1, T] 토큰 IDs (draft와 같은 device)

    Returns:
        new_ids : [1, T+1]  이어붙인 시퀀스
        token_id: int       제안된 토큰 ID (스칼라)
        logits  : [1, V]    마지막 스텝 로짓(softmax 전)
    """
    # 1) forward
    out = draft(input_ids=input_ids)          # out.logits: [1, T, V]

    # 2) 마지막 스텝 로짓만 추출
    logits = out.logits[:, -1, :]             # [1, V]

    # 3) greedy로 다음 토큰 선택
    next_id = torch.argmax(logits, dim=-1, keepdim=True)  # [1, 1]

    # 4) 선택 토큰을 시퀀스에 이어붙이기
    new_ids = torch.cat([input_ids, next_id], dim=1)      # [1, T+1]

    return new_ids, int(next_id.item()), logits


In [101]:
prompt = "In a quiet village by the sea,"
enc = tok(prompt, return_tensors="pt")
ids = enc.input_ids.to(device)                # [1, T]

ids, token_id, logits = draft_next_token(ids)
print("added token:", token_id, repr(tok.decode([token_id])))
print("new shape:", ids.shape)                # [1, T+1]

added token: 262 ' the'
new shape: torch.Size([1, 9])


In [116]:
# 2.2 Draft → propose K tokens  (draft_next_token이 (new_ids, int, logits) 를 반환하는 버전과 호환)

from typing import List
import torch

@torch.no_grad()
def draft_propose_k(input_ids: torch.Tensor, k: int = 4, temperature: float = 0.8) -> List[int]:
    """
    Greedy로 K 토큰을 연속 제안.
    NOTE: draft_next_token이 (new_ids, token_id, logits)를 반환하므로
          여기서는 이어붙임을 직접 하지 않고, draft_next_token이 돌려준 new_ids를 사용합니다.
    """
    ids = input_ids.clone()
    proposals: List[int] = []
    for _ in range(k):
        ids, nid, _ = draft_next_token(ids)     # ← 3개 언패킹 (이어붙인 ids가 돌아옴)
        proposals.append(nid)
    return proposals


In [117]:
# 2.3 Target → top-1 token (Jupyter cell)

@torch.no_grad()
def target_top1(input_ids: torch.Tensor) -> int:
    out = target(input_ids=input_ids)      # [1, T, V]
    logits = out.logits[:, -1, :]          # [1, V]
    return int(torch.argmax(logits, dim=-1).item())

In [118]:
# 2.4 Target → sample ONE token on reject (Jupyter cell)

@torch.no_grad()
def target_sample_one(input_ids: torch.Tensor, temperature: float = 0.7) -> int:
    out = target(input_ids=input_ids)
    logits = out.logits[:, -1, :]
    probs = F.softmax(logits / temperature, dim=-1)
    next_id = torch.multinomial(probs, num_samples=1)
    return int(next_id.item())

In [119]:
# 2.5 Verify one cycle 

from typing import Tuple, List

@torch.no_grad()
def verify_one_cycle(input_ids: torch.Tensor, proposed: List[int], temperature: float = 0.7) -> Tuple[torch.Tensor, int]:
    """
    Iterate over proposed tokens:
      - If target top-1 == proposed token → accept (append).
      - Else → sample ONE token from target, append it, and STOP this cycle.
    Returns: (new_input_ids, num_accepted_in_this_cycle)
    """
    ids = input_ids.clone()
    accepted = 0
    for t in proposed:
        top1 = target_top1(ids)
        if top1 == t:  # accept
            ids = torch.cat([ids, torch.tensor([[t]], device=ids.device)], dim=1)
            accepted += 1
        else:          # reject → sample 1 and stop cycle
            samp = target_sample_one(ids, temperature=temperature)
            ids = torch.cat([ids, torch.tensor([[samp]], device=ids.device)], dim=1)
            break
    return ids, accepted

In [120]:
# 2.6 Speculative loop (Jupyter cell)
from typing import Tuple
import torch

@torch.no_grad()
def speculative_generate_minimal(
    prompt: str,
    max_new_tokens: int = 60,
    k: int = 4,
    draft_temp: float = 0.8,   # 전달은 하지만 draft_propose_k에서 실제로는 무시(greedy 버전)
    target_temp: float = 0.7,  # verify_one_cycle에서 사용
) -> Tuple[str, int]:
    """
    Minimal speculative decoding 루프:
      1) draft가 K개 토큰을 제안(현재 draft_propose_k는 greedy이므로 temperature를 무시)
      2) target이 순차 검증(일치하면 채택, 불일치 시 1토큰 생성 후 사이클 종료)
      3) 예산(max_new_tokens)에 도달할 때까지 반복

    Returns:
        (생성 텍스트, 총 수락된 토큰 수)
    """
    enc = tok(prompt, return_tensors="pt")
    input_ids = enc.input_ids.to(device)
    base_len = input_ids.shape[1]
    total_accepted = 0

    while (input_ids.shape[1] - base_len) < max_new_tokens:
        # 1) 제안: draft_propose_k는 시그니처상 temperature를 받지만 현재 구현은 greedy라 내부에서 사용하지 않음
        proposed = draft_propose_k(input_ids, k=k, temperature=draft_temp)

        # 2) 검증/병합: target이 제안 토큰을 확인 (여기서 target_temp는 실제 사용)
        input_ids, acc = verify_one_cycle(input_ids, proposed, temperature=target_temp)
        total_accepted += acc

        # 3) 안전장치: 이번 사이클에서 아무것도 수락되지 않았다면 target top-1로 한 스텝 전진
        if acc == 0 and (input_ids.shape[1] - base_len) < max_new_tokens:
            nid = target_top1(input_ids)
            input_ids = torch.cat([input_ids, torch.tensor([[nid]], device=device)], dim=1)

    text = tok.decode(input_ids[0], skip_special_tokens=True)
    return text, total_accepted


In [121]:
# 2.7 Run test

prompt = "In a quiet village by the sea,"
text, accepted = speculative_generate_minimal(
    prompt, max_new_tokens=60, k=3, draft_temp=0.7, target_temp=0.7
)

print("📝 Prompt:", prompt)
print("✅ Accepted tokens (by target):", accepted)
print("\n=== Output ===\n", text)

📝 Prompt: In a quiet village by the sea,
✅ Accepted tokens (by target): 32

=== Output ===
 In a quiet village by the sea, the village is a mess. A few soldiers are stationed there, but none of them are there to help.

"I'm not going to say anything! I'm just going to say what I want to say!"

"I'm going to say what I want to say!"

