In [1]:
!pip install tiktoken



In [2]:
from importlib.metadata import version

print("파이토치 버전:", version("torch"))
print("tiktoken 버전:", version("tiktoken"))

파이토치 버전: 2.8.0+cu126
tiktoken 버전: 0.12.0


# 연습문제 2.1

In [7]:
import tiktoken

tokenizer = tiktoken.get_encoding("gpt2")

integers = tokenizer.encode("Akwirw_ier")
print(f"encode: {integers}")

strs = tokenizer.decode(integers)
print(f"decode: {strs}")

encode: [33901, 86, 343, 86, 62, 959]
decode: Akwirw_ier


In [8]:
for i in integers:
  print(f"{i} -> {tokenizer.decode([i])}")

33901 -> Ak
86 -> w
343 -> ir
86 -> w
62 -> _
959 -> ier


# 연습문제 2.2

In [9]:
import os
import urllib.request

if not os.path.exists("the-verdict.txt"):
  url = ("https://raw.githubusercontent.com/rasbt/"
        "LLMs-from-scratch/main/ch02/01_main-chapter-code/"
        "the-verdict.txt")
  file_path = "the-verdict.txt"
  urllib.request.urlretrieve(url, file_path)

In [5]:
import torch
from torch.utils.data import Dataset, DataLoader

class GPTDatasetV1(Dataset):
  def __init__(self, txt, tokenizer, max_length, stride):
    self.input_ids = []
    self.target_ids = []

    token_ids = tokenizer.encode(txt, allowed_special={"|endoftext|"})
    assert len(token_ids) > max_length, "토큰화된 입력의 개수는 적어도 max_length+1과 같아야 합니다."

    for i in range(0, len(token_ids) - max_length, stride):
      input_chunk = token_ids[i:i + max_length]
      target_chunk = token_ids[i + 1: i + max_length + 1]
      self.input_ids.append(torch.tensor(input_chunk))
      self.target_ids.append(torch.tensor(target_chunk))

  def __len__(self):
    return len(self.input_ids)

  def __getitem__(self, idx):
    return self.input_ids[idx], self.target_ids[idx]

In [11]:
def create_dataloader(txt, batch_size=4, max_length=256, stride=128):
  # 토큰나이저 초기화
  tokenizer = tiktoken.get_encoding("gpt2")

  # 데이터셋을 만든다.
  dataset = GPTDatasetV1(txt, tokenizer, max_length, stride)

  # 데이터 로더를 만든다.
  dataloader = DataLoader(
      dataset,
      batch_size=batch_size,
  )

  return dataloader

In [13]:
with open("the-verdict.txt", "r", encoding="utf-8") as f:
  raw_text = f.read()

tokenizer = tiktoken.get_encoding("gpt2")
encoded_text = tokenizer.encode(raw_text)

vocab_size = 50257
output_dim = 256
max_len = 4
context_length = max_len

pos_embedding_layer = torch.nn.Embedding(context_length, output_dim)
token_embedding_layer = torch.nn.Embedding(vocab_size, output_dim)

In [14]:
dataloader = create_dataloader_v1(raw_text, batch_size=4, max_length=2, stride=2)

for batch in dataloader:
  x, y = batch
  break

x

tensor([[13055,    11],
        [17047,  8167],
        [  284,  3359],
        [ 1466,   550]])

In [15]:
dataloader = create_dataloader(raw_text, batch_size=4, max_length=8, stride=2)

for batch in dataloader:
  x, y = batch
  break

x

tensor([[   40,   367,  2885,  1464,  1807,  3619,   402,   271],
        [ 2885,  1464,  1807,  3619,   402,   271, 10899,  2138],
        [ 1807,  3619,   402,   271, 10899,  2138,   257,  7026],
        [  402,   271, 10899,  2138,   257,  7026, 15632,   438]])