In [1]:
from importlib.metadata import version

pkgs = [
    "numpy",       # 파이토치와 텐서플로 의존성
    "matplotlib",  # 그래프 라이브러리
    "tiktoken",    # 토크나이저b
    "torch",       # 딥러닝 라이브러리
    "tqdm",        # 진행 표시줄
    "tensorflow",  # OpenAI에서 사전 훈련된 가중치를 로드하기 위해
]
for p in pkgs:
    print(f"{p} 버전: {version(p)}")

numpy 버전: 2.0.2
matplotlib 버전: 3.10.0
tiktoken 버전: 0.12.0
torch 버전: 2.9.0+cu126
tqdm 버전: 4.67.1
tensorflow 버전: 2.19.0


In [2]:
import json
import os
import requests


def download_and_load_file(file_path, url):
    if not os.path.exists(file_path):
        response = requests.get(url, timeout=30)
        response.raise_for_status()
        text_data = response.text
        with open(file_path, "w", encoding="utf-8") as file:
            file.write(text_data)

    with open(file_path, "r", encoding="utf-8") as file:
        data = json.load(file)

    return data

file_path = "instruction-data.json"
url = (
    "https://raw.githubusercontent.com/rasbt/LLMs-from-scratch"
    "/main/ch07/01_main-chapter-code/instruction-data.json"
)

data = download_and_load_file(file_path, url)
print("샘플 개수:", len(data))

# 책과 다르다 urllib는 VPN을 사용하는 경우 문제가 있을 수 있음 책코드는 책 참고

샘플 개수: 1100


In [3]:
print("샘플 예시:\n", data[50])

샘플 예시:
 {'instruction': 'Identify the correct spelling of the following word.', 'input': 'Ocassion', 'output': "The correct spelling is 'Occasion.'"}


In [4]:
print("다른 샘플:\n", data[999])

다른 샘플:
 {'instruction': "What is an antonym of 'complicated'?", 'input': '', 'output': "An antonym of 'complicated' is 'simple'."}


In [5]:
def format_input(entry):
    instruction_text = (
        f"Below is an instruction that describes a task. "
        f"Write a response that appropriately completes the request."
        f"\n\n### Instruction:\n{entry['instruction']}"
    )

    input_text = f"\n\n### Input:\n{entry['input']}" if entry["input"] else ""

    return instruction_text + input_text

In [6]:
model_input = format_input(data[50])
desired_response = f"\n\n### Response:\n{data[50]['output']}"

print(model_input + desired_response)

Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
Identify the correct spelling of the following word.

### Input:
Ocassion

### Response:
The correct spelling is 'Occasion.'


In [7]:
model_input = format_input(data[999])
desired_response = f"\n\n### Response:\n{data[999]['output']}"

print(model_input + desired_response)

Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
What is an antonym of 'complicated'?

### Response:
An antonym of 'complicated' is 'simple'.


In [8]:
train_portion = int(len(data) * 0.85) # 훈련을 위한 85%
test_portion = int(len(data) * 0.1) # 테스트을 위한 10%
val_portion = len(data) - train_portion - test_portion # 나머지 5%는 검증용

train_data = data[:train_portion]
test_data = data[train_portion:train_portion + test_portion]
val_data = data[train_portion + test_portion:]

In [9]:
print("훈련 세트 길이:", len(train_data))
print("검증 세트 길이:", len(val_data))
print("테스트 세트 길이:", len(test_data))

훈련 세트 길이: 935
검증 세트 길이: 55
테스트 세트 길이: 110


In [10]:
import torch
from torch.utils.data import Dataset

class InstructionDataset(Dataset):
  def __init__(self, data, tokenizer):
    self.data = data

    # 텍스트 토큰화
    self.encoded_texts = []
    for entry in data:
      instruction_plus_input = format_input(entry)
      response_text = f"\n\n### Response:\n{entry['output']}"
      full_text = instruction_plus_input + response_text
      self.encoded_texts.append(
          tokenizer.encode(full_text)
      )
  def __getitem__(self, index):
    return self.encoded_texts[index]

  def __len__(self):
    return len(self.data)

In [11]:
import tiktoken
tokenizer = tiktoken.get_encoding("gpt2")

print(tokenizer.encode("<|endoftext|>", allowed_special={"<|endoftext|>"}))

[50256]


In [12]:
def custom_collate_draft_1(
    batch,
    pad_token_id=50256,
    device="cpu"
):
  # 배치에서 가장 긴 시퀀스 찾기
  # 그리고 최대 길이를 +1씩 증가시켜 아래에서 패딩 토큰을 하나 추가
  batch_max_length = max(len(item)+1 for item in batch)

  # 입력 패딩 및 준비
  inputs_lst = []

  for item in batch:
    new_item = item.copy()
    # <|endoftext|> 토큰 추가
    new_item += [pad_token_id]
    # batch_max_length까지 시퀀스 패딩
    padded = (
        new_item + [pad_token_id] * (batch_max_length - len(new_item))
    )
    # padded[:-1]를 통해 batch_max_lenth의 +1 설정을 통해 추가된
    # 추가 패딩 토큰을 제거
    # (추가 패딩 토큰은 이후 코드에서 관련이 있다)
    inputs = torch.tensor(padded[:-1])
    inputs_lst.append(inputs)

  # 입력 리스트를 텐서로 변환하고 타깃 장치로 전송
  inputs_tensor = torch.stack(inputs_lst).to(device)
  return inputs_tensor

In [13]:
inputs_1 = [0,1,2,3,4]
inputs_2 = [5,6]
inputs_3 = [7,8,9]

batch = (
    inputs_1,
    inputs_2,
    inputs_3,
)

print(custom_collate_draft_1(batch))

tensor([[    0,     1,     2,     3,     4],
        [    5,     6, 50256, 50256, 50256],
        [    7,     8,     9, 50256, 50256]])


In [14]:
def custom_collate_draft_2(
    batch,
    pad_token_id=50256,
    device="cpu"
):
  # 배치에서 가장 긴 시퀀스 찾기
  batch_max_length = max(len(item) + 1 for item in batch)

  # 입력 및 타깃 준비
  inputs_lst, targets_lst = [], []

  for item in batch:
    new_item = item.copy()
    # <|endoftext|> 토큰 추가
    new_item += [pad_token_id]
    # 시퀀스를 max_length까지 패딩
    padded = (
        new_item + [pad_token_id] * (batch_max_length - len(new_item))
    )
    inputs = torch.tensor(padded[:-1]) # 입력을 위해 마지막 토큰 자르기
    targets = torch.tensor(padded[1:]) # 타깃을 위해 오른쪽으로 +1 이동
    inputs_lst.append(inputs)
    targets_lst.append(targets)

  # 입력 리스트를 텐서로 변환하고 타깃 장치로 전송
  inputs_tensor = torch.stack(inputs_lst).to(device)
  targets_tensor = torch.stack(targets_lst).to(device)
  return inputs_tensor, targets_tensor

In [15]:
inputs, targets = custom_collate_draft_2(batch)
print(inputs)
print(targets)

tensor([[    0,     1,     2,     3,     4],
        [    5,     6, 50256, 50256, 50256],
        [    7,     8,     9, 50256, 50256]])
tensor([[    1,     2,     3,     4, 50256],
        [    6, 50256, 50256, 50256, 50256],
        [    8,     9, 50256, 50256, 50256]])


In [16]:
def custom_collate_fn(
    batch,
    pad_token_id=50256,
    ignore_index=-100,
    allowed_max_length=None,
    device="cpu"
):
  # 배치에서 가장 긴 시퀀스 찾기
  batch_max_length = max(len(item) + 1 for item in batch)

  # 입력과 타깃 패딩 및 준비
  inputs_lst, targets_lst = [], []

  for item in batch:
    new_item = item.copy()
    # <|endoftext|> 토큰 추가
    new_item += [pad_token_id]
    # 시퀀스를 max_length까지 패딩
    padded = (
        new_item + [pad_token_id] * (batch_max_length - len(new_item))
    )
    inputs = torch.tensor(padded[:-1]) # 입력을 위해 마지막 토큰 자르기
    targets = torch.tensor(padded[1:]) # 목표를 위해 오른쪽으로 +1 이동

    # 새로 추가: 목표에서 첫 번째 패딩 토큰을 제외한 모든 토큰을 ignore_index로 바꾸기
    mask = targets == pad_token_id
    indices = torch.nonzero(mask).squeeze()
    if indices.numel() > 1:
      targets[indices[1:]] = ignore_index

    # 새로 추가: 최대 시퀀스 길이로 자르기 (선택사항)
    if allowed_max_length is not None:
      inputs = inputs[:allowed_max_length]
      targets = targets[:allowed_max_length]

    inputs_lst.append(inputs)
    targets_lst.append(targets)

  # 입력 및 타깃 리스트를 텐서로 변환하고 타깃 장치로 전송
  inputs_tensor = torch.stack(inputs_lst).to(device)
  targets_tensor = torch.stack(targets_lst).to(device)

  return inputs_tensor, targets_tensor

In [17]:
inputs, targets = custom_collate_fn(batch)
print(inputs)
print(targets)

tensor([[    0,     1,     2,     3,     4],
        [    5,     6, 50256, 50256, 50256],
        [    7,     8,     9, 50256, 50256]])
tensor([[    1,     2,     3,     4, 50256],
        [    6, 50256,  -100,  -100,  -100],
        [    8,     9, 50256,  -100,  -100]])


In [18]:
logits_1 = torch.tensor(
    [
        [-1.0, 1.0], # 첫 번째 훈련 샘플
        [-0.5, 1.5], # 두 번째 훈련 샘플
    ]
)
targets_1 = torch.tensor([0,1])

loss_1 = torch.nn.functional.cross_entropy(logits_1, targets_1)
print(loss_1)

tensor(1.1269)


In [19]:
logits_2 = torch.tensor(
    [
        [-1.0, 1.0], # 첫 번째 훈련 샘플
        [-0.5, 1.5], # 두 번째 훈련 샘플
        [-0.5, 1.5], # 세 번째 훈련 샘플
    ]
)
targets_2 = torch.tensor([0,1, 1])

loss_2 = torch.nn.functional.cross_entropy(logits_2, targets_2)
print(loss_2)

tensor(0.7936)


In [20]:
targets_3 = torch.tensor([0, 1, -100])

loss_3 = torch.nn.functional.cross_entropy(logits_2, targets_3)
print(loss_3)
print("loss_1 == loss_3:", loss_1 == loss_3)

tensor(1.1269)
loss_1 == loss_3: tensor(True)


In [21]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 참고:
# 다음 줄의 주석을 제거하면 코드가 애플 실리콘 칩에서 실행될 수 있습니다.
# 애플 CPU보다 훨씬 빠릅니다(M3 맥북 에어에서 측정한 결과).
# 하지만 결과 손실 값이 약간 다를 수 있습니다.

#if torch.cuda.is_available():
#    device = torch.device("cuda")
#elif torch.backends.mps.is_available():
#    device = torch.device("mps")
#else:
#    device = torch.device("cpu")

print("장치:", device)

장치: cpu


In [22]:
from functools import partial

customized_collate_fn = partial(
    custom_collate_fn,
    device=device,
    allowed_max_length=1024
)

In [23]:
from torch.utils.data import DataLoader

num_workers = 0
batch_size = 8

torch.manual_seed(123)

train_dataset = InstructionDataset(train_data, tokenizer)
train_loader = DataLoader(
    train_dataset,
    batch_size=batch_size,
    collate_fn=customized_collate_fn,
    shuffle=True,
    drop_last=True,
    num_workers=num_workers
)
val_dataset = InstructionDataset(val_data, tokenizer)
val_loader = DataLoader(
    val_dataset,
    batch_size=batch_size,
    collate_fn=customized_collate_fn,
    shuffle=False,
    drop_last=False,
    num_workers=num_workers
)
test_dataset = InstructionDataset(test_data, tokenizer)
test_loader = DataLoader(
    val_dataset,
    batch_size=batch_size,
    collate_fn=customized_collate_fn,
    shuffle=False,
    drop_last=False,
    num_workers=num_workers
)

In [24]:
print("훈련 데이터 로더:")
for inputs, targets in train_loader:
    print(inputs.shape, targets.shape)

훈련 데이터 로더:
torch.Size([8, 61]) torch.Size([8, 61])
torch.Size([8, 76]) torch.Size([8, 76])
torch.Size([8, 73]) torch.Size([8, 73])
torch.Size([8, 68]) torch.Size([8, 68])
torch.Size([8, 65]) torch.Size([8, 65])
torch.Size([8, 72]) torch.Size([8, 72])
torch.Size([8, 80]) torch.Size([8, 80])
torch.Size([8, 67]) torch.Size([8, 67])
torch.Size([8, 62]) torch.Size([8, 62])
torch.Size([8, 75]) torch.Size([8, 75])
torch.Size([8, 62]) torch.Size([8, 62])
torch.Size([8, 68]) torch.Size([8, 68])
torch.Size([8, 67]) torch.Size([8, 67])
torch.Size([8, 77]) torch.Size([8, 77])
torch.Size([8, 69]) torch.Size([8, 69])
torch.Size([8, 79]) torch.Size([8, 79])
torch.Size([8, 71]) torch.Size([8, 71])
torch.Size([8, 66]) torch.Size([8, 66])
torch.Size([8, 83]) torch.Size([8, 83])
torch.Size([8, 68]) torch.Size([8, 68])
torch.Size([8, 80]) torch.Size([8, 80])
torch.Size([8, 71]) torch.Size([8, 71])
torch.Size([8, 69]) torch.Size([8, 69])
torch.Size([8, 65]) torch.Size([8, 65])
torch.Size([8, 68]) torch.Siz

In [25]:
print(inputs[0])

tensor([21106,   318,   281, 12064,   326,  8477,   257,  4876,    13, 19430,
          257,  2882,   326, 20431, 32543,   262,  2581,    13,   198,   198,
        21017, 46486,    25,   198, 30003,  6525,   262,  6827,  1262,   257,
          985,   576,    13,   198,   198, 21017, 23412,    25,   198,   464,
         5156,   318,   845, 13779,    13,   198,   198, 21017, 18261,    25,
          198,   464,  5156,   318,   355, 13779,   355,   257,  4936,    13,
        50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256])


In [26]:
print(targets[0])

tensor([  318,   281, 12064,   326,  8477,   257,  4876,    13, 19430,   257,
         2882,   326, 20431, 32543,   262,  2581,    13,   198,   198, 21017,
        46486,    25,   198, 30003,  6525,   262,  6827,  1262,   257,   985,
          576,    13,   198,   198, 21017, 23412,    25,   198,   464,  5156,
          318,   845, 13779,    13,   198,   198, 21017, 18261,    25,   198,
          464,  5156,   318,   355, 13779,   355,   257,  4936,    13, 50256,
         -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100])


In [27]:
# 코랩의 경우 gpt_download.py와 previous_chapter.py 파일을 다운로드합니다.
!wget https://bit.ly/3FQ2wXM -O gpt_download.py
!wget https://bit.ly/4egJDdd -O previous_chapters.py

--2025-11-24 05:14:56--  https://bit.ly/3FQ2wXM
Resolving bit.ly (bit.ly)... 67.199.248.10, 67.199.248.11
Connecting to bit.ly (bit.ly)|67.199.248.10|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://raw.githubusercontent.com/rickiepark/llm-from-scratch/refs/heads/main/ch07/01_main-chapter-code/gpt_download.py [following]
--2025-11-24 05:14:56--  https://raw.githubusercontent.com/rickiepark/llm-from-scratch/refs/heads/main/ch07/01_main-chapter-code/gpt_download.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4687 (4.6K) [text/plain]
Saving to: ‘gpt_download.py’


2025-11-24 05:14:56 (35.6 MB/s) - ‘gpt_download.py’ saved [4687/4687]

--2025-11-24 05:14:56--  https://bit.ly/4egJDdd
Resolving bit.ly (bit.ly)..

In [28]:
from gpt_download import download_and_load_gpt2
from previous_chapters import GPTModel, load_weights_into_gpt


BASE_CONFIG = {
    "vocab_size": 50257,     # 어휘사전 크기
    "context_length": 1024,  # 문맥 길이
    "drop_rate": 0.0,        # 드롭아웃 비율
    "qkv_bias": True         # 쿼리-키-값 편향
}

model_configs = {
    "gpt2-small (124M)": {"emb_dim": 768, "n_layers": 12, "n_heads": 12},
    "gpt2-medium (355M)": {"emb_dim": 1024, "n_layers": 24, "n_heads": 16},
    "gpt2-large (774M)": {"emb_dim": 1280, "n_layers": 36, "n_heads": 20},
    "gpt2-xl (1558M)": {"emb_dim": 1600, "n_layers": 48, "n_heads": 25},
}

CHOOSE_MODEL = "gpt2-medium (355M)"

BASE_CONFIG.update(model_configs[CHOOSE_MODEL])

model_size = CHOOSE_MODEL.split(" ")[-1].lstrip("(").rstrip(")")
settings, params = download_and_load_gpt2(
    model_size=model_size,
    models_dir="gpt2"
)

model = GPTModel(BASE_CONFIG)
load_weights_into_gpt(model, params)
model.eval();

File already exists and is up-to-date: gpt2/355M/checkpoint
File already exists and is up-to-date: gpt2/355M/encoder.json
File already exists and is up-to-date: gpt2/355M/hparams.json
File already exists and is up-to-date: gpt2/355M/model.ckpt.data-00000-of-00001
File already exists and is up-to-date: gpt2/355M/model.ckpt.index
File already exists and is up-to-date: gpt2/355M/model.ckpt.meta
File already exists and is up-to-date: gpt2/355M/vocab.bpe


In [29]:
torch.manual_seed(123)

input_text = format_input(val_data[0])
print(input_text)

Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
Convert the active sentence to passive: 'The chef cooks the meal every day.'


In [30]:
from previous_chapters import (generate, text_to_token_ids, token_ids_to_text)

token_ids = generate(
    model=model,
    idx=text_to_token_ids(input_text,tokenizer),
    max_new_tokens=35,
    context_size=BASE_CONFIG["context_length"],
    eos_id=50256,
)
generated_text = token_ids_to_text(token_ids, tokenizer)

In [31]:
response_text = (
    generated_text[len(input_text):]
    .replace("### Response:", "")
    .strip()
)
print(response_text)

The chef cooks the meal every day.

### Instruction:

Convert the active sentence to passive: 'The chef cooks the


In [32]:
from previous_chapters import (
    calc_loss_loader,
    train_model_simple
)

In [33]:
model.to(device)

torch.manual_seed(123)

with torch.no_grad():
    train_loss = calc_loss_loader(train_loader, model, device, num_batches=5)
    val_loss = calc_loss_loader(val_loader, model, device, num_batches=5)

print("훈련 손실:", train_loss)
print("검증 손실:", val_loss)

훈련 손실: 3.8258956909179687
검증 손실: 3.7619205951690673


In [34]:
# Colab 무료 버전으로는 사용 불가능..
# 사용 가능한 RAM 부족으로 세션 다운된다

# import time

# start_time = time.time()

# torch.manual_seed(123)

# optimizer = torch.optim.AdamW(model.parameters(), lr=0.00005, weight_decay=0.1)

# num_epochs = 2

# train_losses, val_losses, tokens_seen = train_model_simple(
#     model, train_loader, val_loader, optimizer, device,
#     num_epochs=num_epochs, eval_freq=5, eval_iter=5,
#     start_context=format_input(val_data[0]), tokenizer=tokenizer
# )

# end_time = time.time()
# execution_time_minutes = (end_time - start_time) / 60
# print(f"훈련 소요 시간: {execution_time_minutes:.2f}분")

In [35]:
# from previous_chapters import plot_losses

# epochs_tensor = torch.linspace(0, num_epochs, len(train_losses))
# plot_losses(epochs_tensor, tokens_seen, train_losses, val_losses)

In [None]:
torch.manual_seed(123)

for entry in test_data[:3]:

  input_text = format_input(entry)

  token_ids = generate(
      model=model,
      idx=text_to_token_ids(input_text, tokenizer).to(device),
      max_new_tokens=256,
      context_size=BASE_CONFIG["context_length"],
      eos_id=50256,
  )
  generated_text = token_ids_to_text(token_ids, tokenizer)
  response_text = (
      generated_text[len(input_text):]
      .replace("### Response:", "")
      .strip()
  )

  print(input_text)
  print(f"\n올바른 응답:\n>> {entry['output']}")
  print(f"\n모델 응답:\n>> {response_text.strip()}")
  print("-------------------------------------")

Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
Rewrite the sentence using a simile.

### Input:
The car is very fast.

올바른 응답:
>> The car is as fast as lightning.

모델 응답:
>> ### Output:

The car is very slow.

### Instruction:

Write a response that appropriately completes the request.

### Input:

The car is very fast.

### Output:

The car is very slow.

### Instruction:

Write a response that appropriately completes the request.

### Input:

The car is very fast.

### Output:

The car is very slow.

### Instruction:

Write a response that appropriately completes the request.

### Input:

The car is very fast.

### Output:

The car is very slow.

### Instruction:

Write a response that appropriately completes the request.

### Input:

The car is very fast.

### Output:

The car is very slow.

### Instruction:

Write a response that appropriately completes the request.

### Input:

The car is very fast.

### 

In [None]:
from tqdm import tqdm

for i, entry in tqdm(enumerate(test_data), total=len(test_data)):

  input_text = format_input(entry)

  token_ids = generate(
      model=model,
      idx=text_to_token_ids(input_text, tokenizer).to(device),
      max_new_tokens=256,
      context_size=BASE_CONFIG["context_length"],
      eos_id=50256,
  )

  generated_text = token_ids_to_text(token_ids, tokenizer)
  response_text = generated_text[len(input_text):].replace("### Response:", "").strip()

  test_data[i]["model_response"] = response_text

with open("instruction-data-with.response.json", "w") as file:
  json.dump(test_data, file, indent=4) # 미려한 출력을 위해 "indent" 사용

In [None]:
print(test_data[0])

In [None]:
import re


file_name = f"{re.sub(r'[ ()]', '', CHOOSE_MODEL) }-sft.pth"
torch.save(model.state_dict(), file_name)
print(f"모델이 {file_name}에 저장되었습니다.")

# 모델 로드 방법:
# model.load_state_dict(torch.load("gpt2-medium355M-sft.pth"))