# 연습문제 7.1: 프롬프트 스타일 변경

In [3]:
def format_input(entry):
  instruction_text = (
      f"<|user|>\n{entry['instruction']}"
  )

  input_text = f"\n{entry['input']}" if entry["input"] else ""

  return instruction_text + input_text

In [4]:
sample_data = [
    {'instruction': 'Identify the correct spelling of the following word.', 'input': 'Ocassion', 'output': "The correct spelling is 'Occasion.'"},
    {'instruction': "What is an antonym of 'complicated'?", 'input': '', 'output': "An antonym of 'complicated' is 'simple'."}
]

print(format_input(sample_data[0]))
print()
print(format_input(sample_data[1]))

<|user|>
Identify the correct spelling of the following word.
Ocassion

<|user|>
What is an antonym of 'complicated'?


# 연습문제 7.2: 명령어 및 입력 마스킹

In [5]:
# 이 `format_input` 함수는 원래 7장 코드에서 복사되었습니다.
def format_input(entry):
    instruction_text = (
        f"Below is an instruction that describes a task. "
        f"Write a response that appropriately completes the request."
        f"\n\n### Instruction:\n{entry['instruction']}"
    )

    input_text = f"\n\n### Input:\n{entry['input']}" if entry["input"] else ""

    return instruction_text + input_text

In [6]:
import torch
from torch.utils.data import Dataset


class InstructionDataset(Dataset):
    def __init__(self, data, tokenizer):
        self.data = data

        ##########################################################################################
        # 추가: 지시 길이를 위한 별도의 리스트
        self.instruction_lengths = []
        ##########################################################################################

        self.encoded_texts = []

        for entry in data:
            instruction_plus_input = format_input(entry)
            response_text = f"\n\n### Response:\n{entry['output']}"
            full_text = instruction_plus_input + response_text

            self.encoded_texts.append(
                tokenizer.encode(full_text)
            )

            ##########################################################################################
            # 추가: 지시 길이 수집
            instruction_length = len(tokenizer.encode(instruction_plus_input))
            self.instruction_lengths.append(instruction_length)
            ##########################################################################################

    def __getitem__(self, index):
        # 추가: 지시 길이와 텍스트를 모두 따로 반환
        return self.instruction_lengths[index], self.encoded_texts[index]

    def __len__(self):
        return len(self.data)

In [7]:
import tiktoken

tokenizer = tiktoken.get_encoding("gpt2")

In [8]:
def custom_collate_fn(
    batch,
    pad_token_id=50256,
    ignore_index=-100,
    allowed_max_length=None,
    device="cpu"
):
    # 배치에서 가장 긴 시퀀스 찾기
    batch_max_length = max(len(item)+1 for instruction_length, item in batch)   # 추가: batch는 이제 튜플입니다.

    # 입력과 타깃을 패딩하고 준비
    inputs_lst, targets_lst = [], []

    for instruction_length, item in batch:  # 추가: batch는 이제 튜플입니다.
        new_item = item.copy()
        # <|endoftext|> 토큰 추가
        new_item += [pad_token_id]
        # 시퀀스를 max_length까지 패딩
        padded = new_item + [pad_token_id] * (batch_max_length - len(new_item))
        inputs = torch.tensor(padded[:-1])  # 입력을 위해 마지막 토큰 자르기
        targets = torch.tensor(padded[1:])  # 타깃을 위해 오른쪽으로 +1 이동

        # 타깃에서 첫 번째를 제외한 모든 패딩 토큰을 ignore_index로 바꾸기
        mask = targets == pad_token_id
        indices = torch.nonzero(mask).squeeze()
        if indices.numel() > 1:
            targets[indices[1:]] = ignore_index

        ##########################################################################################
        # 추가: 타깃에서 모든 입력 및 지시 토큰 마스킹
        targets[:instruction_length-1] = -100
        ##########################################################################################

        # 선택적으로 최대 시퀀스 길이로 자르기
        if allowed_max_length is not None:
            inputs = inputs[:allowed_max_length]
            targets = targets[:allowed_max_length]

        inputs_lst.append(inputs)
        targets_lst.append(targets)

    # 입력 및 타깃 리스트를 텐서로 변환하고 타깃 장치로 전송
    inputs_tensor = torch.stack(inputs_lst).to(device)
    targets_tensor = torch.stack(targets_lst).to(device)

    return inputs_tensor, targets_tensor

In [9]:
sample_data = [
    {'instruction': "What is an antonym of 'complicated'?", 'input': '', 'output': "An antonym of 'complicated' is 'simple'."},
    {'instruction': 'Sort the following list in alphabetical order.', 'input': 'Zebra, Elephant, Crocodile', 'output': 'Crocodile, Elephant, Zebra'},
    {'instruction': 'Arrange the given numbers in descending order.', 'input': '5, 12, 8, 3, 15', 'output': '15, 12, 8, 5, 3.'}
]


In [10]:
from torch.utils.data import DataLoader

train_dataset = InstructionDataset(sample_data, tokenizer)
train_loader = DataLoader(
    train_dataset,
    batch_size=len(sample_data),
    collate_fn=custom_collate_fn,
    num_workers=0
)

In [11]:
print("훈련 데이터 로더:")
for inputs, targets in train_loader:
    print(inputs.shape, targets.shape)

훈련 데이터 로더:
torch.Size([3, 64]) torch.Size([3, 64])


In [12]:
print("입력:\n", inputs[1])
print("\n\n타깃:\n", targets[1])

입력:
 tensor([21106,   318,   281, 12064,   326,  8477,   257,  4876,    13, 19430,
          257,  2882,   326, 20431, 32543,   262,  2581,    13,   198,   198,
        21017, 46486,    25,   198, 42758,   262,  1708,  1351,   287, 24830,
          605,  1502,    13,   198,   198, 21017, 23412,    25,   198,    57,
        37052,    11, 42651,    11,  9325, 19815,   576,   198,   198, 21017,
        18261,    25,   198,    34, 12204,   375,   576,    11, 42651,    11,
         1168, 37052, 50256, 50256])


타깃:
 tensor([ -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
         -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
         -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
         -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
         -100,  -100,  -100,  -100,  -100,  -100,   198,   198, 21017, 18261,
           25,   198,    34, 12204,   375,   576,    11, 42651,    11,  1168,
        37052, 

In [13]:
print(tokenizer.decode(list(inputs[1])))

Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
Sort the following list in alphabetical order.

### Input:
Zebra, Elephant, Crocodile

### Response:
Crocodile, Elephant, Zebra<|endoftext|><|endoftext|>


In [14]:
non_masked_targets = targets[1][targets[1] != -100]

print(tokenizer.decode(list(non_masked_targets)))



### Response:
Crocodile, Elephant, Zebra<|endoftext|>


# 연습문제 7.3: 원본 Alpaca 데이터셋에서 파인튜닝

In [15]:
# 해당 데이터로 훈련
url = "https://raw.githubusercontent.com/tatsu-lab/stanford_alpaca/main/alpaca_data.json"

# 연습문제 7.4: LoRA를 사용한 파라미터 효율적인 미세 튜닝