In [1]:
import copy
import logging
import json
import io
from pprint import pprint
from dataclasses import dataclass, field
from typing import Dict, Optional, Sequence

import torch
from torch.utils.data import Dataset
from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments

import os

os.environ['CUDA_VISIBLE_DEVICES'] = '7'

In [2]:
IGNORE_INDEX = -100
DEFAULT_PAD_TOKEN = "[PAD]"
DEFAULT_EOS_TOKEN = "</s>"
DEFAULT_BOS_TOKEN = "<s>"
DEFAULT_UNK_TOKEN = "<unk>"

In [3]:
# model_name_or_path = '../DataCollection/officials/Llama-2-7b/'
model_name_or_path = '../DataCollection/officials/Qwen2.5-1.5b-Instruct'

model = AutoModelForCausalLM.from_pretrained(
    model_name_or_path,
)

tokenizer = AutoTokenizer.from_pretrained(
    model_name_or_path,
    padding_side="right",
    use_fast=False,
)

In [4]:
tokenizer.pad_token_id

151643

In [5]:
def _make_r_io_base(f, mode: str):
    if not isinstance(f, io.IOBase):
        f = open(f, mode=mode)
    return f

def jload(f, mode="r"):
    """Load a .json file into a dictionary."""
    f = _make_r_io_base(f, mode)
    jdict = json.load(f)
    f.close()
    return jdict

# 加载指令微调数据，格式为list[dict]
data_path = './alpaca_data.json'
# data_path = './alpaca_data_100.json'
list_data_dict = jload(data_path)

In [6]:
print(len(list_data_dict))
pprint(list_data_dict[0])

52002
{'input': '',
 'instruction': 'Give three tips for staying healthy.',
 'output': '1.Eat a balanced diet and make sure to include plenty of fruits '
           'and vegetables. \n'
           '2. Exercise regularly to keep your body active and strong. \n'
           '3. Get enough sleep and maintain a consistent sleep schedule.'}


In [7]:
PROMPT_DICT = {
    "prompt_input": (
        "Below is an instruction that describes a task, paired with an input that provides further context. "
        "Write a response that appropriately completes the request.\n\n"
        "### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response:"
    ),
    "prompt_no_input": (
        "Below is an instruction that describes a task. "
        "Write a response that appropriately completes the request.\n\n"
        "### Instruction:\n{instruction}\n\n### Response:"
    ),
}
prompt_input, prompt_no_input = PROMPT_DICT["prompt_input"], PROMPT_DICT["prompt_no_input"]
sources = [
    prompt_input.format_map(example) if example.get("input", "") != "" else prompt_no_input.format_map(example)
    for example in list_data_dict
]
targets = [f"{example['output']}{tokenizer.eos_token}" for example in list_data_dict]

In [8]:
pprint(list_data_dict[0])
print(10*'-')
print(sources[0])
print(10*'-')
print(targets[0])

{'input': '',
 'instruction': 'Give three tips for staying healthy.',
 'output': '1.Eat a balanced diet and make sure to include plenty of fruits '
           'and vegetables. \n'
           '2. Exercise regularly to keep your body active and strong. \n'
           '3. Get enough sleep and maintain a consistent sleep schedule.'}
----------
Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
Give three tips for staying healthy.

### Response:
----------
1.Eat a balanced diet and make sure to include plenty of fruits and vegetables. 
2. Exercise regularly to keep your body active and strong. 
3. Get enough sleep and maintain a consistent sleep schedule.<|im_end|>


In [9]:
IGNORE_INDEX = -100

def _tokenize_fn(strings: Sequence[str], tokenizer: AutoTokenizer) -> Dict:
    """Tokenize a list of strings."""
    # 先将每个元素tokenize，按照最大长度padding，但实际每次只输入一个句子，根本不会padding
    tokenized_list = [
        tokenizer(
            text,
            return_tensors="pt",
            padding="longest",
            max_length=tokenizer.model_max_length,
            truncation=True,
        )
        for text in strings
    ]
    # 取出input_ids和labels为数组
    input_ids = labels = [tokenized.input_ids[0] for tokenized in tokenized_list]
    # 统计每个句子的非 padding token 的 token 数量
    input_ids_lens = labels_lens = [
        tokenized.input_ids.ne(tokenizer.pad_token_id).sum().item() for tokenized in tokenized_list
    ]
    return dict(
        input_ids=input_ids,
        labels=labels,
        input_ids_lens=input_ids_lens,
        labels_lens=labels_lens,
    )

def preprocess(
    sources: Sequence[str],
    targets: Sequence[str],
    tokenizer: AutoTokenizer,
) -> Dict:
    """Preprocess the data by tokenizing."""
    # 将prompt和预期输出组合
    examples = [s + t for s, t in zip(sources, targets)]
    # 分别对组合字符串和单独prompt进行tokenize
    examples_tokenized, sources_tokenized = [_tokenize_fn(strings, tokenizer) for strings in (examples, sources)]
    input_ids = examples_tokenized["input_ids"]
    labels = copy.deepcopy(input_ids)
    # 将 prompt 部分的 label 设置为 -100 (交叉熵的忽略 index)
    for label, source_len in zip(labels, sources_tokenized["input_ids_lens"]):
        label[:source_len] = IGNORE_INDEX
    return dict(input_ids=input_ids, labels=labels)

data_dict = preprocess(sources, targets, tokenizer)

In [10]:
for k in data_dict.keys():
    print(len(data_dict[k]))
    print(data_dict[k][0])

52002
tensor([ 38214,    374,    458,   7600,    429,  16555,    264,   3383,     13,
          9645,    264,   2033,    429,  34901,  44595,    279,   1681,    382,
         14374,  29051,    510,  35127,   2326,  10414,    369,  19429,   9314,
           382,  14374,   5949,     25,     16,   5142,    266,    264,  23831,
          9968,    323,   1281,   2704,    311,   2924,  11260,    315,  25322,
           323,  23880,     13,    715,     17,     13,  32818,  15502,    311,
          2506,    697,   2487,   4541,    323,   3746,     13,    715,     18,
            13,   2126,   3322,   6084,    323,  10306,    264,  12966,   6084,
          9700,     13, 151645])
52002
tensor([  -100,   -100,   -100,   -100,   -100,   -100,   -100,   -100,   -100,
          -100,   -100,   -100,   -100,   -100,   -100,   -100,   -100,   -100,
          -100,   -100,   -100,   -100,   -100,   -100,   -100,   -100,   -100,
          -100,   -100,   -100,   -100,     16,   5142,    266,    264,  23

In [11]:
print([len(input_ids) for input_ids in data_dict['input_ids']])

[75, 43, 101, 110, 183, 51, 101, 261, 51, 68, 86, 38, 100, 125, 70, 135, 121, 108, 121, 47, 42, 318, 78, 352, 121, 191, 132, 223, 116, 126, 185, 61, 91, 175, 145, 65, 114, 71, 136, 85, 128, 67, 157, 109, 104, 163, 132, 120, 165, 115, 82, 175, 60, 72, 56, 60, 45, 75, 41, 48, 46, 69, 49, 66, 57, 132, 56, 131, 65, 168, 56, 61, 95, 76, 65, 64, 55, 245, 71, 113, 128, 70, 176, 253, 183, 128, 254, 187, 143, 59, 251, 71, 131, 87, 94, 53, 103, 54, 37, 60, 88, 101, 89, 39, 104, 50, 111, 73, 43, 84, 69, 72, 74, 63, 56, 121, 71, 58, 61, 60, 58, 196, 55, 66, 61, 62, 67, 58, 60, 74, 77, 72, 65, 82, 40, 70, 55, 59, 59, 62, 114, 63, 62, 55, 77, 89, 61, 45, 75, 111, 78, 82, 99, 112, 73, 83, 100, 91, 122, 155, 286, 57, 130, 136, 109, 137, 131, 63, 51, 84, 102, 136, 90, 61, 60, 166, 61, 124, 95, 62, 70, 64, 68, 118, 176, 59, 75, 111, 67, 60, 70, 63, 62, 69, 69, 59, 84, 130, 61, 127, 44, 152, 175, 64, 197, 115, 139, 45, 110, 58, 71, 51, 193, 199, 89, 76, 54, 95, 74, 151, 50, 113, 130, 66, 46, 60, 47, 74, 

In [12]:
class SupervisedDataset(Dataset):

    def __init__(self, input_ids, labels):
        super(SupervisedDataset, self).__init__()
        self.input_ids = input_ids
        self.labels = labels

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, i) -> Dict[str, torch.Tensor]:
        return dict(input_ids=self.input_ids[i], labels=self.labels[i])
    
ds = SupervisedDataset(**data_dict)

In [13]:
@dataclass
class DataCollatorForSupervisedDataset(object):
    """Collate examples for supervised fine-tuning."""

    tokenizer: AutoTokenizer

    def __call__(self, instances: Sequence[Dict]) -> Dict[str, torch.Tensor]:
        input_ids, labels = tuple([instance[key] for instance in instances] for key in ("input_ids", "labels"))
        # input_ids 用 pad_token_id 补齐
        input_ids = torch.nn.utils.rnn.pad_sequence(
            input_ids, batch_first=True, padding_value=self.tokenizer.pad_token_id
        )
        # labels 用 IGNORE_INDEX 补齐
        labels = torch.nn.utils.rnn.pad_sequence(labels, batch_first=True, padding_value=IGNORE_INDEX)
        return dict(
            input_ids=input_ids,
            labels=labels,
            attention_mask=input_ids.ne(self.tokenizer.pad_token_id), # attention_mask 直接基于 input_ids
        )

dc = DataCollatorForSupervisedDataset(tokenizer)
# DataCollator的输入是list[dict[str, tensor]]
ret = [ds[index] for index in range(2,4)]
ret = dc(ret)
print([len(input_ids) for input_ids in ret['input_ids']])

[110, 110]


In [14]:
train_dataset = SupervisedDataset(**data_dict)
eval_dataset=None
data_collator = DataCollatorForSupervisedDataset(tokenizer)

In [None]:
training_args = TrainingArguments(output_dir='./output',
                                  num_train_epochs=3,
                                  per_device_train_batch_size=8,
                                  per_device_eval_batch_size=8,
                                  gradient_accumulation_steps=8,
                                  evaluation_strategy='no',
                                  save_strategy='steps',
                                  save_steps=2000,
                                  save_total_limit=1,
                                  learning_rate=2e-5,
                                  weight_decay=0.,
                                  warmup_ratio=0.03,
                                  lr_scheduler_type='cosine',
                                  logging_steps=1,
                                  report_to=[]
                                  )
trainer = Trainer(model=model, 
                  tokenizer=tokenizer, 
                  args=training_args, 
                  train_dataset=train_dataset,
                  eval_dataset=eval_dataset,
                  data_collator=data_collator,
                  )
trainer.train()