# README

改进Alpaca源码:

- 原始Alpaca是自己读取并预处理数据，这里我改为dataset读取并用map函数批量预处理。
- 原始Alpaca训练7B模型需要四张A100卡，不算数据单独模型训练也需要100GB以上的显存，这里我改为bf16训练，可用一张A800(80G)训练。更改的地方包括 
    1. 模型加载，加载时设置torch_dtype
    2. 训练参数，增加bf16=True

In [1]:
import copy
import logging
import json
import io
import os
from pprint import pprint
from dataclasses import dataclass, field
from typing import Dict, Optional, Sequence
import matplotlib.pyplot as plt

import torch
from torch.utils.data import Dataset
from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments, DataCollatorForSeq2Seq
from datasets import load_dataset

os.environ['CUDA_VISIBLE_DEVICES'] = '7'

In [2]:
IGNORE_INDEX = -100
DEFAULT_PAD_TOKEN = "[PAD]"
DEFAULT_EOS_TOKEN = "</s>"
DEFAULT_BOS_TOKEN = "<s>"
DEFAULT_UNK_TOKEN = "<unk>"

In [None]:
model_name_or_path = '../DataCollection/officials/Llama-2-7b'
# model_name_or_path = '../DataCollection/officials/Qwen2.5-1.5b-Instruct'

model = AutoModelForCausalLM.from_pretrained(
    model_name_or_path,
    torch_dtype=torch.bfloat16
)

tokenizer = AutoTokenizer.from_pretrained(
    model_name_or_path,
    padding_side="right",
    use_fast=False,
)

In [4]:
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

In [None]:
dataset = load_dataset('json', 
                       data_dir='/data02/hyzhang10/pengxia2/tws/data', 
                       data_files={
                           'train': 'alpaca_data_100.json', 
                        #    'test': 'alpaca_data_100.json'
                           }
                       )
print(dataset)

In [None]:
print(len(dataset['train']))
pprint(dataset['train'][0])

In [None]:
IGNORE_INDEX = -100
PROMPT_DICT = {
    "prompt_input": (
        "Below is an instruction that describes a task, paired with an input that provides further context. "
        "Write a response that appropriately completes the request.\n\n"
        "### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response:"
    ),
    "prompt_no_input": (
        "Below is an instruction that describes a task. "
        "Write a response that appropriately completes the request.\n\n"
        "### Instruction:\n{instruction}\n\n### Response:"
    ),
}
prompt_input, prompt_no_input = PROMPT_DICT["prompt_input"], PROMPT_DICT["prompt_no_input"]

def preprocess_func(example):
    source = prompt_input.format_map(example) if example.get("input", "") != "" else prompt_no_input.format_map(example)
    target = f"{example['output']}{tokenizer.eos_token}"
    full_example = source + target
    full_example_tokenzied = tokenizer(full_example, return_tensors="pt",padding="longest", max_length=tokenizer.model_max_length, truncation=True)
    input_ids = full_example_tokenzied['input_ids'][0]
    labels = copy.deepcopy(input_ids)
    source_tokenzied = tokenizer(source, return_tensors="pt",padding="longest", max_length=tokenizer.model_max_length, truncation=True)
    labels[:len(source_tokenzied['input_ids'][0])] = IGNORE_INDEX
    return dict(
        input_ids=input_ids, 
        labels=labels
    )


# preprocess_func(dataset['train'][0])
train_ds = dataset['train'].map(preprocess_func, remove_columns=list(dataset['train'].features.keys()))
print(train_ds)

In [None]:
print([len(input_ids) for input_ids in train_ds['input_ids']])

In [None]:
pprint(train_ds[0], width=1000)

In [None]:
dc = DataCollatorForSeq2Seq(tokenizer)
# DataCollator的输入是list[dict[str, tensor]]
ret = [train_ds[index] for index in range(2,4)]
ret = dc(ret)
print([len(input_ids) for input_ids in ret['input_ids']])

In [12]:
train_dataset = train_ds
eval_dataset=None
data_collator = DataCollatorForSeq2Seq(tokenizer)

In [None]:
training_args = TrainingArguments(output_dir='./output',
                                  num_train_epochs=3,
                                  per_device_train_batch_size=2,
                                  per_device_eval_batch_size=8,
                                  gradient_accumulation_steps=8,
                                  evaluation_strategy='no',
                                  save_strategy='steps',
                                  save_steps=2000,
                                  save_total_limit=1,
                                  learning_rate=2e-5,
                                  weight_decay=0.,
                                  warmup_ratio=0.03,
                                  lr_scheduler_type='cosine',
                                  logging_steps=1,
                                  report_to=[],
                                  bf16=True
                                  )
trainer = Trainer(model=model, 
                  tokenizer=tokenizer, 
                  args=training_args, 
                  train_dataset=train_dataset,
                  eval_dataset=eval_dataset,
                  data_collator=data_collator,
                  )
train_output = trainer.train()

In [None]:
log_history = trainer.state.log_history
losses = [entry["loss"] for entry in log_history if "loss" in entry]

In [None]:
plt.plot(losses)