In [1]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"  # 避免潜在的死锁问题

from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer
from datasets import load_dataset
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model
import torch
from transformers import BitsAndBytesConfig
import logging

In [2]:
# 设置日志级别
logging.basicConfig(level=logging.INFO)

# 加载AG News数据集
dataset = load_dataset("ag_news")

# 随机选择10000条数据
train_dataset = dataset["train"].shuffle(seed=42).select(range(10000))

In [3]:
# 初始化tokenizer和模型
model_name = "llama3"  # 或者你的LLaMA模型路径
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [4]:
# 设置量化配置
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

In [5]:
# 加载模型并应用量化
model = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=bnb_config)

# 准备模型进行QLoRA训练
model = prepare_model_for_kbit_training(model)


`low_cpu_mem_usage` was None, now set to True since model is quantized.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [6]:
# 设置LoRA配置
peft_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

In [7]:
# 应用LoRA
model = get_peft_model(model, peft_config)

In [8]:
# 定义数据预处理函数
def preprocess_function(examples):
    inputs = tokenizer(examples["text"], truncation=True, padding="max_length", max_length=128)
    inputs["labels"] = inputs["input_ids"].copy()
    return inputs

In [9]:
# 对训练集进行预处理
train_dataset = train_dataset.map(preprocess_function, batched=True, remove_columns=train_dataset.column_names)

In [10]:
# 设置训练参数
training_args = TrainingArguments(
    output_dir="./results_qlora",
    num_train_epochs=3,
    per_device_train_batch_size=8,  # 可以尝试增加批量大小
    gradient_accumulation_steps=2,
    save_strategy="steps",
    save_steps=500,  # 由于数据集变小，我们可以减少保存频率
    save_total_limit=2,
    learning_rate=2e-4,
    fp16=True,
    logging_steps=50,  # 增加日志频率
    remove_unused_columns=False,
    warmup_steps=100,  # 减少预热步骤
    max_grad_norm=0.3,
    lr_scheduler_type="cosine",
)

In [11]:
from transformers import TrainerCallback

# 定义一个简单的回调函数来打印训练进度
class PrinterCallback(TrainerCallback):
    def on_log(self, args, state, control, logs=None, **kwargs):
        if state.is_local_process_zero:
            print(f"Step: {state.global_step}, Loss: {logs['loss']:.4f}")

In [12]:
# 将模型移动到GPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(128256, 4096)
        (layers): ModuleList(
          (0-31): 32 x LlamaDecoderLayer(
            (self_attn): LlamaAttention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=16, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=16, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
              )
              (k_proj): Linear4bit(in_features=4096, out_features=1024, bias=Fals

In [13]:
# 初始化Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    callbacks=[PrinterCallback()],
)

In [None]:
# 训练模型
trainer.train()

ERROR:wandb.jupyter:Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33ms1820587[0m. Use [1m`wandb login --relogin`[0m to force relogin


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Step,Training Loss
