In [1]:
import torch
from datasets import load_dataset
from transformers import (
    AutoTokenizer, 
    AutoModelForSequenceClassification, 
    TrainingArguments, 
    Trainer,
    DataCollatorWithPadding
)
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model

In [2]:
# 加载数据集并随机抽样
dataset = load_dataset("ag_news", split="train")
sampled_dataset = dataset.shuffle(seed=42).select(range(10000))

In [12]:
# 加载模型和分词器
model_name = "llama3"  # 请确保你有权限访问这个模型
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token  # 设置填充标记
tokenizer.padding_side = "right"

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [13]:
# 定义预处理函数
def preprocess_function(examples):
    tokenized = tokenizer(
        examples["text"], 
        truncation=True, 
        padding="max_length", 
        max_length=256,
        return_tensors="pt"
    )
    tokenized["labels"] = examples["label"]
    return tokenized
    
# 对数据集进行预处理
encoded_dataset = sampled_dataset.map(
    preprocess_function, 
    batched=True, 
    remove_columns=[col for col in sampled_dataset.column_names if col != "label"]
)

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

In [14]:
# 加载模型
model = AutoModelForSequenceClassification.from_pretrained(
    model_name, 
    num_labels=4,  # AG News 有 4 个类别
    load_in_8bit=True, 
    device_map="auto"
)
model.config.pad_token_id = tokenizer.pad_token_id

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at llama3 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [15]:
# 准备QLoRA
model = prepare_model_for_kbit_training(model)
config = LoraConfig(
    r=8, 
    lora_alpha=32, 
    target_modules=["q_proj", "v_proj"], 
    lora_dropout=0.05, 
    bias="none", 
    task_type="SEQ_CLS"  # 改为序列分类任务
)
model = get_peft_model(model, config)

In [16]:
# 定义训练参数
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=8,
    gradient_accumulation_steps=8,
    learning_rate=2e-5,
    fp16=True,
    logging_steps=100,
    save_steps=500,
    remove_unused_columns=False,
    optim="adamw_torch",
    max_grad_norm=0.3,
    warmup_ratio=0.03,
    lr_scheduler_type="constant",
    gradient_checkpointing=True,
    dataloader_num_workers=4,
)

In [17]:
# 创建数据整理器
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, padding="longest")

In [18]:
# 创建Trainer并开始训练
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded_dataset,
    data_collator=data_collator,
)


In [19]:
# 开始训练
trainer.train()

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Step,Training Loss
100,1.509
200,0.3918
300,0.2504
400,0.2074


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

TrainOutput(global_step=468, training_loss=0.5323236294281788, metrics={'train_runtime': 9765.3597, 'train_samples_per_second': 3.072, 'train_steps_per_second': 0.048, 'total_flos': 3.212631194122322e+17, 'train_loss': 0.5323236294281788, 'epoch': 2.9952})

In [21]:
# 保存模型
trainer.save_model("final_model")