In [1]:
import torch
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments
from peft import get_peft_model, LoraConfig, TaskType
import random

In [2]:
# 加载原始数据集
dataset = load_dataset("squad", split="train")

In [3]:
# 定义处理函数
def process_example(example):
    return {
        "input": example["context"] + " " + example["question"],
        "output": example["answers"]["text"][0] if example["answers"]["text"] else ""
    }

In [4]:
# 应用处理函数
processed_dataset = dataset.map(process_example, remove_columns=dataset.column_names)

In [5]:
# 打乱数据集
shuffled_dataset = processed_dataset.shuffle(seed=42)  # 设置随机种子以确保可重复性

# 选取5000条数据
selected_dataset = shuffled_dataset.select(range(5000))

print("Processed dataset:", selected_dataset)

Processed dataset: Dataset({
    features: ['input', 'output'],
    num_rows: 5000
})


In [6]:
selected_dataset[0]

{'input': 'The Roman Catholic Church canon law also includes the main five rites (groups) of churches which are in full union with the Roman Catholic Church and the Supreme Pontiff: What term characterizes the intersection of the rites with the Roman Catholic Church?',
 'output': 'full union'}

In [7]:
# 选择模型和tokenizer
model_name = "llama3"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name,
                                            #load_in_8bit=False, 
                                            device_map="auto"
                                            )
tokenizer.pad_token = tokenizer.eos_token

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [8]:
# 设置LoRA配置
peft_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    r=16,
    lora_alpha=32,
    lora_dropout=0.1,
    bias="none",
)

In [9]:
# 应用LoRA
model = get_peft_model(model, peft_config)

In [10]:
# 定义数据处理函数
def preprocess_function(examples):
    inputs = examples["input"]
    outputs = examples["output"]
    model_inputs = tokenizer(inputs, max_length=300, truncation=True, padding="max_length")
    
    # 准备输入和标签
    labels = tokenizer(outputs, max_length=300, truncation=True, padding="max_length")["input_ids"]
    model_inputs["labels"] = labels
    
    return model_inputs

In [11]:
# 处理数据集
processed_datasets = selected_dataset.map(
    preprocess_function,
    batched=True,
    remove_columns=selected_dataset.column_names,
)

In [12]:
# 设置训练参数
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=4,  # 可能需要减小批次大小
    gradient_accumulation_steps=8,  # 增加梯度累积步骤
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir="./logs",
    fp16=True,  # 启用混合精度训练
    learning_rate=1e-4,
    save_strategy="epoch",
    logging_steps=10,
    #evaluation_strategy="epoch",
)

In [13]:
# 初始化Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=processed_datasets,
    tokenizer=tokenizer,
)

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [14]:
# 开始训练
trainer.train()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33ms1820587[0m. Use [1m`wandb login --relogin`[0m to force relogin


We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)


Step,Training Loss
10,9.6247
20,9.5669
30,9.5704
40,9.1477
50,8.7684
60,7.8027
70,7.1396
80,5.7459
90,4.5522
100,3.3151


TrainOutput(global_step=468, training_loss=1.7872087894341884, metrics={'train_runtime': 667.5634, 'train_samples_per_second': 22.47, 'train_steps_per_second': 0.701, 'total_flos': 2.024924840460288e+17, 'train_loss': 1.7872087894341884, 'epoch': 2.9952})

In [15]:
# 保存模型
trainer.save_model("finetuned_causal_model")

print("Training completed. Model saved to finetuned_causal_model")

Training completed. Model saved to finetuned_causal_model
