In [1]:
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model

In [2]:
from datasets import load_dataset

# 加载sciq数据集的训练集和测试集
train_dataset = load_dataset("imdb", split="train")
test_dataset = load_dataset("imdb", split="test")

In [3]:
from datasets import concatenate_datasets
train_data = train_dataset.shuffle()

In [4]:
# 从原始数据集中随机抽取10000条数据
train_data = train_data.shuffle(seed=42).select(range(10000))

In [5]:
# 将txt和label字段合并为一个整体
def merge_fields(example):
    text = example['text']
    label = example['label']
    merged_input = f"Text: {text}\nLabel: {label}"
    return {'merged_input': merged_input}

In [6]:
train_data = train_data.map(merge_fields, remove_columns=['text', 'label'])

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

In [7]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import torch

model_name = 'llama3'
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map='auto',
    trust_remote_code=True,
)
model.config.use_cache = False
model = prepare_model_for_kbit_training(model)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [8]:
lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, lora_config)

In [9]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [10]:
from transformers import TrainingArguments

output_dir = './results'
per_device_train_batch_size = 64
gradient_accumulation_steps = 64
optim = 'paged_adamw_32bit'
save_steps = 1000
logging_steps = 10
learning_rate = 2e-4  # Typically higher for LoRA
max_grad_norm = 0.3
max_steps = 1000
warmup_ratio = 0.03
lr_scheduler_type = 'constant'

training_arguments = TrainingArguments(
    output_dir=output_dir,
    per_device_train_batch_size=per_device_train_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    optim=optim,
    save_steps=save_steps,
    logging_steps=logging_steps,
    learning_rate=learning_rate,
    lr_scheduler_type=lr_scheduler_type,
    max_grad_norm=max_grad_norm,
    warmup_ratio=warmup_ratio,
    group_by_length=True,
    fp16=True,  # Mixed precision training
)

In [11]:
from trl import SFTTrainer

max_seq_length = 512
trainer = SFTTrainer(
    model=model,
    train_dataset=train_data,
    dataset_text_field='merged_input',
    max_seq_length=max_seq_length,
    tokenizer=tokenizer,
    args=training_arguments,
)

# Print trainable parameters
print("Trainable parameters:")
for name, param in model.named_parameters():
    if param.requires_grad:
        print(f"{name}: {param.shape}")


Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.


Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Trainable parameters:
base_model.model.model.layers.0.self_attn.q_proj.lora_A.default.weight: torch.Size([8, 4096])
base_model.model.model.layers.0.self_attn.q_proj.lora_B.default.weight: torch.Size([4096, 8])
base_model.model.model.layers.0.self_attn.v_proj.lora_A.default.weight: torch.Size([8, 4096])
base_model.model.model.layers.0.self_attn.v_proj.lora_B.default.weight: torch.Size([1024, 8])
base_model.model.model.layers.1.self_attn.q_proj.lora_A.default.weight: torch.Size([8, 4096])
base_model.model.model.layers.1.self_attn.q_proj.lora_B.default.weight: torch.Size([4096, 8])
base_model.model.model.layers.1.self_attn.v_proj.lora_A.default.weight: torch.Size([8, 4096])
base_model.model.model.layers.1.self_attn.v_proj.lora_B.default.weight: torch.Size([1024, 8])
base_model.model.model.layers.2.self_attn.q_proj.lora_A.default.weight: torch.Size([8, 4096])
base_model.model.model.layers.2.self_attn.q_proj.lora_B.default.weight: torch.Size([4096, 8])
base_model.model.model.layers.2.self_a

In [None]:
trainer.train()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33ms1820587[0m. Use [1m`wandb login --relogin`[0m to force relogin




Step,Training Loss


In [None]:
# Save the LoRA model
trainer.model.save_pretrained("finetuned_lora_model")

# Merge the LoRA weights with the base model and save
merged_model = model.merge_and_unload()
merged_model.save_pretrained("finetuned_merged_model")
tokenizer.save_pretrained("finetuned_merged_model")
print("Trained model and tokenizer saved to finetuned_merged_model")