In [1]:
import os
import torch
from datasets import load_dataset

from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    pipeline,
    logging,
)
from peft import LoraConfig
from trl import SFTTrainer
def create_model(model_path):
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    model = AutoModelForCausalLM.from_pretrained(model_path, device_map="auto", torch_dtype=torch.half)
    new_special_tokens = {"additional_special_tokens": ["<|mrc|>", "<|summary|>"]}
    tokenizer.add_special_tokens(new_special_tokens)
    model.resize_token_embeddings(len(tokenizer))
    tokenizer.padding_side = "left"
    return tokenizer, model


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
model_path = "Qwen/Qwen2.5-3B-Instruct"
tokenizer, base_model = create_model(model_path)

Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.41s/it]


In [3]:
for name, param in base_model.named_parameters():
    print(name)

model.embed_tokens.weight
model.layers.0.self_attn.q_proj.weight
model.layers.0.self_attn.q_proj.bias
model.layers.0.self_attn.k_proj.weight
model.layers.0.self_attn.k_proj.bias
model.layers.0.self_attn.v_proj.weight
model.layers.0.self_attn.v_proj.bias
model.layers.0.self_attn.o_proj.weight
model.layers.0.mlp.gate_proj.weight
model.layers.0.mlp.up_proj.weight
model.layers.0.mlp.down_proj.weight
model.layers.0.input_layernorm.weight
model.layers.0.post_attention_layernorm.weight
model.layers.1.self_attn.q_proj.weight
model.layers.1.self_attn.q_proj.bias
model.layers.1.self_attn.k_proj.weight
model.layers.1.self_attn.k_proj.bias
model.layers.1.self_attn.v_proj.weight
model.layers.1.self_attn.v_proj.bias
model.layers.1.self_attn.o_proj.weight
model.layers.1.mlp.gate_proj.weight
model.layers.1.mlp.up_proj.weight
model.layers.1.mlp.down_proj.weight
model.layers.1.input_layernorm.weight
model.layers.1.post_attention_layernorm.weight
model.layers.2.self_attn.q_proj.weight
model.layers.2.self

In [3]:
peft_params = LoraConfig(
        # target_modules=["q_proj", "v_proj"],
        target_modules = ["v_proj",
    "q_proj",
    "k_proj",
    "o_proj"
    ],
    lora_alpha=16,
    lora_dropout=0.1,
    r=8,
    bias="none",
    task_type="CAUSAL_LM",
    )

In [4]:

from peft import LoraConfig, TaskType, get_peft_model
model = get_peft_model(base_model, peft_params)

In [5]:
model.print_trainable_parameters()

trainable params: 3,686,400 || all params: 3,089,074,176 || trainable%: 0.11933672647425608


In [17]:
data_file = "data/qwen_train_data.json"
eval_data_file = "data/qwen_dev_data.json"
dataset = load_dataset("json", data_files=data_file, split="train")
eval_dataset = load_dataset("json", data_files=eval_data_file, split="train")
new_model = "_lora_tuning"
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [19]:
training_params = TrainingArguments(
        output_dir="./results",
        num_train_epochs=1,
        per_device_train_batch_size=1,
        per_device_eval_batch_size=1,
        gradient_accumulation_steps=2,
        optim="paged_adamw_8bit",
        warmup_steps=100,
        learning_rate=1e-4,
        fp16=True,
        bf16=False,
        logging_steps=200,
        gradient_checkpointing=True,
        save_steps=2000,
        save_on_each_node=True,
        do_train=True,
        do_eval=True,
        eval_strategy="steps",
        eval_steps=2000,
        push_to_hub=False,
        report_to="wandb",
    )

In [22]:
trainer = SFTTrainer(
        model=model,
        train_dataset=dataset,
        eval_dataset=eval_dataset,
        # peft_config=peft_params,
        dataset_text_field="text",
        max_seq_length=2048,
        tokenizer=tokenizer,
        args=training_params,
        packing=False,
    )

Map: 100%|██████████| 10000/10000 [00:21<00:00, 460.09 examples/s]
Map: 100%|██████████| 10000/10000 [00:20<00:00, 488.51 examples/s]


In [23]:
trainer.train()

ValueError: Attempting to unscale FP16 gradients.

In [None]:
trainer.save_model(new_model)

In [27]:
torch.cuda.amp.autocast

torch.cuda.amp.autocast_mode.autocast