1. 패키지 설치


In [5]:
!pip install transformers datasets peft accelerate bitsandbytes


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


2. 모델 불러오기 및 LoRA 구성


In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import get_peft_model, LoraConfig, TaskType

base_model_name = "beomi/KoAlpaca-Polyglot-5.8B"

quant_config = BitsAndBytesConfig(load_in_8bit=True)

model = AutoModelForCausalLM.from_pretrained(
    base_model_name, quantization_config=quant_config, device_map="auto"
)

tokenizer = AutoTokenizer.from_pretrained(base_model_name)

lora_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM, r=8, lora_alpha=32, lora_dropout=0.05, bias="none"
)

model = get_peft_model(model, lora_config)

ImportError: Using `bitsandbytes` 8-bit quantization requires the latest version of bitsandbytes: `pip install -U bitsandbytes`

3. 데이터셋 로딩 및 포맷 변환


In [None]:
from datasets import load_dataset

dataset = load_dataset(
    "json", data_files="./dataset/instruct_dataset.jsonl", split="train"
)


def format_example(example):
    return {
        "input_ids": tokenizer(
            f"{example['instruction']}\n\n{example['input']}",
            truncation=True,
            padding="max_length",
            max_length=512,
            return_tensors="pt",
        )["input_ids"].squeeze(),
        "labels": tokenizer(
            str(example["output"]),
            truncation=True,
            padding="max_length",
            max_length=128,
            return_tensors="pt",
        )["input_ids"].squeeze(),
    }


formatted_dataset = dataset.map(format_example)

4. 학습 실행


In [None]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="./checkpoints",
    per_device_train_batch_size=2,
    num_train_epochs=3,
    logging_dir="./logs",
    save_strategy="epoch",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=formatted_dataset,
    tokenizer=tokenizer,
)

trainer.train()

5. 모델 저장


In [None]:
model.save_pretrained("./checkpoints/final")
tokenizer.save_pretrained("./checkpoints/final")