In [11]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [14]:
!pip -q install --no-cache-dir \
  "trl==0.9.6" \
  "transformers==4.55.2" \
  "peft==0.13.2" \
  "accelerate==1.10.0" \
  "datasets==2.20.0" \
  "evaluate==0.4.2" \
  "safetensors==0.4.3"

print("✅ Installed")

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m15.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m245.8/245.8 kB[0m [31m33.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m320.7/320.7 kB[0m [31m342.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m547.8/547.8 kB[0m [31m108.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m308.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m227.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m316.1/316.1 kB[0m [31m360.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m18.0/18.0 MB[0m [31m293.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━

# **파인 튜닝**

In [15]:
import platform, torch
import transformers, trl, peft, datasets, accelerate

print("Python       :", platform.python_version())
print("Torch        :", torch.__version__, "| CUDA:", torch.cuda.is_available())
print("transformers :", transformers.__version__)
print("trl          :", trl.__version__)
print("peft         :", peft.__version__)
print("datasets     :", datasets.__version__)
print("accelerate   :", accelerate.__version__)

Python       : 3.12.11
Torch        : 2.8.0+cu126 | CUDA: True
transformers : 4.55.2
trl          : 0.9.6
peft         : 0.17.0
datasets     : 4.0.0
accelerate   : 1.10.0


In [6]:
import sys, subprocess, torch

def try_install_flash_attn():
    if not torch.cuda.is_available():
        return False
    major, _ = torch.cuda.get_device_capability(0)
    if major < 8:
        return False
    try:
        __import__("flash_attn")
        return True
    except Exception:
        pass
    try:
        subprocess.check_call([sys.executable, "-m", "pip", "install", "--no-cache-dir", "flash-attn"], stdout=subprocess.DEVNULL)
        __import__("flash_attn")
        return True
    except Exception:
        return False

FLASH_OK = try_install_flash_attn()
ATTN_IMPL = "flash_attention_2" if FLASH_OK else "sdpa"
print(f"GPU: {torch.cuda.is_available()} | FlashAttention OK: {FLASH_OK} → attn_implementation='{ATTN_IMPL}'")

GPU: True | FlashAttention OK: False → attn_implementation='sdpa'


In [7]:
from pathlib import Path

ROOT     = "/content/drive/MyDrive/Summarize"
DATA_DIR = f"{ROOT}/dataset_chat"   # train_sum_chat.CLEAN.jsonl / val_sum_chat.CLEAN.jsonl
OUT_DIR  = f"{ROOT}/models"

SFT_ADAPTER_DIR = f"{OUT_DIR}/qwen2_sum_lora_sft"
DPO_ADAPTER_DIR = f"{OUT_DIR}/qwen2_sum_lora_dpo"

BASE_MODEL = "Qwen/Qwen2-1.5B-Instruct"

BAD_CHARS = ["。","、","｡","､","「","」","�"]  # 일본식 마침표/쉼표/괄호/깨짐문자

In [8]:
import torch, warnings
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, use_fast=True, trust_remote_code=True)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"

dtype = torch.bfloat16 if (torch.cuda.is_available() and torch.cuda.get_device_capability(0)[0] >= 8) else torch.float16

model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL,
    torch_dtype=dtype,
    low_cpu_mem_usage=False,
    attn_implementation=ATTN_IMPL,
)
model.to("cuda" if torch.cuda.is_available() else "cpu")

print("model ready:", BASE_MODEL, "| device:", next(model.parameters()).device, "| attn:", ATTN_IMPL)

model ready: Qwen/Qwen2-1.5B-Instruct | device: cuda:0 | attn: sdpa


In [12]:
import os, json
from torch.utils.data import Dataset
from datasets import Dataset as HFDataset

TRAIN_JSONL = f"{DATA_DIR}/train_sum_chat.CLEAN.jsonl"
VAL_JSONL   = f"{DATA_DIR}/val_sum_chat.CLEAN.jsonl"

assert Path(TRAIN_JSONL).exists(), f"학습 파일 없음: {TRAIN_JSONL}"
assert Path(VAL_JSONL).exists(),   f"검증 파일 없음: {VAL_JSONL}"

class ChatJsonlDataset(Dataset):
    def __init__(self, path):
        self.rows = []
        with open(path, "r", encoding="utf-8") as f:
            for line in f:
                if not line.strip(): continue
                obj = json.loads(line)
                msgs = obj.get("messages")
                if not msgs or not isinstance(msgs, list): continue
                self.rows.append({"messages": msgs, "id": obj.get("id")})
    def __len__(self): return len(self.rows)
    def __getitem__(self, idx): return self.rows[idx]

train_dataset = ChatJsonlDataset(TRAIN_JSONL)
val_dataset   = ChatJsonlDataset(VAL_JSONL)

def formatting_func(example):
    text = tokenizer.apply_chat_template(
        example["messages"],
        tokenize=False,
        add_generation_prompt=False
    )
    return text

hf_train = HFDataset.from_dict({"text": [formatting_func(ex) for ex in train_dataset]})
hf_val   = HFDataset.from_dict({"text": [formatting_func(ex) for ex in val_dataset]})

MAX_SEQ_LEN = 1024
print(f"train samples: {len(hf_train)} | val samples: {len(hf_val)}")

train samples: 8272 | val samples: 919


In [16]:
from peft import LoraConfig
from trl import SFTTrainer, SFTConfig

if hasattr(model, "gradient_checkpointing_enable"):
    model.gradient_checkpointing_enable()
if hasattr(model, "enable_input_require_grads"):
    model.enable_input_require_grads()

target_modules = ["q_proj","k_proj","v_proj","o_proj","gate_proj","up_proj","down_proj"]

peft_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=target_modules,
)

def build_sft_config():
    common = dict(
        output_dir=SFT_ADAPTER_DIR,
        per_device_train_batch_size=1,
        per_device_eval_batch_size=1,
        gradient_accumulation_steps=16,
        learning_rate=2e-4,
        num_train_epochs=2,
        logging_steps=20,
        save_steps=200,
        eval_steps=200,
        save_total_limit=2,
        lr_scheduler_type="cosine",
        warmup_ratio=0.03,
        bf16=torch.cuda.is_available() and torch.cuda.is_bf16_supported(),
        fp16=torch.cuda.is_available() and not torch.cuda.is_bf16_supported(),
        gradient_checkpointing=True,
        report_to=[],
        max_seq_length=MAX_SEQ_LEN,
        dataset_text_field="text",
        packing=False,
    )
    try:
        return SFTConfig(evaluation_strategy="steps", **common)
    except TypeError:
        return SFTConfig(eval_strategy="steps", **common)

sft_config = build_sft_config()

sft_trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    args=sft_config,
    train_dataset=hf_train,
    eval_dataset=hf_val,
    peft_config=peft_config,
)

trainable = sum(p.numel() for p in sft_trainer.model.parameters() if p.requires_grad)
total     = sum(p.numel() for p in sft_trainer.model.parameters())
print(f"trainable params: {trainable:,} / {total:,} ({trainable/total:.2%})")
assert trainable > 0, "LoRA 파라미터가 열리지 않았습니다!"

train_result = sft_trainer.train()
print(train_result)

Path(SFT_ADAPTER_DIR).mkdir(parents=True, exist_ok=True)
sft_trainer.model.save_pretrained(SFT_ADAPTER_DIR)
tokenizer.save_pretrained(SFT_ADAPTER_DIR)

metrics = sft_trainer.evaluate()
print("eval metrics:", metrics)
print(f"✅ SFT LoRA saved to: {SFT_ADAPTER_DIR}")

Map:   0%|          | 0/8272 [00:00<?, ? examples/s]

Map:   0%|          | 0/919 [00:00<?, ? examples/s]

  super().__init__(


trainable params: 18,464,768 / 1,562,179,072 (1.18%)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Step,Training Loss,Validation Loss
200,1.2807,1.269565
400,1.2182,1.192835
600,1.1004,1.130877
800,1.0684,1.090529
1000,1.0589,1.079829


TrainOutput(global_step=1034, training_loss=1.1891585291008424, metrics={'train_runtime': 74396.0454, 'train_samples_per_second': 0.222, 'train_steps_per_second': 0.014, 'total_flos': 1.125907425527808e+17, 'train_loss': 1.1891585291008424, 'epoch': 2.0})


eval metrics: {'eval_loss': 1.079799771308899, 'eval_runtime': 1252.5452, 'eval_samples_per_second': 0.734, 'eval_steps_per_second': 0.734, 'epoch': 2.0}
✅ SFT LoRA saved to: /content/drive/MyDrive/Summarize/models/qwen2_sum_lora_sft
