In [1]:
# !pip install -q transformers datasets accelerate


# 모델 증류 (온라인 러닝)

In [2]:
from accelerate import Accelerator
from torch.distributed.fsdp import FullyShardedDataParallel as FSDP

from transformers import AutoTokenizer, AutoModelForCausalLM
from torch.optim import AdamW

from datasets import load_dataset
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from accelerate import Accelerator
from tqdm import tqdm

from dotenv import load_dotenv
load_dotenv()

True

In [3]:
from accelerate import Accelerator
from accelerate.logging import get_logger
from accelerate.utils import ProjectConfiguration, InitProcessGroupKwargs
from accelerate.utils import FullyShardedDataParallelPlugin

# Accelerator 초기화
accelerator = Accelerator()
device = accelerator.device

In [4]:
# 2. 학습 파라미터 설정
training_args = {
    "learning_rate": 2e-5,
    "num_train_epochs": 3,
    "per_device_train_batch_size": 2,  # GPU당 배치 크기
    "gradient_accumulation_steps": 8,
    "warmup_ratio": 0.1,
    "max_seq_length": 512,
    "logging_steps": 10,
    "save_steps": 100,
    "save_total_limit": 3,
    "seed": 42,
}

# 3. Accelerate 설정
# FSDP 설정
fsdp_plugin = FullyShardedDataParallelPlugin(
    sharding_strategy="FULL_SHARD",  # 완전 분산 모드
    cpu_offload=True,  # CPU 오프로딩 활성화
    # mixed_precision="bf16",  # BF16 혼합 정밀도 사용
    auto_wrap_policy="TRANSFORMER_BASED_WRAP",  # 트랜스포머 레이어 자동 래핑
    # transformer_layer_cls_to_wrap=["LlamaDecoderLayer"],  # 래핑할 레이어 클래스
    backward_prefetch="BACKWARD_PRE",  # 백워드 패스 최적화
    activation_checkpointing=True,  # 활성화 체크포인팅 사용
)

sharding_strategy is deprecated in favor of reshard_after_forward. This will be removed in a future version of Accelerate.


In [5]:
# 프로젝트 구성
OUTPUT_DIR = "outputs/"
project_config = ProjectConfiguration(
    project_dir=OUTPUT_DIR,
    logging_dir=f"{OUTPUT_DIR}/logs",
)

# 프로세스 그룹 초기화 설정
init_kwargs = InitProcessGroupKwargs()  # 30분 타임아웃

# Accelerator 초기화
accelerator = Accelerator(
    gradient_accumulation_steps=training_args["gradient_accumulation_steps"],
    log_with="tensorboard",
    project_config=project_config,
    fsdp_plugin=fsdp_plugin,
    kwargs_handlers=[init_kwargs],
)


### 🔤 입력 데이터 포맷 예시 (`mini_ai_name_data.jsonl`)

In [6]:
# JSONL 형식 예시
# {"instruction": "너의 이름이 뭐야?", "output": "제 이름은 미니ai입니다."}
# {"instruction": "자기소개 해봐", "output": "저는 미니ai라고 합니다."}


In [7]:
teacher_model_id = "meta-llama/Llama-3.2-3B-Instruct"
student_model_id = "meta-llama/Llama-3.2-1B-Instruct"

tokenizer = AutoTokenizer.from_pretrained(teacher_model_id)
tokenizer.pad_token = tokenizer.eos_token
eos_token = tokenizer.eos_token

dataset = load_dataset("json", data_files="dataset/kogpt/kochatgpt_1_SFT.jsonl")["train"]

colname_1, colname_2 = "prompt", "completion"


def tokenize(example):
    prompt = f"### 질문:\n{example[colname_1]}\n\n### 답변:\n"
    target = example[colname_2]
    full = prompt + target
    tokens = tokenizer(full, truncation=True, max_length=512, padding="max_length")
    return tokens

tokenized_dataset = dataset.map(tokenize, batched=False).remove_columns([colname_1, colname_2])
dataloader = DataLoader(tokenized_dataset, batch_size=1, shuffle=True)


Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/12000 [00:00<?, ? examples/s]

In [8]:
tokenized_dataset

Dataset({
    features: ['tokens', 'input_ids', 'attention_mask'],
    num_rows: 12000
})

In [9]:
teacher = AutoModelForCausalLM.from_pretrained(teacher_model_id,  device_map="auto", torch_dtype=torch.bfloat16)
student = AutoModelForCausalLM.from_pretrained(student_model_id,  device_map="auto", torch_dtype=torch.bfloat16)


temperature = 2.0
loss_fn = nn.KLDivLoss(reduction="batchmean")
optimizer = AdamW(student.parameters(), lr=2e-5)

accelerator = Accelerator()
student, teacher, optimizer, dataloader = accelerator.prepare(student, teacher, optimizer, dataloader)
teacher.eval()


2025-04-14 15:54:47.372916: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-04-14 15:54:47.387615: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-04-14 15:54:47.391594: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-04-14 15:54:47.401717: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 3072)
    (layers): ModuleList(
      (0-27): 28 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=3072, out_features=3072, bias=False)
          (k_proj): Linear(in_features=3072, out_features=1024, bias=False)
          (v_proj): Linear(in_features=3072, out_features=1024, bias=False)
          (o_proj): Linear(in_features=3072, out_features=3072, bias=False)
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=3072, out_features=8192, bias=False)
          (up_proj): Linear(in_features=3072, out_features=8192, bias=False)
          (down_proj): Linear(in_features=8192, out_features=3072, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((3072,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((3072,), eps=1e-05)
      )
    )
    (norm): LlamaRMSNorm((3072,), eps=1e-05)
    (rotary_emb

# 모델 훈련 (선생님 모델 -> 학생 모델)

In [12]:
from transformers import DataCollatorWithPadding

collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="pt")
dataloader = DataLoader(tokenized_dataset, batch_size=4, shuffle=True, collate_fn=collator)

In [None]:
N_EPOCH = 5

for epoch in range(N_EPOCH):
    student.train()
    loop = tqdm(dataloader, desc=f"Epoch {epoch+1}")

    for batch in loop:
        input_ids = batch["input_ids"].to(student.device)
        attention_mask = batch["attention_mask"].to(student.device)

        # Student forward
        student_outputs = student(input_ids=input_ids, attention_mask=attention_mask)
        student_logits = student_outputs.logits / temperature

        # Teacher forward
        with torch.no_grad():
            teacher_outputs = teacher(input_ids=input_ids, attention_mask=attention_mask)
            teacher_logits = teacher_outputs.logits / temperature

        student_logits = student_logits[:, :-1, :].contiguous()
        teacher_logits = teacher_logits[:, :-1, :].contiguous()

        target_probs = torch.nn.functional.softmax(teacher_logits, dim=-1)
        student_log_probs = torch.nn.functional.log_softmax(student_logits, dim=-1)

        loss = loss_fn(student_log_probs, target_probs)

        accelerator.backward(loss)
        optimizer.step()
        optimizer.zero_grad()
        loop.set_postfix(loss=loss.item())

Epoch 1:  64%|██████▍   | 1933/3000 [26:24<14:34,  1.22it/s, loss=68.5]

# 모델 저장

In [14]:
student.save_pretrained("./outputs/distilled-mini-ai")
tokenizer.save_pretrained("./outputs/distilled-mini-ai")

('./outputs/distilled-mini-ai/tokenizer_config.json',
 './outputs/distilled-mini-ai/special_tokens_map.json',
 './outputs/distilled-mini-ai/tokenizer.json')

# 모델 테스트/결과 확인

In [26]:
from transformers import  pipeline
from transformers import AutoTokenizer, AutoModelForCausalLM

def test(prompt: str, model_path: str = "./outputs/distilled-mini-ai", max_new_tokens: int = 100):
    """
    학습된 student 모델을 테스트하는 함수입니다.
    
    Args:
        prompt (str): 질문 프롬프트 (예: "너의 이름이 뭐야?")
        model_path (str): 파인튜닝된 모델 디렉터리 경로
        max_new_tokens (int): 생성할 최대 토큰 수
    """
    # 모델 & 토크나이저 로드
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    model = AutoModelForCausalLM.from_pretrained(model_path)

    # 파이프라인 생성
    pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, device=0)

    # 프롬프트 정제
    full_prompt = f"""### 질문:
{prompt}

### 답변:"""

    # 생성
    output = pipe(full_prompt, max_new_tokens=max_new_tokens, do_sample=True, eos_token_id=128009,repetition_penalty=1.2)[0]["generated_text"]
    
    # 출력 결과
    print("입력 프롬프트:")
    print(full_prompt)
    print("\n 모델 응답:")
    print(output.split("### 답변:")[-1].strip())

In [27]:
# test("너의 이름이 뭐야?")
test("천연으로 염색하고 싶은데 지속이 꽤 되나요?")

Device set to use cuda:0


입력 프롬프트:
### 질문:
천연으로 염색하고 싶은데 지속이 꽤 되나요?

### 답변:

 모델 응답:
지속적으로 사용하는 전기화물이나 수용성 물질을 사용할 때, 전기화물 또는 수용성 물질에 대한 재생의 시간이 약 2-5년로 할 수 있습니다. 이 기간은 원정 및 정상적인 용량에 따라 다르습니다.

*   전기화물(전기화물): 1-3년
    *   기장: 1-2년 (0.01%
