In [1]:
from datasets import Dataset
import json

# dataset.json 파일 로드
with open("dataset.json", "r", encoding="utf-8") as f:
    data = json.load(f)

# Hugging Face Dataset으로 변환
dataset = Dataset.from_dict({
    "input": [item["input"] for item in data],
    "output": [item["output"] for item in data]
})

In [2]:
dataset = dataset.train_test_split(test_size=0.1)  # 90% 학습, 10% 검증

In [3]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

# 토크나이저 로드
tokenizer = AutoTokenizer.from_pretrained("allenai/OLMoE-1B-7B-0924")

# 모델 로드
model = AutoModelForCausalLM.from_pretrained("allenai/OLMoE-1B-7B-0924")

# 모델을 bfloat16로 캐스팅
model = model.to(torch.bfloat16)

# 모델을 GPU로 이동 (옵션)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

# 모델이 GPU로 이동되었는지 확인
print(f"Model is on device: {model.device}")
print(f"Model dtype: {model.dtype}")

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Model is on device: cuda:0
Model dtype: torch.bfloat16


In [4]:
def preprocess_function(examples):
    # 입력과 출력을 결합하여 토큰화
    inputs = [f"Input: {input}\nOutput: {output}" for input, output in zip(examples["input"], examples["output"])]
    return tokenizer(inputs, truncation=True, padding="max_length", max_length=512)

# 데이터셋 전처리
tokenized_dataset = dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/741 [00:00<?, ? examples/s]

Map:   0%|          | 0/83 [00:00<?, ? examples/s]

In [5]:
from transformers import TrainingArguments, Trainer, DataCollatorForLanguageModeling
import os

os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

# 학습 하이퍼파라미터 설정
model.gradient_checkpointing_enable()

training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    learning_rate=6e-5,  # 안정적인 학습을 위해 감소
    per_device_train_batch_size=4,  # VRAM 허용 범위 내에서 증가
    per_device_eval_batch_size=8,
    gradient_accumulation_steps=2,  # 배치 크기 증가로 인해 줄임
    num_train_epochs=5,  # 좀 더 충분한 학습
    weight_decay=0.01,
    save_total_limit=2,
    save_strategy="epoch",
    logging_dir="./logs",
    logging_steps=10,
    fp16=False,
    bf16=True,
    gradient_checkpointing=True,  # 충분한 VRAM이 있으면 비활성화
    warmup_ratio=0.1,  # 학습 초반 안정성 증가
    lr_scheduler_type="cosine",  # 코사인 스케줄러 적용
    adam_beta1=0.9,
    adam_beta2=0.98,
)

# 데이터 콜레이터 초기화
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,  # Masked Language Modeling 사용 여부 (False로 설정)
)

# Trainer 초기화
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    data_collator=data_collator,  # 데이터 콜레이터 추가
)

Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [6]:
trainer.train()

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Epoch,Training Loss,Validation Loss
1,0.9766,0.958853
2,0.5945,0.903375
3,0.2678,1.01462
4,0.1193,1.180622
5,0.0761,1.231687


TrainOutput(global_step=465, training_loss=0.4471045142860823, metrics={'train_runtime': 1489.9389, 'train_samples_per_second': 2.487, 'train_steps_per_second': 0.312, 'total_flos': 7.757966122942464e+16, 'train_loss': 0.4471045142860823, 'epoch': 5.0})

In [7]:
model.save_pretrained("./fine-tuned-olmo-v4") # OLMoE-1B-7B-0924
tokenizer.save_pretrained("./fine-tuned-olmo-v4") 

('./fine-tuned-olmo-v3/tokenizer_config.json',
 './fine-tuned-olmo-v3/special_tokens_map.json',
 './fine-tuned-olmo-v3/tokenizer.json')