In [1]:
from datasets import Dataset
import json
import os

# 폴더 경로 설정
folder_path = "./Data_Final_Reversed/"

# 모든 .json 파일을 읽어들여 데이터를 병합
all_data = {"input": [], "output": []}

for filename in os.listdir(folder_path):
    if filename.endswith(".json"):
        with open(os.path.join(folder_path, filename), "r", encoding="utf-8") as f:
            data = json.load(f)
            all_data["input"].extend([item["input"] for item in data])
            all_data["output"].extend([item["output"] for item in data])

# Hugging Face Dataset으로 변환
dataset = Dataset.from_dict(all_data)

# 학습 및 검증 데이터셋 분할 (90% 학습, 10% 검증)
dataset = dataset.train_test_split(test_size=0.1)

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

base_model = "allenai/OLMo-7B-hf"

# 토크나이저 로드
tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True)

# 모델 로드
model = AutoModelForCausalLM.from_pretrained(base_model, trust_remote_code=True)

# 모델을 bfloat16로 캐스팅
model = model.to(torch.bfloat16)

# 모델을 GPU로 이동 (옵션)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

# 모델이 GPU로 이동되었는지 확인
print(f"Model is on device: {model.device}")
print(f"Model dtype: {model.dtype}")

def preprocess_function(examples):
    # 입력과 출력을 결합하여 토큰화
    inputs = [f"Input: {input}\nOutput: {output}" for input, output in zip(examples["input"], examples["output"])]
    return tokenizer(inputs, truncation=True, padding="max_length", max_length=512)

# 데이터셋 전처리
tokenized_dataset = dataset.map(preprocess_function, batched=True)

In [4]:
from transformers import TrainingArguments, Trainer, DataCollatorForLanguageModeling
import os

os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

# 학습 하이퍼파라미터 설정
model.gradient_checkpointing_enable()

training_args = TrainingArguments(
    output_dir="./fine-tuned-models/results",
    eval_strategy="epoch",
    learning_rate=1e-4,  # 안정적인 학습을 위해 감소
    per_device_train_batch_size=16,  # VRAM 허용 범위 내에서 증가
    per_device_eval_batch_size=8,
    gradient_accumulation_steps=1,  # 배치 크기 증가로 인해 줄임
    num_train_epochs=15,  # 좀 더 충분한 학습
    weight_decay=0.01,
    save_total_limit=2,
    save_strategy="epoch",
    logging_dir="./logs",
    logging_steps=10,
    fp16=False,  # 사양 좋다면 이걸 켜보는것도
    bf16=True,  # 주로 안정적인 학습051,  # 학습 초반 안정성 증가
    lr_scheduler_type="cosine",  # 코사인 스케줄러 적용
    adam_beta1=0.9,
    adam_beta2=0.999,
)

# 데이터 콜레이터 초기화
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,  # Masked Language Modeling 사용 여부 (False로 설정)
)

# Trainer 초기화
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    data_collator=data_collator,  # 데이터 콜레이터 추가
)

Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [5]:
trainer.train()

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Epoch,Training Loss,Validation Loss
1,3.2441,3.088089
2,2.1982,2.245161
3,1.7183,1.883428
4,1.2334,1.517323
5,0.8422,1.190337
6,0.5287,0.978179
7,0.373,0.866119
8,0.2468,0.814571
9,0.141,0.771113
10,0.0743,0.765584


TrainOutput(global_step=1395, training_loss=0.8971977834205901, metrics={'train_runtime': 4215.5921, 'train_samples_per_second': 5.277, 'train_steps_per_second': 0.331, 'total_flos': 4.566288835805184e+17, 'train_loss': 0.8971977834205901, 'epoch': 15.0})

In [6]:
model.save_pretrained("./fine-tuned-models/fine-tuned-olmo-v11") # OLMoE-1B-7B-0924
tokenizer.save_pretrained("./fine-tuned-models/fine-tuned-olmo-v11") 

('./fine-tuned-models/fine-tuned-olmo-v10/tokenizer_config.json',
 './fine-tuned-models/fine-tuned-olmo-v10/special_tokens_map.json',
 './fine-tuned-models/fine-tuned-olmo-v10/tokenizer.json')