<a href="https://colab.research.google.com/github/rkawkclzls/TTT/blob/master/week6_1_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# 필요한 라이브러리 설치
!pip install -q datasets wandb evaluate transformers

import os
import sys
import math
import torch
import wandb
import logging
import datasets
import argparse
import evaluate
import transformers

from typing import Optional
from itertools import chain
from dataclasses import dataclass, field

from datasets import load_dataset
from transformers import (
    AutoConfig,
    AutoModelForCausalLM,
    AutoTokenizer,
    HfArgumentParser,
    Trainer,
    TrainingArguments,
    default_data_collator
)
from transformers.trainer_utils import get_last_checkpoint

# Wandb 설정
wandb.login(key="e2cb71b2cad6bec753921b86c2b2194f1bcb2cd0")
wandb.init(project='Hanghae99')
wandb.run.name = 'gpt-finetuning-with-validation'

@dataclass
class Arguments:
    model_name_or_path: Optional[str] = field(
        default="gpt2",  # 기본값을 gpt2로 설정
        metadata={"help": "Pre-trained model name or path"}
    )
    torch_dtype: Optional[str] = field(
        default="auto",
        metadata={
            'choices': ['auto', 'bfloat16', 'float16', 'float32'],
            'help': "Model's precision"
        }
    )

    dataset_name: Optional[str] = field(
        default="squad",  # 예시 데이터셋
        metadata={"help": "Dataset name from HuggingFace Hub"}
    )
    dataset_config_name: Optional[str] = field(
        default=None,
        metadata={"help": "Dataset configuration name"}
    )
    block_size: int = field(
        default=128,  # Colab 환경을 고려하여 작은 값으로 설정
        metadata={"help": "Length of input sequences"}
    )
    num_workers: Optional[int] = field(
        default=2,
        metadata={"help": "Number of workers for data processing"}
    )
    validation_split_percentage: Optional[int] = field(
        default=5,
        metadata={"help": "Size of validation set in percentage"}
    )

# 학습 인자 설정
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=4,  # Colab GPU 메모리 고려
    per_device_eval_batch_size=4,
    evaluation_strategy="steps",
    eval_steps=500,
    save_strategy="steps",
    save_steps=500,
    logging_dir="./logs",
    logging_steps=100,
    learning_rate=5e-5,
    weight_decay=0.01,
    push_to_hub=False,
    report_to="wandb"
)

# 로깅 설정
logger = logging.getLogger(__name__)
logging.basicConfig(
    format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
    datefmt="%m/%d/%Y %H:%M:%S",
    handlers=[logging.StreamHandler(sys.stdout)],
    level=logging.INFO
)

# Args 파싱
args = Arguments()

# 데이터셋 로드
raw_datasets = load_dataset(args.dataset_name, args.dataset_config_name)

# Train/validation 분할
if "validation" not in raw_datasets.keys():
    logger.info(f"Creating validation split with {args.validation_split_percentage}% of data")
    raw_datasets["validation"] = load_dataset(
        args.dataset_name,
        args.dataset_config_name,
        split=f"train[:{args.validation_split_percentage}%]"
    )
    raw_datasets["train"] = load_dataset(
        args.dataset_name,
        args.dataset_config_name,
        split=f"train[{args.validation_split_percentage}%:]"
    )

# 모델과 토크나이저 로드
config = AutoConfig.from_pretrained(args.model_name_or_path)
tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path)
model = AutoModelForCausalLM.from_pretrained(
    args.model_name_or_path,
    config=config,
    torch_dtype="auto"  # Colab GPU에 맞게 자동 설정
)

# 토크나이저 설정
tokenizer.pad_token_id = tokenizer.eos_token_id
tokenizer.pad_token = tokenizer.eos_token
tokenizer.chat_template = "{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}"

# 임베딩 크기 조정
embedding_size = model.get_input_embeddings().weight.shape[0]
if len(tokenizer) > embedding_size:
    model.resize_token_embeddings(len(tokenizer))

# 데이터 전처리
column_names = list(raw_datasets["train"].features)
text_column_name = "text" if "text" in column_names else column_names[0]

def tokenize_function(examples):
    return tokenizer(examples[text_column_name], truncation=True, padding="max_length", max_length=args.block_size)

logger.info("Tokenizing datasets...")
with training_args.main_process_first(desc="dataset map tokenization"):
    tokenized_datasets = raw_datasets.map(
        tokenize_function,
        batched=True,
        num_proc=args.num_workers,
        remove_columns=column_names,
        desc="Tokenizing datasets..."
    )

# 데이터셋 준비
train_dataset = tokenized_datasets["train"]
eval_dataset = tokenized_datasets["validation"]

# Trainer 초기화
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    data_collator=default_data_collator
)

# 체크포인트 설정
checkpoint = None
if training_args.resume_from_checkpoint is not None:
    checkpoint = training_args.resume_from_checkpoint
elif last_checkpoint := get_last_checkpoint(training_args.output_dir):
    checkpoint = last_checkpoint
    logger.info(f"Resuming from checkpoint: {checkpoint}")

# 학습 실행
logger.info("Starting training...")
train_result = trainer.train(resume_from_checkpoint=checkpoint)

# 모델 저장
trainer.save_model()

# 메트릭 저장
metrics = train_result.metrics
trainer.log_metrics("train", metrics)
trainer.save_metrics("train", metrics)
trainer.save_state()

# 최종 평가
logger.info("Running final evaluation...")
eval_metrics = trainer.evaluate()
trainer.log_metrics("eval", eval_metrics)
trainer.save_metrics("eval", eval_metrics)

# Wandb 종료
wandb.finish()

print("Training completed!")

[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mkslee1761[0m ([33mkslee1761-1[0m). Use [1m`wandb login --relogin`[0m to force relogin


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/7.62k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/14.5M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/1.82M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/87599 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/10570 [00:00<?, ? examples/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Tokenizing datasets... (num_proc=2):   0%|          | 0/87599 [00:00<?, ? examples/s]