Evel Loss :
https://api.wandb.ai/links/w3yfrl-none/2dnx1iup

Train Loss :
https://api.wandb.ai/links/w3yfrl-none/ldpd35n6

In [None]:
import os
import json
import torch
import wandb
import logging
from dataclasses import dataclass, field
from typing import Optional
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    Trainer,
    TrainingArguments,
    HfArgumentParser,
    default_data_collator,
)


@dataclass
class Arguments:
    model_name_or_path: str = field(default="gpt2")
    corpus_path: str = field(default="corpus.json")
    output_dir: str = field(default="./instruction-tuned-gpt")
    block_size: int = field(default=1024)
    per_device_train_batch_size: int = field(default=4)
    num_train_epochs: int = field(default=3)
    save_steps: int = field(default=10)
    eval_steps: int = field(default=10)
    logging_steps: int = field(default=10)


def format_and_tokenize(example, tokenizer, block_size):
    prompt = f"### Instruction:\n{example['instruction']}\n\n### Response:\n{example['output']}"
    tokens = tokenizer(
        prompt,
        truncation=True,
        padding="max_length",
        max_length=block_size
    )
    tokens["labels"] = tokens["input_ids"].copy()
    return tokens


def main():
    parser = HfArgumentParser(Arguments)
    args = parser.parse_args_into_dataclasses()[0]

    wandb.init(project="instruction-tuning")
    wandb.run.name = "homework6_1"

    # 1. Load corpus
    dataset = load_dataset("json", data_files={"data": args.corpus_path})["data"]

    # 2. Split 80/20
    split = dataset.train_test_split(test_size=0.2)
    train_data = split["train"]
    val_data = split["test"]

    # 3. Load tokenizer & model
    tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path)
    tokenizer.pad_token = tokenizer.eos_token
    model = AutoModelForCausalLM.from_pretrained(args.model_name_or_path)

    # 4. Preprocess both datasets
    train_dataset = train_data.map(lambda x: format_and_tokenize(x, tokenizer, args.block_size))
    val_dataset = val_data.map(lambda x: format_and_tokenize(x, tokenizer, args.block_size))

    # 5. Define training arguments
    training_args = TrainingArguments(
        output_dir=args.output_dir,
        per_device_train_batch_size=args.per_device_train_batch_size,
        num_train_epochs=args.num_train_epochs,
        evaluation_strategy="steps",
        eval_steps=args.eval_steps,
        save_strategy="steps",
        save_steps=args.save_steps,
        logging_strategy="steps",
        logging_steps=args.logging_steps,
        report_to="wandb",
        save_total_limit=2
    )

    # 6. Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        tokenizer=tokenizer,
        data_collator=default_data_collator,
    )

    # 7. Train
    trainer.train()
    trainer.save_model()
    print("Fine-tuning complete.")


if __name__ == "__main__":
    main()
