# 1. Google Colab 환경 설정

In [13]:
# !pip install -U fsspec datasets pyarrow

In [14]:
# from datasets import load_dataset

# dataset = load_dataset("wikitext", "wikitext-2-raw-v1")

In [1]:
# prepare for Google Colab
try:
    import google.colab
    import os, sys
    from google.colab import drive
    drive.mount('/content/drive', force_remount=True)

    # Enter the project path in Google Drive.
    # This is the path where the project '*.py' file will be imported and where the data, training logs, and models will be saved.
    # e.g. PROJ_PATH = '/content/drive/MyDrive/Colab Notebooks/repository/default_proj'
    PROJ_PATH = '/content/drive/MyDrive/defaultproject'

    sys.path.append(PROJ_PATH)

    # Install Java, a dependency of pyserini
    !apt install openjdk-21-jre-headless -qq > /dev/null
    os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-21-openjdk-amd64"
    !update-alternatives --set java /usr/lib/jvm/java-21-openjdk-amd64/bin/java
    !java -version

    %pip install faiss-cpu
    %pip install pyserini
    %pip install wget
    %pip install transformers
    # %pip install datasets
    %pip install accelerate[torch]
    %pip install evaluate
    %pip install absl-py
    %pip install nltk
    %pip install rouge_score
    # %pip install pyarrow

except ImportError:
    import os, sys
    PROJ_PATH = PROJ_PATH = os.path.expanduser("~/defaultproject")
    sys.path.append(PROJ_PATH)
    # Please specify the Java path.
    # If it is different from the path below, refer to the Java installation command above to install Java.
    os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-21-openjdk-amd64"
    required_packages = [
        # "faiss-cpu",
        # "pyserini",
        "wget",
        "transformers",
        # "datasets",
        "accelerate[torch]",
        "evaluate",
        "absl-py",
        "nltk",
        "rouge_score"
    ]
    for pkg in required_packages:
        os.system(f"pip install {pkg}")



[0m



[0m



[0m



[0m



[0m



[0m



[0m

# 2. 라이브러리 & 모듈 Import

In [2]:
import os
import math
import json
import tqdm
import functools
import shutil

import torch
import dataclasses
import transformers
import random
import numpy as np
import evaluate
from datasets import load_dataset

from model import TransformerConfig, TransformerForCausalLM, TransformerForSequenceClassification

from utils.logger import Logger
from utils.metrics import best_subspan_exact_match
from utils.etc import print_model_statistics



# 3. 경로 및 전역 설정

In [3]:
MODEL_CONTEXT_LENGTH = 1024 # GPT가 한 번에 처리할 최대 토큰 길이
ROPE_THETA = 20000.0 # set 50k for 2048 context length, set 20k for 1024 context length
# 데이터/모델 저장 경로
DRIVE_CACHE_PATH = os.path.join(PROJ_PATH, "cache")
LOCAL_CACHE_PATH = "local_cache"
LOG_PATH = os.path.join(PROJ_PATH, "logs")
OUTPUT_PATH = os.path.join(PROJ_PATH, "output")
RESULTS_PATH = os.path.join(PROJ_PATH, "output", "results")
SEED = 42

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)

# 디렉토리 생성
os.makedirs(DRIVE_CACHE_PATH, exist_ok=True)
os.makedirs(LOG_PATH, exist_ok=True)
os.makedirs(OUTPUT_PATH, exist_ok=True)
os.makedirs(RESULTS_PATH, exist_ok=True)

# only set matmul precision "high" for ampere and above GPUs(e.g. A100, V100, RTX 30xx, RTX 40xx) unless set "highest"
torch.set_float32_matmul_precision('high')

DO_PRETRAIN = True # GPT-small을 처음부터 학습할지
DO_FINETUNE_SM = True # summarization task로 파인튜닝 할지
DO_FINETUNE_CF = True # classification task로 파인튜닝 할지
DO_FINETUNE_RAG = True # RAG 모델을 학습할지
DO_ZEROSHOT_RAG = True # 사전학습된 RAG로 zero-shot QA를 할지
DO_SUBMISSION = True # 제출용 결과를 생성할지

rouge = evaluate.load("rouge") # 평가지표 가져오기

# 4. Tokenizer 준비

In [4]:
# GPT-2 기반 토크나이저 불러오기
tokenizer = transformers.AutoTokenizer.from_pretrained("openai-community/gpt2")
tokenizer.deprecation_warnings["Asking-to-pad-a-fast-tokenizer"] = True

# special token들을 직접 추가
# GPT-2는 원래 padding token이 없기 때문에 따로 지정하는 과정이 들어간다.
additional_special_tokens = {}
if tokenizer.pad_token is None:
    additional_special_tokens["pad_token"] = "<|padding|>"
if tokenizer.eos_token is None:
    additional_special_tokens["eos_token"] = "<|endoftext|>"
if tokenizer.bos_token is None:
    additional_special_tokens["bos_token"] = "<|beginoftext|>"
if additional_special_tokens:
    print(f"Adding special tokens: {additional_special_tokens}")
    tokenizer.add_special_tokens(additional_special_tokens)

tokenizer.padding_side = "left"
tokenizer.model_max_length = MODEL_CONTEXT_LENGTH

Adding special tokens: {'pad_token': '<|padding|>'}


# 5. TrainingConfig 설정 클래스 정의

In [5]:

# 학습 시 사용할 하이퍼파라미터와 장치 설정을 묶어 관리하는 클래스
@dataclasses.dataclass
class TrainingConfig(object):
    # Device Setting
    device: str = "cuda"
    model_dtype:str = "cast_bf16"
    # set "cast_bf16" if device is upper than ampere architecture for best performance unless use "amp_fp16" for mixed precision
    # ampere architecture means nvidia a100, a40, a6000 or rtx 3000 series or higher

    # Training setting # Hyperparameters
    batch_size: int = 32
    eval_batch_size: int = 32
    gradient_accumulation_steps: int = 2
    num_train_epochs: int = 5
    max_steps: int = None

    # Optimizer setting
    optimizer_type: str = "adamw"
    learning_rate: float = 3e-4
    weight_decay: float = 0.01
    warmup_steps: int = 1000
    max_grad_norm:float = 3.0
    lr_scheduler_type: str = "linear"

    # Trainer setting
    metric_for_best_model: str = "loss" # loss가 작을수록 좋은 모델 선택
    metric_greater_is_better: bool = False
    # save_interval: int = 1000
    eval_interval: int = 1000
    logging_interval: int = 10 # 10 step마다 로그 출력
    save_total_limit: int = 5 # 모델 저장 최대 수 제한

    logging_path: str = os.path.join(LOG_PATH, "train") # 로그 저장하는 곳
    output_path: str = os.path.join(LOG_PATH, "output") # 로그 저장하는 곳

    # 현재 설정을 dictionary 형태로 저장
    def to_dict(self):
        return {
            "model_dtype": self.model_dtype,
            "batch_size": self.batch_size,
            "eval_batch_size": self.eval_batch_size,
            "gradient_accumulation_steps": self.gradient_accumulation_steps,
            "num_train_epochs": self.num_train_epochs,
            "max_steps": self.max_steps,
            "optimizer_type": self.optimizer_type,
            "learning_rate": self.learning_rate,
            "weight_decay": self.weight_decay,
            "warmup_steps": self.warmup_steps,
            "max_grad_norm": self.max_grad_norm,
            "lr_scheduler_type": self.lr_scheduler_type,
            "metric_for_best_model": self.metric_for_best_model,
            "metric_greater_is_better": self.metric_greater_is_better,
            "eval_interval": self.eval_interval,
            "logging_interval": self.logging_interval,
            "save_total_limit": self.save_total_limit,
        }

    # 외부에서 불러온 딕셔너리로 현재 객체를 업데이트
    def from_dict(self, config_dict):
        for key, value in config_dict.items():
            if hasattr(self, key):
                setattr(self, key, value)
            else:
                raise KeyError(f"Invalid key: {key} in config_dict")
        self.__post_init__()

# 6. 평가 함수 정의

In [6]:
# Summarization 모델의 평가
@torch.no_grad()
def eval_for_summary(model, dataloader, train_config, tokenizer, max_new_tokens=128):
    if train_config.model_dtype == "cast_bf16":
        model = model.to(torch.bfloat16)

    model = model.to(train_config.device)
    model.eval()
    model.compile()

    prediction = []
    answers = []
    for batch in tqdm.auto.tqdm(dataloader, desc="Evaluating", leave=False):
        batch = {k:v.to(train_config.device) for k,v in batch.items()}
        input_ids = batch.pop("input_ids")
        labels = batch.pop("labels")
        attention_mask = batch.pop("attention_mask")

        generation_input_ids = input_ids.clone()
        generation_attention_mask = attention_mask.clone()
        generation_input_ids[labels.ne(-100)] = tokenizer.pad_token_id
        generation_attention_mask[labels.ne(-100)] = 0

        max_length = generation_attention_mask.sum(-1).max().item()
        refit_generation_input_ids = torch.zeros((input_ids.shape[0], max_length), dtype=torch.long, device=input_ids.device)
        refit_generation_attention_mask = torch.zeros((input_ids.shape[0], max_length), dtype=torch.long, device=input_ids.device)
        for i in range(input_ids.shape[0]):
            length = generation_attention_mask[i].sum().item()
            refit_generation_input_ids[i, -length:] = generation_input_ids[i, generation_attention_mask[i].eq(1)]
            refit_generation_attention_mask[i, -length:] = generation_attention_mask[i, generation_attention_mask[i].eq(1)]

        gen_output = model.generate(
            input_ids=refit_generation_input_ids,
            attention_mask=refit_generation_attention_mask,
            max_new_tokens=max_new_tokens,
            return_response_only=True
        )

        labels[labels.eq(-100)] = tokenizer.pad_token_id
        pred = tokenizer.batch_decode(gen_output, skip_special_tokens=True)
        ans = tokenizer.batch_decode(labels, skip_special_tokens=True)

        prediction.extend(pred)
        answers.extend(ans)

    rouge_score = rouge.compute(predictions=prediction, references=answers, use_stemmer=True)
    metrics = {
        "rouge1": rouge_score["rouge1"].item(),
        "rouge2": rouge_score["rouge2"].item(),
        "rougeL": rouge_score["rougeL"].item(),
        "rougeLsum": rouge_score["rougeLsum"].item(),
        "prediction": prediction,
        "answers": answers,
    }

    return metrics

# Sequence Classification 모델 평가
@torch.no_grad()
def eval_for_classification(model, dataloader, train_config):
    if train_config.model_dtype == "cast_bf16":
        model = model.to(torch.bfloat16)

    model = model.to(train_config.device)
    model.eval()
    model.compile()

    prediction = []
    answers = []
    for batch in tqdm.auto.tqdm(dataloader, desc="Evaluating", leave=False):
        batch = {k:v.to(train_config.device) for k,v in batch.items()}
        input_ids = batch.pop("input_ids")
        labels = batch.pop("labels")
        attention_mask = batch.pop("attention_mask")

        logits,_ = model(input_ids=input_ids, attention_mask=attention_mask)
        pred = logits.argmax(-1)

        prediction.extend(pred.tolist())
        answers.extend(labels.tolist())

    prediction = np.array(prediction)
    answers = np.array(answers)

    accuracy = (prediction == answers).sum() / len(answers)

    metrics = {
        "accuracy": accuracy,
        "prediction": prediction,
        "answers": answers,
    }

    return metrics

# RAG 모델 평가
@torch.no_grad()
def eval_for_rag(model, dataloader, train_config, tokenizer):
    if train_config.model_dtype == "cast_bf16":
        model.model = model.model.to(torch.bfloat16)

    model.model = model.model.to(train_config.device)
    model.model.eval()
    model.model.compile()

    uids = []
    questions = []
    predictions = []
    answers = []
    for batch in tqdm.auto.tqdm(dataloader, desc="Evaluating", leave=False):
        uid = batch['uid']
        question = batch['question']
        answer = batch['answers']

        # retrieval_augmented_generate
        extra_kw_args = {
            "pad_token_id": tokenizer.eos_token_id,
        } if isinstance(model.model, transformers.generation.GenerationMixin) else {}
        outputs = model.retrieval_augmented_generate(
            queries=question,
            qids=uid,
            max_new_tokens=10,
            k=5,
            **extra_kw_args
        )
        pred = model.tokenizer.batch_decode(outputs, skip_special_tokens=True)

        uids.extend(uid)
        questions.extend(question)
        predictions.extend(pred)
        answers.extend(answer)

    def extract_answer(text):
        if "Answer:" in text:
            return text.split("Answer:")[-1].strip().split("\n")[0]
        return text.strip().split("\n")[0]

    predictions = [extract_answer(pred) for pred in predictions]
    answers = [[a.strip() for a in group] for group in answers]


    accuracy = best_subspan_exact_match(predictions, answers)
    rouge_score = rouge.compute(predictions=predictions, references=answers)

    metrics = {
        "accuracy": accuracy['acc'],
        "rouge1": rouge_score["rouge1"].item(),
        "rouge2": rouge_score["rouge2"].item(),
        "rougeL": rouge_score["rougeL"].item(),
        "rougeLsum": rouge_score["rougeLsum"].item(),
        "prediction": predictions,
        "answers": answers,
        "uid": uids,
        "question": questions,
    }
    return metrics

# 7. train

In [7]:
# summarization시 모델의 loss만 평가
@torch.no_grad()
def eval_loop_for_loss(model, dataloader, train_config, tokenizer):
    model.eval()
    losses = []
    for batch in tqdm.auto.tqdm(dataloader, desc="Evaluating", leave=False):
        batch = {k:v.to(train_config.device) for k,v in batch.items()}

        loss, _ = model(**batch)
        losses.append(loss.item())
    metrics = {
        "loss": sum(losses)/len(losses),
    }
    return metrics

# pretraining 중 평가용 루프
@torch.no_grad()
def eval_loop_for_pretraining(model, dataloader, train_config, tokenizer):
    model.eval()

    losses = []
    ppls = []

    token_correct = 0
    token_total = 0

    for batch in tqdm.auto.tqdm(dataloader, desc="Evaluating", leave=False):
        batch = {k:v.to(train_config.device) for k,v in batch.items()}
        labels = batch.pop("labels")
        logits, _ = model(**batch)

        shifted_logits = logits[..., :-1, :].contiguous()
        shifted_labels = labels[..., 1:].contiguous()

        loss = torch.nn.functional.cross_entropy(shifted_logits.view(-1, shifted_logits.size(-1)), shifted_labels.view(-1),ignore_index=-100, reduction="none")

        loss = loss.view(shifted_labels.size())
        mask = shifted_labels.ne(-100)

        nll_loss = (loss * mask).sum() / mask.sum()
        ppl = torch.exp(nll_loss)

        losses.append(nll_loss.mean())
        ppls.append(ppl.mean())

        token_correct += (shifted_logits.argmax(-1) == shifted_labels).sum().item()
        token_total += shifted_labels.ne(-100).sum().item()

    metrics = {
        "loss": torch.mean(torch.stack(losses)).item(),
        "ppl": torch.mean(torch.stack(ppls)).item(),
        "token_acc": token_correct / token_total,
    }

    return metrics

# train 함수
def train(model, train_dataset, eval_dataset, train_collate_fn, eval_collate_fn, train_config, eval_loop):
    os.makedirs(train_config.logging_path, exist_ok=True)
    os.makedirs(train_config.output_path, exist_ok=True)

    train_model = model if isinstance(model, transformers.PreTrainedModel) else model.model
    if train_config.model_dtype == "cast_bf16":
        train_model = train_model.to(torch.bfloat16)

    print_model_statistics(train_model)

    train_model = train_model.to(train_config.device)
    train_model.train()
    train_model.compile()

    train_loader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size=train_config.batch_size,
        shuffle=True,
        collate_fn=train_collate_fn,
    )
    eval_loader = torch.utils.data.DataLoader(
        eval_dataset,
        batch_size=train_config.eval_batch_size,
        shuffle=False,
        collate_fn=eval_collate_fn,
    )

    num_training_steps = len(train_loader) * train_config.num_train_epochs if train_config.max_steps is None else train_config.max_steps
    num_warmup_steps = train_config.warmup_steps
    num_train_epochs = train_config.num_train_epochs if train_config.max_steps is None else math.ceil(train_config.max_steps / len(train_loader))

    optimizer = torch.optim.AdamW(
        train_model.parameters(),
        lr=train_config.learning_rate,
        weight_decay=train_config.weight_decay,
    )
    scheduler = transformers.get_scheduler(
        train_config.lr_scheduler_type,
        optimizer=optimizer,
        num_warmup_steps=num_warmup_steps,
        num_training_steps=num_training_steps // train_config.gradient_accumulation_steps,
    )

    global_steps = 0

    logger = Logger(log_path=train_config.logging_path)
    with open(os.path.join(train_config.output_path, "train_config.json"), "w") as f:
        json.dump(train_config.to_dict(), f, indent=4)
    logger.log({"train_config": train_config.to_dict()})

    global_pbar = tqdm.auto.tqdm(total=num_training_steps, desc="Training")
    best_models = []
    loss_window = []
    for epoch in range(num_train_epochs):
        train_model.train()
        for batch in train_loader:
            batch = {k:v.to(train_config.device) for k,v in batch.items()}

            with torch.autocast(device_type=train_config.device, dtype=torch.float16, enabled=train_config.model_dtype == "amp_fp16"):
                loss, _ = train_model(**batch)
                l = loss.item()
                loss_window.append(l)
                if len(loss_window) > 100:
                    loss_window.pop(0)

            loss = loss / train_config.gradient_accumulation_steps
            loss.backward()

            if train_config.max_grad_norm is not None:
                torch.nn.utils.clip_grad_norm_(train_model.parameters(), train_config.max_grad_norm)

            if global_steps % train_config.gradient_accumulation_steps == 0:
                optimizer.step()
                scheduler.step()
                optimizer.zero_grad()
            global_steps += 1
            global_pbar.update(1)
            global_pbar.set_postfix({"epoch":epoch, "loss": f"{l:.4f}::{sum(loss_window)/len(loss_window):.4f}", "lr": f"{scheduler.get_last_lr()[0]:.2e}"})

            if global_steps % train_config.logging_interval == 0:
                logger.log({"steps": global_steps, "loss": l,"ave_loss":sum(loss_window)/len(loss_window), "lr": scheduler.get_last_lr()[0]})

            if global_steps % train_config.eval_interval == 0:
                global_pbar.disable = True
                global_pbar.refresh()
                eval_metrics = eval_loop(model, eval_loader, train_config, tokenizer)
                if "prediction" in eval_metrics: eval_metrics.pop("prediction")
                if "answers" in eval_metrics: eval_metrics.pop("answers")
                if "uid" in eval_metrics: eval_metrics.pop("uid")
                if "question" in eval_metrics: eval_metrics.pop("question")

                logger.log({"steps": global_steps, **eval_metrics},is_train=False)
                global_pbar.write(f"====== evaluation {global_steps} ====")
                for k, v in eval_metrics.items():
                    global_pbar.write(f"   {k}: {v}")
                global_pbar.write(f"======================")
                checkpoint = f"step-{global_steps}"

                best_models.append((checkpoint, eval_metrics[train_config.metric_for_best_model]))
                best_models.sort(key=lambda x: x[1], reverse=train_config.metric_greater_is_better)
                if len(best_models) > train_config.save_total_limit:
                    need_to_remove = best_models.pop()
                    if need_to_remove != checkpoint:
                        logger.log({"steps": global_steps, "remove_checkpoint": need_to_remove[0]})
                        shutil.rmtree(os.path.join(train_config.output_path, need_to_remove[0]), ignore_errors=True)
                        logger.log({"steps": global_steps, "save_chechkpoint": checkpoint})
                        os.makedirs(os.path.join(train_config.output_path, checkpoint), exist_ok=True)
                        train_model.save_pretrained(os.path.join(train_config.output_path, checkpoint))
                        tokenizer.save_pretrained(os.path.join(train_config.output_path, checkpoint))
                else:
                    logger.log({"steps": global_steps, "save_chechkpoint": checkpoint})
                    os.makedirs(os.path.join(train_config.output_path, checkpoint), exist_ok=True)
                    train_model.save_pretrained(os.path.join(train_config.output_path, checkpoint))
                    tokenizer.save_pretrained(os.path.join(train_config.output_path, checkpoint))
                global_pbar.disable = False
                global_pbar.refresh()
            if train_config.max_steps is not None and global_steps >= train_config.max_steps:
                break
        if train_config.max_steps is not None and global_steps >= train_config.max_steps:
            break

    eval_metrics = eval_loop(model, eval_loader, train_config, tokenizer)
    logger.log({"steps": global_steps, **eval_metrics},is_train=False)
    global_pbar.write(f"====== Training End ====")

    if "prediction" in eval_metrics: eval_metrics.pop("prediction")
    if "answers" in eval_metrics: eval_metrics.pop("answers")
    if "uid" in eval_metrics: eval_metrics.pop("uid")
    if "question" in eval_metrics: eval_metrics.pop("question")
    for k, v in eval_metrics.items():
        global_pbar.write(f"   {k}: {v}")
    global_pbar.write(f"======================")
    checkpoint = f"step-{global_steps}"

    best_models.append((checkpoint, eval_metrics[train_config.metric_for_best_model]))
    best_models.sort(key=lambda x: x[1], reverse=train_config.metric_greater_is_better)
    if best_models[0][0] != checkpoint:
        shutil.copytree(
            os.path.join(train_config.output_path, best_models[0][0]),
            os.path.join(train_config.output_path, "best_model"),
            dirs_exist_ok=True,
        )
    else:
        train_model.save_pretrained(os.path.join(train_config.output_path, "best_model"))
        tokenizer.save_pretrained(os.path.join(train_config.output_path, "best_model"))


# 8. Pretraining

In [None]:
if DO_PRETRAIN:
    ### About 350M parameters
    # model_config = TransformerConfig(
    #     vocab_size=len(tokenizer),
    #     hidden_size=1024,
    #     intermediate_size=4096,
    #     num_hidden_layers=24,
    #     num_attention_heads=16,
    #     num_key_value_heads=4,
    #     head_dim=64,
    #     max_postion_embeddings=MODEL_CONTEXT_LENGTH,
    #     attention_dropout=0.1,
    #     ffn_dropout=0.05,
    #     pad_token_id=tokenizer.pad_token_id,
    #     bos_token_id=tokenizer.bos_token_id,
    #     eos_token_id=tokenizer.eos_token_id,
    #     rope_theta=ROPE_THETA,
    # )

    ### About 120M parameters
    # model_config = TransformerConfig(
    #     vocab_size=len(tokenizer),
    #     hidden_size=768,
    #     intermediate_size=3072,
    #     num_hidden_layers=12,
    #     num_attention_heads=12,
    #     num_key_value_heads=4,
    #     head_dim=64,
    #     max_postion_embeddings=MODEL_CONTEXT_LENGTH,
    #     attention_dropout=0.1,
    #     ffn_dropout=0.05,
    #     pad_token_id=tokenizer.pad_token_id,
    #     bos_token_id=tokenizer.bos_token_id,
    #     eos_token_id=tokenizer.eos_token_id,
    #     rope_theta=ROPE_THETA,
    # )

    ### About 70M parameters
    model_config = TransformerConfig(
        vocab_size=len(tokenizer),
        hidden_size=512,
        intermediate_size=2048,
        num_hidden_layers=4,
        num_attention_heads=16,
        num_key_value_heads=4,
        head_dim=32,
        max_postion_embeddings=MODEL_CONTEXT_LENGTH,
        attention_dropout=0.1,
        ffn_dropout=0.05,
        pad_token_id=tokenizer.pad_token_id,
        bos_token_id=tokenizer.bos_token_id,
        eos_token_id=tokenizer.eos_token_id,
        rope_theta=ROPE_THETA,
    )

    ### About 30M parameters
    # model_config = TransformerConfig(
    #     vocab_size=len(tokenizer),
    #     hidden_size=384,
    #     intermediate_size=1536,
    #     num_hidden_layers=4,
    #     num_attention_heads=4,
    #     num_key_value_heads=2,
    #     head_dim=32,
    #     max_postion_embeddings=MODEL_CONTEXT_LENGTH,
    #     attention_dropout=0.1,
    #     ffn_dropout=0.05,
    #     pad_token_id=tokenizer.pad_token_id,
    #     bos_token_id=tokenizer.bos_token_id,
    #     eos_token_id=tokenizer.eos_token_id,
    #     rope_theta=ROPE_THETA,
    # )

    # 데이터셋 구성
    from dataset.pretrain import prepare_pretrain_dataset, collate_fn_for_pretrain

    pretraining_config = TrainingConfig(
        device="cuda",
        batch_size=16,
        eval_batch_size=16,
        gradient_accumulation_steps=2,
        num_train_epochs=2,
        max_steps=None,
        optimizer_type="adamw",
        learning_rate=3e-4,
        weight_decay=0.01,
        warmup_steps=1000,
        max_grad_norm=1.0,
        lr_scheduler_type="cosine",
        metric_for_best_model="ppl",
        eval_interval=2500,
        logging_interval=10,
        save_total_limit=3,
        logging_path=os.path.join(LOG_PATH, "pretraining"),
        output_path=os.path.join(OUTPUT_PATH, "pretraining"),
    )

    # trainingConfig 설정
    pretrain_train, pretrain_eval = prepare_pretrain_dataset(tokenizer, max_length=MODEL_CONTEXT_LENGTH, train_sample_size=1048576, eval_sample_size=8096, cache_path=DRIVE_CACHE_PATH)
    model = TransformerForCausalLM(model_config)
    collate_fn = functools.partial(collate_fn_for_pretrain,
                                    tokenizer=tokenizer,
                                    block_length=MODEL_CONTEXT_LENGTH)
    # 학습 시작
    train(model,
        train_dataset=pretrain_train,
        eval_dataset=pretrain_eval,
        train_collate_fn=collate_fn,
        eval_collate_fn=collate_fn,
        train_config=pretraining_config,
        eval_loop=eval_loop_for_pretraining)
    del pretrain_train, pretrain_eval, model
    torch.cuda.empty_cache()

# 9. Summarization Fine-tuning

In [None]:
if DO_FINETUNE_SM:
    from dataset.summary import prepare_summary_dataset, collate_fn_for_summary
    # 학습 설정 구성
    summary_config = TrainingConfig(
        device="cuda",
        batch_size=8,
        eval_batch_size=2,
        gradient_accumulation_steps=2,
        num_train_epochs=3,
        max_steps=None,
        optimizer_type="adamw",
        learning_rate=5e-5,
        weight_decay=0.01,
        warmup_steps=1000,
        max_grad_norm=1.0,
        lr_scheduler_type="cosine",
        metric_for_best_model="loss",
        eval_interval=2500,
        logging_interval=10,
        save_total_limit=3,
        logging_path=os.path.join(LOG_PATH, "summary"),
        output_path=os.path.join(OUTPUT_PATH, "summary"),
    )
    # Pretrained GPT 불러오기
    # model = TransformerForCausalLM.from_pretrained(os.path.join(OUTPUT_PATH, "pretraining", "best_model"))
    # model = model.to(TrainingConfig.device)  
    # 데이터셋 로딩
    summary_train, summary_eval = prepare_summary_dataset(tokenizer)
    collate_fn = functools.partial(collate_fn_for_summary,
                                    tokenizer=tokenizer)      
    # 학습 시작
    # train(model,
    #     train_dataset=summary_train,
    #     eval_dataset=summary_eval,
    #     train_collate_fn= collate_fn,
    #     eval_collate_fn=collate_fn,
    #     train_config=summary_config,
    #     eval_loop=eval_loop_for_loss)

    # 메모리 정리 및 best model 불러오기
    del model
    torch.cuda.empty_cache()
    model = TransformerForCausalLM.from_pretrained(os.path.join(OUTPUT_PATH, "summary", "best_model"))
    eval_loader = torch.utils.data.DataLoader(
        summary_eval,
        batch_size=summary_config.eval_batch_size,
        shuffle=False,
        collate_fn=collate_fn,
    )
    # 평가 및 결과 저장
    metrics = eval_for_summary(model, eval_loader, summary_config, tokenizer, max_new_tokens=128)
    prediction = metrics.pop("prediction")
    answers = metrics.pop("answers")
    with open(os.path.join(RESULTS_PATH,"summary_score.json"), "w") as f:
        json.dump(metrics, f, indent=4)
    with open(os.path.join(RESULTS_PATH,"summary_output.txt"), "w") as f:
        for p,a in zip(prediction, answers):
            a = a.replace("\n", " ").strip()
            p = p.replace("\n", " ").strip()
            f.write(f"{a}\t{p}\n")
    # 메모리 정리
    del summary_train, summary_eval, model
    torch.cuda.empty_cache()

# 10. Classification Fine-tuning

In [None]:
if DO_FINETUNE_CF:
    from dataset.classification import prepare_classification_dataset, collate_fn_for_classification

    # TrainingConfig
    classification_config = TrainingConfig(
        device="cuda",
        batch_size=32,
        eval_batch_size=32,
        gradient_accumulation_steps=1,
        num_train_epochs=5,
        max_steps=None,
        optimizer_type="adamw",
        learning_rate=5e-5,
        weight_decay=0.01,
        warmup_steps=1000,
        max_grad_norm=1.0,
        lr_scheduler_type="cosine",
        metric_for_best_model="loss",
        eval_interval=2500,
        logging_interval=10,
        save_total_limit=3,
        logging_path=os.path.join(LOG_PATH, "classification"),
        output_path=os.path.join(OUTPUT_PATH, "classification"),
    )
    # 모델 로드
    model = TransformerForSequenceClassification.from_pretrained(os.path.join(OUTPUT_PATH, "pretraining", "best_model"),num_labels=20)
    # 데이터셋 구성
    class_train, class_eval = prepare_classification_dataset(tokenizer)
    collate_fn = functools.partial(collate_fn_for_classification,
                                    tokenizer=tokenizer)
    # 학습
    train(model,
        train_dataset=class_train,
        eval_dataset=class_eval,
        train_collate_fn=collate_fn,
        eval_collate_fn=collate_fn,
        train_config=classification_config,
        eval_loop=eval_loop_for_loss)
    # best 모델
    del model
    torch.cuda.empty_cache()
    model = TransformerForSequenceClassification.from_pretrained(os.path.join(OUTPUT_PATH, "classification", "best_model"),num_labels=20)
    eval_loader = torch.utils.data.DataLoader(
        class_eval,
        batch_size=classification_config.eval_batch_size,
        shuffle=False,
        collate_fn=collate_fn,
    )
    # 평가 및 결과 저장
    metrics = eval_for_classification(model, eval_loader, classification_config)
    prediction = metrics.pop("prediction")
    answers = metrics.pop("answers")
    with open(os.path.join(RESULTS_PATH,"classification_score.json"), "w") as f:
        json.dump(metrics, f, indent=4)
    with open(os.path.join(RESULTS_PATH,"classification_output.txt"), "w") as f:
        for p,a in zip(prediction, answers):
            f.write(f"{a}\t{p}\n")
    # 메모리 정리
    del class_train, class_eval, model
    torch.cuda.empty_cache()

# 11. RAG Fine-tuning

In [9]:
if DO_FINETUNE_RAG:
    from dataset.rag import donwload_dataset_nq_open_dpr, RAGDataset, RAGCollator
    from pyserini.search.lucene import LuceneSearcher
    from model_rag import ModelRAG

    # TrainingConfig
    rag_config = TrainingConfig(
        device="cuda",
        batch_size=16,
        eval_batch_size=16,
        gradient_accumulation_steps=1,
        num_train_epochs=3,
        max_steps=None,
        optimizer_type="adamw",
        learning_rate=5e-5,
        weight_decay=0.01,
        warmup_steps=1000,
        max_grad_norm=1.0,
        lr_scheduler_type="cosine",
        metric_for_best_model="accuracy",
        metric_greater_is_better=True,
        eval_interval=2500,
        logging_interval=10,
        save_total_limit=3,
        logging_path=os.path.join(LOG_PATH, "rag"),
        output_path=os.path.join(OUTPUT_PATH, "rag"),
    )
    # 데이터셋 준비
    rag_cache_path = os.path.join(LOCAL_CACHE_PATH, "rag")
    # donwload_dataset_nq_open_dpr(rag_cache_path)

    PREBUILT_INDEX_NAME_BM25 = "wikipedia-dpr-100w"
    LOCAL_INDEX_NAME_BM25 = os.path.join(rag_cache_path, "lucene-index.wikipedia-dpr-100w.20210120.d1b9e6")
    # # it might take a 10-20 minutes to download the index, recommand to use drive cache, if drive capacity is enough

    # Pyserini BM25 검색 인덱스 준비
    try: # 1. 사전 빌드된 위키백과 BM25 인덱스를 pyserini로 불러옴
        searcher_bm25 = LuceneSearcher(index_dir=LOCAL_INDEX_NAME_BM25)
    except: # 실패시 자동으로 다운로드 -> 캐시에 저장
        import shutil
        searcher_bm25 = LuceneSearcher.from_prebuilt_index(prebuilt_index_name=PREBUILT_INDEX_NAME_BM25)
        index_dir = searcher_bm25.index_dir
        shutil.move(index_dir, LOCAL_INDEX_NAME_BM25)
        searcher_bm25 = LuceneSearcher(index_dir=LOCAL_INDEX_NAME_BM25)

    # 모델 및 데이터셋 구성
    model = TransformerForCausalLM.from_pretrained(os.path.join(OUTPUT_PATH, "pretraining", "best_model"))
    train_dataset = RAGDataset(
        tokenizer=tokenizer,
        is_train=True,
        dataset_path=os.path.join(rag_cache_path,"data","nq_open_dpr","nq_train"),
        num_samples=None
    )
    eval_dataset = RAGDataset(
        tokenizer=tokenizer,
        is_train=False,
        dataset_path=os.path.join(rag_cache_path,"data","nq_open_dpr","nq_dev"),
    )
    train_collator = RAGCollator(
        tokenizer=tokenizer,
        is_train=True,
    )
    eval_collator = RAGCollator(
        tokenizer=tokenizer,
        is_train=False,
    )
    
    # RAG 구조 설정
    model_rag = ModelRAG()
    model_rag.set_model(model)
    model_rag.set_tokenizer(tokenizer)
    model_rag.set_retriever(searcher_bm25)

    # 학습
    train(
        model_rag,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        train_collate_fn=train_collator,
        eval_collate_fn=eval_collator,
        train_config=rag_config,
        eval_loop=eval_for_rag,
    )

    # best model 로드
    model_rag.set_model(None)
    del model
    torch.cuda.empty_cache()
    model = TransformerForCausalLM.from_pretrained(os.path.join(OUTPUT_PATH, "rag", "best_model"))

    model_rag.set_model(model)

    eval_loader = torch.utils.data.DataLoader(
        eval_dataset,
        batch_size=rag_config.eval_batch_size,
        shuffle=False,
        collate_fn=eval_collator,
    )
    metrics = eval_for_rag(model_rag, eval_loader, rag_config, tokenizer)
    prediction = metrics.pop("prediction")
    answers = metrics.pop("answers")
    uid = metrics.pop("uid")
    questions = metrics.pop("question")

    # 평가 및 결과 저장
    print(f"====== evaluation ====")
    for k, v in metrics.items():
        print(f"   {k}: {v}")
    print(f"======================")
    with open(os.path.join(RESULTS_PATH,"rag_score.json"), "w") as f:
        json.dump(metrics, f, indent=4)
    with open(os.path.join(RESULTS_PATH,"rag_output.txt"), "w") as f:
        for u,q,p,a in zip(uid, questions, prediction, answers):
            f.write(f"{u}\t{q}\t{a}\t{p}\n")
    # 최종 메모리 정리
    del train_dataset, eval_dataset, model, model_rag
    torch.cuda.empty_cache()

Aug 04, 2025 10:43:46 AM org.apache.lucene.store.MemorySegmentIndexInputProvider <init>
INFO: Using MemorySegmentIndexInput with Java 21; to disable start with -Dorg.apache.lucene.store.MMapDirectory.enableMemorySegments=false


Training Total size=63.58M params. Trainable ratio=100.00%


Training:   0%|          | 0/11040 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/408 [00:00<?, ?it/s]

   accuracy: 0.001841903300076746
   rouge1: 0.002591211551303645
   rouge2: 0.0
   rougeL: 0.0026070348173187763
   rougeLsum: 0.0026139414435653865


Evaluating:   0%|          | 0/408 [00:00<?, ?it/s]

   accuracy: 0.00214888718342287
   rouge1: 0.0031729432842249397
   rouge2: 0.0
   rougeL: 0.0031658262433396723
   rougeLsum: 0.0031760637247974137


Evaluating:   0%|          | 0/408 [00:00<?, ?it/s]

   accuracy: 0.0012279355333844973
   rouge1: 0.0030973907988489724
   rouge2: 0.0
   rougeL: 0.003111187439660194
   rougeLsum: 0.003121473478342243


Evaluating:   0%|          | 0/408 [00:00<?, ?it/s]

   accuracy: 0.001074443591711435
   rouge1: 0.002856776548257744
   rouge2: 2.790762575873857e-05
   rougeL: 0.0028652597935023083
   rougeLsum: 0.002869726632197851


Evaluating:   0%|          | 0/408 [00:00<?, ?it/s]

   accuracy: 0.001074443591711435
   rouge1: 0.002937803221763312
   rouge2: 2.790762575873857e-05
   rougeL: 0.002956218975021736
   rougeLsum: 0.00295474862819452


Evaluating:   0%|          | 0/408 [00:00<?, ?it/s]

   accuracy: 0.00214888718342287
   rouge1: 0.0031729432842249397
   rouge2: 0.0
   rougeL: 0.0031658262433396723
   rougeLsum: 0.0031760637247974137


# 12. HF 로그인

In [None]:
from huggingface_hub import notebook_login, login
login(token="your HF token")

# 13. Zero-shot QA (RAG)

In [None]:
if DO_ZEROSHOT_RAG:
    from dataset.rag import donwload_dataset_nq_open_dpr, RAGDataset, RAGCollator
    from pyserini.search.lucene import LuceneSearcher
    from model_rag import ModelRAG

    # 데이터셋 및 BM25 인덱스 준비
    rag_cache_path = os.path.join(LOCAL_CACHE_PATH, "rag")
    donwload_dataset_nq_open_dpr(rag_cache_path)

    PREBUILT_INDEX_NAME_BM25 = "wikipedia-dpr-100w"
    LOCAL_INDEX_NAME_BM25 = os.path.join(rag_cache_path, "lucene-index.wikipedia-dpr-100w.20210120.d1b9e6")
    # it might take a 10-20 minutes to download the index, recommand to use drive cache, if drive capacity is enough

    try:
        searcher_bm25 = LuceneSearcher(index_dir=LOCAL_INDEX_NAME_BM25)
    except:
        import shutil
        searcher_bm25 = LuceneSearcher.from_prebuilt_index(prebuilt_index_name=PREBUILT_INDEX_NAME_BM25)
        index_dir = searcher_bm25.index_dir
        shutil.move(index_dir, LOCAL_INDEX_NAME_BM25)
        searcher_bm25 = LuceneSearcher(index_dir=LOCAL_INDEX_NAME_BM25)

    # LLaMA 모델 로딩
    model_name_or_path = "meta-llama/Llama-3.2-1B-Instruct"
    llm_model = transformers.AutoModelForCausalLM.from_pretrained(model_name_or_path,
                                                                  torch_dtype=torch.bfloat16,
                                                                  device_map={"": 0},
                                                                  low_cpu_mem_usage=True,)
    llm_tokenizer = transformers.AutoTokenizer.from_pretrained(model_name_or_path)
    llm_batch_size = 4
    llm_tokenizer.pad_token = llm_tokenizer.eos_token
    llm_tokenizer.padding_side = "left"

    # 평가용 Dataset, Collator 구성
    eval_dataset = RAGDataset(
        tokenizer=llm_tokenizer,
        is_train=False,
        dataset_path=os.path.join(rag_cache_path,"data","nq_open_dpr","nq_dev"),
    )
    eval_collator = RAGCollator(
        tokenizer=llm_tokenizer,
        is_train=False,
    )
    # ModelRAG 구성 및 평가 실행
    model_rag = ModelRAG()
    model_rag.set_model(llm_model)
    model_rag.set_tokenizer(llm_tokenizer)
    model_rag.set_retriever(searcher_bm25)

    eval_loader = torch.utils.data.DataLoader(
        eval_dataset,
        batch_size=llm_batch_size,
        shuffle=False,
        collate_fn=eval_collator,
    )
    class rag_config:
        device = "cuda"
        model_dtype = "cast_bf16"

    metrics = eval_for_rag(model_rag, eval_loader, rag_config, tokenizer)
    prediction = metrics.pop("prediction")
    answers = metrics.pop("answers")
    uid = metrics.pop("uid")
    questions = metrics.pop("question")

    print(f"====== evaluation ====")
    for k, v in metrics.items():
        print(f"   {k}: {v}")
    print(f"======================")

    # 결과 저장 및 출력
    with open(os.path.join(RESULTS_PATH,"llm_rag_score.json"), "w") as f:
        json.dump(metrics, f, indent=4)
    with open(os.path.join(RESULTS_PATH,"llm_rag_output.txt"), "w") as f:
        for u,q,p,a in zip(uid, questions, prediction, answers):
            f.write(f"{u}\t{q}\t{a}\t{p}\n")

Dataset already exists at 'local_cache/rag/data/nq_open_dpr/nq_dev'
Dataset already exists at 'local_cache/rag/data/nq_open_dpr/nq_train'


Evaluating:   0%|          | 0/1629 [00:00<?, ?it/s]

   accuracy: 0.1772831926323868
   rouge1: 0.15384416087025365
   rouge2: 0.06592316526314988
   rougeL: 0.15272402194888696
   rougeLsum: 0.1529871578068039


# 14. SUBMISSION

In [None]:
    
if DO_SUBMISSION:
    os.makedirs("submission", exist_ok=True)
    os.makedirs("submission/code", exist_ok=True)
    submission_code_list=[
        "main.ipynb",
        "model.py",
        "model_rag.py",
        "dataset/summary.py",
        "dataset/rag.py",
        "dataset/classification.py",
        "dataset/pretrain.py",
        "utils/etc.py",
        "utils/logger.py",
        "utils/metrics.py",
        "README"
    ]
    submission_directory_list=[
        "logs",
        RESULTS_PATH,
        os.path.join(OUTPUT_PATH, "pretraining", "best_model"),
        os.path.join(OUTPUT_PATH, "summary", "best_model"),
        os.path.join(OUTPUT_PATH, "classification", "best_model"),
        os.path.join(OUTPUT_PATH, "rag", "best_model"),
    ]

    for file in submission_code_list:
        dst = os.path.join("submission", "code", file)
        dst_dir = os.path.dirname(dst)
        if not os.path.exists(dst_dir):
            os.makedirs(dst_dir, exist_ok=True)
        if not os.path.exists(file):
            with open(dst, "w") as f:
                f.write(f"{file} not exist")
        else:
            shutil.copyfile(file, dst)

    for directory in submission_directory_list:
        base_name = os.path.basename(directory)
        if not os.path.exists(directory):
            pass
        else:
            if 'best_model' in directory:
                dirname = os.path.dirname(directory)
                dst = os.path.join("submission", dirname, base_name)
                os.makedirs(dst, exist_ok=True)
            else:
                dst = os.path.join("submission", base_name)
            try:
                shutil.copytree(directory, dst, dirs_exist_ok=True)
            except shutil.Error as e:
                print("Warning during copy:", e)
    shutil.make_archive("defaultproject_code", 'zip', "submission/code")
    shutil.rmtree("submission/code", ignore_errors=True)
    shutil.make_archive("defaultproject_supplementaries", 'zip', "submission")
    shutil.rmtree("submission", ignore_errors=True)
    # With Colab, move to Google Drive, otherwise there are no changes.
    shutil.move("defaultproject_code.zip", os.path.join(PROJ_PATH, "defaultproject_code.zip"))
    shutil.move("defaultproject_supplementaries.zip", os.path.join(PROJ_PATH, "defaultproject_supplementaries.zip"))



# 15. Evaluating Various Prompts for RAG (4.2.1)

## 15-1. CoT

In [8]:
from custom_prompt_rag import PromptRAG_CoT

if DO_ZEROSHOT_RAG:
    from dataset.rag import donwload_dataset_nq_open_dpr, RAGDataset, RAGCollator
    from pyserini.search.lucene import LuceneSearcher
    from model_rag import ModelRAG

    # 데이터셋 및 BM25 인덱스 준비
    rag_cache_path = os.path.join(LOCAL_CACHE_PATH, "rag")
    donwload_dataset_nq_open_dpr(rag_cache_path)

    PREBUILT_INDEX_NAME_BM25 = "wikipedia-dpr-100w"
    LOCAL_INDEX_NAME_BM25 = os.path.join(rag_cache_path, "lucene-index.wikipedia-dpr-100w.20210120.d1b9e6")
    # it might take a 10-20 minutes to download the index, recommand to use drive cache, if drive capacity is enough

    try:
        searcher_bm25 = LuceneSearcher(index_dir=LOCAL_INDEX_NAME_BM25)
    except:
        import shutil
        searcher_bm25 = LuceneSearcher.from_prebuilt_index(prebuilt_index_name=PREBUILT_INDEX_NAME_BM25)
        index_dir = searcher_bm25.index_dir
        shutil.move(index_dir, LOCAL_INDEX_NAME_BM25)
        searcher_bm25 = LuceneSearcher(index_dir=LOCAL_INDEX_NAME_BM25)

    # LLaMA 모델 로딩
    model_name_or_path = "meta-llama/Llama-3.2-1B-Instruct"
    llm_model = transformers.AutoModelForCausalLM.from_pretrained(model_name_or_path,
                                                                  torch_dtype=torch.bfloat16,
                                                                  device_map={"": 0},
                                                                  low_cpu_mem_usage=True,)
    llm_tokenizer = transformers.AutoTokenizer.from_pretrained(model_name_or_path)
    llm_batch_size = 4
    llm_tokenizer.pad_token = llm_tokenizer.eos_token
    llm_tokenizer.padding_side = "left"

    # 평가용 Dataset, Collator 구성
    eval_dataset = RAGDataset(
        tokenizer=llm_tokenizer,
        is_train=False,
        dataset_path=os.path.join(rag_cache_path,"data","nq_open_dpr","nq_dev"),
    )
    eval_collator = RAGCollator(
        tokenizer=llm_tokenizer,
        is_train=False,
    )
    # ModelRAG 구성 및 평가 실행
    model_rag = PromptRAG_CoT()
    model_rag.set_model(llm_model)
    model_rag.set_tokenizer(llm_tokenizer)
    model_rag.set_retriever(searcher_bm25)

    eval_loader = torch.utils.data.DataLoader(
        eval_dataset,
        batch_size=llm_batch_size,
        shuffle=False,
        collate_fn=eval_collator,
    )
    class rag_config:
        device = "cuda"
        model_dtype = "cast_bf16"

    print(f"\n==== Evaluating {model_rag.__class__.__name__} ====\n")

    metrics = eval_for_rag(model_rag, eval_loader, rag_config, tokenizer)
    prediction = metrics.pop("prediction")
    answers = metrics.pop("answers")
    uid = metrics.pop("uid")
    questions = metrics.pop("question")

    print(f"====== evaluation ====")
    for k, v in metrics.items():
        print(f"   {k}: {v}")
    print(f"======================")

    # 예시 출력 5개
    print(f"\n====== Sample Predictions ({model_rag.__class__.__name__}) ======")
    for u, q, a, p in list(zip(uid, questions, answers, prediction))[:5]:
        print(f"[Q] {q}\n[A] {a}\n[P] {p}\n{'-'*40}")


Dataset already exists at 'local_cache/rag/data/nq_open_dpr/nq_dev'
Dataset already exists at 'local_cache/rag/data/nq_open_dpr/nq_train'


Aug 03, 2025 10:14:02 AM org.apache.lucene.store.MemorySegmentIndexInputProvider <init>
INFO: Using MemorySegmentIndexInput with Java 21; to disable start with -Dorg.apache.lucene.store.MMapDirectory.enableMemorySegments=false



==== Evaluating PromptRAG_CoT ====



Evaluating:   0%|          | 0/1629 [00:00<?, ?it/s]

W0803 10:16:41.184000 44226 site-packages/torch/_dynamo/convert_frame.py:844] [0/8] torch._dynamo hit config.cache_size_limit (8)
W0803 10:16:41.184000 44226 site-packages/torch/_dynamo/convert_frame.py:844] [0/8]    function: 'wrapper' (/root/anaconda3/envs/nlp311/lib/python3.11/site-packages/transformers/utils/generic.py:927)
W0803 10:16:41.184000 44226 site-packages/torch/_dynamo/convert_frame.py:844] [0/8]    last reason: 0/0: tensor 'L['kwargs']['cache_position']' size mismatch at index 0. expected 827, actual 1
W0803 10:16:41.184000 44226 site-packages/torch/_dynamo/convert_frame.py:844] [0/8] To log all recompilation reasons, use TORCH_LOGS="recompiles".
W0803 10:16:41.184000 44226 site-packages/torch/_dynamo/convert_frame.py:844] [0/8] To diagnose recompilation issues, see https://pytorch.org/docs/main/torch.compiler_troubleshooting.html.


   accuracy: 0.1577897160399079
   rouge1: 0.17817903934557755
   rouge2: 0.09026043624662186
   rougeL: 0.17734637835942466
   rougeLsum: 0.17748901861334654

[Q] who sings does he love me with reba
[A] ['Linda Davis']
[P] does he love me.
----------------------------------------
[Q] where do the great lakes meet the ocean
[A] ['the Saint Lawrence River']
[P] Erie Canal.
----------------------------------------
[Q] when does the new my hero academia movie come out
[A] ['July 5 , 2018']
[P] My Hero Academia: Two Heroes
----------------------------------------
[Q] who was the creator of victoria 's secret
[A] ['Roy Raymond']
[P] Raymond
----------------------------------------
[Q] when did wesley leave last of the summer wine
[A] ['2002']
[P] 1985.
----------------------------------------


너의 모델은 지금 CoT을 통해:

정답률(accuracy)은 약간 떨어졌지만,

답변 품질(정보 포함량)은 높아진 것이야.

즉, CoT은 “잘못된 정답”을 늘리는 게 아니라, “정답 스타일”을 바꿔서 정확히 매칭되지 않게 만들었을 가능성이 높아.

## 15-2. Input formatting

In [None]:
from custom_prompt_rag import PromptRAG_UserAssistant

if DO_ZEROSHOT_RAG:
    from dataset.rag import donwload_dataset_nq_open_dpr, RAGDataset, RAGCollator
    from pyserini.search.lucene import LuceneSearcher
    from model_rag import ModelRAG

    # 데이터셋 및 BM25 인덱스 준비
    rag_cache_path = os.path.join(LOCAL_CACHE_PATH, "rag")
    donwload_dataset_nq_open_dpr(rag_cache_path)

    PREBUILT_INDEX_NAME_BM25 = "wikipedia-dpr-100w"
    LOCAL_INDEX_NAME_BM25 = os.path.join(rag_cache_path, "lucene-index.wikipedia-dpr-100w.20210120.d1b9e6")
    # it might take a 10-20 minutes to download the index, recommand to use drive cache, if drive capacity is enough

    try:
        searcher_bm25 = LuceneSearcher(index_dir=LOCAL_INDEX_NAME_BM25)
    except:
        import shutil
        searcher_bm25 = LuceneSearcher.from_prebuilt_index(prebuilt_index_name=PREBUILT_INDEX_NAME_BM25)
        index_dir = searcher_bm25.index_dir
        shutil.move(index_dir, LOCAL_INDEX_NAME_BM25)
        searcher_bm25 = LuceneSearcher(index_dir=LOCAL_INDEX_NAME_BM25)

    # LLaMA 모델 로딩
    model_name_or_path = "meta-llama/Llama-3.2-1B-Instruct"
    llm_model = transformers.AutoModelForCausalLM.from_pretrained(model_name_or_path,
                                                                  torch_dtype=torch.bfloat16,
                                                                  device_map={"": 0},
                                                                  low_cpu_mem_usage=True,)
    llm_tokenizer = transformers.AutoTokenizer.from_pretrained(model_name_or_path)
    llm_batch_size = 4
    llm_tokenizer.pad_token = llm_tokenizer.eos_token
    llm_tokenizer.padding_side = "left"

    # 평가용 Dataset, Collator 구성
    eval_dataset = RAGDataset(
        tokenizer=llm_tokenizer,
        is_train=False,
        dataset_path=os.path.join(rag_cache_path,"data","nq_open_dpr","nq_dev"),
    )
    eval_collator = RAGCollator(
        tokenizer=llm_tokenizer,
        is_train=False,
    )
    # ModelRAG 구성 및 평가 실행
    model_rag = PromptRAG_UserAssistant()
    model_rag.set_model(llm_model)
    model_rag.set_tokenizer(llm_tokenizer)
    model_rag.set_retriever(searcher_bm25)

    eval_loader = torch.utils.data.DataLoader(
        eval_dataset,
        batch_size=llm_batch_size,
        shuffle=False,
        collate_fn=eval_collator,
    )
    class rag_config:
        device = "cuda"
        model_dtype = "cast_bf16"

    print(f"\n==== Evaluating {model_rag.__class__.__name__} ====\n")

    metrics = eval_for_rag(model_rag, eval_loader, rag_config, tokenizer)
    prediction = metrics.pop("prediction")
    answers = metrics.pop("answers")
    uid = metrics.pop("uid")
    questions = metrics.pop("question")

    print(f"====== evaluation ====")
    for k, v in metrics.items():
        print(f"   {k}: {v}")
    print(f"======================")

    # 예시 출력 5개
    print(f"\n====== Sample Predictions ({model_rag.__class__.__name__}) ======")
    for u, q, a, p in list(zip(uid, questions, answers, prediction))[:5]:
        print(f"[Q] {q}\n[A] {a}\n[P] {p}\n{'-'*40}")



Dataset already exists at 'local_cache/rag/data/nq_open_dpr/nq_dev'
Dataset already exists at 'local_cache/rag/data/nq_open_dpr/nq_train'


Aug 04, 2025 8:34:09 AM org.apache.lucene.store.MemorySegmentIndexInputProvider <init>
INFO: Using MemorySegmentIndexInput with Java 21; to disable start with -Dorg.apache.lucene.store.MMapDirectory.enableMemorySegments=false



==== Evaluating PromptRAG_UserAssistant ====



Evaluating:   0%|          | 0/1629 [00:00<?, ?it/s]

W0804 08:36:37.915000 60876 site-packages/torch/_dynamo/convert_frame.py:844] [0/8] torch._dynamo hit config.cache_size_limit (8)
W0804 08:36:37.915000 60876 site-packages/torch/_dynamo/convert_frame.py:844] [0/8]    function: 'wrapper' (/root/anaconda3/envs/nlp311/lib/python3.11/site-packages/transformers/utils/generic.py:927)
W0804 08:36:37.915000 60876 site-packages/torch/_dynamo/convert_frame.py:844] [0/8]    last reason: 0/0: tensor 'L['kwargs']['cache_position']' size mismatch at index 0. expected 787, actual 1
W0804 08:36:37.915000 60876 site-packages/torch/_dynamo/convert_frame.py:844] [0/8] To log all recompilation reasons, use TORCH_LOGS="recompiles".
W0804 08:36:37.915000 60876 site-packages/torch/_dynamo/convert_frame.py:844] [0/8] To diagnose recompilation issues, see https://pytorch.org/docs/main/torch.compiler_troubleshooting.html.


   accuracy: 0.19769762087490406
   rouge1: 0.17048953306435954
   rouge2: 0.0790341499551017
   rougeL: 0.1696137464479741
   rougeLsum: 0.16997751289385893

[Q] who sings does he love me with reba
[A] ['Linda Davis']
[P] Beyoncé!parableparableparableparableparableparable
----------------------------------------
[Q] where do the great lakes meet the ocean
[A] ['the Saint Lawrence River']
[P] **The Great Lakes meet the ocean at the St
----------------------------------------
[Q] when does the new my hero academia movie come out
[A] ['July 5 , 2018']
[P] There is no new My Hero Academia movie announced
----------------------------------------
[Q] who was the creator of victoria 's secret
[A] ['Roy Raymond']
[P] Raymond Raymondparableparableparableparableparableparableparable
----------------------------------------
[Q] when did wesley leave last of the summer wine
[A] ['2002']
[P] John Wesley, played by actor Brian Capron,
----------------------------------------


## 15-3. Parsing generation results

In [10]:
from custom_prompt_rag import PromptRAG_JSON

if DO_ZEROSHOT_RAG:
    from dataset.rag import donwload_dataset_nq_open_dpr, RAGDataset, RAGCollator
    from pyserini.search.lucene import LuceneSearcher
    from model_rag import ModelRAG

    # 데이터셋 및 BM25 인덱스 준비
    rag_cache_path = os.path.join(LOCAL_CACHE_PATH, "rag")
    donwload_dataset_nq_open_dpr(rag_cache_path)

    PREBUILT_INDEX_NAME_BM25 = "wikipedia-dpr-100w"
    LOCAL_INDEX_NAME_BM25 = os.path.join(rag_cache_path, "lucene-index.wikipedia-dpr-100w.20210120.d1b9e6")
    # it might take a 10-20 minutes to download the index, recommand to use drive cache, if drive capacity is enough

    try:
        searcher_bm25 = LuceneSearcher(index_dir=LOCAL_INDEX_NAME_BM25)
    except:
        import shutil
        searcher_bm25 = LuceneSearcher.from_prebuilt_index(prebuilt_index_name=PREBUILT_INDEX_NAME_BM25)
        index_dir = searcher_bm25.index_dir
        shutil.move(index_dir, LOCAL_INDEX_NAME_BM25)
        searcher_bm25 = LuceneSearcher(index_dir=LOCAL_INDEX_NAME_BM25)

    # LLaMA 모델 로딩
    model_name_or_path = "meta-llama/Llama-3.2-1B-Instruct"
    llm_model = transformers.AutoModelForCausalLM.from_pretrained(model_name_or_path,
                                                                  torch_dtype=torch.bfloat16,
                                                                  device_map={"": 0},
                                                                  low_cpu_mem_usage=True,)
    llm_tokenizer = transformers.AutoTokenizer.from_pretrained(model_name_or_path)
    llm_batch_size = 4
    llm_tokenizer.pad_token = llm_tokenizer.eos_token
    llm_tokenizer.padding_side = "left"

    # 평가용 Dataset, Collator 구성
    eval_dataset = RAGDataset(
        tokenizer=llm_tokenizer,
        is_train=False,
        dataset_path=os.path.join(rag_cache_path,"data","nq_open_dpr","nq_dev"),
    )
    eval_collator = RAGCollator(
        tokenizer=llm_tokenizer,
        is_train=False,
    )
    # ModelRAG 구성 및 평가 실행
    model_rag = PromptRAG_JSON()
    model_rag.set_model(llm_model)
    model_rag.set_tokenizer(llm_tokenizer)
    model_rag.set_retriever(searcher_bm25)

    eval_loader = torch.utils.data.DataLoader(
        eval_dataset,
        batch_size=llm_batch_size,
        shuffle=False,
        collate_fn=eval_collator,
    )
    class rag_config:
        device = "cuda"
        model_dtype = "cast_bf16"

    print(f"\n==== Evaluating {model_rag.__class__.__name__} ====\n")

    metrics = eval_for_rag(model_rag, eval_loader, rag_config, tokenizer)
    prediction = metrics.pop("prediction")
    answers = metrics.pop("answers")
    uid = metrics.pop("uid")
    questions = metrics.pop("question")

    print(f"====== evaluation ====")
    for k, v in metrics.items():
        print(f"   {k}: {v}")
    print(f"======================")

    # 예시 출력 5개
    print(f"\n====== Sample Predictions ({model_rag.__class__.__name__}) ======")
    for u, q, a, p in list(zip(uid, questions, answers, prediction))[:5]:
        print(f"[Q] {q}\n[A] {a}\n[P] {p}\n{'-'*40}")

Dataset already exists at 'local_cache/rag/data/nq_open_dpr/nq_dev'
Dataset already exists at 'local_cache/rag/data/nq_open_dpr/nq_train'

==== Evaluating PromptRAG_JSON ====



Evaluating:   0%|          | 0/1629 [00:00<?, ?it/s]

W0804 12:47:43.736000 81979 site-packages/torch/_dynamo/convert_frame.py:844] [1/8] torch._dynamo hit config.cache_size_limit (8)
W0804 12:47:43.736000 81979 site-packages/torch/_dynamo/convert_frame.py:844] [1/8]    function: 'wrapper' (/root/anaconda3/envs/nlp311/lib/python3.11/site-packages/transformers/utils/generic.py:927)
W0804 12:47:43.736000 81979 site-packages/torch/_dynamo/convert_frame.py:844] [1/8]    last reason: 1/0: tensor 'L['kwargs']['cache_position']' size mismatch at index 0. expected 807, actual 1
W0804 12:47:43.736000 81979 site-packages/torch/_dynamo/convert_frame.py:844] [1/8] To log all recompilation reasons, use TORCH_LOGS="recompiles".
W0804 12:47:43.736000 81979 site-packages/torch/_dynamo/convert_frame.py:844] [1/8] To diagnose recompilation issues, see https://pytorch.org/docs/main/torch.compiler_troubleshooting.html.


   accuracy: 0.14458940905602455
   rouge1: 0.13696652200873127
   rouge2: 0.061458538902898194
   rougeL: 0.1359017447121812
   rougeLsum: 0.13623258424869994

[Q] who sings does he love me with reba
[A] ['Linda Davis']
[P] "Does He Love You"{"Artist": "
----------------------------------------
[Q] where do the great lakes meet the ocean
[A] ['the Saint Lawrence River']
[P] "Erie Canal"}
----------------------------------------
[Q] when does the new my hero academia movie come out
[A] ['July 5 , 2018']
[P] "My Hero Academia: Two Heroes", "
----------------------------------------
[Q] who was the creator of victoria 's secret
[A] ['Roy Raymond']
[P] "Leslie Wexner"}}parable
----------------------------------------
[Q] when did wesley leave last of the summer wine
[A] ['2002']
[P] "18 May 2002"}
----------------------------------------


# 16. Enhancing RAG with Llama-3.2-1B-Instruct (4.3)

### PCW (Prediction-Critic-Wrapper): 모델이 생성한 답변을, 또 다른(또는 함수)이 평가 수정하는 방식의 구조

즉, 한 번의 답변 생성 후 그게 맞는지/ 더 나은 답이 있는지를 판단하거나 보완하는 구조이다.

질문 → [Generator] → 답변 → [Critic or Corrector] → 최종 정제된 답변

In [None]:
from custom_prompt_rag import PromptRAG_UserAssistantSelfRAG

if DO_ZEROSHOT_RAG:
    from dataset.rag import donwload_dataset_nq_open_dpr, RAGDataset, RAGCollator
    from pyserini.search.lucene import LuceneSearcher
    from model_rag import ModelRAG

    # 데이터셋 및 BM25 인덱스 준비
    rag_cache_path = os.path.join(LOCAL_CACHE_PATH, "rag")
    donwload_dataset_nq_open_dpr(rag_cache_path)

    PREBUILT_INDEX_NAME_BM25 = "wikipedia-dpr-100w"
    LOCAL_INDEX_NAME_BM25 = os.path.join(rag_cache_path, "lucene-index.wikipedia-dpr-100w.20210120.d1b9e6")
    # it might take a 10-20 minutes to download the index, recommand to use drive cache, if drive capacity is enough

    try:
        searcher_bm25 = LuceneSearcher(index_dir=LOCAL_INDEX_NAME_BM25)
    except:
        import shutil
        searcher_bm25 = LuceneSearcher.from_prebuilt_index(prebuilt_index_name=PREBUILT_INDEX_NAME_BM25)
        index_dir = searcher_bm25.index_dir
        shutil.move(index_dir, LOCAL_INDEX_NAME_BM25)
        searcher_bm25 = LuceneSearcher(index_dir=LOCAL_INDEX_NAME_BM25)

    # LLaMA 모델 로딩
    model_name_or_path = "meta-llama/Llama-3.2-1B-Instruct"
    llm_model = transformers.AutoModelForCausalLM.from_pretrained(model_name_or_path,
                                                                  torch_dtype=torch.bfloat16,
                                                                  device_map={"": 0},
                                                                  low_cpu_mem_usage=True,)
    llm_tokenizer = transformers.AutoTokenizer.from_pretrained(model_name_or_path)
    llm_batch_size = 4
    llm_tokenizer.pad_token = llm_tokenizer.eos_token
    llm_tokenizer.padding_side = "left"

    # 평가용 Dataset, Collator 구성
    eval_dataset = RAGDataset(
        tokenizer=llm_tokenizer,
        is_train=False,
        dataset_path=os.path.join(rag_cache_path,"data","nq_open_dpr","nq_dev"),
    )
    eval_collator = RAGCollator(
        tokenizer=llm_tokenizer,
        is_train=False,
    )
    # ModelRAG 구성 및 평가 실행
    model_rag = PromptRAG_UserAssistantSelfRAG()
    model_rag.set_model(llm_model)
    model_rag.set_tokenizer(llm_tokenizer)
    model_rag.set_retriever(searcher_bm25)

    eval_loader = torch.utils.data.DataLoader(
        eval_dataset,
        batch_size=llm_batch_size,
        shuffle=False,
        collate_fn=eval_collator,
    )
    class rag_config:
        device = "cuda"
        model_dtype = "cast_bf16"

    print(f"\n==== Evaluating {model_rag.__class__.__name__} ====\n")

    metrics = eval_for_rag(model_rag, eval_loader, rag_config, tokenizer)
    prediction = metrics.pop("prediction")
    answers = metrics.pop("answers")
    uid = metrics.pop("uid")
    questions = metrics.pop("question")

    passages_all, _ = model_rag.search(queries=questions, qids=uid, k=5)
    contexts = [model_rag.build_context(passages) for passages in passages_all]
    final_outputs = model_rag.apply_self_rag_postprocess(questions, contexts, prediction, answers)

    print(f"====== evaluation ====")
    for k, v in metrics.items():
        print(f"   {k}: {v}")
    print(f"======================")

    # 예시 출력 5개
    print(f"\n====== Sample Predictions ({model_rag.__class__.__name__}) ======")
    for u, q, a, p in list(zip(uid, questions, answers, prediction))[:5]:
        print(f"[Q] {q}\n[A] {a}\n[P] {p}\n{'-'*40}")



Dataset already exists at 'local_cache/rag/data/nq_open_dpr/nq_dev'
Dataset already exists at 'local_cache/rag/data/nq_open_dpr/nq_train'


Aug 05, 2025 10:04:01 AM org.apache.lucene.store.MemorySegmentIndexInputProvider <init>
INFO: Using MemorySegmentIndexInput with Java 21; to disable start with -Dorg.apache.lucene.store.MMapDirectory.enableMemorySegments=false



==== Evaluating PromptRAG_UserAssistant_CoT ====



Evaluating:   0%|          | 0/1629 [00:00<?, ?it/s]

W0805 10:05:15.078000 146010 site-packages/torch/_dynamo/convert_frame.py:844] [0/8] torch._dynamo hit config.cache_size_limit (8)
W0805 10:05:15.078000 146010 site-packages/torch/_dynamo/convert_frame.py:844] [0/8]    function: 'wrapper' (/root/anaconda3/envs/nlp311/lib/python3.11/site-packages/transformers/utils/generic.py:927)
W0805 10:05:15.078000 146010 site-packages/torch/_dynamo/convert_frame.py:844] [0/8]    last reason: 0/0: tensor 'L['kwargs']['cache_position']' size mismatch at index 0. expected 805, actual 1
W0805 10:05:15.078000 146010 site-packages/torch/_dynamo/convert_frame.py:844] [0/8] To log all recompilation reasons, use TORCH_LOGS="recompiles".
W0805 10:05:15.078000 146010 site-packages/torch/_dynamo/convert_frame.py:844] [0/8] To diagnose recompilation issues, see https://pytorch.org/docs/main/torch.compiler_troubleshooting.html.


   accuracy: 0.1602455871066769
   rouge1: 0.1269146607634703
   rouge2: 0.05783548633510274
   rougeL: 0.1263918400180863
   rougeLsum: 0.12652540901581497

[Q] who sings does he love me with reba
[A] ['Linda Davis']
[P] Reba McEntireparableparableparableparable
----------------------------------------
[Q] where do the great lakes meet the ocean
[A] ['the Saint Lawrence River']
[P] The Great Lakes meet the ocean at the St
----------------------------------------
[Q] when does the new my hero academia movie come out
[A] ['July 5 , 2018']
[P] "My Hero Academia: Two Heroes"
----------------------------------------
[Q] who was the creator of victoria 's secret
[A] ['Roy Raymond']
[P] Leslie Wexnerparableparableparableparableparable
----------------------------------------
[Q] when did wesley leave last of the summer wine
[A] ['2002']
[P] The 25th series of "Last of the
----------------------------------------
