<a href="https://colab.research.google.com/github/onewon1234/AI_DL_Project/blob/main/global/global_roberta_colab.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from scipy.stats import loguniform
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
import itertools
from itertools import permutations
from transformers import (
    AutoTokenizer,
    AutoModel,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
    EarlyStoppingCallback
)

In [None]:
import pandas as pd

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
%cd /content/drive/MyDrive/강의/AI를 위한 딥러닝/AI_DL_Project/code

[Errno 2] No such file or directory: '/content/drive/MyDrive/강의/AI를 위한 딥러닝/AI_DL_Project/code'
/content


In [None]:
# 데이터 로드
train_path = pd.read_csv('/content/drive/MyDrive/data/daycon_sentence/train.csv')
test_df = pd.read_csv('/content/drive/MyDrive/data/daycon_sentence/test.csv')
submission_path = pd.read_csv('/content/drive/MyDrive/data/daycon_sentence/sample_submission.csv')

In [None]:
submission_path = pd.read_csv(BASE_DIR + "sample_submission.csv")

NameError: name 'BASE_DIR' is not defined

In [None]:
submission_path.info()
submission_path.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1780 entries, 0 to 1779
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   ID        1780 non-null   object
 1   answer_0  1780 non-null   int64 
 2   answer_1  1780 non-null   int64 
 3   answer_2  1780 non-null   int64 
 4   answer_3  1780 non-null   int64 
dtypes: int64(4), object(1)
memory usage: 69.7+ KB


Unnamed: 0,ID,answer_0,answer_1,answer_2,answer_3
0,TEST_0000,0,1,2,3
1,TEST_0001,0,1,2,3
2,TEST_0002,0,1,2,3
3,TEST_0003,0,1,2,3
4,TEST_0004,0,1,2,3


In [None]:
# data_utils.py
import pandas as pd
import numpy as np

def load_data(train_path, test_path):
    train = pd.read_csv(train_path)
    test = pd.read_csv(test_path)
    return train, test

def make_labels(df):
    # answer_0 ~ answer_3 → [문장0은 몇 번째, 문장1은 몇 번째, ...]
    answers = df[[f'answer_{i}' for i in range(4)]].values
    labels = []
    for row in answers:
        label = [0]*4
        for pos, sent_idx in enumerate(row):
            label[sent_idx] = pos
        labels.append(label)
    return np.array(labels)

# Dataset 클래스
 4개의 문장을 [SEP]로 묶어서 BERT에 넣을 수 있게 바꿔줌

In [None]:
# ✅ 1. 데이터셋 클래스
from torch.utils.data import Dataset

class GlobalOrderDataset(Dataset):
    def __init__(self, df, tokenizer, labels=None, max_length=256):
        self.sentences = df[[f'sentence_{i}' for i in range(4)]].values
        self.tokenizer = tokenizer
        self.labels = labels
        self.max_length = max_length

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
        sents = self.sentences[idx]
        text = '[CLS] ' + ' [SEP] '.join(sents) + ' [SEP]'
        encoding = self.tokenizer(
            text,
            padding='max_length',
            truncation=True,
            max_length=self.max_length,
            return_tensors='pt'
        )
        item = {k: v.squeeze(0) for k, v in encoding.items()}
        if self.labels is not None:
            item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
            return item


# Model 클래스
- AutoModel (예: Roberta) 사용
- 문장 4개를 넣었을 때 그 순서를 예측
- 출력은 [batch, 4, 4] 크기의 행렬 → 각 문장이 어떤 위치에 있어야 하는지 예측

In [None]:
# ✅ 2. 모델 정의
import torch
import torch.nn as nn
from transformers import AutoModel
class GlobalOrderModel(nn.Module):
    def __init__(self, model_name='klue/roberta-large'):
        super().__init__()
        self.bert = AutoModel.from_pretrained(model_name)
        hidden_size = self.bert.config.hidden_size

        self.classifier = nn.Sequential(
            nn.Linear(hidden_size, 1024),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(1024, 4 * 4)  # 4문장 * 4 클래스
        )

    def forward(self, input_ids, attention_mask, labels=None):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled = outputs.last_hidden_state[:, 0]  # [CLS] 토큰 기준
        logits = self.classifier(pooled)
        if labels is not None:
            loss_fn = nn.CrossEntropyLoss()
            loss = loss_fn(logits.view(-1, 4), labels.view(-1))
            return {"loss": loss, "logits": logits.view(-1, 4, 4)}  # ✅ 여기가 핵심!
        else:
            return {"logits": logits.view(-1, 4, 4)}


In [None]:
def compute_metrics(eval_pred):
    try:
        logits, labels = eval_pred
        print("✅ compute_metrics 호출됨")
        print("logits shape:", np.shape(logits))    # 예: (1471, 4, 4)
        print("labels shape:", np.shape(labels))    # 예: (1471, 4) 이어야 함

        preds = np.argmax(logits, axis=2)
        sentence_accuracy = (preds == labels).mean()
        full_order_accuracy = (preds == labels).all(axis=1).mean()

        print(f"🎯 sentence_acc: {sentence_accuracy:.4f}, full_order_acc: {full_order_accuracy:.4f}")

        return {
            "sentence_accuracy": sentence_accuracy,
            "full_order_accuracy": full_order_accuracy
        }
    except Exception as e:
        print(f"❌ compute_metrics 내부 오류: {e}")
        return {}


In [None]:
model = GlobalOrderModel(model_name="klue/roberta-large")  # ✅ RoBERTa로 교체
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")  # ✅ GPU 사용 여부 확인
model.to(device)  # ✅ 모델을 해당 디바이스로 이동


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/547 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.35G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at klue/roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


GlobalOrderModel(
  (bert): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(32000, 1024, padding_idx=1)
      (position_embeddings): Embedding(514, 1024, padding_idx=1)
      (token_type_embeddings): Embedding(1, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-23): 24 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=True)
              (L

In [None]:
from transformers import AutoTokenizer

# ✅ RoBERTa 전용 tokenizer 사용
tokenizer = AutoTokenizer.from_pretrained("klue/roberta-large")

# 🔹 학습 데이터셋 생성
train_dataset = GlobalOrderDataset(train_path, tokenizer, labels=train_labels)
train_labels = make_labels(train_path)

In [None]:
train_dataset[0]

{'input_ids': tensor([    0,     0, 18966,  3726,  2073,  4653,  3747,  2079,  5767,  2047,
          2069,  1750,  2318,  5144,  2067,  2764,  1295,  1513,  2062,    18,
             2,  3839,  2470,  4803,  2073,  7243,  2031,  2170,  2318,  4901,
          2138,  3894,  2205,  2307,    16,  3772,  2125,  3844,  2138,  6627,
          2205,  2259,   842,  5291,  2085,  1295,  1513,  2062,    18,     2,
          3731, 11187, 18966,  4568,  2079,  4653,  4119,  2073,  4646, 19521,
          4901,  2085,  1295,  1513,  2259,  3862,  3828,  2069,  4196,  2085,
         10149,  2069, 16954,    18,     2,   544,  4653,  2259,  5131,  5391,
          2183,  2470,  3809,  6233,  6965,  2496,  2051,  6251,  2079,  3662,
          2047,  2069,  7848, 11187,  6202,  4538,    18,     2,     2,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,   

In [None]:
import os
os.environ["WANDB_DISABLED"] = "true"

from transformers import AutoTokenizer, Trainer, TrainingArguments

# ✅ RoBERTa tokenizer
tokenizer = AutoTokenizer.from_pretrained("klue/roberta-large")

# 🔹 전체 라벨 생성
labels = make_labels(train_path)

# ✅ TrainingArguments
training_args = TrainingArguments(
    output_dir="./global_results",
    num_train_epochs=5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    eval_strategy="no",
    save_strategy="epoch",
    logging_dir="./logs",
    logging_steps=100,
    warmup_steps=500,
    weight_decay=0.01,
    load_best_model_at_end=False,
    seed=42,
)

# ✅ Trainer 설정 (전체 학습용)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=None,
    compute_metrics=None,
)


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


# 학습 실행

In [None]:
trainer.train()

Step,Training Loss
100,1.3765
200,1.2867
300,1.0273
400,0.8348
500,0.6714
600,0.6026
700,0.578
800,0.5431
900,0.5296
1000,0.4595


TrainOutput(global_step=2300, training_loss=0.4603814475432686, metrics={'train_runtime': 4979.3022, 'train_samples_per_second': 7.382, 'train_steps_per_second': 0.462, 'total_flos': 0.0, 'train_loss': 0.4603814475432686, 'epoch': 5.0})

In [None]:
import shutil

checkpoints = ["checkpoint-368", "checkpoint-736", "checkpoint-1104", "checkpoint-1472", "checkpoint-1840"]
for ckpt in checkpoints:
    shutil.rmtree(f"/content/global_results/{ckpt}", ignore_errors=True)


In [None]:
# ✅ best checkpoint 기준으로 모델 저장
save_path = "/content/global_results/best_model"

trainer.save_model(save_path)
tokenizer.save_pretrained(save_path)

('/content/global_results/best_model/tokenizer_config.json',
 '/content/global_results/best_model/special_tokens_map.json',
 '/content/global_results/best_model/vocab.txt',
 '/content/global_results/best_model/added_tokens.json',
 '/content/global_results/best_model/tokenizer.json')

# 튜닝

In [None]:
# ✅ 튜닝용 데이터 분리
train_split_df, val_df = train_test_split(train_path, test_size=0.2, random_state=42)
train_split_labels = make_labels(train_split_df)
val_labels = make_labels(val_df)

train_split_dataset = GlobalOrderDataset(train_split_df, tokenizer, labels=train_split_labels)
val_dataset = GlobalOrderDataset(val_df, tokenizer, labels=val_labels)


In [None]:
from transformers import TrainingArguments, Trainer
from scipy.stats import loguniform
import numpy as np
import pandas as pd


In [None]:
import os
import numpy as np
import pandas as pd
import torch
import shutil
from scipy.stats import loguniform
from transformers import AutoTokenizer, Trainer, TrainingArguments

def run_global_tuning(train_split_dataset, val_dataset, tokenizer, n_trials=2):
    results_path = './global_results/tuning_log.csv'
    if os.path.exists(results_path):
        results = pd.read_csv(results_path).to_dict(orient='records')
        start_trial = len(results)
    else:
        results = []
        start_trial = 0

    for trial in range(start_trial, n_trials):
        print(f"\n🎯 Trial {trial + 1} 시작")
        lr = float(loguniform.rvs(1.5e-5, 3.5e-5))
        wd = float(loguniform.rvs(0.01, 0.07))
        epochs = int(np.random.randint(9, 13))
        batch_size = 16
        total_steps = (len(train_split_dataset) // batch_size) * epochs
        warmup = int(total_steps * 0.02)

        args = TrainingArguments(
            output_dir=f'./global_results/trial_{trial+1}',
            learning_rate=lr,
            weight_decay=wd,
            warmup_steps=warmup,
            per_device_train_batch_size=batch_size,
            per_device_eval_batch_size=64,
            num_train_epochs=epochs,
            gradient_accumulation_steps=1,
            lr_scheduler_type='linear',
            logging_dir='./roberta_logs',
            logging_steps=100,
            save_strategy="epoch",
            save_total_limit=2,
            eval_strategy='epoch',  # ✅ 평가 활성화
            seed=42,
            load_best_model_at_end=True,
            metric_for_best_model='full_order_accuracy',  # ✅ 이 기준으로 best 선택
            greater_is_better=True,
            report_to='none',
            fp16=True,
            optim='adamw_torch_fused'
        )

        trainer = Trainer(
            model=GlobalOrderModel("klue/roberta-large"),
            args=args,
            train_dataset=train_split_dataset,
            eval_dataset=val_dataset,
            tokenizer=tokenizer,
            compute_metrics=compute_metrics,  # ✅ 사용자 정의 메트릭
            callbacks=[],
        )

        try:
            trainer.train()

            # 최종 평가
            eval_result = trainer.evaluate()
            sentence_acc = eval_result.get("eval_sentence_accuracy", None)
            full_order_acc = eval_result.get("eval_full_order_accuracy", None)
            eval_loss = eval_result.get("eval_loss", None)

            save_path = f'./global_results/trial_{trial+1}/best_model'
            try:
                trainer.save_model(save_path)
                tokenizer.save_pretrained(save_path)
                model_saved = True
            except Exception as e:
                print(f"⚠️ 모델 저장 실패: {e}")
                model_saved = False
                save_path = "FAILED"

            results.append({
                'trial': trial + 1,
                'learning_rate': lr,
                'weight_decay': wd,
                'warmup_steps': warmup,
                'epochs': epochs,
                'sentence_accuracy': sentence_acc,
                'full_order_accuracy': full_order_acc,
                'eval_loss': eval_loss,
                'model_saved': model_saved,
                'save_path': save_path
            })
            pd.DataFrame(results).to_csv(results_path, index=False)

            # checkpoint 정리
            output_dir = f'./global_results/trial_{trial+1}'
            for subdir in os.listdir(output_dir):
                if subdir.startswith("checkpoint"):
                    shutil.rmtree(os.path.join(output_dir, subdir), ignore_errors=True)

            print(f"✅ Trial {trial+1} 완료 | 저장 경로: {save_path}")

        except Exception as e:
            print(f"⛔ Trial {trial+1} 중 오류 발생: {e}")
            break

    print("\n🏆 상위 Trial:")
    top_trials = pd.DataFrame(results).sort_values(by="full_order_accuracy", ascending=False).head(1)
    print(top_trials)
    return top_trials


In [None]:
top_trials = run_global_tuning(train_split_dataset, val_dataset, tokenizer, n_trials=2)


🎯 Trial 1 시작


Some weights of RobertaModel were not initialized from the model checkpoint at klue/roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Sentence Accuracy,Full Order Accuracy
1,1.1794,1.072099,0.443576,0.028552
2,0.6539,0.462166,0.844494,0.625425
3,0.2792,0.322806,0.897689,0.789939
4,0.1772,0.309631,0.905676,0.808973
5,0.0984,0.344414,0.917233,0.835486
6,0.0396,0.402695,0.918423,0.836166
7,0.0176,0.431295,0.919782,0.842284
8,0.0145,0.431568,0.923861,0.845683
9,0.007,0.447171,0.926241,0.851801
10,0.0067,0.466715,0.926751,0.852481


✅ compute_metrics 호출됨
logits shape: (1471, 4, 4)
labels shape: (1471, 4)
🎯 sentence_acc: 0.4436, full_order_acc: 0.0286
✅ compute_metrics 호출됨
logits shape: (1471, 4, 4)
labels shape: (1471, 4)
🎯 sentence_acc: 0.8445, full_order_acc: 0.6254
✅ compute_metrics 호출됨
logits shape: (1471, 4, 4)
labels shape: (1471, 4)
🎯 sentence_acc: 0.8977, full_order_acc: 0.7899
✅ compute_metrics 호출됨
logits shape: (1471, 4, 4)
labels shape: (1471, 4)
🎯 sentence_acc: 0.9057, full_order_acc: 0.8090
✅ compute_metrics 호출됨
logits shape: (1471, 4, 4)
labels shape: (1471, 4)
🎯 sentence_acc: 0.9172, full_order_acc: 0.8355
✅ compute_metrics 호출됨
logits shape: (1471, 4, 4)
labels shape: (1471, 4)
🎯 sentence_acc: 0.9184, full_order_acc: 0.8362
✅ compute_metrics 호출됨
logits shape: (1471, 4, 4)
labels shape: (1471, 4)
🎯 sentence_acc: 0.9198, full_order_acc: 0.8423
✅ compute_metrics 호출됨
logits shape: (1471, 4, 4)
labels shape: (1471, 4)
🎯 sentence_acc: 0.9239, full_order_acc: 0.8457
✅ compute_metrics 호출됨
logits shape: (147

✅ compute_metrics 호출됨
logits shape: (1471, 4, 4)
labels shape: (1471, 4)
🎯 sentence_acc: 0.9266, full_order_acc: 0.8532
✅ Trial 1 완료 | 저장 경로: ./global_results/trial_1/best_model

🎯 Trial 2 시작


Some weights of RobertaModel were not initialized from the model checkpoint at klue/roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss


In [None]:
import pandas as pd
import os

# ✅ 튜닝 로그 불러오기
df = pd.read_csv("./global_results/tuning_log.csv")

# ✅ 모델 저장된 trial만 남기기
df = df[df["model_saved"] == True]

# ✅ 평가 결과가 있는 경우: full_order_accuracy 기준 선택, 없으면 trial 번호 기준
if "full_order_accuracy" in df.columns and df["full_order_accuracy"].notna().any():
    top_trial = df.sort_values("full_order_accuracy", ascending=False).iloc[0]
else:
    top_trial = df.sort_values("trial", ascending=True).iloc[0]  # fallback

# ✅ 경로 확인
best_model_path = top_trial["save_path"]
print(f"🏆 선택된 Best Model 경로: {best_model_path}")


FileNotFoundError: [Errno 2] No such file or directory: './global_results/tuning_log.csv'

In [None]:
# ✅ 실제 폴더 존재하는지 확인 (예시로 trial_2)
print(os.listdir(best_model_path))


In [None]:
# ✅ 모델 로드
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = GlobalOrderModel("klue/roberta-large")

# safetensors 파일 로드
state_dict = load_file(f"{best_model_path}/model.safetensors")
model.load_state_dict(state_dict)
model.to(device)
model.eval()

In [None]:
import shutil

final_model_dir = "./global_results/best_model_custom"
shutil.copytree(best_model_path, final_model_dir, dirs_exist_ok=True)
print(f"📦 최종 best model 저장됨: {final_model_dir}")


# 추론

In [None]:
# 🔹 전체 학습 데이터셋 구성
train_labels = make_labels(train_path)
train_dataset = GlobalOrderDataset(train_path, tokenizer, labels=train_labels)

# 🔹 튜닝 결과에서 best 설정 추출
# (예: top_trial에서 learning_rate, weight_decay 등 가져오기)
args = TrainingArguments(
    output_dir="./global_results/best_final",
    learning_rate=top_trial['learning_rate'],
    weight_decay=top_trial['weight_decay'],
    warmup_steps=int(top_trial['warmup_steps']),
    per_device_train_batch_size=16,
    num_train_epochs=int(top_trial['epochs']),
    logging_dir='./retrain_logs',
    save_strategy="no",  # ❌ 저장은 수동으로
    evaluation_strategy="no",
    report_to='none',
    fp16=True,
    optim="adamw_torch_fused"
)

trainer = Trainer(
    model=GlobalOrderModel("klue/roberta-large"),
    args=args,
    train_dataset=train_dataset,
    tokenizer=tokenizer
)

# 🔹 재학습
trainer.train()

# 🔹 최종 모델 저장
trainer.save_model("./global_results/final_model")
tokenizer.save_pretrained("./global_results/final_model")


In [None]:
# ✅ 모델 클래스 직접 정의 (model.py 없이도 OK)
import torch
import torch.nn as nn
from transformers import AutoModel

class GlobalOrderModel(nn.Module):
    def __init__(self, model_name='klue/roberta-large'):
        super().__init__()
        self.bert = AutoModel.from_pretrained(model_name)
        hidden_size = self.bert.config.hidden_size
        self.classifier = nn.Sequential(
            nn.Linear(hidden_size, 1024),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(1024, 4 * 4)
        )

    def forward(self, input_ids, attention_mask, labels=None):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled = outputs.last_hidden_state[:, 0]
        logits = self.classifier(pooled)
        if labels is not None:
            loss_fn = nn.CrossEntropyLoss()
            loss = loss_fn(logits.view(-1, 4), labels.view(-1))
            return {"loss": loss, "logits": logits.view(-1, 4, 4)}
        else:
            return {"logits": logits.view(-1, 4, 4)}


In [None]:
from safetensors.torch import load_file
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

best_model_path = "./global_results/best_model_custom"  # ← 경로 꼭 이걸로 맞춰주세요

model = GlobalOrderModel("klue/roberta-large")
state_dict = load_file(f"{best_model_path}/model.safetensors")
model.load_state_dict(state_dict)
model.to(device)
model.eval()


In [None]:
# ✅ 직접 정의한 Dataset 클래스
from torch.utils.data import Dataset
import torch

class GlobalOrderDataset(Dataset):
    def __init__(self, df, tokenizer, labels=None, max_length=256):
        self.sentences = df[[f'sentence_{i}' for i in range(4)]].values
        self.tokenizer = tokenizer
        self.labels = labels
        self.max_length = max_length

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
        sents = self.sentences[idx]
        text = '[CLS] ' + ' [SEP] '.join(sents) + ' [SEP]'
        encoding = self.tokenizer(
            text,
            padding='max_length',
            truncation=True,
            max_length=self.max_length,
            return_tensors='pt'
        )
        item = {k: v.squeeze(0) for k, v in encoding.items()}
        if self.labels is not None:
            item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item


In [None]:
# inference.py
import numpy as np
import pandas as pd
from torch.utils.data import DataLoader
def predict(model, test_df, tokenizer, device, batch_size=32):
    test_dataset = GlobalOrderDataset(test_df, tokenizer, labels=None)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
    model.eval()
    all_preds = []
    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            logits = model(input_ids, attention_mask)['logits']  # ✅ dict에서 'logits' 꺼냄
            preds = logits.argmax(-1).cpu().numpy()
            all_preds.append(preds)
    all_preds = np.concatenate(all_preds, axis=0)
    # [문장0은 몇 번째, ...] → [answer_0, answer_1, ...]로 역변환
    answers = []
    for row in all_preds:
        answer = [0]*4
        for sent_idx, pos in enumerate(row):
            answer[pos] = sent_idx
        answers.append(answer)
    return np.array(answers)

def save_submission(test_df, answers, submission_path, output_path):
    sub = pd.read_csv(submission_path)
    for i in range(4):
        sub[f'answer_{i}'] = answers[:, i]
    sub.to_csv(output_path, index=False)


# 예측 및 저장

In [None]:

# 예측 수행
answers = predict(
    model=model,
    test_df=test_df,
    tokenizer=tokenizer,
    device=device,
    batch_size=32
)

# 제출 파일 저장
save_submission(
    test_df=test_df,
    answers=answers,
    submission_path="/content/sample_submission.csv",
    output_path="/content/submission.csv"
)

print("✅ submission.csv 저장 완료")

1차 결과: 0.82134
# 2차 코드 점검

## 혼동행렬 분석 결과
- ✅ 결과 해석 (요약)
정답 → 예측	가장 많이 헷갈린 조합
0 → 1	53건 → 문장 0을 1번 위치로 착각
1 → 3	46건
2 → 3	50건
3 → 2	51건

  🔍 구체적 해석
대각선 값이 높을수록 (1350 이상) → 정확히 예측한 경우.

대각선 바깥 값은 오답인데, 특히 다음 케이스가 문제:

문장 2와 문장 3은 서로 자주 헷갈림 (2→3: 50건, 3→2: 51건)

문장 0은 거의 안 헷갈리는데도 0→1: 53건 존재함

후반 문장(2, 3) 사이의 연결성이 약하거나, 모델이 전개 흐름을 파악하지 못하고 있음

문장 0 → 1 착각은 시작 문장 판단 근거 부족을 의미

ex. [SEP]으로만 문장 구분했을 때, 첫 문장만의 특성 학습이 어렵다면 생김


### 개선 아이디어
| 문제           | 해결 방안                                                            |
| ------------ | ---------------------------------------------------------------- |
| 문장 0/1 구분 애매 | 문장 시작을 명확히 학습시키기 위한 **문장 위치 임베딩 추가**                             |
| 문장 2/3 혼동    | Pairwise loss 도입 or 후반 문장 강조 학습 (e.g. position-aware classifier) |
| 전체 미세 성능 향상  | 앙상블 (Global + Pairwise), 또는 warmup 증가, 학습 에폭 증가                  |

*위치 사용 모델*
01, 12	기존 Global 예측 유지
2~3	pairwise 모델로 sent_2 vs sent_3 우선순위 재결정
