In [1]:
import pandas as pd
import numpy as np
import re
import shutil
import os
os.environ["WANDB_DISABLED"] = "true"
# os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3"
os.environ["CUDA_VISIBLE_DEVICES"] = "2"

In [2]:
from sklearn.model_selection import train_test_split
from scipy.stats import kendalltau, spearmanr
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from transformers import (
    AutoTokenizer,
    AutoModel,
    Trainer,
    TrainingArguments,
    EarlyStoppingCallback
)



In [3]:
# from google.colab import drive
# drive.mount('/content/drive')

# 데이터 로드 및 전처리

In [4]:
# %cd /content/drive/MyDrive/25-1학기/AI를 위한 딥러닝/team_project
# base_path = "/content/drive/MyDrive/25-1학기/AI를 위한 딥러닝/team_project"
base_path = '.'

In [5]:
# 데이터 로드
train_path = pd.read_csv(f'{base_path}/data/train_extended.csv')
test_path = pd.read_csv(f'{base_path}/data/test.csv')
submission_path = pd.read_csv(f'{base_path}/data/sample_submission.csv')

In [6]:
# 텍스트 정제
def clean_text(text):
  # 특수문자 제거
  text = re.sub(r'[^\w\s]', '', text)
  # 소문자 변환: 한글에는 무의미
  text = text.lower()
  # 불필요한 공백 제거
  text = ' '.join(text.split())
  return text

In [7]:
# 텍스트 정제
for i in range(4):
    train_path[f'sentence_{i}'] = train_path[f'sentence_{i}'].apply(clean_text)
    test_path[f'sentence_{i}'] = test_path[f'sentence_{i}'].apply(clean_text)

In [8]:
def load_data(train_path, test_path):
    train = pd.read_csv(train_path)
    test = pd.read_csv(test_path)
    return train, test

def make_labels(df):
    # answer_0 ~ answer_3 → [문장0은 몇 번째, 문장1은 몇 번째, ...]
    answers = df[[f'answer_{i}' for i in range(4)]].values
    labels = []
    for row in answers:
        label = [0]*4
        for pos, sent_idx in enumerate(row):
            label[sent_idx] = pos
        labels.append(label)
    return np.array(labels)

# 모델 정의 및 학습 준비

## Dataset 클래스
 4개의 문장을 [SEP]로 묶어서 BERT에 넣을 수 있게 바꿔줌

In [9]:
# 데이터셋 클래스
class GlobalOrderDataset(Dataset):
    def __init__(self, df, tokenizer, labels=None, max_length=128):
        self.sentences = df[[f'sentence_{i}' for i in range(4)]].values
        self.tokenizer = tokenizer
        self.labels = labels
        self.max_length = max_length

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
        sents = self.sentences[idx]
        text = '[CLS] ' + ' [SEP] '.join(sents) + ' [SEP]'
        encoding = self.tokenizer(
            text,
            padding='max_length',
            truncation=True,
            max_length=self.max_length,
            return_tensors='pt'
        )
        item = {k: v.squeeze(0) for k, v in encoding.items()}

        # labels가 None이 아닐 때만 labels 추가
        if self.labels is not None:
            item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)

        return item

## Model 클래스
- AutoModel (예: Roberta) 사용
- 문장 4개를 넣었을 때 그 순서를 예측
- 출력은 [batch, 4, 4] 크기의 행렬 → 각 문장이 어떤 위치에 있어야 하는지 예측

In [10]:
class GlobalOrderModel(nn.Module):
    def __init__(self, model_name='klue/roberta-large'):
        super().__init__()
        self.bert = AutoModel.from_pretrained(model_name)
        hidden_size = self.bert.config.hidden_size

        # BERT 대부분 동결 (과적합 방지)
        for param in self.bert.parameters():
            param.requires_grad = False

        # 마지막 6개 레이어만 fine-tuning
        for layer in self.bert.encoder.layer[-6:]:
            for param in layer.parameters():
                param.requires_grad = True

        # 단순한 분류기 (과적합 방지)
        self.classifier = nn.Sequential(
            nn.Linear(hidden_size, 512),
            nn.LayerNorm(512),
            nn.ReLU(),
            nn.Dropout(0.4),

            nn.Linear(512, 256),
            nn.LayerNorm(256),
            nn.ReLU(),
            nn.Dropout(0.4),

            nn.Linear(256, 4 * 4)
        )

    def forward(self, input_ids, attention_mask, labels=None):
        # BERT 인코딩
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)

        # [CLS] 토큰을 사용
        pooled = outputs.last_hidden_state[:, 0]

        # 분류
        logits = self.classifier(pooled)

        if labels is not None:
            loss_fn = nn.CrossEntropyLoss(label_smoothing=0.1)
            loss = loss_fn(logits.view(-1, 4), labels.view(-1))
            return {"loss": loss, "logits": logits.view(-1, 4, 4)}
        else:
            return {"logits": logits.view(-1, 4, 4)}

In [11]:
def compute_metrics(eval_pred):
    try:
        logits, labels = eval_pred
        preds = np.argmax(logits, axis=2)

        # 1. 전체 순서 정확도 (4개 문장이 모두 올바른 순서)
        full_order_accuracy = (preds == labels).all(axis=1).mean()

        # 2. 개별 문장 위치 정확도
        sentence_accuracy = (preds == labels).mean()

        # 3. Kendall's tau
        tau_scores = []
        for pred, label in zip(preds, labels):
            try:
                tau, _ = kendalltau(pred, label)
                tau_scores.append(tau if not np.isnan(tau) else 0.0)
            except:
                tau_scores.append(0.0)
        avg_tau = np.mean(tau_scores)

        # 4. Spearman's rho
        rho_scores = []
        for pred, label in zip(preds, labels):
            try:
                rho, _ = spearmanr(pred, label)
                rho_scores.append(rho if not np.isnan(rho) else 0.0)
            except:
                rho_scores.append(0.0)
        avg_rho = np.mean(rho_scores)

        # # 5. Position-wise accuracy
        # position_accuracies = []
        # for pos in range(4):
        #     correct_pos = sum(1 for p, t in zip(preds, labels) if p[pos] == t[pos])
        #     position_accuracies.append(correct_pos / len(labels))

        # # 6. Inversion accuracy
        # def count_inversions(pred, label):
        #     inversions = 0
        #     n = len(pred)
        #     for i in range(n):
        #         for j in range(i+1, n):
        #             if (pred[i] < pred[j] and label[i] > label[j]) or \
        #                (pred[i] > pred[j] and label[i] < label[j]):
        #                 inversions += 1
        #     return inversions

        # max_inversions = 6  # 4개 문장의 최대 뒤바뀜 개수: C(4,2) = 6
        # inversion_scores = []
        # for pred, label in zip(preds, labels):
        #     inv_count = count_inversions(pred, label)
        #     accuracy = 1 - (inv_count / max_inversions)
        #     inversion_scores.append(accuracy)
        # inversion_accuracy = np.mean(inversion_scores)

        # 5. Adjacent Pair Accuracy (인접 쌍 정확도)
        adjacent_correct = 0
        total_adjacent = 0

        for i in range(0, len(pred), 6):
            if i + 6 > len(pred):
                break

            group_preds = pred[i:i+6]
            group_labels = labels[i:i+6]

            # 인접 쌍만 추출 (0-1, 1-2, 2-3) - 인덱스 수정
            if len(group_preds) >= 6:
                adjacent_preds = [group_preds[0], group_preds[2], group_preds[4]]
                adjacent_labels = [group_labels[0], group_labels[2], group_labels[4]]

                for pred, label in zip(adjacent_preds, adjacent_labels):
                    if pred == label:
                        adjacent_correct += 1
                    total_adjacent += 1

        adjacent_accuracy = adjacent_correct / total_adjacent if total_adjacent > 0 else 0.0

        # 6. Long-range Pair Accuracy (원거리 쌍 정확도)
        long_range_correct = 0
        total_long_range = 0

        for i in range(0, len(pred), 6):
            if i + 6 > len(pred):
                break

            group_preds = pred[i:i+6]
            group_labels = labels[i:i+6]

            # 원거리 쌍 추출 (0-2, 0-3, 1-3) - 인덱스 수정
            if len(group_preds) >= 6:
                long_range_preds = [group_preds[1], group_preds[3], group_preds[5]]
                long_range_labels = [group_labels[1], group_labels[3], group_labels[5]]

                for pred, label in zip(long_range_preds, long_range_labels):
                    if pred == label:
                        long_range_correct += 1
                    total_long_range += 1

        long_range_accuracy = long_range_correct / total_long_range if total_long_range > 0 else 0.0

        return {
            "full_order_accuracy": full_order_accuracy,
            "sentence_accuracy": sentence_accuracy,
            "kendall_tau": avg_tau,
            "spearman_rho": avg_rho,
            # "position_accuracies": position_accuracies,
            # "inversion_accuracy": inversion_accuracy,
            "adjacent_pair_accuracy": adjacent_accuracy,
            "long_range_pair_accuracy": long_range_accuracy
        }
    except Exception as e:
        print(f"❌ compute_metrics 내부 오류: {e}")
        return {}

In [12]:
model_name = "klue/bert-base"

model = GlobalOrderModel(model_name=model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

config.json:   0%|          | 0.00/425 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/445M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/289 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

## 학습 준비

In [13]:
# train/val 분리 (20% → 검증에 사용)
train_df_split, val_df = train_test_split(train_path, test_size=0.2, random_state=42)

# 라벨 생성
train_labels = make_labels(train_df_split)
val_labels = make_labels(val_df)

In [14]:
# 학습 데이터셋 생성
train_dataset = GlobalOrderDataset(train_df_split, tokenizer, labels=train_labels)
val_dataset = GlobalOrderDataset(val_df, tokenizer, labels=val_labels)

In [15]:
# TrainingArguments
training_args = TrainingArguments(
    output_dir="./bert_results",
    num_train_epochs=5,
    learning_rate=3e-5,
    per_device_train_batch_size=64,
    per_device_eval_batch_size=128,
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_dir="./bert_logs",
    logging_steps=100,
    warmup_steps=200,
    weight_decay=0.1,
    gradient_accumulation_steps=4,
    lr_scheduler_type="cosine",
    max_grad_norm=1.0,

    load_best_model_at_end=True,
    metric_for_best_model='full_order_accuracy',
    greater_is_better=True,
    seed=42,
    fp16=True,

    # 💡 핵심 수정
    ddp_find_unused_parameters=False,   # DDP 문제 방지
    dataloader_pin_memory=True,
)

# Trainer 설정 (전체 학습용)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2, early_stopping_threshold=0.005)]
)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


# 학습 실행

In [16]:
trainer.train()

Epoch,Training Loss,Validation Loss,Full Order Accuracy,Sentence Accuracy,Kendall Tau,Spearman Rho,Adjacent Pair Accuracy,Long Range Pair Accuracy
0,0.7131,0.583711,0.715233,0.867068,0.910835,0.943745,0.0,0.0
2,0.4064,0.362348,0.992603,0.996415,0.997359,0.998311,0.0,0.0
4,0.3847,0.358761,0.996571,0.998285,0.998734,0.999178,0.0,0.0


TrainOutput(global_step=2755, training_loss=0.5496384795477949, metrics={'train_runtime': 747.624, 'train_samples_per_second': 943.917, 'train_steps_per_second': 3.685, 'total_flos': 0.0, 'train_loss': 0.5496384795477949, 'epoch': 4.99546690843155})

In [17]:
# checkpoints = ["checkpoint-1103", "checkpoint-2206", "checkpoint-3309"]
# for ckpt in checkpoints:
#     shutil.rmtree(f"/content/roberta_results/{ckpt}", ignore_errors=True)

In [18]:
# best checkpoint 기준으로 모델 저장
save_path = "./bert_results/best_bert_model"

trainer.save_model(save_path)
tokenizer.save_pretrained(save_path)

('./bert_results/best_bert_model/tokenizer_config.json',
 './bert_results/best_bert_model/special_tokens_map.json',
 './bert_results/best_bert_model/vocab.txt',
 './bert_results/best_bert_model/added_tokens.json',
 './bert_results/best_bert_model/tokenizer.json')

# 예측 및 저장

In [19]:
def predict(model, test_df, tokenizer, device, batch_size=32):
    test_dataset = GlobalOrderDataset(test_df, tokenizer, labels=None)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
    model.eval()
    all_preds = []

    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)

            # 모델 출력이 [batch, 4, 4] 형태로 나옴
            # 각 문장별로 4개의 위치에 대한 확률
            logits = model(input_ids, attention_mask)['logits']

            # 각 문장이 어떤 위치에 가야하는지 예측
            # [batch, 4, 4] -> [batch, 4] (각 문장의 최적 위치)
            preds = logits.argmax(-1).cpu().numpy()
            all_preds.append(preds)

    all_preds = np.concatenate(all_preds, axis=0)

    # [문장0은 몇 번째, ...] → [answer_0, answer_1, ...]로 역변환
    answers = []
    for row in all_preds:
        answer = [0] * 4
        for sent_idx, pos in enumerate(row):
            answer[pos] = sent_idx
        answers.append(answer)

    return np.array(answers)

def save_submission(answers, submission_path, output_path):
    sub = submission_path.copy()
    for i in range(4):
        sub[f'answer_{i}'] = answers[:, i]
    sub.to_csv(output_path, index=False)

In [20]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [21]:
# 예측 수행
answers = predict(
    model=model,
    test_df=test_path,
    tokenizer=tokenizer,
    device=device,
    batch_size=32
)

# 제출 파일 저장
save_submission(
    answers=answers,
    submission_path=submission_path,
    output_path=f"./data/submission_bert.csv"
)

print("✅ 제출파일 저장 완료")

✅ 제출파일 저장 완료
