In [1]:
import pandas as pd
import numpy as np
import re
import os
os.environ["WANDB_DISABLED"] = "true"

In [2]:
from scipy.stats import loguniform
from sklearn.model_selection import train_test_split
import itertools
from itertools import permutations
import shutil
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
from safetensors.torch import load_file
from transformers import (
    AutoTokenizer,
    AutoModel,
    Trainer,
    TrainingArguments,
    EarlyStoppingCallback
)




# 1. Îç∞Ïù¥ÌÑ∞ Ï§ÄÎπÑ Î∞è Ï†ÑÏ≤òÎ¶¨

In [3]:
# Îç∞Ïù¥ÌÑ∞ Î°úÎìú
train_path = pd.read_csv('../data/train.csv')
test_df = pd.read_csv('../data/test.csv')
submission_path = pd.read_csv('../data/sample_submission.csv')

In [None]:
submission_path.info()
submission_path.head()

In [5]:
# ÌÖçÏä§Ìä∏ Ï†ïÏ†ú
def clean_text(text):
  # ÌäπÏàòÎ¨∏Ïûê Ï†úÍ±∞
  text = re.sub(r'[^\w\s]', '', text)
  # ÏÜåÎ¨∏Ïûê Î≥ÄÌôò: ÌïúÍ∏ÄÏóêÎäî Î¨¥ÏùòÎØ∏
  text = text.lower()
  # Î∂àÌïÑÏöîÌïú Í≥µÎ∞± Ï†úÍ±∞
  text = ' '.join(text.split())
  return text

In [6]:
# ÌÖçÏä§Ìä∏ Ï†ïÏ†ú
for i in range(4):
    train_path[f'sentence_{i}'] = train_path[f'sentence_{i}'].apply(clean_text)
    test_df[f'sentence_{i}'] = test_df[f'sentence_{i}'].apply(clean_text)

In [7]:
# data_utils.py
def load_data(train_path, test_path):
    train = pd.read_csv(train_path)
    test = pd.read_csv(test_path)
    return train, test

def make_labels(df):
    # answer_0 ~ answer_3 ‚Üí [Î¨∏Ïû•0ÏùÄ Î™á Î≤àÏß∏, Î¨∏Ïû•1ÏùÄ Î™á Î≤àÏß∏, ...]
    answers = df[[f'answer_{i}' for i in range(4)]].values
    labels = []
    for row in answers:
        label = [0]*4
        for pos, sent_idx in enumerate(row):
            label[sent_idx] = pos
        labels.append(label)
    return np.array(labels)

Dataset ÌÅ¥ÎûòÏä§
 * 4Í∞úÏùò Î¨∏Ïû•ÏùÑ [SEP]Î°ú Î¨∂Ïñ¥ÏÑú BERTÏóê ÎÑ£ÏùÑ Ïàò ÏûàÍ≤å Î∞îÍøîÏ§å

In [8]:
# ‚úÖ 1. Îç∞Ïù¥ÌÑ∞ÏÖã ÌÅ¥ÎûòÏä§
class GlobalOrderDataset(Dataset):
    def __init__(self, df, tokenizer, labels=None, max_length=256):
        self.sentences = df[[f'sentence_{i}' for i in range(4)]].values
        self.tokenizer = tokenizer
        self.labels = labels
        self.max_length = max_length

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
        sents = self.sentences[idx]
        text = '[CLS] ' + ' [SEP] '.join(sents) + ' [SEP]'
        encoding = self.tokenizer(
            text,
            padding='max_length',
            truncation=True,
            max_length=self.max_length,
            return_tensors='pt'
        )
        item = {k: v.squeeze(0) for k, v in encoding.items()}
        if self.labels is not None:
            item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
            return item

# 2. ÌïôÏäµ Ï§ÄÎπÑ

Model ÌÅ¥ÎûòÏä§
- AutoModel (Ïòà: Roberta) ÏÇ¨Ïö©
- Î¨∏Ïû• 4Í∞úÎ•º ÎÑ£ÏóàÏùÑ Îïå Í∑∏ ÏàúÏÑúÎ•º ÏòàÏ∏°
- Ï∂úÎ†•ÏùÄ [batch, 4, 4] ÌÅ¨Í∏∞Ïùò ÌñâÎ†¨ ‚Üí Í∞Å Î¨∏Ïû•Ïù¥ Ïñ¥Îñ§ ÏúÑÏπòÏóê ÏûàÏñ¥Ïïº ÌïòÎäîÏßÄ ÏòàÏ∏°

In [9]:
class GlobalOrderModel(nn.Module):
    def __init__(self, model_name='klue/roberta-large'):
        super().__init__()
        self.bert = AutoModel.from_pretrained(model_name)
        hidden_size = self.bert.config.hidden_size

        # Îçî ÍπäÏùÄ Î∂ÑÎ•òÍ∏∞ Íµ¨Ï°∞
        self.classifier = nn.Sequential(
            nn.Linear(hidden_size, 1024),
            nn.LayerNorm(1024),  # LayerNorm Ï∂îÍ∞Ä
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(1024, 512),
            nn.LayerNorm(512),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(512, 4 * 4)  # 4Î¨∏Ïû• * 4 ÌÅ¥ÎûòÏä§
        )

    def forward(self, input_ids, attention_mask, labels=None):
        # BERT Ïù∏ÏΩîÎî©
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        
        # [CLS] ÌÜ†ÌÅ∞Í≥º ÎßàÏßÄÎßâ hidden stateÏùò ÌèâÍ∑†ÏùÑ ÏÇ¨Ïö©
        cls_token = outputs.last_hidden_state[:, 0]  # [CLS] ÌÜ†ÌÅ∞
        last_hidden = outputs.last_hidden_state[:, 1:].mean(dim=1)  # ÎÇòÎ®∏ÏßÄ ÌÜ†ÌÅ∞Îì§Ïùò ÌèâÍ∑†
        pooled = (cls_token + last_hidden) / 2  # Îëê ÌäπÏßïÏùò ÌèâÍ∑†
        
        # Î∂ÑÎ•ò
        logits = self.classifier(pooled)
        
        if labels is not None:
            loss_fn = nn.CrossEntropyLoss()
            loss = loss_fn(logits.view(-1, 4), labels.view(-1))
            return {"loss": loss, "logits": logits.view(-1, 4, 4)}
        else:
            return {"logits": logits.view(-1, 4, 4)}

In [10]:
def compute_metrics(eval_pred):
    try:
        logits, labels = eval_pred
        preds = np.argmax(logits, axis=2)
        sentence_accuracy = (preds == labels).mean()
        full_order_accuracy = (preds == labels).all(axis=1).mean()

        return {
            "sentence_accuracy": sentence_accuracy,
            "full_order_accuracy": full_order_accuracy
        }
    except Exception as e:
        print(f"‚ùå compute_metrics ÎÇ¥Î∂Ä Ïò§Î•ò: {e}")
        return {}

In [None]:
model = GlobalOrderModel(model_name="klue/roberta-large")  # ‚úÖ RoBERTaÎ°ú ÍµêÏ≤¥
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")  # ‚úÖ GPU ÏÇ¨Ïö© Ïó¨Î∂Ä ÌôïÏù∏
model.to(device)  # ‚úÖ Î™®Îç∏ÏùÑ Ìï¥Îãπ ÎîîÎ∞îÏù¥Ïä§Î°ú Ïù¥Îèô

In [12]:
# ‚úÖ RoBERTa Ï†ÑÏö© tokenizer ÏÇ¨Ïö©
tokenizer = AutoTokenizer.from_pretrained("klue/roberta-large")

In [13]:
# ‚úÖ train/val Î∂ÑÎ¶¨ (20% ‚Üí Í≤ÄÏ¶ùÏóê ÏÇ¨Ïö©)
train_df_split, val_df = train_test_split(train_path, test_size=0.2, random_state=42)

# ‚úÖ ÎùºÎ≤® ÏÉùÏÑ±
train_labels = make_labels(train_df_split)
val_labels = make_labels(val_df)

In [14]:
# üîπ ÌïôÏäµ Îç∞Ïù¥ÌÑ∞ÏÖã ÏÉùÏÑ±
train_dataset = GlobalOrderDataset(train_df_split, tokenizer, labels=train_labels)
val_dataset = GlobalOrderDataset(val_df, tokenizer, labels=val_labels)

In [None]:
train_dataset[0]

In [None]:
val_dataset[0]

In [17]:
# üîπ Ï†ÑÏ≤¥ ÎùºÎ≤® ÏÉùÏÑ±
labels = make_labels(train_path)

# ‚úÖ TrainingArguments
training_args = TrainingArguments(
    output_dir="./global_results",
    num_train_epochs=5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_dir="./logs",
    logging_steps=100,
    warmup_steps=500,
    weight_decay=0.01,
    
    load_best_model_at_end=True,
    metric_for_best_model='full_order_accuracy',
    greater_is_better=True,
    seed=42,
)

# ‚úÖ Trainer ÏÑ§Ï†ï (Ï†ÑÏ≤¥ ÌïôÏäµÏö©)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2, early_stopping_threshold=0.001)]
)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


# 3. ÌïôÏäµ Ïã§Ìñâ

In [None]:
trainer.train()

In [16]:
# import shutil

# checkpoints = ["checkpoint-368", "checkpoint-736", "checkpoint-1104", "checkpoint-1472", "checkpoint-1840"]
# for ckpt in checkpoints:
#     shutil.rmtree(f"/content/global_results/{ckpt}", ignore_errors=True)

In [None]:
# ‚úÖ best checkpoint Í∏∞Ï§ÄÏúºÎ°ú Î™®Îç∏ Ï†ÄÏû•
save_path = "./global_results/best_model"

trainer.save_model(save_path)
tokenizer.save_pretrained(save_path)

# 4. ÌïòÏù¥ÌçºÌååÎùºÎØ∏ÌÑ∞ ÌäúÎãù

In [30]:
# ‚úÖ ÌäúÎãùÏö© Îç∞Ïù¥ÌÑ∞ Î∂ÑÎ¶¨
train_split_df, val_df = train_test_split(train_path, test_size=0.2, random_state=42)
train_split_labels = make_labels(train_split_df)
val_labels = make_labels(val_df)

train_split_dataset = GlobalOrderDataset(train_split_df, tokenizer, labels=train_split_labels)
val_dataset = GlobalOrderDataset(val_df, tokenizer, labels=val_labels)


In [None]:
def run_global_tuning(train_split_dataset, val_dataset, tokenizer, n_trials=2):
    results_path = './global_results/tuning_log.csv'
    if os.path.exists(results_path):
        results = pd.read_csv(results_path).to_dict(orient='records')
        start_trial = len(results)
    else:
        results = []
        start_trial = 0

    for trial in range(start_trial, n_trials):
        print(f"\nüéØ Trial {trial + 1} ÏãúÏûë")
        lr = float(loguniform.rvs(1.5e-5, 3.5e-5))
        wd = float(loguniform.rvs(0.01, 0.07))
        epochs = 10
        batch_size = int(np.random.randint(16, 32))
        total_steps = (len(train_split_dataset) // batch_size) * epochs
        warmup = int(total_steps * 0.02)

        args = TrainingArguments(
            output_dir=f'./global_results/trial_{trial+1}',
            learning_rate=lr,
            weight_decay=wd,
            warmup_steps=warmup,
            per_device_train_batch_size=batch_size,
            per_device_eval_batch_size=64,
            num_train_epochs=epochs,
            gradient_accumulation_steps=1,
            lr_scheduler_type='linear',
            logging_dir='./roberta_logs',
            logging_steps=100,
            save_strategy="epoch",
            save_total_limit=2,
            eval_strategy='epoch',  # ‚úÖ ÌèâÍ∞Ä ÌôúÏÑ±Ìôî
            seed=42,
            load_best_model_at_end=True,
            metric_for_best_model='full_order_accuracy',  # ‚úÖ Ïù¥ Í∏∞Ï§ÄÏúºÎ°ú best ÏÑ†ÌÉù
            greater_is_better=True,
            report_to='none',
            fp16=True,
            optim='adamw_torch_fused'
        )

        trainer = Trainer(
            model=GlobalOrderModel("klue/roberta-large"),
            args=args,
            train_dataset=train_split_dataset,
            eval_dataset=val_dataset,
            tokenizer=tokenizer,
            compute_metrics=compute_metrics,  # ‚úÖ ÏÇ¨Ïö©Ïûê Ï†ïÏùò Î©îÌä∏Î¶≠
            callbacks=[EarlyStoppingCallback(early_stopping_patience=2, early_stopping_threshold=0.001)],
        )

        try:
            trainer.train()

            # ÏµúÏ¢Ö ÌèâÍ∞Ä
            eval_result = trainer.evaluate()
            sentence_acc = eval_result.get("eval_sentence_accuracy", None)
            full_order_acc = eval_result.get("eval_full_order_accuracy", None)
            eval_loss = eval_result.get("eval_loss", None)

            save_path = f'./global_results/trial_{trial+1}/best_model'
            try:
                trainer.save_model(save_path)
                tokenizer.save_pretrained(save_path)
                model_saved = True
            except Exception as e:
                print(f"‚ö†Ô∏è Î™®Îç∏ Ï†ÄÏû• Ïã§Ìå®: {e}")
                model_saved = False
                save_path = "FAILED"

            results.append({
                'trial': trial + 1,
                'learning_rate': lr,
                'weight_decay': wd,
                'warmup_steps': warmup,
                'epochs': epochs,
                'sentence_accuracy': sentence_acc,
                'full_order_accuracy': full_order_acc,
                'eval_loss': eval_loss,
                'model_saved': model_saved,
                'save_path': save_path
            })
            pd.DataFrame(results).to_csv(results_path, index=False)

            # checkpoint Ï†ïÎ¶¨
            output_dir = f'./global_results/trial_{trial+1}'
            for subdir in os.listdir(output_dir):
                if subdir.startswith("checkpoint"):
                    shutil.rmtree(os.path.join(output_dir, subdir), ignore_errors=True)

            print(f"‚úÖ Trial {trial+1} ÏôÑÎ£å | Ï†ÄÏû• Í≤ΩÎ°ú: {save_path}")

        except Exception as e:
            print(f"‚õî Trial {trial+1} Ï§ë Ïò§Î•ò Î∞úÏÉù: {e}")
            break

    print("\nüèÜ ÏÉÅÏúÑ Trial:")
    top_trials = pd.DataFrame(results).sort_values(by="full_order_accuracy", ascending=False).head(1)
    print(top_trials)
    return top_trials

In [None]:
# ‚úÖ ÌäúÎãù Î°úÍ∑∏ Î∂àÎü¨Ïò§Í∏∞
df = pd.read_csv("./global_results/tuning_log.csv")

# ‚úÖ Î™®Îç∏ Ï†ÄÏû•Îêú trialÎßå ÎÇ®Í∏∞Í∏∞
df = df[df["model_saved"] == True]

# ‚úÖ ÌèâÍ∞Ä Í≤∞Í≥ºÍ∞Ä ÏûàÎäî Í≤ΩÏö∞: full_order_accuracy Í∏∞Ï§Ä ÏÑ†ÌÉù, ÏóÜÏúºÎ©¥ trial Î≤àÌò∏ Í∏∞Ï§Ä
if "full_order_accuracy" in df.columns and df["full_order_accuracy"].notna().any():
    top_trial = df.sort_values("full_order_accuracy", ascending=False).iloc[0]
else:
    top_trial = df.sort_values("trial", ascending=True).iloc[0]  # fallback

# ‚úÖ Í≤ΩÎ°ú ÌôïÏù∏
best_model_path = top_trial["save_path"]
print(f"üèÜ ÏÑ†ÌÉùÎêú Best Model Í≤ΩÎ°ú: {best_model_path}")


In [None]:
# ‚úÖ Ïã§Ï†ú Ìè¥Îçî Ï°¥Ïû¨ÌïòÎäîÏßÄ ÌôïÏù∏ (ÏòàÏãúÎ°ú trial_2)
print(os.listdir(best_model_path))

In [None]:
# ‚úÖ Î™®Îç∏ Î°úÎìú
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = GlobalOrderModel("klue/roberta-large")

# safetensors ÌååÏùº Î°úÎìú
state_dict = load_file(f"{best_model_path}/model.safetensors")
model.load_state_dict(state_dict)
model.to(device)
model.eval()

In [None]:
final_model_dir = "./global_results/best_model_custom"
shutil.copytree(best_model_path, final_model_dir, dirs_exist_ok=True)
print(f"üì¶ ÏµúÏ¢Ö best model Ï†ÄÏû•Îê®: {final_model_dir}")


# 5. Ï∂îÎ°†

In [None]:
# üîπ Ï†ÑÏ≤¥ ÌïôÏäµ Îç∞Ïù¥ÌÑ∞ÏÖã Íµ¨ÏÑ±
train_labels = make_labels(train_path)
train_dataset = GlobalOrderDataset(train_path, tokenizer, labels=train_labels)

# üîπ ÌäúÎãù Í≤∞Í≥ºÏóêÏÑú best ÏÑ§Ï†ï Ï∂îÏ∂ú
# (Ïòà: top_trialÏóêÏÑú learning_rate, weight_decay Îì± Í∞ÄÏ†∏Ïò§Í∏∞)
args = TrainingArguments(
    output_dir="./global_results/best_final",
    learning_rate=top_trial['learning_rate'],
    weight_decay=top_trial['weight_decay'],
    warmup_steps=int(top_trial['warmup_steps']),
    per_device_train_batch_size=16,
    num_train_epochs=int(top_trial['epochs']),
    logging_dir='./retrain_logs',
    save_strategy="no",  # ‚ùå Ï†ÄÏû•ÏùÄ ÏàòÎèôÏúºÎ°ú
    evaluation_strategy="no",
    report_to='none',
    fp16=True,
    optim="adamw_torch_fused"
)

trainer = Trainer(
    model=GlobalOrderModel("klue/roberta-large"),
    args=args,
    train_dataset=train_dataset,
    tokenizer=tokenizer
)

# üîπ Ïû¨ÌïôÏäµ
trainer.train()

# üîπ ÏµúÏ¢Ö Î™®Îç∏ Ï†ÄÏû•
trainer.save_model("./global_results/final_model")
tokenizer.save_pretrained("./global_results/final_model")


In [None]:
# ‚úÖ Î™®Îç∏ ÌÅ¥ÎûòÏä§ ÏßÅÏ†ë Ï†ïÏùò (model.py ÏóÜÏù¥ÎèÑ OK)
class GlobalOrderModel(nn.Module):
    def __init__(self, model_name='klue/roberta-large'):
        super().__init__()
        self.bert = AutoModel.from_pretrained(model_name)
        hidden_size = self.bert.config.hidden_size
        self.classifier = nn.Sequential(
            nn.Linear(hidden_size, 1024),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(1024, 4 * 4)
        )

    def forward(self, input_ids, attention_mask, labels=None):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled = outputs.last_hidden_state[:, 0]
        logits = self.classifier(pooled)
        if labels is not None:
            loss_fn = nn.CrossEntropyLoss()
            loss = loss_fn(logits.view(-1, 4), labels.view(-1))
            return {"loss": loss, "logits": logits.view(-1, 4, 4)}
        else:
            return {"logits": logits.view(-1, 4, 4)}


In [None]:

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

best_model_path = "./global_results/best_model_custom"  # ‚Üê Í≤ΩÎ°ú Íº≠ Ïù¥Í±∏Î°ú ÎßûÏ∂∞Ï£ºÏÑ∏Ïöî

model = GlobalOrderModel("klue/roberta-large")
state_dict = load_file(f"{best_model_path}/model.safetensors")
model.load_state_dict(state_dict)
model.to(device)
model.eval()


In [None]:
# ‚úÖ ÏßÅÏ†ë Ï†ïÏùòÌïú Dataset ÌÅ¥ÎûòÏä§
class GlobalOrderDataset(Dataset):
    def __init__(self, df, tokenizer, labels=None, max_length=256):
        self.sentences = df[[f'sentence_{i}' for i in range(4)]].values
        self.tokenizer = tokenizer
        self.labels = labels
        self.max_length = max_length

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
        sents = self.sentences[idx]
        text = '[CLS] ' + ' [SEP] '.join(sents) + ' [SEP]'
        encoding = self.tokenizer(
            text,
            padding='max_length',
            truncation=True,
            max_length=self.max_length,
            return_tensors='pt'
        )
        item = {k: v.squeeze(0) for k, v in encoding.items()}
        if self.labels is not None:
            item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item


In [None]:
def predict(model, test_df, tokenizer, device, batch_size=32):
    test_dataset = GlobalOrderDataset(test_df, tokenizer, labels=None)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
    model.eval()
    all_preds = []
    
    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            
            # Î™®Îç∏ Ï∂úÎ†•Ïù¥ [batch, 4, 4] ÌòïÌÉúÎ°ú ÎÇòÏò¥
            # Í∞Å Î¨∏Ïû•Î≥ÑÎ°ú 4Í∞úÏùò ÏúÑÏπòÏóê ÎåÄÌïú ÌôïÎ•†
            logits = model(input_ids, attention_mask)['logits']
            
            # Í∞Å Î¨∏Ïû•Ïù¥ Ïñ¥Îñ§ ÏúÑÏπòÏóê Í∞ÄÏïºÌïòÎäîÏßÄ ÏòàÏ∏°
            # [batch, 4, 4] -> [batch, 4] (Í∞Å Î¨∏Ïû•Ïùò ÏµúÏ†Å ÏúÑÏπò)
            preds = logits.argmax(-1).cpu().numpy()
            all_preds.append(preds)
    
    all_preds = np.concatenate(all_preds, axis=0)
    
    # [Î¨∏Ïû•0ÏùÄ Î™á Î≤àÏß∏, ...] ‚Üí [answer_0, answer_1, ...]Î°ú Ïó≠Î≥ÄÌôò
    answers = []
    for row in all_preds:
        answer = [0] * 4
        for sent_idx, pos in enumerate(row):
            answer[pos] = sent_idx
        answers.append(answer)
    
    return np.array(answers)

def save_submission(test_df, answers, submission_path, output_path):
    sub = pd.read_csv(submission_path)
    for i in range(4):
        sub[f'answer_{i}'] = answers[:, i]
    sub.to_csv(output_path, index=False)

# 6. ÏòàÏ∏° Î∞è Ï†ÄÏû•

In [None]:
# ÏòàÏ∏° ÏàòÌñâ
answers = predict(
    model=model,
    test_df=test_df,
    tokenizer=tokenizer,
    device=device,
    batch_size=32
)

# Ï†úÏ∂ú ÌååÏùº Ï†ÄÏû•
save_submission(
    test_df=test_df,
    answers=answers,
    submission_path="./sample_submission.csv",
    output_path="./submission.csv"
)

print("‚úÖ submission.csv Ï†ÄÏû• ÏôÑÎ£å")