In [1]:
# 1. 라이브러리 임포트
import numpy as np
import evaluate
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import TrainingArguments, Trainer
from sklearn.metrics import accuracy_score, f1_score


In [2]:
# 2. 데이터셋 로드 (BANKING77)
ds = load_dataset("banking77")
labels = ds["train"].features["label"].names
num_labels = len(labels)

README.md: 0.00B [00:00, ?B/s]

data/train-00000-of-00001.parquet:   0%|          | 0.00/298k [00:00<?, ?B/s]

data/test-00000-of-00001.parquet:   0%|          | 0.00/93.9k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/10003 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/3080 [00:00<?, ? examples/s]

In [3]:
# 3. 토크나이저 & 모델 (XLM-R)
model_ckpt = "xlm-roberta-base"
tok = AutoTokenizer.from_pretrained(model_ckpt)
model = AutoModelForSequenceClassification.from_pretrained(model_ckpt, num_labels=num_labels)

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [4]:
# 4. 토큰화
def tokenize(batch):
    return tok(batch["text"], truncation=True, padding="max_length", max_length=64)

ds_tok = ds.map(tokenize, batched=True)
ds_tok = ds_tok.rename_column("label", "labels")
ds_tok.set_format("torch", columns=["input_ids", "attention_mask", "labels"])


Map:   0%|          | 0/10003 [00:00<?, ? examples/s]

Map:   0%|          | 0/3080 [00:00<?, ? examples/s]

In [5]:

# 5. 평가지표
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return {
        "accuracy": accuracy_score(labels, preds),
        "macro_f1": f1_score(labels, preds, average="macro")
    }

In [6]:
# 6. 학습 세팅 (똥컴 모드)
args = TrainingArguments(
    output_dir="./out_xlmr",
    learning_rate=2e-5,
    num_train_epochs=1,              # 1epoch만!
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    eval_strategy="epoch",           # ⚠️ evaluation_strategy → eval_strategy
    save_strategy="epoch",
    load_best_model_at_end=True,
    report_to="none"
)

In [7]:

# 7. Trainer 정의
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=ds_tok["train"],
    eval_dataset=ds_tok["test"],
    compute_metrics=compute_metrics,
    tokenizer=tok,
)


  trainer = Trainer(


In [8]:
# 8. 학습 시작
trainer.train()



Epoch,Training Loss,Validation Loss,Accuracy,Macro F1
1,2.7443,2.118907,0.642208,0.589854


TrainOutput(global_step=1251, training_loss=3.1303256223146483, metrics={'train_runtime': 275.3017, 'train_samples_per_second': 36.335, 'train_steps_per_second': 4.544, 'total_flos': 329209024287360.0, 'train_loss': 3.1303256223146483, 'epoch': 1.0})

In [9]:
# 9. 평가
results = trainer.evaluate()
print("📊 XLM-R Results:", results)



📊 XLM-R Results: {'eval_loss': 2.1189074516296387, 'eval_accuracy': 0.6422077922077922, 'eval_macro_f1': 0.5898535567980643, 'eval_runtime': 12.5398, 'eval_samples_per_second': 245.619, 'eval_steps_per_second': 15.391, 'epoch': 1.0}
