In [1]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
import torch
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# 1. NSMC 데이터 로드
nsmc_dataset = load_dataset("nsmc")
print(nsmc_dataset)

2025-01-02 08:45:32.340879: I tensorflow/core/util/port.cc:111] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-01-02 08:45:32.357905: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-01-02 08:45:32.357925: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-01-02 08:45:32.357936: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-01-02 08:45:32.361637: I tensorflow/core/platform/cpu_feature_g

DatasetDict({
    train: Dataset({
        features: ['id', 'document', 'label'],
        num_rows: 150000
    })
    test: Dataset({
        features: ['id', 'document', 'label'],
        num_rows: 50000
    })
})


In [2]:
train = nsmc_dataset['train']
cols = train.column_names
cols

['id', 'document', 'label']

In [3]:
for i in range(5):
    for col in cols:
        print(col, ":", train[col][i])
    print('\n')

id : 9976970
document : 아 더빙.. 진짜 짜증나네요 목소리
label : 0


id : 3819312
document : 흠...포스터보고 초딩영화줄....오버연기조차 가볍지 않구나
label : 1


id : 10265843
document : 너무재밓었다그래서보는것을추천한다
label : 0


id : 9045019
document : 교도소 이야기구먼 ..솔직히 재미는 없다..평점 조정
label : 0


id : 6483659
document : 사이몬페그의 익살스런 연기가 돋보였던 영화!스파이더맨에서 늙어보이기만 했던 커스틴 던스트가 너무나도 이뻐보였다
label : 1




In [4]:
# 2. klue/bert-base 토크나이저 로드
tokenizer = AutoTokenizer.from_pretrained('klue/bert-base')

In [5]:
# 3. 데이터 전처리 함수 정의
def preprocess_data(data):
    return tokenizer(
        data['document'],
        padding="max_length",
        truncation=True,
        return_token_type_ids=False,
    )

In [6]:
# 4. 데이터셋 전처리
tokenized_datasets = nsmc_dataset.map(preprocess_data, batched=True)
tokenized_datasets = tokenized_datasets.rename_column(
    "label", "labels")  # Trainer에서 label 이름을 맞추기 위함
tokenized_datasets.set_format("torch")

In [7]:
# 5. 학습 및 검증 데이터셋 준비
train_dataset = tokenized_datasets["train"]
test_dataset = tokenized_datasets["test"]

In [8]:
# 6. 모델 로드
model = AutoModelForSequenceClassification.from_pretrained(
    'klue/bert-base', num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at klue/bert-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
# 7. 평가 메트릭 정의
def compute_metrics(pred):
    predictions = torch.argmax(torch.tensor(pred.predictions), dim=1)
    labels = pred.label_ids
    acc = accuracy_score(labels, predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(
        labels, predictions, average='binary')
    return {"accuracy": acc, "precision": precision, "recall": recall, "f1": f1}

In [10]:
# 8. 학습 설정
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="f1"
)



In [11]:
# 9. Trainer 객체 생성
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

  trainer = Trainer(


In [12]:
from time import time

# 10. 모델 학습
start_time = time()
trainer.train()
end_time = time()

training_time = start_time - end_time

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.305,0.237803,0.902,0.894524,0.913002,0.903668
2,0.2183,0.24426,0.90658,0.914282,0.898701,0.906425
3,0.0973,0.302069,0.90548,0.898624,0.915544,0.907005


In [13]:
# 11. 평가
eval_result = trainer.evaluate()
print("Evaluation results:", eval_result)

Evaluation results: {'eval_loss': 0.3020688593387604, 'eval_accuracy': 0.90548, 'eval_precision': 0.8986236206963777, 'eval_recall': 0.9155444325269138, 'eval_f1': 0.9070051160960252, 'eval_runtime': 151.9116, 'eval_samples_per_second': 329.139, 'eval_steps_per_second': 10.289, 'epoch': 3.0}


In [14]:
from transformers import DataCollatorWithPadding

# 1. Data Collator 생성 (Dynamic Padding)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# 2. Training Arguments에 group_by_length 추가
training_args_bucketing = TrainingArguments(
    output_dir="./results_bucketing",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs_bucketing",
    logging_steps=10,
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    group_by_length=True  # Bucketing 활성화
)

# 3. Trainer 생성
trainer_bucketing = Trainer(
    model=model,
    args=training_args_bucketing,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,  # Dynamic Padding 활성화
    compute_metrics=compute_metrics
)

# 4. 모델 학습
start_time = time()
trainer_bucketing.train()
end_time = time()

training_time_bucketing = end_time - start_time

  trainer_bucketing = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.1533,0.298389,0.90022,0.887468,0.918246,0.902595
2,0.0992,0.391471,0.90402,0.896721,0.91471,0.905626
3,0.0324,0.488648,0.90332,0.897914,0.911612,0.904711


Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.


In [15]:
# 5. 평가
eval_result_bucketing = trainer_bucketing.evaluate()

Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.


In [17]:
# STEP 4와 STEP 5 결과 비교
print("==== 성능 비교 ====")
print("STEP 4 결과:", eval_result)
print("STEP 5 (Bucketing + Dynamic Padding) 결과:", eval_result_bucketing)

print("\n==== 학습 시간 비교 ====")
print("STEP 4 학습 시간:", -training_time)
print("STEP 5 학습 시간:", training_time_bucketing)

==== 성능 비교 ====
STEP 4 결과: {'eval_loss': 0.3020688593387604, 'eval_accuracy': 0.90548, 'eval_precision': 0.8986236206963777, 'eval_recall': 0.9155444325269138, 'eval_f1': 0.9070051160960252, 'eval_runtime': 151.9116, 'eval_samples_per_second': 329.139, 'eval_steps_per_second': 10.289, 'epoch': 3.0}
STEP 5 (Bucketing + Dynamic Padding) 결과: {'eval_loss': 0.391470730304718, 'eval_accuracy': 0.90402, 'eval_precision': 0.8967209284212166, 'eval_recall': 0.9147102053787789, 'eval_f1': 0.9056262413718511, 'eval_runtime': 151.7473, 'eval_samples_per_second': 329.495, 'eval_steps_per_second': 10.3, 'epoch': 3.0}

==== 학습 시간 비교 ====
STEP 4 학습 시간: 4644.297319412231
STEP 5 학습 시간: 4671.733449220657


### STEP5보다 STEP4가 더 빠르고 정확한 성능을 가지는 것으로 나타났다.      
           
일반적인 상황에서는 반대가 되어야 하는데 이렇게 나온 이유를 생각해보면....     
NSMC 데이터셋은 긴 문장이 아니라 짧은 문장들로 구성되어 있다.    
따라서 비슷한 길이의 샘플을 그룹으로 묶어 배치를 만든 Bucketing과 동적 패딩 Dynamic Padding의 효과가 그리 크지 않아 성능에 미치는 영향이 미미했을 것으로 생각된다.