In [1]:
##### CoLA 데이터세트 불러오기
import torch
from torchtext.datasets import CoLA
from transformers import AutoTokenizer
from torch.utils.data import DataLoader

## 배치를 토큰화하고, 패딩, 절사(truncation), 반환 형식을 설정하는 함수
def collator(batch, tokenizer, device):
    source, labels, texts = zip(*batch)
    tokenized = tokenizer(
        texts,
        padding        = "longest",  # 가장 긴 시퀀스에 대해 패딩을 적용
        truncation     = True,       # 입력 시퀀스 길이가 최대 길이 초과한 경우 자름
        return_tensors = "pt"        # 파이토치 텐서 형태로 반환
    )
    input_ids      = tokenized["input_ids"].to(device)
    attention_mask = tokenized["attention_mask"].to(device)
    labels         = torch.tensor(labels, dtype=torch.long).to(device)
    return input_ids, attention_mask, labels


train_data = list(CoLA(split="train"))
valid_data = list(CoLA(split="dev"))
test_data  = list(CoLA(split="test"))

tokenizer           = AutoTokenizer.from_pretrained("gpt2")
# GPT-2는 사전 학습시 패딩기법 사용 x > 패딩 토큰 포함 x > eos 토큰 사용해 패딩
tokenizer.pad_token = tokenizer.eos_token

epochs     = 3
batch_size = 16
device     = "cuda" if torch.cuda.is_available() else "cpu"

train_dataloader = DataLoader(
    train_data,
    batch_size = batch_size,
    collate_fn = lambda x: collator(x, tokenizer, device),
    shuffle    = True,
)
valid_dataloader = DataLoader(
    valid_data, batch_size = batch_size, collate_fn = lambda x: collator(x, tokenizer, device)
)
test_dataloader = DataLoader(
    test_data, batch_size = batch_size, collate_fn = lambda x: collator(x, tokenizer, device)
)

print("Train Dataset Length :", len(train_data))
print("Valid Dataset Length :", len(valid_data))
print("Test Dataset Length :", len(test_data))

Train Dataset Length : 8550
Valid Dataset Length : 526
Test Dataset Length : 515


In [2]:
##### GPT-2 모델 설정
from torch import optim
from transformers import GPT2ForSequenceClassification

# GPT-2 모델을 기반으로 하는 문장 분류 모델 
# (GPT-2와 유사, 분류위해 최종 출력 계층이 미세 조정됨)
model = GPT2ForSequenceClassification.from_pretrained(
    pretrained_model_name_or_path = "gpt2",
    num_labels                    = 2   # 분류 레이블 수
).to(device)

# GPT-2는 사전 학습시 패딩기법 사용 x > 패딩 토큰 포함 x > eos 토큰 사용해 패딩
model.config.pad_token_id = model.config.eos_token_id
optimizer                 = optim.Adam(model.parameters(), lr=5e-5)

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [3]:
##### GPT-2 모델 학습 및 검증
import numpy as np
from torch import nn

# 정확도 계산 함수
def calc_accuracy(preds, labels):
    pred_flat   = np.argmax(preds, axis=1).flatten() # 큰 원소의 인덱스 반환
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

# 학습 함수
def train(model, optimizer, dataloader):
    model.train()
    train_loss = 0.0

    for input_ids, attention_mask, labels in dataloader:
        outputs = model(
            input_ids      = input_ids,
            attention_mask = attention_mask,
            labels         = labels
        )

        loss        = outputs.loss
        train_loss += loss.item()
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    train_loss = train_loss / len(dataloader)
    return train_loss

# 평가 함수
def evaluation(model, dataloader):
    with torch.no_grad():
        model.eval()
        criterion = nn.CrossEntropyLoss()
        val_loss, val_accuracy = 0.0, 0.0
        
        for input_ids, attention_mask, labels in dataloader:
            outputs = model(
                input_ids      = input_ids,
                attention_mask = attention_mask,
                labels         = labels
            )
            
            logits = outputs.logits

            loss      = criterion(logits, labels)
            logits    = logits.detach().cpu().numpy()
            label_ids = labels.to("cpu").numpy()
            accuracy  = calc_accuracy(logits, label_ids)
            
            val_loss     += loss
            val_accuracy += accuracy
    
    val_loss     = val_loss/len(dataloader)
    val_accuracy = val_accuracy/len(dataloader)
    return val_loss, val_accuracy


best_loss = 10000
for epoch in range(epochs):
    train_loss = train(model, optimizer, train_dataloader)
    val_loss, val_accuracy = evaluation(model, valid_dataloader)
    print(f"Epoch {epoch + 1}: Train Loss: {train_loss:.4f} Val Loss: {val_loss:.4f} Val Accuracy {val_accuracy:.4f}")

    # 손실값이 전보다 작으면 모델 가중치를 저장
    if val_loss < best_loss:
        best_loss = val_loss
        torch.save(model.state_dict(), "../models/GPT2ForSequenceClassification.pt")
        print("Saved the model weights")

Epoch 1: Train Loss: 0.5891 Val Loss: 0.5670 Val Accuracy 0.6972
Saved the model weights
Epoch 2: Train Loss: 0.4725 Val Loss: 0.4558 Val Accuracy 0.7925
Saved the model weights
Epoch 3: Train Loss: 0.3310 Val Loss: 0.6027 Val Accuracy 0.7527


In [4]:
##### 모델 평가
model = GPT2ForSequenceClassification.from_pretrained(
    pretrained_model_name_or_path = "gpt2",
    num_labels                    = 2
).to(device)
model.config.pad_token_id = model.config.eos_token_id
model.load_state_dict(torch.load("../models/GPT2ForSequenceClassification.pt"))

test_loss, test_accuracy = evaluation(model, test_dataloader)
print(f"Test Loss : {test_loss:.4f}")
print(f"Test Accuracy : {test_accuracy:.4f}")

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Test Loss : 0.5616
Test Accuracy : 0.7229
