<a href="https://colab.research.google.com/github/rkawkclzls/TTT/blob/master/Untitled26.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers datasets torch

import torch
from torch import nn
from torch.utils.data import DataLoader
from torch.optim import AdamW
from transformers import OpenAIGPTTokenizer, OpenAIGPTModel
from datasets import load_dataset
from transformers import get_linear_schedule_with_warmup

# GPU 사용 가능 여부 확인
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# AG News 데이터셋 로드
ds = load_dataset("ag_news")

# OpenAI GPT 토크나이저 로드 및 패딩 토큰 추가
tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt')
tokenizer.pad_token = tokenizer.unk_token

# collate_fn 정의: 배치 데이터 처리
def collate_fn(batch):
    texts = [item['text'] for item in batch]
    labels = [item['label'] for item in batch]

    encoded = tokenizer(texts, padding=True, truncation=True, max_length=128, return_tensors="pt")
    return encoded, torch.LongTensor(labels)

# 데이터 로더 생성 함수
def create_data_loader(dataset, batch_size, shuffle):
    return DataLoader(dataset, batch_size=batch_size, shuffle=shuffle, collate_fn=collate_fn)

# 데이터 로더 생성
train_loader = create_data_loader(ds['train'], batch_size=32, shuffle=True)
test_loader = create_data_loader(ds['test'], batch_size=32, shuffle=False)

# TextClassifier 모델 정의
class TextClassifier(nn.Module):
    def __init__(self, num_classes=4):
        super().__init__()
        self.encoder = OpenAIGPTModel.from_pretrained('openai-gpt')
        self.dropout = nn.Dropout(0.1)
        self.classifier = nn.Linear(self.encoder.config.n_embd, num_classes)

    def forward(self, input_ids, attention_mask):
        outputs = self.encoder(input_ids=input_ids, attention_mask=attention_mask)
        # 마지막 토큰의 hidden state를 사용
        last_hidden_state = outputs.last_hidden_state
        pooled_output = last_hidden_state[:, -1, :]
        pooled_output = self.dropout(pooled_output)
        return self.classifier(pooled_output)

# 모델 인스턴스 생성 및 GPU로 이동
model = TextClassifier().to(device)

# 손실 함수 및 옵티마이저 정의
loss_fn = nn.CrossEntropyLoss()
optimizer = AdamW(model.parameters(), lr=2e-5)

# 학습률 스케줄러 정의
total_steps = len(train_loader) * 3  # 3 에포크
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# 학습 함수 정의
def train(model, dataloader, optimizer, scheduler, loss_fn):
    model.train()
    total_loss = 0
    for batch in dataloader:
        inputs, labels = batch
        input_ids = inputs['input_ids'].to(device)
        attention_mask = inputs['attention_mask'].to(device)
        labels = labels.to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask)
        loss = loss_fn(outputs, labels)
        loss.backward()
        optimizer.step()
        scheduler.step()

        total_loss += loss.item()

    return total_loss / len(dataloader)

# 평가 함수 정의
def evaluate(model, dataloader):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for batch in dataloader:
            inputs, labels = batch
            input_ids = inputs['input_ids'].to(device)
            attention_mask = inputs['attention_mask'].to(device)
            labels = labels.to(device)

            outputs = model(input_ids, attention_mask)
            _, predicted = torch.max(outputs, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    return correct / total

# 학습 루프
n_epochs = 3
for epoch in range(n_epochs):
    train_loss = train(model, train_loader, optimizer, scheduler, loss_fn)
    print(f"Epoch {epoch+1}/{n_epochs}, Train Loss: {train_loss:.4f}")

# 최종 평가
test_acc = evaluate(model, test_loader)
print(f"Final Test Accuracy: {test_acc:.4f}")

Collecting datasets
  Downloading datasets-3.0.1-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.17-py310-none-any.whl.metadata (7.2 kB)
INFO: pip is looking at multiple versions of multiprocess to determine which version is compatible with other requirements. This could take a while.
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.0.1-py3-none-any.whl (471 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m471.6/471.6 kB[0m [31m9.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m9.5 MB/s[0m eta [36m0:00:0

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/8.07k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/18.6M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/1.23M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/120000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/7600 [00:00<?, ? examples/s]

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/816k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/458k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.27M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/656 [00:00<?, ?B/s]

ftfy or spacy is not installed using BERT BasicTokenizer instead of SpaCy & ftfy.


model.safetensors:   0%|          | 0.00/479M [00:00<?, ?B/s]