In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from datasets import load_dataset, Dataset
from sklearn.model_selection import train_test_split

# Step 1: 데이터 전처리 - AutoTokenizer 사용
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

# 데이터셋 로드
dataset = load_dataset("csv", data_files="./datas/justice/justice_train.csv")

# 텍스트와 레이블 추출
texts = dataset['train']['scenario']
labels = dataset['train']["label"]

# Step 2: 레이블링 - 정의로우면 0, 아니면 1로 레이블링
labels = torch.tensor(labels)
labels = 1 - labels  # 0과 1을 반대로 매핑

# Step 3: 데이터셋 분할 - 트레인 데이터셋과 테스트 데이터셋
train_texts, test_texts, train_labels, test_labels = train_test_split(texts, labels, test_size=0.2, random_state=42)

# 토큰화 및 인코딩
train_encoded_inputs = tokenizer(train_texts, padding=True, truncation=True, return_tensors="pt")
test_encoded_inputs = tokenizer(test_texts, padding=True, truncation=True, return_tensors="pt")

# Step 4: 모델 학습 - Hugging Face Hub에서 모델 가져오기
model = AutoModelForSequenceClassification.from_pretrained("textattack/bert-base-uncased-rotten_tomatoes")

# TrainingArguments 설정
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy='epoch',
    learning_rate=2e-1,
    per_device_train_batch_size=512,  # 배치 크기를 32로 조정
    per_device_eval_batch_size=512,  # 배치 크기를 32로 조정
    num_train_epochs=1,  # 학습 에폭 수를 1로 줄임
    weight_decay=0.1,
    save_strategy='epoch',
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
)

# Trainer 설정
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=Dataset.from_dict({
        'input_ids': train_encoded_inputs['input_ids'],
        'attention_mask': train_encoded_inputs['attention_mask'],
        'labels': train_labels,
    }),
    eval_dataset=Dataset.from_dict({
        'input_ids': test_encoded_inputs['input_ids'],
        'attention_mask': test_encoded_inputs['attention_mask'],
        'labels': test_labels,
    }),
)

# 모델 학습
trainer.train()

# Step 5: 최종 결과 측정 - 정확도로 측정
eval_result = trainer.evaluate(eval_dataset=torch.utils.data.TensorDataset(test_encoded_inputs['input_ids'], test_encoded_inputs['attention_mask'], test_labels))
accuracy = eval_result['eval_accuracy']
print("Accuracy:", accuracy)

Found cached dataset csv (C:/Users/User/.cache/huggingface/datasets/csv/default-6304ef94639b87b5/0.0.0/eea64c71ca8b46dd3f537ed218fc9bf495d5707789152eb2764f5c78fa66d59d)


  0%|          | 0/1 [00:00<?, ?it/s]

***** Running training *****
  Num examples = 17432
  Num Epochs = 1
  Instantaneous batch size per device = 512
  Total train batch size (w. parallel, distributed & accumulation) = 512
  Gradient Accumulation steps = 1
  Total optimization steps = 35
  Number of trainable parameters = 109483778


Epoch,Training Loss,Validation Loss


In [1]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AdamW
from sklearn.model_selection import train_test_split
import pandas as pd

import warnings

warnings.filterwarnings("ignore")

# 1. 데이터 전처리
class EthicsDataset(Dataset):
    def __init__(self, data_path, tokenizer):
        self.data = pd.read_csv(data_path)
        self.labels = self.data['label'].values
        self.texts = self.data['scenario'].values
        self.tokenizer = tokenizer
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, index):
        text = self.texts[index]
        label = self.labels[index]
        encoding = self.tokenizer(text, truncation=True, padding='max_length', max_length=128, return_tensors='pt')
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label)
        }

# 2. 데이터셋 로드 및 전처리
data_path = '/Users/minsoo/Downloads/ethics/justice/justice_train.csv'
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
dataset = EthicsDataset(data_path, tokenizer)
train_dataset, eval_dataset = train_test_split(dataset, test_size=0.2, random_state=42)

# 3. 모델 학습
model = AutoModelForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
optimizer = AdamW(model.parameters(), lr=1e-5)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
eval_loader = DataLoader(eval_dataset, batch_size=16, shuffle=False)

model.to(device)

for epoch in range(3):
    model.train()
    total_loss = 0
    
    for batch in train_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
    
    average_loss = total_loss / len(train_loader)
    
    print(f"Epoch {epoch+1}:")
    print(f"  Training Loss: {average_loss}")
    
    model.eval()
    correct = 0
    total = 0
    
    with torch.no_grad():
        for batch in eval_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            logits = outputs.logits
            predictions = torch.argmax(logits, dim=1)
            
            total += labels.size(0)
            correct += (predictions == labels).sum().item()
    
    accuracy = correct / total
    print(f"  Evaluation Accuracy: {accuracy}")

# 4. 최종 결과
print(f"Final Evaluation Accuracy: {accuracy}")

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Downloading model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly i

Epoch 1:
  Training Loss: 0.5252090042063949
  Evaluation Accuracy: 0.7967423721036935
Epoch 2:
  Training Loss: 0.34706535946338546
  Evaluation Accuracy: 0.8095893553567332
Epoch 3:
  Training Loss: 0.23324432597760486
  Evaluation Accuracy: 0.8226657490250058
Final Evaluation Accuracy: 0.8226657490250058
