In [3]:
pip install transformers

Collecting transformers
  Using cached transformers-4.30.2-py3-none-any.whl (7.2 MB)
Collecting safetensors>=0.3.1
  Downloading safetensors-0.3.1-cp38-cp38-win_amd64.whl (263 kB)
     ---------------------------------------- 0.0/263.9 kB ? eta -:--:--
     ------------------------------------ - 256.0/263.9 kB 7.9 MB/s eta 0:00:01
     -------------------------------------- 263.9/263.9 kB 5.4 MB/s eta 0:00:00
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.3-cp38-cp38-win_amd64.whl (3.5 MB)
     ---------------------------------------- 0.0/3.5 MB ? eta -:--:--
     - -------------------------------------- 0.1/3.5 MB 4.2 MB/s eta 0:00:01
     ----- ---------------------------------- 0.5/3.5 MB 5.0 MB/s eta 0:00:01
     ---------- ----------------------------- 0.9/3.5 MB 6.1 MB/s eta 0:00:01
     -------------- ------------------------- 1.2/3.5 MB 7.1 MB/s eta 0:00:01
     ------------------ --------------------- 1.6/3.5 MB 7.4 MB/s eta 0:00:01
     -----------

In [None]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification

# 데이터 로드
df = pd.read_csv('train1.csv')

# BERT 토크나이저 로드
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# 데이터셋 클래스 생성
class CustomDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length):
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, index):
        facts = str(self.data.loc[index, 'facts'])
        first_party_winner = self.data.loc[index, 'first_party_winner']

        # 텍스트를 BERT 입력 형식으로 변환
        encoding = self.tokenizer.encode_plus(
            facts,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        
        input_ids = encoding['input_ids'].squeeze()
        attention_mask = encoding['attention_mask'].squeeze()

        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'label': first_party_winner
        }

# 하이퍼파라미터 설정
max_length = 128
batch_size = 16
num_epochs = 5
learning_rate = 1e-5

# 데이터셋과 데이터로더 생성
dataset = CustomDataset(df, tokenizer, max_length)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

# BERT 분류 모델 로드
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

# 옵티마이저 설정
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

# 모델 학습
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

for epoch in range(num_epochs):
    running_loss = 0.0
    model.train()
    
    for batch in dataloader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)
        
        optimizer.zero_grad()
        
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item()
    
    epoch_loss = running_loss / len(dataloader)
    print(f'Epoch {epoch + 1}/{num_epochs}, Loss: {epoch_loss:.4f}')

# 학습된 모델 저장
model.save_pretrained('bert_classifier_model')
tokenizer.save_pretrained('bert_classifier_model')



Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly i

In [18]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# 토크나이저 객체를 파일로 저장
tokenizer.save_pretrained('tokenizer_directory')

('tokenizer_directory\\tokenizer_config.json',
 'tokenizer_directory\\special_tokens_map.json',
 'tokenizer_directory\\vocab.txt',
 'tokenizer_directory\\added_tokens.json')

In [19]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('tokenizer_directory')

In [20]:
from transformers import BertForSequenceClassification

model = BertForSequenceClassification.from_pretrained('bert-base-uncased')

# 모델 객체를 파일로 저장
model.save_pretrained('model_directory')

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly i

In [21]:
model = BertForSequenceClassification.from_pretrained('model_directory')

In [None]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification

# 데이터 로드
df_test = pd.read_csv('test1.csv')  # 테스트 데이터 파일명에 맞게 변경해주세요.

# BERT 토크나이저 로드
tokenizer = BertTokenizer.from_pretrained('bert_classifier_model')

# 데이터셋 클래스 생성
class CustomDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length):
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, index):
        facts = str(self.data.loc[index, 'facts'])

        # 텍스트를 BERT 입력 형식으로 변환
        encoding = self.tokenizer.encode_plus(
            facts,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        
        input_ids = encoding['input_ids'].squeeze()
        attention_mask = encoding['attention_mask'].squeeze()

        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask
        }

# 하이퍼파라미터 설정
max_length = 128
batch_size = 16

# 데이터셋과 데이터로더 생성
dataset_test = CustomDataset(df_test, tokenizer, max_length)
dataloader_test = DataLoader(dataset_test, batch_size=batch_size, shuffle=False)

# BERT 분류 모델 로드
model = BertForSequenceClassification.from_pretrained('bert_classifier_model', num_labels=2)

# 모델 불러오기
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
model.eval()

predictions = []

# 테스트 데이터 예측
with torch.no_grad():
    for batch in dataloader_test:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        probabilities = torch.softmax(logits, dim=1)
        predicted_labels = torch.argmax(probabilities, dim=1)
        
        predictions.extend(predicted_labels.tolist())

# 예측 결과 출력
df_test['predicted_first_party_winner'] = predictions
print(df_test[['ID', 'first_party', 'second_party', 'facts', 'predicted_first_party_winner']])