In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip3 install torch



In [3]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from tqdm import tqdm
import pandas as pd
from types import SimpleNamespace

In [4]:
config = {
    "learning_rate": 2e-5,
    "epoch": 10,
    "batch_size": 32,
}

CFG = SimpleNamespace(**config)

In [5]:
RandomState=110

train_df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/GBT해커톤/Data/train_df_1007.csv")
test_df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/GBT해커톤/Data/test_df_1007.csv")
sample_submission = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/GBT해커톤/Data/sample_submission.csv")

In [6]:
print(len(test_df))
print(len(sample_submission))

23405
23405


In [7]:
from transformers import ElectraForSequenceClassification, ElectraTokenizer
import torch

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
tokenizer = ElectraTokenizer.from_pretrained("beomi/KcELECTRA-base-v2022")
model = ElectraForSequenceClassification.from_pretrained("beomi/KcELECTRA-base-v2022", num_labels=len(train_df['분류'].unique())).to(device)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/288 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/450k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/504 [00:00<?, ?B/s]

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'BertTokenizer'. 
The class this function is called from is 'ElectraTokenizer'.


pytorch_model.bin:   0%|          | 0.00/511M [00:00<?, ?B/s]

Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at beomi/KcELECTRA-base-v2022 and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=300): # max_length 300으로 지정
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, item):
        text = str(self.texts[item])
        label = self.labels[item] if self.labels is not None else -1
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )
        return {
            'text': text,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

In [11]:
train_df.drop(columns=['제목'], inplace=True)
test_df.drop(columns=['제목'], inplace=True)

In [12]:
# 레이블 인코딩
label_encoder = {label: i for i, label in enumerate(train_df['분류'].unique())}
train_df['label'] = train_df['분류'].map(label_encoder)

# 데이터 분할 (train -> train + validation)
train_df, val_df = train_test_split(train_df, test_size=0.2, stratify=train_df['분류'], random_state=RandomState)

# 데이터셋 생성
train_dataset = TextDataset(train_df.키워드.tolist(), train_df.label.tolist(), tokenizer)
val_dataset = TextDataset(val_df.키워드.tolist(), val_df.label.tolist(), tokenizer)
test_dataset = TextDataset(test_df.키워드.tolist(), None, tokenizer)  # 라벨 없음

# 데이터 로더 생성
train_loader = DataLoader(train_dataset, batch_size=CFG.batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=CFG.batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=CFG.batch_size, shuffle=False)

In [13]:
# 옵티마이저 및 학습 파라미터 설정
optimizer = AdamW(model.parameters(), lr=CFG.learning_rate)



In [14]:
# 학습
model.train()
best_f1 = 0.0
patience = 3  # 성능 향상이 없을 때 기다리는 에포크 수
patience_counter = 0

for epoch in range(CFG.epoch):
    for batch in tqdm(train_loader, desc=f'Epoch {epoch + 1}/{CFG.epoch}'):
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

    # Validation
    model.eval()
    val_predictions = []
    val_true_labels = []
    with torch.no_grad():
        for batch in tqdm(val_loader, desc='Validating'):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask)
            _, preds = torch.max(outputs.logits, dim=1)
            val_predictions.extend(preds.cpu().tolist())
            val_true_labels.extend(labels.cpu().tolist())

    # 검증 결과 출력
    val_f1 = f1_score(val_true_labels, val_predictions, average='macro')
    print(f"Validation F1 Score: {val_f1:.2f}")

    # 조기 종료 체크
    if val_f1 > best_f1:
        best_f1 = val_f1
        patience_counter = 0  # 성능 향상이 있었으므로 카운터 초기화
        # 모델 저장 등 추가 작업을 여기서 수행할 수 있습니다.
    else:
        patience_counter += 1

    # patience 초과 시 학습 종료
    if patience_counter >= patience:
        print("Early stopping triggered.")
        break

Epoch 1/10: 100%|██████████| 1358/1358 [18:53<00:00,  1.20it/s]
Validating: 100%|██████████| 340/340 [01:45<00:00,  3.22it/s]


Validation F1 Score: 0.28


Epoch 2/10: 100%|██████████| 1358/1358 [18:08<00:00,  1.25it/s]
Validating: 100%|██████████| 340/340 [01:45<00:00,  3.22it/s]


Validation F1 Score: 0.45


Epoch 3/10: 100%|██████████| 1358/1358 [18:09<00:00,  1.25it/s]
Validating: 100%|██████████| 340/340 [01:45<00:00,  3.22it/s]


Validation F1 Score: 0.53


Epoch 4/10: 100%|██████████| 1358/1358 [18:08<00:00,  1.25it/s]
Validating: 100%|██████████| 340/340 [01:45<00:00,  3.22it/s]


Validation F1 Score: 0.55


Epoch 5/10: 100%|██████████| 1358/1358 [18:09<00:00,  1.25it/s]
Validating: 100%|██████████| 340/340 [01:45<00:00,  3.22it/s]


Validation F1 Score: 0.61


Epoch 6/10: 100%|██████████| 1358/1358 [18:09<00:00,  1.25it/s]
Validating: 100%|██████████| 340/340 [01:45<00:00,  3.22it/s]


Validation F1 Score: 0.60


Epoch 7/10: 100%|██████████| 1358/1358 [18:08<00:00,  1.25it/s]
Validating: 100%|██████████| 340/340 [01:45<00:00,  3.22it/s]


Validation F1 Score: 0.61


Epoch 8/10: 100%|██████████| 1358/1358 [18:09<00:00,  1.25it/s]
Validating: 100%|██████████| 340/340 [01:45<00:00,  3.22it/s]


Validation F1 Score: 0.62


Epoch 9/10: 100%|██████████| 1358/1358 [18:09<00:00,  1.25it/s]
Validating: 100%|██████████| 340/340 [01:45<00:00,  3.22it/s]


Validation F1 Score: 0.61


Epoch 10/10: 100%|██████████| 1358/1358 [18:09<00:00,  1.25it/s]
Validating: 100%|██████████| 340/340 [01:45<00:00,  3.22it/s]

Validation F1 Score: 0.63





In [17]:
# 테스트 세트 추론
model.eval()
test_predictions = []
with torch.no_grad():
    for batch in tqdm(test_loader, desc='Testing'):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask)
        _, preds = torch.max(outputs.logits, dim=1)
        test_predictions.extend(preds.cpu().tolist())

# 라벨 디코딩
label_decoder = {i: label for label, i in label_encoder.items()}
decoded_predictions = [label_decoder[pred] for pred in test_predictions]

Testing: 100%|██████████| 732/732 [03:47<00:00,  3.22it/s]


In [18]:
sample_submission["분류"] = decoded_predictions

sample_submission.to_csv("/content/drive/MyDrive/Colab Notebooks/GBT해커톤/submission/1007_submission_DY.csv", encoding='UTF-8-sig', index=False)