# Import

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip3 install torch



In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from tqdm import tqdm
import pandas as pd
from types import SimpleNamespace

# Hyperparameter

In [None]:
config = {
    "learning_rate": 2e-5,
    "epoch": 8,
    "batch_size": 32,
}

CFG = SimpleNamespace(**config)

# Load Data

In [None]:
RandomState=110

train_df = pd.read_csv("/content/drive/MyDrive/gbt해커톤/data/train_df_1012.csv")
test_df = pd.read_csv("/content/drive/MyDrive/gbt해커톤//data/test_df_1012.csv")
sample_submission = pd.read_csv("/content/drive/MyDrive/gbt해커톤/data/sample_submission.csv")

In [None]:
print(len(test_df))
print(len(sample_submission))

23405
23405


# Load Model

In [None]:
from transformers import ElectraForSequenceClassification, ElectraTokenizer
import torch

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
tokenizer = ElectraTokenizer.from_pretrained("beomi/KcELECTRA-base-v2022")
model = ElectraForSequenceClassification.from_pretrained("beomi/KcELECTRA-base-v2022", num_labels=len(train_df['분류'].unique())).to(device)

# Custom Dataset

In [None]:
class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=200):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, item):
        text = str(self.texts[item])
        label = self.labels[item] if self.labels is not None else -1
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )
        return {
            'text': text,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }


In [None]:
train_df.drop(columns=['제목'], inplace=True)
test_df.drop(columns=['제목'], inplace=True)

In [None]:
# 레이블 인코딩
label_encoder = {label: i for i, label in enumerate(train_df['분류'].unique())}
train_df['label'] = train_df['분류'].map(label_encoder)

# 데이터 분할 (train -> train + validation)
train_df, val_df = train_test_split(train_df, test_size=0.2, stratify=train_df['분류'], random_state=RandomState)

# 데이터셋 생성
train_dataset = TextDataset(train_df.키워드.tolist(), train_df.label.tolist(), tokenizer)
val_dataset = TextDataset(val_df.키워드.tolist(), val_df.label.tolist(), tokenizer)
test_dataset = TextDataset(test_df.키워드.tolist(), None, tokenizer)  # 라벨 없음

# 데이터 로더 생성
train_loader = DataLoader(train_dataset, batch_size=CFG.batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=CFG.batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=CFG.batch_size, shuffle=False)

In [None]:
# 옵티마이저 및 학습 파라미터 설정
optimizer = AdamW(model.parameters(), lr=CFG.learning_rate)



In [12]:
# 학습
model.train()
best_f1 = 0.0
patience = 2  # 성능 향상이 없을 때 기다리는 에포크 수
patience_counter = 0

for epoch in range(CFG.epoch):
    for batch in tqdm(train_loader, desc=f'Epoch {epoch + 1}/{CFG.epoch}'):
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

    # Validation
    model.eval()
    val_predictions = []
    val_true_labels = []
    with torch.no_grad():
        for batch in tqdm(val_loader, desc='Validating'):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask)
            _, preds = torch.max(outputs.logits, dim=1)
            val_predictions.extend(preds.cpu().tolist())
            val_true_labels.extend(labels.cpu().tolist())

    # 검증 결과 출력
    val_f1 = f1_score(val_true_labels, val_predictions, average='macro')
    print(f"Validation F1 Score: {val_f1:.2f}")

    # 조기 종료 체크
    if val_f1 > best_f1:
        best_f1 = val_f1
        patience_counter = 0  # 성능 향상이 있었으므로 카운터 초기화
        # 모델 저장 등 추가 작업을 여기서 수행할 수 있습니다.
    else:
        patience_counter += 1

    # patience 초과 시 학습 종료
    if patience_counter >= patience:
        print("Early stopping triggered.")
        break


Epoch 1/8: 100%|██████████| 1358/1358 [24:13<00:00,  1.07s/it]
Validating: 100%|██████████| 340/340 [02:13<00:00,  2.54it/s]


Validation F1 Score: 0.30


Epoch 2/8: 100%|██████████| 1358/1358 [23:50<00:00,  1.05s/it]
Validating: 100%|██████████| 340/340 [02:13<00:00,  2.54it/s]


Validation F1 Score: 0.45


Epoch 3/8: 100%|██████████| 1358/1358 [23:49<00:00,  1.05s/it]
Validating: 100%|██████████| 340/340 [02:13<00:00,  2.54it/s]


Validation F1 Score: 0.51


Epoch 4/8: 100%|██████████| 1358/1358 [23:50<00:00,  1.05s/it]
Validating: 100%|██████████| 340/340 [02:13<00:00,  2.54it/s]


Validation F1 Score: 0.55


Epoch 5/8: 100%|██████████| 1358/1358 [23:50<00:00,  1.05s/it]
Validating: 100%|██████████| 340/340 [02:13<00:00,  2.54it/s]


Validation F1 Score: 0.57


Epoch 6/8: 100%|██████████| 1358/1358 [23:49<00:00,  1.05s/it]
Validating: 100%|██████████| 340/340 [02:14<00:00,  2.53it/s]


Validation F1 Score: 0.61


Epoch 7/8: 100%|██████████| 1358/1358 [23:51<00:00,  1.05s/it]
Validating: 100%|██████████| 340/340 [02:13<00:00,  2.54it/s]


Validation F1 Score: 0.61


Epoch 8/8: 100%|██████████| 1358/1358 [23:52<00:00,  1.05s/it]
Validating: 100%|██████████| 340/340 [02:14<00:00,  2.53it/s]

Validation F1 Score: 0.61





# Inference

In [13]:
# 테스트 세트 추론
model.eval()
test_predictions = []
with torch.no_grad():
    for batch in tqdm(test_loader, desc='Testing'):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask)
        _, preds = torch.max(outputs.logits, dim=1)
        test_predictions.extend(preds.cpu().tolist())

# 라벨 디코딩
label_decoder = {i: label for label, i in label_encoder.items()}
decoded_predictions = [label_decoder[pred] for pred in test_predictions]

Testing: 100%|██████████| 732/732 [04:49<00:00,  2.53it/s]


# Submission

In [None]:
sample_submission["분류"] = decoded_predictions

sample_submission.to_csv("/content/drive/MyDrive/gbt해커톤/submission/1012_submission.csv", encoding='UTF-8-sig', index=False)

In [15]:
sample_submission['분류'].value_counts()

Unnamed: 0_level_0,count
분류,Unnamed: 1_level_1
지역,12212
경제:부동산,1464
사회:사건_사고,1109
경제:반도체,857
사회:사회일반,457
정치:국회_정당,424
사회:교육_시험,416
사회:의료_건강,361
경제:취업_창업,349
스포츠:올림픽_아시안게임,307


Unnamed: 0_level_0,count
분류,Unnamed: 1_level_1
지역,12212
경제:부동산,1464
사회:사건_사고,1109
경제:반도체,857
사회:사회일반,457
정치:국회_정당,424
사회:교육_시험,416
사회:의료_건강,361
경제:취업_창업,349
스포츠:올림픽_아시안게임,307
