# Import

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip3 install torch

In [3]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from tqdm import tqdm
import pandas as pd
from types import SimpleNamespace

# Load Data

In [4]:
RandomState=110

train_df = pd.read_csv("/content/drive/MyDrive/gbt해커톤/data/train_df_1009.csv")
test_df = pd.read_csv("/content/drive/MyDrive/gbt해커톤//data/test_df_1009.csv")
sample_submission = pd.read_csv("/content/drive/MyDrive/gbt해커톤/data/sample_submission.csv")

In [5]:
print(len(test_df))
print(len(sample_submission))

23405
23405


# Load Model

In [None]:
from transformers import ElectraForSequenceClassification, ElectraTokenizer
import torch

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
tokenizer = ElectraTokenizer.from_pretrained("beomi/KcELECTRA-base-v2022")
model = ElectraForSequenceClassification.from_pretrained("beomi/KcELECTRA-base-v2022", num_labels=len(train_df['분류'].unique())).to(device)

# Custom Dataset

In [7]:
class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=200):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, item):
        text = str(self.texts[item])
        label = self.labels[item] if self.labels is not None else -1
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )
        return {
            'text': text,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }


In [8]:
train_df.drop(columns=['제목'], inplace=True)
test_df.drop(columns=['제목'], inplace=True)

In [9]:
# 옵티마이저 및 학습 파라미터 설정
config = {
    "learning_rate": [2e-5, 2e-5, 2e-5, 2e-5, 1e-5, 1e-5],  # 각 epoch에 대한 학습률
    "epoch": 6,
    "batch_size": 64,
}

CFG = SimpleNamespace(**config)

optimizer = AdamW(model.parameters(), lr=CFG.learning_rate[0])




In [10]:
# 레이블 인코딩
label_encoder = {label: i for i, label in enumerate(train_df['분류'].unique())}
train_df['label'] = train_df['분류'].map(label_encoder)

# 데이터셋 생성
train_dataset = TextDataset(train_df.키워드.tolist(), train_df.label.tolist(), tokenizer)
test_dataset = TextDataset(test_df.키워드.tolist(), None, tokenizer)  # 라벨 없음

# 데이터 로더 생성
train_loader = DataLoader(train_dataset, batch_size=CFG.batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=CFG.batch_size, shuffle=False)

In [11]:
# 학습
model.train()

for epoch in range(CFG.epoch):
    # 학습률 업데이트
    for param_group in optimizer.param_groups:
        param_group['lr'] = CFG.learning_rate[epoch]

    total_loss = 0.0
    for batch in tqdm(train_loader, desc=f'Epoch {epoch + 1}/{CFG.epoch}'):
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    # 평균 학습 손실 계산
    avg_train_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch + 1}/{CFG.epoch} - Training Loss: {avg_train_loss:.4f}")

Epoch 1/6: 100%|██████████| 849/849 [30:32<00:00,  2.16s/it]


Epoch 1/6 - Training Loss: 1.6360


Epoch 2/6: 100%|██████████| 849/849 [30:36<00:00,  2.16s/it]


Epoch 2/6 - Training Loss: 0.9009


Epoch 3/6: 100%|██████████| 849/849 [30:30<00:00,  2.16s/it]


Epoch 3/6 - Training Loss: 0.6955


Epoch 4/6: 100%|██████████| 849/849 [30:36<00:00,  2.16s/it]


Epoch 4/6 - Training Loss: 0.5640


Epoch 5/6: 100%|██████████| 849/849 [30:51<00:00,  2.18s/it]


Epoch 5/6 - Training Loss: 0.4199


Epoch 6/6: 100%|██████████| 849/849 [30:43<00:00,  2.17s/it]

Epoch 6/6 - Training Loss: 0.3566





public : 0.5812718413

# Inference

In [12]:
# 테스트 세트 추론
model.eval()
test_predictions = []
with torch.no_grad():
    for batch in tqdm(test_loader, desc='Testing'):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask)
        _, preds = torch.max(outputs.logits, dim=1)
        test_predictions.extend(preds.cpu().tolist())

# 라벨 디코딩
label_decoder = {i: label for label, i in label_encoder.items()}
decoded_predictions = [label_decoder[pred] for pred in test_predictions]

Testing: 100%|██████████| 366/366 [05:05<00:00,  1.20it/s]


# Submission

In [13]:
sample_submission["분류"] = decoded_predictions

sample_submission.to_csv("/content/drive/MyDrive/gbt해커톤/submission/1009_ver2_2차시도_submission.csv", encoding='UTF-8-sig', index=False)

In [14]:
sample_submission['분류'].value_counts()

Unnamed: 0_level_0,count
분류,Unnamed: 1_level_1
지역,12188
경제:부동산,1307
사회:사건_사고,1115
경제:반도체,973
사회:사회일반,541
사회:의료_건강,441
사회:교육_시험,413
경제:취업_창업,399
정치:국회_정당,394
스포츠:올림픽_아시안게임,338
