# **KcBert**

In [1]:
import numpy as np
import pandas as pd

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


### 데이터 로드

In [8]:
train_df = pd.read_csv('/content/drive/MyDrive/GBT 해커톤/data/train_df_1007.csv')
test_df = pd.read_csv('/content/drive/MyDrive/GBT 해커톤/data/test_df_1007.csv')

In [9]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54314 entries, 0 to 54313
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   ID      54314 non-null  object
 1   분류      54314 non-null  object
 2   제목      54314 non-null  object
 3   키워드     54314 non-null  object
dtypes: object(4)
memory usage: 1.7+ MB


In [10]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23405 entries, 0 to 23404
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   ID      23405 non-null  object
 1   제목      23405 non-null  object
 2   키워드     23405 non-null  object
dtypes: object(3)
memory usage: 548.7+ KB


### 모델링

- epoch: 5
- learning rate: 1e-5
- batch size: 32
- max length: 256

In [11]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from tqdm import tqdm
import pandas as pd
from types import SimpleNamespace

In [18]:
config = {
    "learning_rate": 1e-5,
    "epoch": 5,
    "batch_size": 32
}

CFG = SimpleNamespace(**config)

In [19]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
tokenizer = BertTokenizer.from_pretrained('beomi/KcBERT-base')
model = BertForSequenceClassification.from_pretrained('beomi/KcBERT-base', num_labels=len(train_df['분류'].unique())).to(device)

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/250k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/619 [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at beomi/KcBERT-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [20]:
class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=256):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, item):
        text = str(self.texts[item])
        label = self.labels[item] if self.labels is not None else -1
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )
        return {
            'text': text,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

In [21]:
# 레이블 인코딩
label_encoder = {label: i for i, label in enumerate(train_df['분류'].unique())}
train_df['label'] = train_df['분류'].map(label_encoder)

# 데이터 분할 (train -> train + validation)
train_df, val_df = train_test_split(train_df, test_size=0.2, stratify=train_df['분류'], random_state=42)

# 데이터셋 생성
train_dataset = TextDataset(train_df.키워드.tolist(), train_df.label.tolist(), tokenizer)
val_dataset = TextDataset(val_df.키워드.tolist(), val_df.label.tolist(), tokenizer)
test_dataset = TextDataset(test_df.키워드.tolist(), None, tokenizer)  # 라벨 없음

# 데이터 로더 생성
train_loader = DataLoader(train_dataset, batch_size=CFG.batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=CFG.batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=CFG.batch_size, shuffle=False)

In [23]:
train_dataset[0]

{'text': '용인시 하수시설 기흥레스피아 하수 처리 시설 증설 특례시 특례시 기흥구 하갈동 하수 처리 시설 기흥레스피아 증설 사업비 투입 특례 기흥구 일대 각종 하수 처리 시설 증설 판단 하수처리시설 기흥레스피아 하루 처리 용량 사업비 설비 증설 기흥레스피아 하수 처리 구역 기흥구 일대 상갈 공공 민간 임대 주택 건설 사업 민간임대주택사업 상갈 공공 민간 임대 주택 사업 상갈동 일원 가구 민간 임대 주택 기흥 역세권 인근 가구 주택 건설 사업 각종 원활 하수처리 시설 증설',
 'input_ids': tensor([    2, 22241,  4039, 12358, 11714,   414,  4879, 10770, 19773, 12358,
          9810, 15277,  2680,  4461,  3210,  4219,  4039,  3210,  4219,  4039,
           414,  4879,  4228,  3352,  4320,  4163, 12358,  9810, 15277,   414,
          4879, 10770, 19773,  2680,  4461,  9569,  4167, 13333,  3210,  4219,
           414,  4879,  4228, 27542, 11122, 12358,  9810, 15277,  2680,  4461,
          8885, 12358, 11385, 11714,   414,  4879, 10770, 19773,  8525,  9810,
          2355,  4667,  9569,  4167,  1849,  4167,  2680,  4461,   414,  4879,
         10770, 19773, 12358,  9810, 11581,   414,  4879,  4228, 27542,  1801,
          4320, 11172, 11217, 10862, 13546, 11793,  9569, 11217, 27026, 10597,
    

In [24]:
# 옵티마이저 및 학습 파라미터 설정
optimizer = AdamW(model.parameters(), lr=CFG.learning_rate)



In [25]:
# 학습
model.train()
for epoch in range(CFG.epoch):
    for batch in tqdm(train_loader, desc=f'Epoch {epoch + 1}/{CFG.epoch}'):
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

    # Validation
    model.eval()
    val_predictions = []
    val_true_labels = []
    val_loss = 0.0  # 손실을 계산할 변수 추가
    with torch.no_grad():
        for batch in tqdm(val_loader, desc='Validating'):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)  # labels 추가
            val_loss += outputs.loss.item()  # 손실을 누적

            _, preds = torch.max(outputs.logits, dim=1)
            val_predictions.extend(preds.cpu().tolist())
            val_true_labels.extend(labels.cpu().tolist())

    # 검증 결과 출력
    val_f1 = f1_score(val_true_labels, val_predictions, average='macro')
    val_loss /= len(val_loader)  # 평균 손실 계산
    print(f"Validation F1 Score: {val_f1:.2f}, Validation Loss: {val_loss:.4f}")

Epoch 1/5: 100%|██████████| 1087/1087 [25:32<00:00,  1.41s/it]
Validating: 100%|██████████| 272/272 [02:22<00:00,  1.90it/s]


Validation F1 Score: 0.36, Validation Loss: 0.9628


Epoch 2/5: 100%|██████████| 1087/1087 [25:08<00:00,  1.39s/it]
Validating: 100%|██████████| 272/272 [02:23<00:00,  1.90it/s]


Validation F1 Score: 0.49, Validation Loss: 0.8072


Epoch 3/5: 100%|██████████| 1087/1087 [25:06<00:00,  1.39s/it]
Validating: 100%|██████████| 272/272 [02:23<00:00,  1.90it/s]


Validation F1 Score: 0.54, Validation Loss: 0.7456


Epoch 4/5: 100%|██████████| 1087/1087 [25:07<00:00,  1.39s/it]
Validating: 100%|██████████| 272/272 [02:23<00:00,  1.90it/s]


Validation F1 Score: 0.58, Validation Loss: 0.7664


Epoch 5/5: 100%|██████████| 1087/1087 [25:08<00:00,  1.39s/it]
Validating: 100%|██████████| 272/272 [02:23<00:00,  1.90it/s]

Validation F1 Score: 0.59, Validation Loss: 0.8125





In [26]:
# 테스트 세트 추론
model.eval()
test_predictions = []
with torch.no_grad():
    for batch in tqdm(test_loader, desc='Testing'):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask)
        _, preds = torch.max(outputs.logits, dim=1)
        test_predictions.extend(preds.cpu().tolist())

# 라벨 디코딩
label_decoder = {i: label for label, i in label_encoder.items()}
decoded_predictions = [label_decoder[pred] for pred in test_predictions]

Testing: 100%|██████████| 732/732 [06:37<00:00,  1.84it/s]


In [27]:
sample_submission = pd.read_csv("/content/drive/MyDrive/GBT 해커톤/data/sample_submission.csv")
sample_submission["분류"] = decoded_predictions

sample_submission.to_csv("/content/drive/MyDrive/GBT 해커톤/data/submission_KcBert_1007.csv", encoding='UTF-8-sig', index=False)

In [28]:
result = pd.read_csv("/content/drive/MyDrive/GBT 해커톤/data/submission_KcBert_1007.csv")
result.head()

Unnamed: 0,ID,분류
0,TEST_00000,지역
1,TEST_00001,사회:사회일반
2,TEST_00002,정치:행정_자치
3,TEST_00003,경제:취업_창업
4,TEST_00004,지역


In [29]:
result['분류'].value_counts()

Unnamed: 0_level_0,count
분류,Unnamed: 1_level_1
지역,11969
경제:부동산,1377
사회:사건_사고,1204
경제:반도체,1141
사회:사회일반,627
정치:국회_정당,514
스포츠:올림픽_아시안게임,386
사회:교육_시험,382
사회:의료_건강,361
경제:취업_창업,337
