# **KcBert-base**

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
import numpy as np
import pandas as pd

### 데이터 로드

In [5]:
train_df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/seojin/data/train_df_1012.csv')
test_df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/seojin/data/test_df_1012.csv')

### 모델링

- epoch: 7
- learning rate: 2e-5
- batch size: 64
- max length: 256

In [6]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import f1_score
from tqdm import tqdm
import pandas as pd
from types import SimpleNamespace
from transformers import get_linear_schedule_with_warmup

In [7]:
config = {
    "learning_rate": 2e-5,
    "epoch": 7,
    "batch_size": 64
}

CFG = SimpleNamespace(**config)

In [8]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
tokenizer = BertTokenizer.from_pretrained('beomi/kcbert-base')
model = BertForSequenceClassification.from_pretrained('beomi/kcbert-base', num_labels=len(train_df['분류'].unique())).to(device)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/250k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/619 [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at beomi/kcbert-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=256):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, item):
        text = str(self.texts[item])
        label = self.labels[item] if self.labels is not None else -1
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )
        return {
            'text': text,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

In [10]:
# 레이블 인코딩
label_encoder = {label: i for i, label in enumerate(train_df['분류'].unique())}
train_df['label'] = train_df['분류'].map(label_encoder)

# 데이터 분할 (train -> train + validation)
# train_df, val_df = train_test_split(train_df, test_size=0.2, stratify=train_df['분류'], random_state=42)

# # 데이터셋 생성
# train_dataset = TextDataset(train_df.키워드.tolist(), train_df.label.tolist(), tokenizer)
# val_dataset = TextDataset(val_df.키워드.tolist(), val_df.label.tolist(), tokenizer)
# test_dataset = TextDataset(test_df.키워드.tolist(), None, tokenizer)  # 라벨 없음

# # 데이터 로더 생성
# train_loader = DataLoader(train_dataset, batch_size=CFG.batch_size, shuffle=True)
# val_loader = DataLoader(val_dataset, batch_size=CFG.batch_size, shuffle=False)
# test_loader = DataLoader(test_dataset, batch_size=CFG.batch_size, shuffle=False)

In [11]:
# 5-fold 교차 검증 설정
n_splits = 5
skf = StratifiedKFold(n_splits=n_splits)
best_f1 = 0.0  # 최상의 F1 스코어 초기화

In [None]:
# 옵티마이저 및 학습 파라미터 설정
# optimizer = torch.optim.AdamW(model.parameters(), lr=CFG.learning_rate)
# scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(train_loader) * CFG.epoch)

In [12]:
# k-fold 루프
for fold, (train_idx, val_idx) in enumerate(skf.split(train_df, train_df['label'])):
    print(f'Fold {fold + 1}/{n_splits}')

    # Fold에 맞는 데이터 분할
    train_subset = train_df.iloc[train_idx]
    val_subset = train_df.iloc[val_idx]

    # 데이터셋 생성
    train_dataset = TextDataset(train_subset.키워드.tolist(), train_subset.label.tolist(), tokenizer)
    val_dataset = TextDataset(val_subset.키워드.tolist(), val_subset.label.tolist(), tokenizer)

    # 데이터 로더 생성
    train_loader = DataLoader(train_dataset, batch_size=CFG.batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=CFG.batch_size, shuffle=False)

    # 모델 및 옵티마이저 초기화
    model = BertForSequenceClassification.from_pretrained('beomi/kcbert-base', num_labels=len(train_df['분류'].unique())).to(device)
    optimizer = torch.optim.AdamW(model.parameters(), lr=CFG.learning_rate)
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(train_loader) * CFG.epoch)

    counter = 0  # Early stopping counter

    # 에폭 루프
    for epoch in range(CFG.epoch):
        model.train()

        for batch in tqdm(train_loader, desc=f'Epoch {epoch + 1}/{CFG.epoch}, Fold {fold + 1}'):
            optimizer.zero_grad()

            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            loss.backward()
            optimizer.step()
            scheduler.step()

        # Validation
        model.eval()
        val_predictions = []
        val_true_labels = []

        with torch.no_grad():
            for batch in tqdm(val_loader, desc=f'Validating Fold {fold + 1}'):
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                labels = batch['labels'].to(device)

                outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
                _, preds = torch.max(outputs.logits, dim=1)
                val_predictions.extend(preds.cpu().tolist())
                val_true_labels.extend(labels.cpu().tolist())

        # F1 스코어 계산
        current_f1 = f1_score(val_true_labels, val_predictions, average='macro')

        if current_f1 > best_f1:
            best_f1 = current_f1
            torch.save(model.state_dict(), f'model_best_f1_fold{fold + 1}.pth')
            print(f"Model saved for Fold {fold + 1} with F1 Score: {best_f1:.4f}")
        else:
            counter += 1

        if counter >= 2:  # patience 적용
            print(f"Early stopping for Fold {fold + 1}")
            break

print(f'Best F1 score across all folds: {best_f1:.4f}')

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at beomi/kcbert-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Fold 1/5


Epoch 1/7, Fold 1: 100%|██████████| 679/679 [06:10<00:00,  1.83it/s]
Validating Fold 1: 100%|██████████| 170/170 [00:53<00:00,  3.21it/s]


Model saved for Fold 1 with F1 Score: 0.4068


Epoch 2/7, Fold 1: 100%|██████████| 679/679 [06:08<00:00,  1.84it/s]
Validating Fold 1: 100%|██████████| 170/170 [00:53<00:00,  3.20it/s]


Model saved for Fold 1 with F1 Score: 0.4835


Epoch 3/7, Fold 1: 100%|██████████| 679/679 [06:08<00:00,  1.84it/s]
Validating Fold 1: 100%|██████████| 170/170 [00:53<00:00,  3.19it/s]


Model saved for Fold 1 with F1 Score: 0.5490


Epoch 4/7, Fold 1: 100%|██████████| 679/679 [06:08<00:00,  1.84it/s]
Validating Fold 1: 100%|██████████| 170/170 [00:53<00:00,  3.20it/s]


Model saved for Fold 1 with F1 Score: 0.5746


Epoch 5/7, Fold 1: 100%|██████████| 679/679 [06:08<00:00,  1.84it/s]
Validating Fold 1: 100%|██████████| 170/170 [00:53<00:00,  3.20it/s]


Model saved for Fold 1 with F1 Score: 0.5878


Epoch 6/7, Fold 1: 100%|██████████| 679/679 [06:08<00:00,  1.84it/s]
Validating Fold 1: 100%|██████████| 170/170 [00:53<00:00,  3.21it/s]


Model saved for Fold 1 with F1 Score: 0.5957


Epoch 7/7, Fold 1: 100%|██████████| 679/679 [06:08<00:00,  1.84it/s]
Validating Fold 1: 100%|██████████| 170/170 [00:53<00:00,  3.20it/s]


Model saved for Fold 1 with F1 Score: 0.5978
Fold 2/5


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at beomi/kcbert-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1/7, Fold 2: 100%|██████████| 679/679 [06:08<00:00,  1.84it/s]
Validating Fold 2: 100%|██████████| 170/170 [00:52<00:00,  3.21it/s]
Epoch 2/7, Fold 2: 100%|██████████| 679/679 [06:08<00:00,  1.84it/s]
Validating Fold 2: 100%|██████████| 170/170 [00:53<00:00,  3.21it/s]


Early stopping for Fold 2
Fold 3/5


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at beomi/kcbert-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1/7, Fold 3: 100%|██████████| 679/679 [06:08<00:00,  1.84it/s]
Validating Fold 3: 100%|██████████| 170/170 [00:52<00:00,  3.21it/s]
Epoch 2/7, Fold 3: 100%|██████████| 679/679 [06:08<00:00,  1.84it/s]
Validating Fold 3: 100%|██████████| 170/170 [00:52<00:00,  3.22it/s]


Early stopping for Fold 3
Fold 4/5


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at beomi/kcbert-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1/7, Fold 4: 100%|██████████| 679/679 [06:08<00:00,  1.84it/s]
Validating Fold 4: 100%|██████████| 170/170 [00:52<00:00,  3.22it/s]
Epoch 2/7, Fold 4: 100%|██████████| 679/679 [06:08<00:00,  1.84it/s]
Validating Fold 4: 100%|██████████| 170/170 [00:52<00:00,  3.21it/s]


Early stopping for Fold 4
Fold 5/5


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at beomi/kcbert-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1/7, Fold 5: 100%|██████████| 679/679 [06:08<00:00,  1.84it/s]
Validating Fold 5: 100%|██████████| 170/170 [00:53<00:00,  3.19it/s]
Epoch 2/7, Fold 5: 100%|██████████| 679/679 [06:08<00:00,  1.84it/s]
Validating Fold 5: 100%|██████████| 170/170 [00:52<00:00,  3.22it/s]

Early stopping for Fold 5
Best F1 score across all folds: 0.5978





In [13]:
# 테스트 세트 추론
model.eval()
test_dataset = TextDataset(test_df.키워드.tolist(), None, tokenizer)  # 라벨 없음
test_loader = DataLoader(test_dataset, batch_size=CFG.batch_size, shuffle=False)
test_predictions = []

with torch.no_grad():
    for batch in tqdm(test_loader, desc='Testing'):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask)
        _, preds = torch.max(outputs.logits, dim=1)
        test_predictions.extend(preds.cpu().tolist())

# 라벨 디코딩 및 제출 파일 생성
label_decoder = {i: label for label, i in label_encoder.items()}
decoded_predictions = [label_decoder[pred] for pred in test_predictions]

Testing: 100%|██████████| 366/366 [01:53<00:00,  3.22it/s]


In [1]:
# 초기화
# patience = 2  # 개선되지 않을 경우 기다리는 에폭 수
# best_f1 = 0.0  # 최상의 F1 스코어 초기화
# counter = 0  # 카운터 초기화

# for epoch in range(CFG.epoch):
#     model.train()

#     # 학습 단계
#     for batch in tqdm(train_loader, desc=f'Epoch {epoch + 1}/{CFG.epoch}'):
#         optimizer.zero_grad()

#         input_ids = batch['input_ids'].to(device)
#         attention_mask = batch['attention_mask'].to(device)
#         labels = batch['labels'].to(device)

#         outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
#         loss = outputs.loss
#         loss.backward()
#         optimizer.step()
#         scheduler.step()

#     # Validation
#     model.eval()
#     val_predictions = []
#     val_true_labels = []

#     with torch.no_grad():
#         for batch in tqdm(val_loader, desc='Validating'):
#             input_ids = batch['input_ids'].to(device)
#             attention_mask = batch['attention_mask'].to(device)
#             labels = batch['labels'].to(device)

#             outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
#             _, preds = torch.max(outputs.logits, dim=1)
#             val_predictions.extend(preds.cpu().tolist())
#             val_true_labels.extend(labels.cpu().tolist())

#     # F1 스코어 계산
#     current_f1 = f1_score(val_true_labels, val_predictions, average='macro')

#     # Early stopping 체크
#     if current_f1 > best_f1:
#         best_f1 = current_f1  # 최상의 F1 스코어 갱신
#         counter = 0  # 카운터 초기화
#         torch.save(model.state_dict(), f'model_best_f1.pth')  # 모델 저장
#         print(f"Model saved with F1 Score: {best_f1:.4f}")
#     else:
#         counter += 1  # 카운터 증가

#     print(f"Epoch {epoch + 1}, F1 Score: {current_f1:.4f}")

#     # Early stopping이 활성화되면 훈련 종료
#     if counter >= patience:
#         print("Early stopping triggered. Training stopped.")
#         break

In [2]:
# 테스트 세트 추론
# model.eval()
# test_predictions = []
# with torch.no_grad():
#     for batch in tqdm(test_loader, desc='Testing'):
#         input_ids = batch['input_ids'].to(device)
#         attention_mask = batch['attention_mask'].to(device)
#         outputs = model(input_ids, attention_mask=attention_mask)
#         _, preds = torch.max(outputs.logits, dim=1)
#         test_predictions.extend(preds.cpu().tolist())

# # 라벨 디코딩
# label_decoder = {i: label for label, i in label_encoder.items()}
# decoded_predictions = [label_decoder[pred] for pred in test_predictions]

In [None]:
sample_submission = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/seojin/data/sample_submission.csv")
sample_submission["분류"] = decoded_predictions

sample_submission.to_csv("/content/drive/MyDrive/Colab Notebooks/seojin/baseline_kobert_1012_5fold.csv", encoding='UTF-8-sig', index=False)

In [17]:
result = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/seojin/baseline_kobert_1012_5fold.csv")
result.head()

Unnamed: 0,ID,분류
0,TEST_00000,지역
1,TEST_00001,사회:사회일반
2,TEST_00002,정치:행정_자치
3,TEST_00003,경제:취업_창업
4,TEST_00004,지역


In [18]:
result['분류'].value_counts()

Unnamed: 0_level_0,count
분류,Unnamed: 1_level_1
지역,12670
경제:부동산,1473
사회:사건_사고,1208
경제:반도체,1115
정치:국회_정당,420
사회:장애인,370
사회:사회일반,357
스포츠:올림픽_아시안게임,356
경제:취업_창업,343
경제:자동차,329
