In [None]:
# Python 3.6 설치
!sudo apt-get update -q
!sudo apt-get install python3.6
!sudo apt-get install python3.6-distutils

# Python 3.6 버전을 Colab의 기본 Python으로 설정
!update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.6 1
!update-alternatives --set python3 /usr/bin/python3.6

# pip 재설치
!wget https://bootstrap.pypa.io/pip/3.6/get-pip.py
!python3 get-pip.py


Get:1 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,626 B]
Get:2 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]
Get:3 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease [1,581 B]
Hit:4 http://archive.ubuntu.com/ubuntu jammy InRelease
Ign:5 https://r2u.stat.illinois.edu/ubuntu jammy InRelease
Get:6 https://r2u.stat.illinois.edu/ubuntu jammy Release [5,713 B]
Get:7 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [128 kB]
Get:8 https://r2u.stat.illinois.edu/ubuntu jammy Release.gpg [793 B]
Get:9 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  Packages [1,031 kB]
Get:10 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease [18.1 kB]
Hit:11 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease
Hit:12 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease
Hit:13 http://archive.ubuntu.com/ubuntu jammy-backports InRelease


In [2]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from tqdm import tqdm
import pandas as pd
from types import SimpleNamespace

In [3]:
config = {
    "learning_rate": 2e-5,
    "epoch": 8,
    "batch_size": 32
}

CFG = SimpleNamespace(**config)

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
# 데이터 로드

train_df = pd.read_csv('/content/drive/MyDrive/DAT/train_df_1012.csv')
test_df = pd.read_csv('/content/drive/MyDrive/DAT/test_df_1012.csv')

In [5]:
# 모델 로드
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
tokenizer = BertTokenizer.from_pretrained('jinmang2/kpfbert')
model = BertForSequenceClassification.from_pretrained('jinmang2/kpfbert', num_labels=len(train_df['분류'].unique())).to(device)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/335 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/276k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/622 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/454M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at jinmang2/kpfbert and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=200):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, item):
        text = str(self.texts[item])
        label = self.labels[item] if self.labels is not None else -1
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )
        return {
            'text': text,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }


In [7]:
train_df.drop(columns=['제목'], inplace=True)
test_df.drop(columns=['제목'], inplace=True)

In [8]:
# 레이블 인코딩
label_encoder = {label: i for i, label in enumerate(train_df['분류'].unique())}
train_df['label'] = train_df['분류'].map(label_encoder)

# 데이터 분할 (train -> train + validation)
train_df, val_df = train_test_split(train_df, test_size=0.2, stratify=train_df['분류'], random_state=123)

# 데이터셋 생성
train_dataset = TextDataset(train_df.키워드.tolist(), train_df.label.tolist(), tokenizer)
val_dataset = TextDataset(val_df.키워드.tolist(), val_df.label.tolist(), tokenizer)
test_dataset = TextDataset(test_df.키워드.tolist(), None, tokenizer)  # 라벨 없음

# 데이터 로더 생성
train_loader = DataLoader(train_dataset, batch_size=CFG.batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=CFG.batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=CFG.batch_size, shuffle=False)

In [9]:
'''
# 레이블 인코더 매핑 및 검증
label_encoder = {label: i for i, label in enumerate(train_df['분류'].unique())}
train_df['label'] = train_df['분류'].map(label_encoder)

# 매핑 결과 확인
print("Label Encoder Mapping:")
print(label_encoder)

# 학습 및 검증 데이터셋에 포함된 고유한 레이블 확인
train_unique_labels = train_df['label'].unique()
val_unique_labels = val_df['label'].unique()

print("\nUnique labels in training set:", train_unique_labels)
print("Unique labels in validation set:", val_unique_labels)

# 레이블이 올바르게 매핑되었는지 확인
missing_train_labels = [label for label in train_unique_labels if label not in label_encoder.values()]
missing_val_labels = [label for label in val_unique_labels if label not in label_encoder.values()]

print("\nMissing labels in training set:", missing_train_labels)
print("Missing labels in validation set:", missing_val_labels)
'''

'\n# 레이블 인코더 매핑 및 검증\nlabel_encoder = {label: i for i, label in enumerate(train_df[\'분류\'].unique())}\ntrain_df[\'label\'] = train_df[\'분류\'].map(label_encoder)\n\n# 매핑 결과 확인\nprint("Label Encoder Mapping:")\nprint(label_encoder)\n\n# 학습 및 검증 데이터셋에 포함된 고유한 레이블 확인\ntrain_unique_labels = train_df[\'label\'].unique()\nval_unique_labels = val_df[\'label\'].unique()\n\nprint("\nUnique labels in training set:", train_unique_labels)\nprint("Unique labels in validation set:", val_unique_labels)\n\n# 레이블이 올바르게 매핑되었는지 확인\nmissing_train_labels = [label for label in train_unique_labels if label not in label_encoder.values()]\nmissing_val_labels = [label for label in val_unique_labels if label not in label_encoder.values()]\n\nprint("\nMissing labels in training set:", missing_train_labels)\nprint("Missing labels in validation set:", missing_val_labels)\n'

In [9]:
# 옵티마이저 및 학습 파라미터 설정
optimizer = AdamW(model.parameters(), lr=CFG.learning_rate)



In [11]:
from sklearn.metrics import f1_score
import torch

# Early Stopping 설정
patience = 2  # 성능이 향상되지 않는 에포크 수
best_score = 0
early_stop_counter = 0

# 학습
model.train()
for epoch in range(CFG.epoch):
    for batch in tqdm(train_loader, desc=f'Epoch {epoch + 1}/{CFG.epoch}'):
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

    # Validation
    model.eval()
    val_predictions = []
    val_true_labels = []
    with torch.no_grad():
        for batch in tqdm(val_loader, desc='Validating'):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask)
            _, preds = torch.max(outputs.logits, dim=1)
            val_predictions.extend(preds.cpu().tolist())
            val_true_labels.extend(labels.cpu().tolist())

    # 검증 결과 출력 및 Early Stopping 체크
    val_f1 = f1_score(val_true_labels, val_predictions, average='macro')
    print(f"Validation F1 Score: {val_f1:.2f}")

    # Early Stopping 조건
    if val_f1 > best_score:
        best_score = val_f1
        early_stop_counter = 0  # Reset counter when improvement is seen
        print("New best score achieved.")
    else:
        early_stop_counter += 1
        print(f"No improvement. Early stopping counter: {early_stop_counter}/{patience}")

        if early_stop_counter >= patience:
            print("Early stopping triggered. Training stopped.")
            break

# 최종 모델 저장
torch.save(model, '/content/drive/MyDrive/DAT/final_model6.pt')
print("Model saved after training completion.")

Epoch 1/8: 100%|██████████| 1358/1358 [24:44<00:00,  1.09s/it]
Validating: 100%|██████████| 340/340 [02:17<00:00,  2.47it/s]


Validation F1 Score: 0.46
New best score achieved.


Epoch 2/8: 100%|██████████| 1358/1358 [24:23<00:00,  1.08s/it]
Validating: 100%|██████████| 340/340 [02:17<00:00,  2.48it/s]


Validation F1 Score: 0.54
New best score achieved.


Epoch 3/8: 100%|██████████| 1358/1358 [24:23<00:00,  1.08s/it]
Validating: 100%|██████████| 340/340 [02:17<00:00,  2.47it/s]


Validation F1 Score: 0.57
New best score achieved.


Epoch 4/8: 100%|██████████| 1358/1358 [24:24<00:00,  1.08s/it]
Validating: 100%|██████████| 340/340 [02:17<00:00,  2.47it/s]


Validation F1 Score: 0.59
New best score achieved.


Epoch 5/8: 100%|██████████| 1358/1358 [24:23<00:00,  1.08s/it]
Validating: 100%|██████████| 340/340 [02:17<00:00,  2.47it/s]


Validation F1 Score: 0.60
New best score achieved.


Epoch 6/8: 100%|██████████| 1358/1358 [24:24<00:00,  1.08s/it]
Validating: 100%|██████████| 340/340 [02:17<00:00,  2.47it/s]


Validation F1 Score: 0.61
New best score achieved.


Epoch 7/8: 100%|██████████| 1358/1358 [24:24<00:00,  1.08s/it]
Validating: 100%|██████████| 340/340 [02:17<00:00,  2.47it/s]


Validation F1 Score: 0.61
New best score achieved.


Epoch 8/8: 100%|██████████| 1358/1358 [24:26<00:00,  1.08s/it]
Validating: 100%|██████████| 340/340 [02:18<00:00,  2.46it/s]


Validation F1 Score: 0.62
New best score achieved.
Model saved after training completion.


In [11]:
# 저장된 모델 로드
model = torch.load('/content/drive/MyDrive/DAT/final_model6.pt')
model.to(device)

  model = torch.load('/content/drive/MyDrive/DAT/final_model6.pt')


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(36440, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [12]:
# 테스트 세트 추론
model.eval()
test_predictions = []
with torch.no_grad():
    for batch in tqdm(test_loader, desc='Testing'):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask)
        _, preds = torch.max(outputs.logits, dim=1)
        test_predictions.extend(preds.cpu().tolist())

# 라벨 디코딩
label_decoder = {i: label for label, i in label_encoder.items()}
decoded_predictions = [label_decoder[pred] for pred in test_predictions]

Testing: 100%|██████████| 732/732 [05:08<00:00,  2.37it/s]


In [13]:
sample_submission = pd.read_csv("/content/drive/MyDrive/DAT/sample_submission.csv")
sample_submission["분류"] = decoded_predictions

sample_submission.to_csv("/content/drive/MyDrive/DAT/yk_submission6.csv", encoding='UTF-8-sig', index=False)

In [14]:
import pandas as pd
result = pd.read_csv("/content/drive/MyDrive/DAT/yk_submission6.csv")
result

Unnamed: 0,ID,분류
0,TEST_00000,사회:사회일반
1,TEST_00001,사회:사회일반
2,TEST_00002,정치:행정_자치
3,TEST_00003,경제:취업_창업
4,TEST_00004,지역
...,...,...
23400,TEST_23400,지역
23401,TEST_23401,문화:요리_여행
23402,TEST_23402,지역
23403,TEST_23403,지역


In [15]:
result['분류'].value_counts()

Unnamed: 0_level_0,count
분류,Unnamed: 1_level_1
지역,11764
경제:부동산,1500
사회:사건_사고,1158
경제:반도체,937
사회:사회일반,631
정치:국회_정당,512
사회:의료_건강,444
사회:교육_시험,405
경제:경제일반,346
스포츠:올림픽_아시안게임,341
