In [None]:
# Python 3.6 설치
!sudo apt-get update -q
!sudo apt-get install python3.6
!sudo apt-get install python3.6-distutils

# Python 3.6 버전을 Colab의 기본 Python으로 설정
!update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.6 1
!update-alternatives --set python3 /usr/bin/python3.6

# pip 재설치
!wget https://bootstrap.pypa.io/pip/3.6/get-pip.py
!python3 get-pip.py


Get:1 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,626 B]
Get:2 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]
Get:3 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease [1,581 B]
Hit:4 http://archive.ubuntu.com/ubuntu jammy InRelease
Ign:5 https://r2u.stat.illinois.edu/ubuntu jammy InRelease
Get:6 https://r2u.stat.illinois.edu/ubuntu jammy Release [5,713 B]
Get:7 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [128 kB]
Get:8 https://r2u.stat.illinois.edu/ubuntu jammy Release.gpg [793 B]
Get:9 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  Packages [1,031 kB]
Get:10 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease [18.1 kB]
Hit:11 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease
Hit:12 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease
Hit:13 http://archive.ubuntu.com/ubuntu jammy-backports InRelease


In [3]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from tqdm import tqdm
import pandas as pd
from types import SimpleNamespace

In [12]:
config = {
    "learning_rate": 2e-5,
    "epoch": 10,
    "batch_size": 32
}

CFG = SimpleNamespace(**config)

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [6]:
# 데이터 로드

train_df = pd.read_csv('/content/drive/MyDrive/DAT/train_df_1012.csv')
test_df = pd.read_csv('/content/drive/MyDrive/DAT/test_df_1012.csv')

In [7]:
# 모델 로드
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
tokenizer = BertTokenizer.from_pretrained('jinmang2/kpfbert')
model = BertForSequenceClassification.from_pretrained('jinmang2/kpfbert', num_labels=len(train_df['분류'].unique())).to(device)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/335 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/276k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/622 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/454M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at jinmang2/kpfbert and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=256):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, item):
        text = str(self.texts[item])
        label = self.labels[item] if self.labels is not None else -1
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )
        return {
            'text': text,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }


In [9]:
train_df.drop(columns=['제목'], inplace=True)
test_df.drop(columns=['제목'], inplace=True)

In [10]:
# 레이블 인코딩
label_encoder = {label: i for i, label in enumerate(train_df['분류'].unique())}
train_df['label'] = train_df['분류'].map(label_encoder)

# 데이터 분할 (train -> train + validation)
train_df, val_df = train_test_split(train_df, test_size=0.2, stratify=train_df['분류'], random_state=123)

# 데이터셋 생성
train_dataset = TextDataset(train_df.키워드.tolist(), train_df.label.tolist(), tokenizer)
val_dataset = TextDataset(val_df.키워드.tolist(), val_df.label.tolist(), tokenizer)
test_dataset = TextDataset(test_df.키워드.tolist(), None, tokenizer)  # 라벨 없음

# 데이터 로더 생성
train_loader = DataLoader(train_dataset, batch_size=CFG.batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=CFG.batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=CFG.batch_size, shuffle=False)

In [None]:
'''
# 레이블 인코더 매핑 및 검증
label_encoder = {label: i for i, label in enumerate(train_df['분류'].unique())}
train_df['label'] = train_df['분류'].map(label_encoder)

# 매핑 결과 확인
print("Label Encoder Mapping:")
print(label_encoder)

# 학습 및 검증 데이터셋에 포함된 고유한 레이블 확인
train_unique_labels = train_df['label'].unique()
val_unique_labels = val_df['label'].unique()

print("\nUnique labels in training set:", train_unique_labels)
print("Unique labels in validation set:", val_unique_labels)

# 레이블이 올바르게 매핑되었는지 확인
missing_train_labels = [label for label in train_unique_labels if label not in label_encoder.values()]
missing_val_labels = [label for label in val_unique_labels if label not in label_encoder.values()]

print("\nMissing labels in training set:", missing_train_labels)
print("Missing labels in validation set:", missing_val_labels)
'''

Label Encoder Mapping:
{'지역': 0, '사회:사건_사고': 1, '사회:여성': 2, '문화:전시_공연': 3, '경제:부동산': 4, '사회:장애인': 5, '문화:미술_건축': 6, '스포츠:올림픽_아시안게임': 7, 'IT_과학:콘텐츠': 8, '경제:무역': 9, '경제:유통': 10, '사회:의료_건강': 11, '정치:국회_정당': 12, '경제:반도체': 13, '사회:교육_시험': 14, '사회:미디어': 15, '경제:산업_기업': 16, 'IT_과학:모바일': 17, '문화:출판': 18, '경제:경제일반': 19, '정치:청와대': 20, '문화:문화일반': 21, '사회:사회일반': 22, '경제:취업_창업': 23, '문화:학술_문화재': 24, '국제': 25, '스포츠:골프': 26, '정치:외교': 27, '스포츠:축구': 28, '사회:노동_복지': 29, '정치:선거': 30, '경제:서비스_쇼핑': 31, 'IT_과학:IT_과학일반': 32, '문화:종교': 33, 'IT_과학:보안': 34, '경제:자동차': 35, '경제:금융_재테크': 36, '문화:방송_연예': 37, '사회:환경': 38, '경제:자원': 39, '문화:요리_여행': 40, '사회:날씨': 41, '정치:정치일반': 42, '스포츠:농구_배구': 43, '정치:행정_자치': 44, '문화:생활': 45, 'IT_과학:인터넷_SNS': 46, '스포츠:월드컵': 47, 'IT_과학:과학': 48, '정치:북한': 49, '문화:음악': 50, '스포츠:스포츠일반': 51, '문화:영화': 52, '경제:증권_증시': 53, '경제:외환': 54, '스포츠:야구': 55}

Unique labels in training set: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27 28 29 30 31 32 33 34 35 36 37 

In [11]:
# 옵티마이저 및 학습 파라미터 설정
optimizer = AdamW(model.parameters(), lr=CFG.learning_rate)



In [None]:
from sklearn.metrics import f1_score
import torch

# Early Stopping 설정
patience = 2  # 성능이 향상되지 않는 에포크 수
best_score = 0
early_stop_counter = 0

# 학습
model.train()
for epoch in range(CFG.epoch):
    for batch in tqdm(train_loader, desc=f'Epoch {epoch + 1}/{CFG.epoch}'):
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

    # Validation
    model.eval()
    val_predictions = []
    val_true_labels = []
    with torch.no_grad():
        for batch in tqdm(val_loader, desc='Validating'):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask)
            _, preds = torch.max(outputs.logits, dim=1)
            val_predictions.extend(preds.cpu().tolist())
            val_true_labels.extend(labels.cpu().tolist())

    # 검증 결과 출력 및 Early Stopping 체크
    val_f1 = f1_score(val_true_labels, val_predictions, average='macro')
    print(f"Validation F1 Score: {val_f1:.2f}")

    # Early Stopping 조건
    if val_f1 > best_score:
        best_score = val_f1
        early_stop_counter = 0  # Reset counter when improvement is seen
        print("New best score achieved.")
    else:
        early_stop_counter += 1
        print(f"No improvement. Early stopping counter: {early_stop_counter}/{patience}")

        if early_stop_counter >= patience:
            print("Early stopping triggered. Training stopped.")
            break

# 최종 모델 저장
torch.save(model, '/content/drive/MyDrive/DAT/final_model4.pt')
print("Model saved after training completion.")

Epoch 1/10: 100%|██████████| 1358/1358 [30:21<00:00,  1.34s/it]
Validating: 100%|██████████| 340/340 [02:44<00:00,  2.07it/s]


Validation F1 Score: 0.47
New best score achieved.


Epoch 2/10: 100%|██████████| 1358/1358 [29:42<00:00,  1.31s/it]
Validating: 100%|██████████| 340/340 [02:44<00:00,  2.07it/s]


Validation F1 Score: 0.54
New best score achieved.


Epoch 3/10: 100%|██████████| 1358/1358 [29:43<00:00,  1.31s/it]
Validating: 100%|██████████| 340/340 [02:43<00:00,  2.08it/s]


Validation F1 Score: 0.58
New best score achieved.


Epoch 4/10: 100%|██████████| 1358/1358 [29:42<00:00,  1.31s/it]
Validating: 100%|██████████| 340/340 [02:44<00:00,  2.07it/s]


Validation F1 Score: 0.57
No improvement. Early stopping counter: 1/2


Epoch 5/10: 100%|██████████| 1358/1358 [29:42<00:00,  1.31s/it]
Validating: 100%|██████████| 340/340 [02:43<00:00,  2.08it/s]


Validation F1 Score: 0.61
New best score achieved.


Epoch 6/10: 100%|██████████| 1358/1358 [29:40<00:00,  1.31s/it]
Validating: 100%|██████████| 340/340 [02:43<00:00,  2.07it/s]


Validation F1 Score: 0.61
New best score achieved.


Epoch 7/10: 100%|██████████| 1358/1358 [29:42<00:00,  1.31s/it]
Validating: 100%|██████████| 340/340 [02:43<00:00,  2.08it/s]


Validation F1 Score: 0.62
New best score achieved.


Epoch 8/10:   3%|▎         | 43/1358 [00:56<28:34,  1.30s/it]

In [None]:
# 테스트 세트 추론
model.eval()
test_predictions = []
with torch.no_grad():
    for batch in tqdm(test_loader, desc='Testing'):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask)
        _, preds = torch.max(outputs.logits, dim=1)
        test_predictions.extend(preds.cpu().tolist())

# 라벨 디코딩
label_decoder = {i: label for label, i in label_encoder.items()}
decoded_predictions = [label_decoder[pred] for pred in test_predictions]

In [None]:
sample_submission = pd.read_csv("/content/drive/MyDrive/DAT/sample_submission.csv")
sample_submission["분류"] = decoded_predictions

sample_submission.to_csv("/content/drive/MyDrive/DAT/yk_submission5.csv", encoding='UTF-8-sig', index=False)

In [None]:
import pandas as pd
result = pd.read_csv("/content/drive/MyDrive/DAT/yk_submission5.csv")
result

In [None]:
result['분류'].value_counts()