# **KcELECTRA-base**
- 학습률 스케쥴러 ver.2 이용
- max_len : 200
- (기존 방식) 제목만 이용



# Import

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# !pip3 install torch

In [3]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from tqdm import tqdm
import pandas as pd
from types import SimpleNamespace
import torch.optim as optim
from torch.optim.lr_scheduler import ReduceLROnPlateau
from transformers import get_linear_schedule_with_warmup

# Hyperparameter

In [4]:
config = {
    "learning_rate": 5e-5,
    "epoch": 10,
    "batch_size": 32
}

CFG = SimpleNamespace(**config)

# Load Data

In [5]:
RandomState=110

train_df = pd.read_csv("/content/drive/MyDrive/gbt해커톤/data/train_df_1012.csv")
test_df = pd.read_csv("/content/drive/MyDrive/gbt해커톤//data/test_df_1012.csv")
sample_submission = pd.read_csv("/content/drive/MyDrive/gbt해커톤/data/sample_submission.csv")

In [6]:
print(len(test_df))
print(len(sample_submission))

23405
23405


# Load Model

In [None]:
from transformers import ElectraForSequenceClassification, ElectraTokenizer
import torch

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
tokenizer = ElectraTokenizer.from_pretrained("beomi/KcELECTRA-base-v2022")
model = ElectraForSequenceClassification.from_pretrained("beomi/KcELECTRA-base-v2022", num_labels=len(train_df['분류'].unique())).to(device)

# Custom Dataset

In [8]:
class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=200):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, item):
        text = str(self.texts[item])
        label = self.labels[item] if self.labels is not None else -1
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )
        return {
            'text': text,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

In [9]:
train_df.drop(columns=['제목'], inplace=True)
test_df.drop(columns=['제목'], inplace=True)

In [10]:
# 레이블 인코딩
label_encoder = {label: i for i, label in enumerate(train_df['분류'].unique())}
train_df['label'] = train_df['분류'].map(label_encoder)

# 데이터 분할 (train -> train + validation)
train_df, val_df = train_test_split(train_df, test_size=0.2, stratify=train_df['분류'], random_state=RandomState)

# 데이터셋 생성
train_dataset = TextDataset(train_df.키워드.tolist(), train_df.label.tolist(), tokenizer)
val_dataset = TextDataset(val_df.키워드.tolist(), val_df.label.tolist(), tokenizer)
test_dataset = TextDataset(test_df.키워드.tolist(), None, tokenizer)  # 라벨 없음

# 데이터 로더 생성
train_loader = DataLoader(train_dataset, batch_size=CFG.batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=CFG.batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=CFG.batch_size, shuffle=False)

In [11]:
train_dataset[0]

{'text': '용인특례시 외국 주식 이해 온라인 비대 강의 모집 용인시민대학 강태희 온라인 교육 강태희 용인특례시 용인시민대학 강좌 외국주식시장 이해 선정 온라인 교육 대학 수준 양질 인문 교양 강좌 용인시민대학 실생활 전문지식 강좌 주식 전문가 외국 주식 이해 강의 구성 투자 세계 미국 주식 이해 주식 미국 기초 종목 ETF 미국 강의 정책 중국 정부 주식시장 강의 트렌드 중국 산업 종목 선정 실전 투자 주제 강의 거주 홈페이지 평생 학습관 이달 신청 선착순 선정 화상 연결 비대면 교육 외국 주식 관심 만큼 금융지식 현명 투자 강좌 해외주식 관심 강좌 건전 투자방법 저작권자',
 'input_ids': tensor([    2, 23659,  4908,  4899,  4172,  8534,  9263,  8209, 13405, 28976,
         19436, 27980, 41973,  4289, 13566, 51318,  4342, 13405,  8735, 51318,
          4342, 23659,  4908,  4899,  4172, 41973,  4289, 13566, 38220,  8534,
         21010,  8536,  8209, 19460, 13405,  8735,  8901,  8168, 41429, 37264,
         32336, 38220, 41973,  4289, 13566, 46130, 47189,  4128, 38220,  9263,
          9978,  8534,  9263,  8209, 19436, 16511,  9234,  8365,  8062,  9263,
          8209,  9263,  8062, 13255, 24504, 40712,  8062, 19436,  8305,  7979,
          7995, 23910, 19436, 37620,  7979, 11788, 24504, 19460, 38896,  9234,
         14784, 19436, 16

In [12]:
# 옵티마이저 및 학습 파라미터 설정
optimizer = torch.optim.AdamW(model.parameters(), lr=CFG.learning_rate)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(train_loader) * CFG.epoch)

In [None]:
# 초기화
patience = 2  # 개선되지 않을 경우 기다리는 에폭 수
best_f1 = 0.0  # 최상의 F1 스코어 초기화
counter = 0  # 카운터 초기화

for epoch in range(CFG.epoch):
    model.train()

    # 학습 단계
    for batch in tqdm(train_loader, desc=f'Epoch {epoch + 1}/{CFG.epoch}'):
        optimizer.zero_grad()

        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        scheduler.step()

    # Validation
    model.eval()
    val_predictions = []
    val_true_labels = []

    with torch.no_grad():
        for batch in tqdm(val_loader, desc='Validating'):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            _, preds = torch.max(outputs.logits, dim=1)
            val_predictions.extend(preds.cpu().tolist())
            val_true_labels.extend(labels.cpu().tolist())

    # F1 스코어 계산
    current_f1 = f1_score(val_true_labels, val_predictions, average='macro')

    # Early stopping 체크
    if current_f1 > best_f1:
        best_f1 = current_f1  # 최상의 F1 스코어 갱신
        counter = 0  # 카운터 초기화
        torch.save(model.state_dict(), f'model_best_f1.pth')  # 모델 저장
        print(f"Model saved with F1 Score: {best_f1:.4f}")
    else:
        counter += 1  # 카운터 증가

    print(f"Epoch {epoch + 1}, F1 Score: {current_f1:.4f}")

    # Early stopping이 활성화되면 훈련 종료
    if counter >= patience:
        print("Early stopping triggered. Training stopped.")
        break

Epoch 1/10: 100%|██████████| 1358/1358 [24:10<00:00,  1.07s/it]
Validating: 100%|██████████| 340/340 [02:22<00:00,  2.38it/s]


Model saved with F1 Score: 0.3571
Epoch 1, F1 Score: 0.3571


Epoch 2/10: 100%|██████████| 1358/1358 [24:18<00:00,  1.07s/it]
Validating: 100%|██████████| 340/340 [02:22<00:00,  2.39it/s]


Model saved with F1 Score: 0.4567
Epoch 2, F1 Score: 0.4567


Epoch 3/10: 100%|██████████| 1358/1358 [24:19<00:00,  1.07s/it]
Validating: 100%|██████████| 340/340 [02:23<00:00,  2.38it/s]


Model saved with F1 Score: 0.5037
Epoch 3, F1 Score: 0.5037


Epoch 4/10: 100%|██████████| 1358/1358 [24:18<00:00,  1.07s/it]
Validating: 100%|██████████| 340/340 [02:23<00:00,  2.37it/s]


Model saved with F1 Score: 0.5601
Epoch 4, F1 Score: 0.5601


Epoch 5/10: 100%|██████████| 1358/1358 [24:20<00:00,  1.08s/it]
Validating: 100%|██████████| 340/340 [02:22<00:00,  2.38it/s]


Model saved with F1 Score: 0.6022
Epoch 5, F1 Score: 0.6022


Epoch 6/10: 100%|██████████| 1358/1358 [24:19<00:00,  1.08s/it]
Validating: 100%|██████████| 340/340 [02:22<00:00,  2.38it/s]


Model saved with F1 Score: 0.6078
Epoch 6, F1 Score: 0.6078


Epoch 7/10: 100%|██████████| 1358/1358 [24:17<00:00,  1.07s/it]
Validating: 100%|██████████| 340/340 [02:22<00:00,  2.38it/s]


Model saved with F1 Score: 0.6219
Epoch 7, F1 Score: 0.6219


Epoch 8/10: 100%|██████████| 1358/1358 [24:19<00:00,  1.07s/it]
Validating: 100%|██████████| 340/340 [02:23<00:00,  2.37it/s]


Model saved with F1 Score: 0.6296
Epoch 8, F1 Score: 0.6296


Epoch 9/10: 100%|██████████| 1358/1358 [24:18<00:00,  1.07s/it]
Validating: 100%|██████████| 340/340 [02:23<00:00,  2.37it/s]


Model saved with F1 Score: 0.6316
Epoch 9, F1 Score: 0.6316


Epoch 10/10:  16%|█▌        | 211/1358 [03:45<20:41,  1.08s/it]

# Inference

In [None]:
# 테스트 세트 추론
model.eval()
test_predictions = []
with torch.no_grad():
    for batch in tqdm(test_loader, desc='Testing'):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask)
        _, preds = torch.max(outputs.logits, dim=1)
        test_predictions.extend(preds.cpu().tolist())

# 라벨 디코딩
label_decoder = {i: label for label, i in label_encoder.items()}
decoded_predictions = [label_decoder[pred] for pred in test_predictions]

# Submission

In [None]:
sample_submission["분류"] = decoded_predictions

sample_submission.to_csv("/content/drive/MyDrive/gbt해커톤/submission/1013_kcelectra_lr.csv", encoding='UTF-8-sig', index=False)

In [None]:
sample_submission['분류'].value_counts()

.