# Import

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
!pip3 install torch



In [4]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from tqdm import tqdm
import pandas as pd
from types import SimpleNamespace

# Hyperparameter

In [5]:
config = {
    "learning_rate": 2e-5,
    "epoch": 8,
    "batch_size": 32,
}

CFG = SimpleNamespace(**config)

# Load Data

In [6]:
RandomState=110

train_df = pd.read_csv("/content/drive/MyDrive/gbt해커톤/data/train_df_1008.csv")
test_df = pd.read_csv("/content/drive/MyDrive/gbt해커톤//data/test_df_1008.csv")
sample_submission = pd.read_csv("/content/drive/MyDrive/gbt해커톤/data/sample_submission.csv")

In [7]:
print(len(test_df))
print(len(sample_submission))

23405
23405


# Load Model

In [None]:
from transformers import ElectraForSequenceClassification, ElectraTokenizer
import torch

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
tokenizer = ElectraTokenizer.from_pretrained("beomi/KcELECTRA-base-v2022")
model = ElectraForSequenceClassification.from_pretrained("beomi/KcELECTRA-base-v2022", num_labels=len(train_df['분류'].unique())).to(device)

# Custom Dataset

In [9]:
# 데이터셋 클래스 정의
class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=200):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )
        item = {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten()
        }
        if self.labels is not None:
            item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

In [10]:
train_df.drop(columns=['제목'], inplace=True)
test_df.drop(columns=['제목'], inplace=True)

In [11]:
# 레이블 인코딩
label_encoder = {label: i for i, label in enumerate(train_df['분류'].unique())}
train_df['label'] = train_df['분류'].map(label_encoder)

# 데이터 분할 (train -> train + validation)
train_df, val_df = train_test_split(train_df, test_size=0.2, stratify=train_df['분류'], random_state=RandomState)

# 데이터셋 생성
train_dataset = TextDataset(train_df.키워드.tolist(), train_df.label.tolist(), tokenizer)
val_dataset = TextDataset(val_df.키워드.tolist(), val_df.label.tolist(), tokenizer)
test_dataset = TextDataset(test_df.키워드.tolist(), None, tokenizer)  # 라벨 없음

# 데이터 로더 생성
train_loader = DataLoader(train_dataset, batch_size=CFG.batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=CFG.batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=CFG.batch_size, shuffle=False)

In [12]:
# 옵티마이저 및 학습 파라미터 설정
optimizer = AdamW(model.parameters(), lr=CFG.learning_rate)



threshold 조절하긴 했는데 의미있게 반영되지 않아서 해당 내용은 무시해도되고  
아래에서 로짓확률로 저장하는 부분이 이전 코드와 달라진 부분  

In [13]:
# 학습
model.train()
for epoch in range(CFG.epoch):
    for batch in tqdm(train_loader, desc=f'Epoch {epoch + 1}/{CFG.epoch}'):
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

# 검증
model.eval()
val_predictions = []
val_true_labels = []
with torch.no_grad():
    for batch in tqdm(val_loader, desc='Validating'):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        val_predictions.extend(logits.cpu().tolist())
        val_true_labels.extend(labels.cpu().tolist())

# 지역 범주에 대해 스레스홀드 조절
threshold = 0.7  # 지역 범주에 대해 0.7로 설정
specific_class_index = label_encoder['지역']  # 지역 범주의 인덱스

adjusted_predictions = []
for pred in val_predictions:
    adjusted_pred = pred.copy()
    if pred[specific_class_index] < threshold:
        adjusted_pred[specific_class_index] = -float('inf')  # 스레스홀드 미만이면 해당 범주로 예측되지 않도록 설정
    adjusted_predictions.append(adjusted_pred)

# 최종 예측
final_predictions = [torch.argmax(torch.tensor(pred)).item() for pred in adjusted_predictions]

# 검증 결과 출력
val_f1 = f1_score(val_true_labels, final_predictions, average='macro')
print(f"Validation F1 Score: {val_f1:.2f}")

Epoch 1/8: 100%|██████████| 1358/1358 [24:38<00:00,  1.09s/it]
Epoch 2/8: 100%|██████████| 1358/1358 [24:34<00:00,  1.09s/it]
Epoch 3/8: 100%|██████████| 1358/1358 [24:31<00:00,  1.08s/it]
Epoch 4/8: 100%|██████████| 1358/1358 [24:32<00:00,  1.08s/it]
Epoch 5/8: 100%|██████████| 1358/1358 [24:31<00:00,  1.08s/it]
Epoch 6/8: 100%|██████████| 1358/1358 [24:30<00:00,  1.08s/it]
Epoch 7/8: 100%|██████████| 1358/1358 [24:33<00:00,  1.09s/it]
Epoch 8/8: 100%|██████████| 1358/1358 [24:33<00:00,  1.09s/it]
Validating: 100%|██████████| 340/340 [02:15<00:00,  2.51it/s]


Validation F1 Score: 0.61


# Inference

In [14]:
# 테스트 세트 추론
test_predictions = []
with torch.no_grad():
    for batch in tqdm(test_loader, desc='Testing'):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        test_predictions.extend(logits.cpu().tolist())

# 특정 범주에 대해 스레스홀드 조절 (테스트 세트)
adjusted_test_predictions = []
for pred in test_predictions:
    adjusted_pred = pred.copy()
    if pred[specific_class_index] < threshold:
        adjusted_pred[specific_class_index] = -float('inf')  # 스레스홀드 미만이면 해당 범주로 예측되지 않도록 설정
    adjusted_test_predictions.append(adjusted_pred)

Testing: 100%|██████████| 732/732 [04:50<00:00,  2.52it/s]


In [22]:
# 예측 확률을 데이터프레임으로 변환
df_predictions = pd.DataFrame(adjusted_test_predictions)

# CSV 파일로 저장
df_predictions.to_csv("/content/drive/MyDrive/gbt해커톤/data/predictions.csv", index=False, encoding='UTF-8-sig')

# Submission

In [19]:
# 최종 테스트 예측
final_test_predictions = [torch.argmax(torch.tensor(pred)).item() for pred in adjusted_test_predictions]

# 최종 예측 결과를 sample_submission에 적용

sample_submission = pd.read_csv("/content/drive/MyDrive/gbt해커톤/data/sample_submission.csv")
sample_submission["분류"] = [list(label_encoder.keys())[pred] for pred in final_test_predictions]


In [20]:
# 결과를 CSV 파일로 저장
sample_submission.to_csv("/content/drive/MyDrive/gbt해커톤/submission/1008_submission.csv", encoding='UTF-8-sig', index=False)

In [21]:
sample_submission['분류'].value_counts()

Unnamed: 0_level_0,count
분류,Unnamed: 1_level_1
지역,12103
경제:부동산,1425
사회:사건_사고,1082
경제:반도체,1026
사회:사회일반,626
사회:교육_시험,462
정치:국회_정당,399
스포츠:올림픽_아시안게임,383
사회:의료_건강,359
경제:취업_창업,335
