# Import

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip3 install torch



In [None]:
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import ElectraTokenizer, ElectraForSequenceClassification, AdamW
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm
from types import SimpleNamespace
import pandas as pd

# Hyperparameter

In [None]:
config = {
    "learning_rate": 2e-5,
    "epoch": 6,
    "batch_size": 32,
}

CFG = SimpleNamespace(**config)

# Load Data

In [None]:
RandomState=110

train_df = pd.read_csv("/content/drive/MyDrive/gbt해커톤/data/train_df_1008.csv")
test_df = pd.read_csv("/content/drive/MyDrive/gbt해커톤//data/test_df_1008.csv")
sample_submission = pd.read_csv("/content/drive/MyDrive/gbt해커톤/data/sample_submission.csv")

In [None]:
print(len(test_df))
print(len(sample_submission))

23405
23405


# Custom Dataset

In [None]:
# 데이터셋 클래스 정의
class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

In [None]:
train_df.drop(columns=['제목'], inplace=True)
test_df.drop(columns=['제목'], inplace=True)

In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from tqdm import tqdm
import pandas as pd
from types import SimpleNamespace

In [None]:
train_df['지역_여부'] = train_df['분류'].apply(lambda x: '지역' if x == '지역' else '비지역')

# 데이터 분리
X_train, X_val, y_train, y_val = train_test_split(train_df['키워드'], train_df['지역_여부'], test_size=0.2, random_state=42)

# 라벨 인코딩
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(y_train)
y_val = label_encoder.transform(y_val)

# 데이터셋 생성
tokenizer = ElectraTokenizer.from_pretrained("beomi/KcELECTRA-base-v2022")
train_dataset = TextDataset(X_train.tolist(), y_train.tolist(), tokenizer, max_len=200)
val_dataset = TextDataset(X_val.tolist(), y_val.tolist(), tokenizer, max_len=200)

train_loader = DataLoader(train_dataset, batch_size=CFG.batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=CFG.batch_size)

# 모델 준비
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
num_labels = len(train_df['분류'].unique())
model = ElectraForSequenceClassification.from_pretrained("beomi/KcELECTRA-base-v2022", num_labels=num_labels).to(device)
optimizer = AdamW(model.parameters(), lr=CFG.learning_rate)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/288 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/450k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/504 [00:00<?, ?B/s]

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'BertTokenizer'. 
The class this function is called from is 'ElectraTokenizer'.


pytorch_model.bin:   0%|          | 0.00/511M [00:00<?, ?B/s]

Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at beomi/KcELECTRA-base-v2022 and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# 학습
model.train()
best_f1 = 0.0
patience = 2
patience_counter = 0

for epoch in range(CFG.epoch):
    for batch in tqdm(train_loader, desc=f'Epoch {epoch + 1}/{CFG.epoch}'):
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

    # Validation
    model.eval()
    val_predictions = []
    val_true_labels = []
    with torch.no_grad():
        for batch in tqdm(val_loader, desc='Validating'):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask)
            _, preds = torch.max(outputs.logits, dim=1)
            val_predictions.extend(preds.cpu().tolist())
            val_true_labels.extend(labels.cpu().tolist())

    # 검증 결과 출력
    val_f1 = f1_score(val_true_labels, val_predictions, average='macro')
    print(f"Validation F1 Score: {val_f1:.2f}")

    # 조기 종료 체크
    if val_f1 > best_f1:
        best_f1 = val_f1
        patience_counter = 0
        # 모델 저장 등 추가 작업을 여기서 수행할 수 있습니다.
    else:
        patience_counter += 1

    if patience_counter >= patience:
        print("Early stopping triggered.")
        break

Epoch 1/6: 100%|██████████| 1358/1358 [24:17<00:00,  1.07s/it]
Validating: 100%|██████████| 340/340 [02:14<00:00,  2.53it/s]


Validation F1 Score: 0.87


Epoch 2/6: 100%|██████████| 1358/1358 [24:00<00:00,  1.06s/it]
Validating: 100%|██████████| 340/340 [02:14<00:00,  2.53it/s]


Validation F1 Score: 0.89


Epoch 3/6: 100%|██████████| 1358/1358 [24:03<00:00,  1.06s/it]
Validating: 100%|██████████| 340/340 [02:15<00:00,  2.52it/s]


Validation F1 Score: 0.89


Epoch 4/6: 100%|██████████| 1358/1358 [24:05<00:00,  1.06s/it]
Validating:  41%|████      | 138/340 [00:54<01:24,  2.40it/s]

In [None]:
# 2단계: '비지역' 범주에 대한 세부 분류 모델 구축
non_region_df = train_df[train_df['지역_여부'] == '비지역']

# 데이터 분리
X_train_non_region, X_val_non_region, y_train_non_region, y_val_non_region = train_test_split(non_region_df['키워드'], non_region_df['분류'], test_size=0.2, random_state=42)

# 라벨 인코딩
label_encoder_non_region = LabelEncoder()
y_train_non_region = label_encoder_non_region.fit_transform(y_train_non_region)
y_val_non_region = label_encoder_non_region.transform(y_val_non_region)

# 데이터셋 생성
train_dataset_non_region = TextDataset(X_train_non_region.tolist(), y_train_non_region.tolist(), tokenizer, max_len=200)
val_dataset_non_region = TextDataset(X_val_non_region.tolist(), y_val_non_region.tolist(), tokenizer, max_len=200)

train_loader_non_region = DataLoader(train_dataset_non_region, batch_size=CFG.batch_size, shuffle=True)
val_loader_non_region = DataLoader(val_dataset_non_region, batch_size=CFG.batch_size)

# 모델 준비
num_labels_non_region = len(non_region_df['분류'].unique())
model_non_region = ElectraForSequenceClassification.from_pretrained("beomi/KcELECTRA-base-v2022", num_labels=num_labels_non_region).to(device)
optimizer_non_region = AdamW(model_non_region.parameters(), lr=CFG.learning_rate)


In [None]:
# 학습
model_non_region.train()
best_f1_non_region = 0.0
patience_counter_non_region = 0

for epoch in range(CFG.epoch):
    for batch in tqdm(train_loader_non_region, desc=f'Epoch {epoch + 1}/{CFG.epoch}'):
        optimizer_non_region.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model_non_region(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer_non_region.step()

    # Validation
    model_non_region.eval()
    val_predictions_non_region = []
    val_true_labels_non_region = []
    with torch.no_grad():
        for batch in tqdm(val_loader_non_region, desc='Validating'):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = model_non_region(input_ids, attention_mask=attention_mask)
            _, preds = torch.max(outputs.logits, dim=1)
            val_predictions_non_region.extend(preds.cpu().tolist())
            val_true_labels_non_region.extend(labels.cpu().tolist())

    # 검증 결과 출력
    val_f1_non_region = f1_score(val_true_labels_non_region, val_predictions_non_region, average='macro')
    print(f"Validation F1 Score: {val_f1_non_region:.2f}")

    # 조기 종료 체크
    if val_f1_non_region > best_f1_non_region:
        best_f1_non_region = val_f1_non_region
        patience_counter_non_region = 0
        # 모델 저장 등 추가 작업을 여기서 수행할 수 있습니다.
    else:
        patience_counter_non_region += 1

    if patience_counter_non_region >= patience:
        print("Early stopping triggered.")
        break

# Inference

In [None]:
# 테스트 세트 추론 (1단계: '지역'과 '비지역' 예측)
test_texts = test_df['키워드']
test_dataset = TextDataset(test_texts.tolist(), [0]*len(test_texts), tokenizer, max_len=200)  # 라벨은 임시로 0으로 설정
test_loader = DataLoader(test_dataset, batch_size=CFG.batch_size)

model.eval()
test_predictions_stage1 = []
with torch.no_grad():
    for batch in tqdm(test_loader, desc='Testing Stage 1'):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask)
        _, preds = torch.max(outputs.logits, dim=1)
        test_predictions_stage1.extend(preds.cpu().tolist())

# 라벨 디코딩 (1단계)
label_decoder_stage1 = {i: label for i, label in enumerate(label_encoder.classes_)}
decoded_predictions_stage1 = [label_decoder_stage1[pred] for pred in test_predictions_stage1]

# '비지역'으로 예측된 데이터 필터링
non_region_indices = [i for i, pred in enumerate(decoded_predictions_stage1) if pred == '비지역']
non_region_texts = [test_texts[i] for i in non_region_indices]

# 데이터셋 생성 (2단계)
test_dataset_non_region = TextDataset(non_region_texts, [0]*len(non_region_texts), tokenizer, max_len=200)  # 라벨은 임시로 0으로 설정
test_loader_non_region = DataLoader(test_dataset_non_region, batch_size=CFG.batch_size)

# 테스트 세트 추론 (2단계: '비지역' 세부 분류 예측)
model_non_region.eval()
test_predictions_stage2 = []
with torch.no_grad():
    for batch in tqdm(test_loader_non_region, desc='Testing Stage 2'):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        outputs = model_non_region(input_ids, attention_mask=attention_mask)
        _, preds = torch.max(outputs.logits, dim=1)
        test_predictions_stage2.extend(preds.cpu().tolist())

# 라벨 디코딩 (2단계)
label_decoder_stage2 = {i: label for i, label in enumerate(label_encoder_non_region.classes_)}
decoded_predictions_stage2 = [label_decoder_stage2[pred] for pred in test_predictions_stage2]

# 최종 예측 결과 결합
final_predictions = decoded_predictions_stage1.copy()
for i, idx in enumerate(non_region_indices):
    final_predictions[idx] = decoded_predictions_stage2[i]

# Submission

In [None]:
# 최종 예측 결과를 sample_submission에 적용
sample_submission = pd.read_csv('/content/drive/MyDrive/gbt해커톤/data/sample_submission.csv')
sample_submission["분류"] = final_predictions

# 결과를 CSV 파일로 저장
sample_submission.to_csv("/content/drive/MyDrive/gbt해커톤/submission/1008_submission.csv", encoding='UTF-8-sig', index=False)

In [None]:
sample_submission['분류'].value_counts()