In [None]:
from transformers import ElectraTokenizer, ElectraForTokenClassification

# NER_TAG 맵핑
# - 매장명(STR), 메뉴(N), 수량(CNT)
# -> B-STR, I-STR, B-N, I-N, B-CNT, I-CNT 
label_list = ["0", "1", "2", "3", "4", "5", "6"]  # BIO 태깅

# KoELECTRA tokenizer와 모델 로드
tokenizer = ElectraTokenizer.from_pretrained("monologg/koelectra-base-v3-discriminator")
model = ElectraForTokenClassification.from_pretrained(
    "monologg/koelectra-base-v3-discriminator", num_labels=len(label_list)
)

# 레이블 리스트
id2label = {i: label for i, label in enumerate(label_list)}
label2id = {label: i for i, label in enumerate(label_list)}

# 모델에 레이블 매핑 정보 추가
model.config.id2label = id2label
model.config.label2id = label2id


Some weights of ElectraForTokenClassification were not initialized from the model checkpoint at monologg/koelectra-base-v3-discriminator and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [2]:
# JSON 파일 열기
import json
file_path = './mecab/ner_all.json'
with open(file_path, 'r', encoding='utf-8') as file:
    data = json.load(file)

# json 파일 정리 token은 token대로  tag는 tag대로
processed_data = {
    "tokens": [item["tokens"] for item in data],
    "ner_tags": [item["ner_tags"] for item in data],
}

In [4]:
def preprocess_function(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"],
        is_split_into_words=True,  # 토큰 단위로 입력
        padding="max_length",
        truncation=True,
        max_length=128
    )
    
    labels = []
    for i, label in enumerate(examples["tokens"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)  # 토큰과 원래 단어 매핑
        label_ids = []
        previous_word_idx = None
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)  # 손실 계산에서 무시
            elif word_idx != previous_word_idx:
                label_ids.append(label2id[label[word_idx]])
            else:
                label_ids.append(label2id[label[word_idx]] if word_idx is not None else -100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["tokens"] = labels
    return tokenized_inputs

# Dataset 생성 및 전처리
from datasets import Dataset
raw_dataset = Dataset.from_dict(processed_data)
processed_dataset = raw_dataset.map(preprocess_function, batched=True)


Map:   0%|          | 0/402 [00:00<?, ? examples/s]


ValueError: word_ids() is not available when using non-fast tokenizers (e.g. instance of a `XxxTokenizerFast` class).

In [None]:
from torch.utils.data import DataLoader
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)
train_dataloader = DataLoader(processed_dataset, shuffle=True, batch_size=16, collate_fn=data_collator)


In [None]:
from transformers import AdamW, get_scheduler
import torch

optimizer = AdamW(model.parameters(), lr=5e-5)
num_epochs = 3
num_training_steps = len(train_dataloader) * num_epochs
lr_scheduler = get_scheduler(
    "linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
)

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)

from tqdm import tqdm

progress_bar = tqdm(range(num_training_steps))

model.train()
for epoch in range(num_epochs):
    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        
        progress_bar.update(1)

print("Fine-tuning 완료!")


In [None]:
model.save_pretrained("./koelectra-ner-finetuned")
tokenizer.save_pretrained("./koelectra-ner-finetuned")

# 평가 모드 전환
model.eval()