In [1]:
import json
import torch
from transformers import ElectraTokenizer, ElectraForTokenClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from datasets import Dataset
import evaluate

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# 1. 데이터 로드 및 전처리
def load_data(json_path):
    with open(json_path, "r", encoding="utf-8") as f:
        data = json.load(f)
    return data

def preprocess_data(data, tokenizer, label_map):
    tokenized_data = {"input_ids": [], "attention_mask": [], "labels": []}
    for entry in data:
        tokens = entry["tokens"]
        tags = entry["ner_tags"]

        tokenized = tokenizer(
            tokens,
            is_split_into_words=True,
            padding="max_length",
            truncation=True,
            max_length=128,
            return_offsets_mapping=True
        )

        labels = [-100] * len(tokenized["input_ids"])  # 기본값 -100
        word_ids = tokenized.word_ids()

        for idx, label in enumerate(tags):
            if word_ids and idx < len(word_ids) and word_ids[idx] is not None:  # None 값 처리
                labels[word_ids[idx]] = label

        tokenized_data["input_ids"].append(tokenized["input_ids"])
        tokenized_data["attention_mask"].append(tokenized["attention_mask"])
        tokenized_data["labels"].append(labels)

    return Dataset.from_dict(tokenized_data)

In [4]:
# BIO 태그 정의 및 데이터 로드
LABELS = {0: "O", 1: "B-PLACE", 2: "I-PLACE", 3: "B-FOOD", 4: "I-FOOD", 5: "B-QTY", 6: "I-QTY"}
label_map = {v: k for k, v in LABELS.items()}
data = load_data("tagged_one_all.json")

from transformers import ElectraTokenizerFast

# KoELECTRA FastTokenizer 로드
tokenizer = ElectraTokenizerFast.from_pretrained("monologg/koelectra-base-v3-discriminator")
dataset = preprocess_data(data, tokenizer, label_map)

# 학습/검증 데이터 분리
dataset_split = dataset.train_test_split(test_size=0.2, seed=42)
train_dataset = dataset_split['train']
val_dataset = dataset_split['test']


In [None]:
# 2. 모델 학습 설정
model = ElectraForTokenClassification.from_pretrained(
    "monologg/koelectra-base-v3-discriminator", num_labels=len(LABELS)
)

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    num_train_epochs=10,
    weight_decay=0.01,
    logging_dir="./logs",
    load_best_model_at_end=True,  # 가장 좋은 검증 스코어 모델 저장
    metric_for_best_model="f1",
)

# 평가 메트릭
# 메트릭 로드
metric = evaluate.load("seqeval")

Some weights of ElectraForTokenClassification were not initialized from the model checkpoint at monologg/koelectra-base-v3-discriminator and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


ImportError: To be able to use evaluate-metric/seqeval, you need to install the following dependencies['seqeval'] using 'pip install seqeval' for instance'

In [None]:
# 기존 코드와 동일하게 compute_metrics 함수 작성
def compute_metrics(p):
    predictions, labels = p
    predictions = predictions.argmax(axis=-1)
    true_predictions = [
        [LABELS[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [LABELS[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

# 모델 학습
trainer.train()

# 모델 저장
trainer.save_model("./ner_koelectra_model_gpt")


In [6]:
# 3. 저장된 모델 불러오기 및 예측
def predict_ner(sentence, model, tokenizer):
    tokenized_input = tokenizer(
        list(sentence),  # 한 글자씩 분리
        is_split_into_words=False,
        return_tensors="pt",
        truncation=True,
        padding=True,
        max_length=128
    )
    outputs = model(**tokenized_input)
    predictions = torch.argmax(outputs.logits, dim=2)

    tokens = tokenizer.convert_ids_to_tokens(tokenized_input["input_ids"][0])
    predicted_tags = [LABELS[p.item()] for p in predictions[0]]
    return list(zip(tokens, predicted_tags))

# 저장된 모델 불러오기
model = ElectraForTokenClassification.from_pretrained("./ner_koelectra_model_gpt")
tokenizer = ElectraTokenizer.from_pretrained("./ner_koelectra_model_gpt")

# 예측 실행
sentence = "명태본가에서 비빔밥 두 개랑 김치찌개 하나 추가요."
result = predict_ner(sentence, model, tokenizer)
print(result)

OSError: Incorrect path_or_model_id: './ner_koelectra_model_gpt'. Please provide either the path to a local folder or the repo_id of a model on the Hub.