In [36]:
from datasets import load_dataset

dataset = load_dataset("kor_ner")

In [37]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'annot_text', 'tokens', 'pos_tags', 'ner_tags'],
        num_rows: 2928
    })
    test: Dataset({
        features: ['text', 'annot_text', 'tokens', 'pos_tags', 'ner_tags'],
        num_rows: 366
    })
    validation: Dataset({
        features: ['text', 'annot_text', 'tokens', 'pos_tags', 'ner_tags'],
        num_rows: 366
    })
})

In [38]:
dataset['train']['text'][0], dataset['train']['annot_text'][0]

('나도 때늦은 홍길동이보다는 이 사회의 기본 구조를 변혁시키는 쪽이 더 많은 사람을 보다 효과적으로 구할 수 있다는 것쯤은 알고 있오.',
 '나도 때늦은 <홍길동:PS>이보다는 이 사회의 기본 구조를 변혁시키는 쪽이 더 많은 사람을 보다 효과적으로 구할 수 있다는 것쯤은 알고 있오.')

In [39]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import Trainer, TrainingArguments
from datasets import load_dataset, load_metric
import numpy as np


# 데이터셋 로드 및 토크나이저 초기화
# dataset = load_dataset("conll2003")
checkpoint = "FacebookAI/roberta-base"
tokenizer = AutoTokenizer.from_pretrained(checkpoint,add_prefix_space=True)
model = AutoModelForTokenClassification.from_pretrained(checkpoint, num_labels=9)


Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at FacebookAI/roberta-base and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [40]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples['tokens'], truncation=True, is_split_into_words=True, max_length=512, padding='max_length')
    labels = []
    for i, label in enumerate(examples[f'ner_tags']):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None or word_idx == previous_word_idx:
                label_ids.append(-100)
            else:
                label_ids.append(label[word_idx])
            previous_word_idx = word_idx
        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs


In [41]:
# 토큰화 및 레이블 정렬
tokenized_datasets = dataset.map(tokenize_and_align_labels, batched=True)

Map:   0%|          | 0/366 [00:00<?, ? examples/s]

In [42]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['text', 'annot_text', 'tokens', 'pos_tags', 'ner_tags', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 2928
    })
    test: Dataset({
        features: ['text', 'annot_text', 'tokens', 'pos_tags', 'ner_tags', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 366
    })
    validation: Dataset({
        features: ['text', 'annot_text', 'tokens', 'pos_tags', 'ner_tags', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 366
    })
})

In [43]:
# 훈련 인자 설정
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
)

In [44]:
# 메트릭 계산 함수
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # 메트릭을 위해 -100 레이블 제거
    true_predictions = [
        [p for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [l for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

metric = load_metric("seqeval")

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


In [45]:
# 트레이너 초기화 및 훈련 시작
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['test'],
    compute_metrics=compute_metrics,
)

trainer.train()

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False)
Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Step,Training Loss
10,2.2179
20,2.0273
30,1.6921
40,1.1401
50,0.4675
60,0.3739
70,0.3764
80,0.3619
90,0.3213
100,0.318


Checkpoint destination directory ./results/checkpoint-500 already exists and is non-empty.Saving will proceed but saved results may be invalid.


TrainOutput(global_step=549, training_loss=0.3423364113588802, metrics={'train_runtime': 239.3716, 'train_samples_per_second': 36.696, 'train_steps_per_second': 2.294, 'total_flos': 2295375968452608.0, 'train_loss': 0.3423364113588802, 'epoch': 3.0})

In [48]:
trainer.evaluate(tokenized_datasets['validation'])

{'eval_loss': 0.1352897584438324,
 'eval_precision': 0.0,
 'eval_recall': 0.0,
 'eval_f1': 0.0,
 'eval_accuracy': 0.9617105917454003,
 'eval_runtime': 3.7046,
 'eval_samples_per_second': 98.795,
 'eval_steps_per_second': 6.208,
 'epoch': 3.0}