| [02_lexical_analysis/02_Bert实体识别.ipynb](https://github.com/shibing624/nlp-tutorial/blob/main/07_information_extraction/02_Bert实体识别.ipynb)  | Bert实体识别  |[Open In Colab](https://colab.research.google.com/github/shibing624/nlp-tutorial/blob/main/07_information_extraction/02_Bert实体识别.ipynb) |

# Bert实体识别
适用于品牌、人名、地址名称识别，序列标注任务解决方法。

基于transformers的预训练模型，识别人名实体，模型地址：https://huggingface.co/models

In [4]:
!pip install transformers seqeval

Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple


In [5]:
import os

import torch
from transformers import AutoModelForTokenClassification, AutoTokenizer
from transformers import pipeline

os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"
model_name = "shibing624/bert4ner-base-chinese"

nlp = pipeline("ner", model=model_name, tokenizer=model_name)
sequence = "王宏伟来自北京，是个警察，喜欢去王府井游玩儿。"
print(nlp(sequence))


[{'entity': 'LABEL_8', 'score': 0.9998587, 'index': 1, 'word': '王', 'start': 0, 'end': 1}, {'entity': 'LABEL_5', 'score': 0.99971896, 'index': 2, 'word': '宏', 'start': 1, 'end': 2}, {'entity': 'LABEL_5', 'score': 0.99990284, 'index': 3, 'word': '伟', 'start': 2, 'end': 3}, {'entity': 'LABEL_2', 'score': 0.9999732, 'index': 4, 'word': '来', 'start': 3, 'end': 4}, {'entity': 'LABEL_2', 'score': 0.9999744, 'index': 5, 'word': '自', 'start': 4, 'end': 5}, {'entity': 'LABEL_1', 'score': 0.99971133, 'index': 6, 'word': '北', 'start': 5, 'end': 6}, {'entity': 'LABEL_4', 'score': 0.99962795, 'index': 7, 'word': '京', 'start': 6, 'end': 7}, {'entity': 'LABEL_2', 'score': 0.9999791, 'index': 8, 'word': '，', 'start': 7, 'end': 8}, {'entity': 'LABEL_2', 'score': 0.9999846, 'index': 9, 'word': '是', 'start': 8, 'end': 9}, {'entity': 'LABEL_2', 'score': 0.9999859, 'index': 10, 'word': '个', 'start': 9, 'end': 10}, {'entity': 'LABEL_2', 'score': 0.9999852, 'index': 11, 'word': '警', 'start': 10, 'end': 11}, 

不使用pipline，自己编写NER任务预测代码：

In [6]:
import os
import torch
from transformers import AutoTokenizer, AutoModelForTokenClassification
from seqeval.metrics.sequence_labeling import get_entities

os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"

# Load model from HuggingFace Hub
tokenizer = AutoTokenizer.from_pretrained("shibing624/bert4ner-base-chinese")
model = AutoModelForTokenClassification.from_pretrained("shibing624/bert4ner-base-chinese")
label_list = ['I-ORG', 'B-LOC', 'O', 'B-ORG', 'I-LOC', 'I-PER', 'B-TIME', 'I-TIME', 'B-PER']

sentence = "王宏伟来自北京，是个警察，喜欢去王府井游玩儿。"


def get_entity(sentence):
    tokens = tokenizer.tokenize(tokenizer.decode(tokenizer.encode(sentence)))
    inputs = tokenizer.encode(sentence, return_tensors="pt")
    with torch.no_grad():
        outputs = model(inputs).logits
    predictions = torch.argmax(outputs, dim=2)
    char_tags = [(token, label_list[prediction]) for token, prediction in zip(tokens, predictions[0].numpy())][1:-1]
    print(sentence)
    print(char_tags)

    pred_labels = [i[1] for i in char_tags]
    entities = []
    line_entities = get_entities(pred_labels)
    for i in line_entities:
        word = sentence[i[1]: i[2] + 1]
        entity_type = i[0]
        entities.append((word, entity_type))

    print("Sentence entity:")
    print(entities)


get_entity(sentence)

王宏伟来自北京，是个警察，喜欢去王府井游玩儿。
[('王', 'B-PER'), ('宏', 'I-PER'), ('伟', 'I-PER'), ('来', 'O'), ('自', 'O'), ('北', 'B-LOC'), ('京', 'I-LOC'), ('，', 'O'), ('是', 'O'), ('个', 'O'), ('警', 'O'), ('察', 'O'), ('，', 'O'), ('喜', 'O'), ('欢', 'O'), ('去', 'O'), ('王', 'B-LOC'), ('府', 'I-LOC'), ('井', 'I-LOC'), ('游', 'O'), ('玩', 'O'), ('儿', 'O'), ('。', 'O')]
Sentence entity:
[('王宏伟', 'PER'), ('北京', 'LOC'), ('王府井', 'LOC')]


本节完。