# 命名實體標注(Named Entity Recognition, NER)

In [1]:
from transformers import pipeline

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
nlp = pipeline("ner")

No model was supplied, defaulted to dbmdz/bert-large-cased-finetuned-conll03-english and revision f2482bf (https://huggingface.co/dbmdz/bert-large-cased-finetuned-conll03-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
(…)conll03-english/resolve/main/config.json: 100%|█████████████████████████████████| 998/998 [00:00<00:00, 332kB/s]
model.safetensors: 100%|██████████████████████████████████████████████████████| 1.33G/1.33G [00:58<00:00, 22.8MB/s]
Some weights of the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.weight', 'bert.pooler.dense.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initiali

In [3]:
# 測試資料
sequence = "Hugging Face Inc. is a company based in New York City. " \
           "Its headquarters are in DUMBO, therefore very" \
           "close to the Manhattan Bridge."

# 推測答案
import pandas as pd
df = pd.DataFrame(nlp(sequence))
df

Unnamed: 0,entity,score,index,word,start,end
0,I-ORG,0.999511,1,Hu,0,2
1,I-ORG,0.989597,2,##gging,2,7
2,I-ORG,0.99797,3,Face,8,12
3,I-ORG,0.999376,4,Inc,13,16
4,I-LOC,0.999341,11,New,40,43
5,I-LOC,0.999193,12,York,44,48
6,I-LOC,0.999341,13,City,49,53
7,I-LOC,0.986336,19,D,79,80
8,I-LOC,0.939624,20,##UM,80,82
9,I-LOC,0.912139,21,##BO,82,84


In [4]:
nlp(sequence)

[{'entity': 'I-ORG',
  'score': 0.9995109,
  'index': 1,
  'word': 'Hu',
  'start': 0,
  'end': 2},
 {'entity': 'I-ORG',
  'score': 0.9895975,
  'index': 2,
  'word': '##gging',
  'start': 2,
  'end': 7},
 {'entity': 'I-ORG',
  'score': 0.9979704,
  'index': 3,
  'word': 'Face',
  'start': 8,
  'end': 12},
 {'entity': 'I-ORG',
  'score': 0.9993759,
  'index': 4,
  'word': 'Inc',
  'start': 13,
  'end': 16},
 {'entity': 'I-LOC',
  'score': 0.9993406,
  'index': 11,
  'word': 'New',
  'start': 40,
  'end': 43},
 {'entity': 'I-LOC',
  'score': 0.9991928,
  'index': 12,
  'word': 'York',
  'start': 44,
  'end': 48},
 {'entity': 'I-LOC',
  'score': 0.9993411,
  'index': 13,
  'word': 'City',
  'start': 49,
  'end': 53},
 {'entity': 'I-LOC',
  'score': 0.98633635,
  'index': 19,
  'word': 'D',
  'start': 79,
  'end': 80},
 {'entity': 'I-LOC',
  'score': 0.9396239,
  'index': 20,
  'word': '##UM',
  'start': 80,
  'end': 82},
 {'entity': 'I-LOC',
  'score': 0.91213864,
  'index': 21,
  'word'

# 結合Tokenizer

In [5]:
# 載入相關套件
from transformers import AutoModelForTokenClassification, AutoTokenizer
import torch

# 結合分詞器(Tokenizer)
model_name = "dbmdz/bert-large-cased-finetuned-conll03-english"
model = AutoModelForTokenClassification.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

Some weights of the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.weight', 'bert.pooler.dense.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
(…)cased/resolve/main/tokenizer_config.json: 100%|██████████████████████████████| 29.0/29.0 [00:00<00:00, 14.5kB/s]
(…)bert-base-cased/resolve/main/config.json: 100%|█████████████████████████████████| 570/570 [00:00<00:00, 189kB/s]
(…)o/bert-base-cased/resolve/main/vocab.txt: 100%|███████████████████████████

In [6]:
# NER 類別 
label_list = [
    "O",       # 非實體
    "B-MISC",  # 雜項實體的開頭，接在另一雜項實體的後面
    "I-MISC",  # 雜項實體
    "B-PER",   # 人名的開頭，接在另一人名的後面
    "I-PER",   # 人名
    "B-ORG",   # 組織的開頭，接在另一組織的後面
    "I-ORG",   # 組織
    "B-LOC",   # 地名的開頭，接在另一地名的後面
    "I-LOC"    # 地名
]

# 測試資料
sequence = "Hugging Face Inc. is a company based in New York City. " \
           "Its headquarters are in DUMBO, therefore very" \
           "close to the Manhattan Bridge."

# 推測答案
inputs = tokenizer(sequence, return_tensors="pt")
tokens = inputs.tokens()

outputs = model(**inputs).logits
predictions = torch.argmax(outputs, dim=2)

for token, prediction in zip(tokens, predictions[0].numpy()):
    print((token, model.config.id2label[prediction]))

('[CLS]', 'O')
('Hu', 'I-ORG')
('##gging', 'I-ORG')
('Face', 'I-ORG')
('Inc', 'I-ORG')
('.', 'O')
('is', 'O')
('a', 'O')
('company', 'O')
('based', 'O')
('in', 'O')
('New', 'I-LOC')
('York', 'I-LOC')
('City', 'I-LOC')
('.', 'O')
('Its', 'O')
('headquarters', 'O')
('are', 'O')
('in', 'O')
('D', 'I-LOC')
('##UM', 'I-LOC')
('##BO', 'I-LOC')
(',', 'O')
('therefore', 'O')
('very', 'O')
('##c', 'O')
('##lose', 'O')
('to', 'O')
('the', 'O')
('Manhattan', 'I-LOC')
('Bridge', 'I-LOC')
('.', 'O')
('[SEP]', 'O')
