In [10]:
from transformers import pipeline
ner_pipe = pipeline('token-classification', model='roberthsu2003/models_for_ner',aggregation_strategy='simple')
inputs = '美國國務卿魯比歐反駁卡拉斯和其他歐洲領袖，他表示，「這些人只是在玩無聊的把戲與說這些話，他們的退場戰略是什麼？這個星球上唯一正積極嘗試結束這場衝突的人，就是美國總統川普，我們應該幫助他實現」。'
res = ner_pipe(inputs)
print(res)
res_result = {}
for r in res:
    entity_name = r['entity_group']
    start = r['start']
    end = r['end']
    if entity_name not in res_result:
        res_result[entity_name] = []
    res_result[entity_name].append(inputs[start:end])

res_result

Device set to use cpu


[{'entity_group': 'LOC', 'score': np.float32(0.9980444), 'word': '美 國', 'start': 0, 'end': 2}, {'entity_group': 'PER', 'score': np.float32(0.9996342), 'word': '魯 比 歐', 'start': 5, 'end': 8}, {'entity_group': 'PER', 'score': np.float32(0.99959224), 'word': '卡 拉 斯', 'start': 10, 'end': 13}, {'entity_group': 'LOC', 'score': np.float32(0.9995218), 'word': '歐 洲', 'start': 16, 'end': 18}, {'entity_group': 'LOC', 'score': np.float32(0.9993503), 'word': '美 國', 'start': 78, 'end': 80}, {'entity_group': 'PER', 'score': np.float32(0.9975341), 'word': '川 普', 'start': 82, 'end': 84}]


{'LOC': ['美國', '歐洲', '美國'], 'PER': ['魯比歐', '卡拉斯', '川普']}

In [32]:
from transformers import AutoModelForTokenClassification, AutoTokenizer
import numpy as np

model = AutoModelForTokenClassification.from_pretrained('roberthsu2003/models_for_ner')
tokenizer = AutoTokenizer.from_pretrained('roberthsu2003/models_for_ner')
label_list = list(model.config.id2label.values())
label_list

['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC']

In [46]:
from pprint import pprint
def predict_ner(text):
    inputs = tokenizer(text,truncation=True,padding=True,return_tensors='pt')
    outputs = model(**inputs)
    predictions = np.argmax(outputs.logits.detach().numpy(),axis=-1)
    print(predictions)
    word_ids = inputs.word_ids()
    print(word_ids)

    pred_tags = []
    for word_id, pred in zip(word_ids, predictions[0]):
        if word_id is None:
            continue
        pred_tags.append(label_list[pred])
    return pred_tags

def get_entities(tags):
    entities = []
    start_index = -1
    current_entity_type = None
    for i, tag in enumerate(tags):
        if tag != 'O':
            if start_index == -1:
                start_index = i
                current_entity_type = tag[2:]
        else: #tag == 'O'
            if start_index != -1:
                entities.append((start_index, i, current_entity_type))
                start_index = -1
                current_entity_type = None    

    if start_index != -1:
        entities.append((start_index, len(tags), current_entity_type))
    return entities

text = "徐國堂 interest to 台北上班"
ner_tags = predict_ner(text)
entities = get_entities(ner_tags)
print(entities)
word_tokens = tokenizer.tokenize(text)
print(f'Entities:')
for start, end, entity_type in entities:
    entity_text = "".join(word_tokens[start:end])
    print(f"- {entity_text}:{entity_type}")


[[0 1 2 2 0 0 0 0 5 6 0 0 0]]
[None, 0, 1, 2, 3, 3, 3, 4, 5, 6, 7, 8, None]
[(0, 3, 'PER'), (7, 9, 'LOC')]
Entities:
- 徐國堂:PER
- 台北:LOC
