In [1]:
def extract_entities(sents):
    import jieba.posseg as pseg
    words = pseg.cut(sents,use_paddle=True)
    tokens=[(word, flag) for word, flag in words]
    running_offset = 0
    rs = []
    for token in tokens:
        word = token[0]
        word_offset = sents.index(word, running_offset)
        word_len = len(word)
        running_offset = word_offset + word_len
        rs.append({"start": word_offset,
                   "end": running_offset,
                   'value': word,
                   'entity': token[1],
                   "confidence": None,
                   })
    return [w for w in rs if w['entity'] in {'PER', 'LOC', 'ORG', 'TIME'}]

extract_entities("我爱北京天安门")

[{'start': 2, 'end': 4, 'value': '北京', 'entity': 'LOC', 'confidence': None},
 {'start': 4, 'end': 7, 'value': '天安门', 'entity': 'LOC', 'confidence': None}]

In [2]:
from ipymarkup import show_ascii_markup
text="我爱北京天安门"
spans=[(w['start'], w['end'], w['entity']) for w in extract_entities(text)]
show_ascii_markup(text, spans)

我爱北京天安门
  LOLOC


In [3]:
from ipymarkup import show_box_markup
from ipymarkup.palette import palette, BLUE, RED, GREEN

show_box_markup(text, spans, palette=palette(PER=BLUE, ORG=RED, LOC=GREEN))

In [4]:
# [(word, flag) for word, flag in words if flag in {'PER', 'LOC', 'ORG', 'TIME'}]
extract_entities('Rami Eid正在纽约石溪大学学习')

[{'start': 10,
  'end': 16,
  'value': '纽约石溪大学',
  'entity': 'ORG',
  'confidence': None}]

In [5]:
extract_entities('周四下午三点到五点开会')

[{'start': 0,
  'end': 9,
  'value': '周四下午三点到五点',
  'entity': 'TIME',
  'confidence': None}]

In [6]:
extract_entities('7月10日晚上7点左右，六安市公安局裕安分局平桥派出所接到辖区居民戴某报警')

[{'start': 0,
  'end': 9,
  'value': '7月10日晚上7点',
  'entity': 'TIME',
  'confidence': None},
 {'start': 12,
  'end': 27,
  'value': '六安市公安局裕安分局平桥派出所',
  'entity': 'ORG',
  'confidence': None},
 {'start': 33, 'end': 35, 'value': '戴某', 'entity': 'PER', 'confidence': None}]

In [7]:
import jieba
result = jieba.tokenize('am I talking to a bot?')
for tk in result:
    print("word %s\t\t start: %d \t\t end:%d" % (tk[0],tk[1],tk[2]))

Building prefix dict from the default dictionary ...
Dumping model to file cache /var/folders/fv/7k1qk5v11dn33sdcngv2wbnm0000gn/T/jieba.cache
Loading model cost 0.971 seconds.
Prefix dict has been built successfully.


word am		 start: 0 		 end:2
word  		 start: 2 		 end:3
word I		 start: 3 		 end:4
word  		 start: 4 		 end:5
word talking		 start: 5 		 end:12
word  		 start: 12 		 end:13
word to		 start: 13 		 end:15
word  		 start: 15 		 end:16
word a		 start: 16 		 end:17
word  		 start: 17 		 end:18
word bot		 start: 18 		 end:21
word ?		 start: 21 		 end:22
