In [1]:
import torch
import jieba
import numpy as np
from classifier import BertForMaskClassification
from transformers import AutoTokenizer, AutoConfig, BertForTokenClassification

label_list = ["O","COMMA","PERIOD","COLON"]

label2punct = {
    "COMMA": "，",
    "PERIOD": "。",
    "COLON":"：",
}

model_name_or_path = "output"

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
model = BertForMaskClassification.from_pretrained(model_name_or_path)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def punct(text):

    tokenize_words = jieba.lcut(''.join(text))
    mask_tokens = []
    for word in tokenize_words:
        mask_tokens.extend(word)
        mask_tokens.append("[MASK]")
    tokenized_inputs = tokenizer(mask_tokens,is_split_into_words=True, return_tensors="pt")
    with torch.no_grad():   
        logits = model(**tokenized_inputs).logits
    predictions = logits.argmax(-1).tolist()
    predictions = predictions[0]
    tokens = tokenizer.convert_ids_to_tokens(tokenized_inputs["input_ids"][0])

    result =[]
    print(tokens)
    print(predictions)
    for token, prediction in zip(tokens, predictions):
        if token =="[CLS]" or token =="[SEP]":
            continue
        if token == "[MASK]":
            label = label_list[prediction]
            if label != "O":
                punct = label2punct[label]
                result.append(punct)
        else:
            result.append(token)

    return "".join(result)


In [3]:
text1 = '对于腰痛治疗专家建议谨慎手术治疗推荐物理康复治疗与心理治疗相结合'
text2 = '全腹未触及包块肝脾肋下未触及胆囊未触及Murphy征阴性肾脏未触及'
text3 = '未见毛细血管搏动征未闻及水冲脉未闻及枪击音等'
text4 = '本病病性属实证病位在胰腑与脾胃有关'
text5 = '肝浊音界正常肝上界位于锁骨中线第五肋间移动浊音阴性肾区无叩痛'

In [4]:
print(punct(text1))

Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Dumping model to file cache /tmp/jieba.cache
Dump cache file failed.
Traceback (most recent call last):
  File "/home/tongtao.ling/miniconda3/envs/t5/lib/python3.9/site-packages/jieba/__init__.py", line 154, in initialize
    _replace_file(fpath, cache_file)
PermissionError: [Errno 1] Operation not permitted: '/tmp/tmps_ew_jtg' -> '/tmp/jieba.cache'
Loading model cost 0.935 seconds.
Prefix dict has been built successfully.


['[CLS]', '对', '于', '[MASK]', '腰', '痛', '[MASK]', '治', '疗', '[MASK]', '专', '家', '建', '议', '[MASK]', '谨', '慎', '[MASK]', '手', '术', '[MASK]', '治', '疗', '[MASK]', '推', '荐', '[MASK]', '物', '理', '[MASK]', '康', '复', '[MASK]', '治', '疗', '[MASK]', '与', '[MASK]', '心', '理', '治', '疗', '[MASK]', '相', '结', '合', '[MASK]', '[SEP]']
[0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0]
对于腰痛治疗。专家建议，谨慎手术治疗，推荐物理康复治疗与心理治疗相结合。


In [5]:
print(punct(text2))

['[CLS]', '全', '腹', '[MASK]', '未', '[MASK]', '触', '及', '[MASK]', '包', '块', '[MASK]', '肝', '[MASK]', '脾', '[MASK]', '肋', '[MASK]', '下', '[MASK]', '未', '[MASK]', '触', '及', '[MASK]', '胆', '囊', '[MASK]', '未', '[MASK]', '触', '及', '[MASK]', 'm', 'u', 'r', 'p', 'h', 'y', '[MASK]', '征', '[MASK]', '阴', '性', '[MASK]', '肾', '脏', '[MASK]', '未', '[MASK]', '触', '及', '[MASK]', '[SEP]']
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0]
全腹未触及包块，肝脾肋下未触及，胆囊未触及，murphy征阴性，肾脏未触及，


In [6]:
print(punct(text3))

['[CLS]', '未', '见', '[MASK]', '毛', '细', '血', '管', '[MASK]', '搏', '动', '[MASK]', '征', '[MASK]', '未', '闻', '[MASK]', '及', '水', '[MASK]', '冲', '脉', '[MASK]', '未', '闻', '[MASK]', '及', '[MASK]', '枪', '击', '[MASK]', '音', '等', '[MASK]', '[SEP]']
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0]
未见毛细血管搏动征，未闻及水冲脉，未闻及枪击音等。


In [7]:
print(punct(text4))

['[CLS]', '本', '病', '[MASK]', '病', '性', '[MASK]', '属', '[MASK]', '实', '证', '[MASK]', '病', '位', '[MASK]', '在', '[MASK]', '胰', '腑', '[MASK]', '与', '[MASK]', '脾', '胃', '[MASK]', '有', '关', '[MASK]', '[SEP]']
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 2, 0]
本病病性属实证，病位在胰腑，与脾胃有关。


In [8]:
print(punct(text5))

['[CLS]', '肝', '[MASK]', '浊', '音', '[MASK]', '界', '[MASK]', '正', '常', '[MASK]', '肝', '上', '界', '[MASK]', '位', '于', '[MASK]', '锁', '骨', '[MASK]', '中', '线', '[MASK]', '第', '五', '[MASK]', '肋', '间', '[MASK]', '移', '动', '[MASK]', '浊', '音', '[MASK]', '阴', '性', '[MASK]', '肾', '区', '[MASK]', '无', '[MASK]', '叩', '痛', '[MASK]', '[SEP]']
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 2, 0]
肝浊音界正常，肝上界位于锁骨中线第五肋间，移动浊音阴性，肾区无叩痛。
