## 电商评论模型调用




### 加载模型

In [1]:
import torch
from run_ecom_senti import MODEL_CLASSES  

task_name = 'skincare_patch'  # 本次任务名
runs_name = 'all_in_one_ground' # 当前一次运行名称
subdict = 'skincare' # 品类名称, 需提前将subtype定义写入 SUBTYPE.json文件
target_device="cpu"  
# target_device="cuda:0"  # 模型指定gpu或cpu
config_path = f'/data/projects/bert_pytorch/{task_name}_out/{runs_name}/config.json'
vocab_path = f'/data/projects/bert_pytorch/{task_name}_out/{runs_name}/vocab.txt'
model_path = f'/data/projects/bert_pytorch/{task_name}_out/{runs_name}/checkpoint-4200/pytorch_model.bin' 
# 注意此处实际上用了checkpoint中的模型, 如需使用最终步模型, 修改对应路径即可

config_class, model_class, tokenizer_class = MODEL_CLASSES['ecom_senti']
config = config_class.from_pretrained(config_path, num_labels=4)
tokenizer = tokenizer_class.from_pretrained(vocab_path)
model = model_class.from_pretrained(model_path, config=config)
model.to(torch.device(target_device))  
model.eval()

BertEcomCommentMultiPolarV4(
  (dropout): Dropout(p=0.1, inplace=False)
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(21128, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): FusedLayerNorm(torch.Size([768]), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
      

### 加载函数

In [6]:
import re
import os
import time

from torchviz import make_dot, make_dot_from_trace

from data_preprocess import convert_text
from utils_ecom_senti import (
    RawResult, 
    SquadExample,
    convert_examples_to_features,
    convert_polar_examples_to_features, find_positions
)
from utils_glue import InputExample
import numpy as np


from mtl_manual import write_predictions

def to_list(tensor):
    return tensor.detach().cpu().tolist()


# 加载trans_subtype
TRANS_SUBTYPE = json.load(open(os.path.join(os.getcwd(), 'SUBTYPE.json'), 'r'))[subdict]
subtype_list = list(TRANS_SUBTYPE.keys())


# 极性映射关系
label_map = {
    '2': 5,
    '0': 1,
    '1': 3
}


def find_subtype(idx):
    return subtype_list[idx]



def calc_polar(text, opinions, inputs, seq_outputs, max_seq_length=256):

    opinion_masks = []
    examples = []
    for r in opinions:
        opinion_mask = [0] * max_seq_length
        op_start, op_end = find_positions(text, [r['opinionTerm']])
        if op_start == -2 or op_start > max_seq_length - 5:
            opinion_mask[0] = 1
        else:
            for i in range(op_start, op_end):
                opinion_mask[min(i+4, max_seq_length-1)] = 1
        opinion_masks.append(opinion_mask)   
    all_opinion_mask = torch.tensor(opinion_masks, dtype=torch.float32).to(torch.device(target_device))
    
    
    with torch.no_grad():
        inputs.update({
          'opinion_mask':   all_opinion_mask,
          'seq_embeddings': seq_outputs.repeat(len(opinion_masks),1,1) ,
          'attention_mask': inputs['attention_mask'].repeat(len(opinion_masks),1)
        })
    

        outputs = model(**inputs)
    return np.argmax(outputs.detach().cpu().numpy(),axis=1).tolist()


def general_sentiment(text, tokenizer=tokenizer, model=model):
    ## 目前处理不了长句子
    text = text[:250]
    start = time.time()
    # 生成模型feature run 模型
    example = SquadExample(
                     qas_id=0,
                     question_text=list(TRANS_SUBTYPE.values())[0],
                     doc_tokens=convert_text(text),
                     label=1

    )

    features = convert_examples_to_features(examples=[example],
                                            tokenizer=tokenizer,
                                            max_seq_length=256,
                                            doc_stride=128,
                                            max_query_length=20,
                                            is_training=False,
                                            label_list=[1,3,5],
                                           trans_subtype=TRANS_SUBTYPE)
    all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long).to(torch.device(target_device))
    all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long).to(torch.device(target_device))
    all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long).to(torch.device(target_device))
    all_question_ids = torch.tensor([-1 for f in features], dtype=torch.long).to(torch.device(target_device))
    with torch.no_grad():
        inputs = {
              'input_ids':   all_input_ids,
              'attention_mask': all_input_mask,
              'token_type_ids': all_segment_ids,  # XLM don't use segment_ids
              'question_ids':  all_question_ids,
        }
        time1 = time.time()
        outputs =model(**inputs)

        time2 = time.time()
    all_results = []
    all_op_results = []
    
    for i in range(len(outputs[0])):
        
        result = RawResult(unique_id = features[0].unique_id,
                           start_logits = to_list(outputs[0][i][0]),
                           end_logits = to_list(outputs[1][i][0]))
        op_result = RawResult(unique_id = features[0].unique_id,
                           start_logits = to_list(outputs[2][i][0]),
                           end_logits = to_list(outputs[3][i][0]))

   
        all_results.append(result)
        all_op_results.append(op_result)
    
    # 根据模型输出查找结果
    ret = {
        'text':  convert_text(text),
        'opinions': []
    }
    idx = 0
    kwargs = dict(n_best_size=5,
                  max_answer_length=20, 
                  do_lower_case=True,
                  verbose_logging=False,
                  version_2_with_negative=True,
                  null_score_diff_threshold=0)
    for r, opr in (zip(all_results, all_op_results)):

        asp, nbest_asp=write_predictions([example], features, [r], **kwargs) #特征词    
        op, nbest_op=write_predictions([example], features, [opr], **kwargs) #情感词 
        asp_ret = asp[0].replace(" ","")
        op_ret = op[0].replace(" ","")
        if len(asp_ret) > 1:
            ret['opinions'].append({
                'aspectSubtype' : find_subtype(idx),
                'aspectTerm': asp_ret,
                'opinionTerm': op_ret,
            })
        idx += 1
        if idx > len(TRANS_SUBTYPE): break 
            
            
    time3 = time.time()

    if len(ret['opinions']) > 0:
        polars = calc_polar(example.doc_tokens, ret['opinions'], inputs, outputs[4])
        time4 = time.time()
        for op, polar in zip(ret['opinions'], polars):
            if len(op['opinionTerm']) < 1:
                op['polarity'] = 3
            else:
                op['polarity'] = label_map.get(str(polar),3)
    time5 = time.time()
#     print('预处理时间', time1 - start)
#     print('模型时间', time2 - time1)
#     print('后处理词时间', time3 - time2)
#     print('极性时间', time4 - time3)
#     print('极性时间', time5 - time4)

    return ret, [time1 - start,time2 - time1,time3 - time2,time4 - time3,time5 - time4]
    

### 文本测试

- GPU 单进程 0.2s 一条 GPU利用率8%
- CPU 单进程 0.5s 一条 CPU利用率超过100%(多核)

```python

import time
start = time.time()
for i in range(100):
    p = general_sentiment(text)
print('cost time', time.time() - start)

```




In [8]:
text = "这个真的不推荐,眼霜还是买大牌吧,完全不吸收,担心会长脂肪粒"
timer = []
start = time.time()
for i in range(1):
    ret, t = general_sentiment(text)
#     timer.append(t)
print('cost time', time.time() - start)
ret





cost time 0.3654806613922119


{'text': '这个真的不推荐,眼霜还是买大牌吧,完全不吸收,担心会长脂肪粒',
 'opinions': [{'aspectSubtype': 'Fat Granule',
   'aspectTerm': '脂肪粒',
   'opinionTerm': '担心会长脂肪粒',
   'polarity': 3}]}

### 对比godolphin结果

In [9]:
import requests
payload = {
    'text': text,
    'category': 'SkinCare'  # 这里替换为godolphin对应的品类
}
headers = {
    'Content-Type': "application/json",
    'Cache-Control': "no-cache",
    }

r = requests.post('http://godolphin.ym:30078/analysis', json=payload, headers=headers)
r.json()


{'ok': True,
 'data': {'data': [{'aspectSubtype': 'Fat Granule',
    'aspectType': 'Product',
    'polarity': 3,
    'aspectTerm': ['脂肪粒'],
    'opinionTerm': [''],
    'aspect_confidence': 1,
    'opinion_confidence': 0.76811594,
    'subtype_strategy': 'R102/n-脂肪粒/a-3,3/lc',
    'opinion_strategy': '{}NEU/p',
    'mapping': '',
    'summary': '脂肪粒'},
   {'aspectSubtype': 'Brand Equity',
    'aspectType': 'Branding',
    'polarity': 1,
    'aspectTerm': ['大牌'],
    'opinionTerm': ['担心'],
    'aspect_confidence': 1,
    'opinion_confidence': 0.80595875,
    'subtype_strategy': 'R102/n-大牌/a-1,1/lc',
    'opinion_strategy': 's/担心/oNEG/p/1-KPT/g/1',
    'mapping': '',
    'summary': '大牌担心'}],
  'lang': '',
  'is_spam': False,
  'model_version': '',
  'text_polarity': 1}}