使用最优模型预测train+dev数据

In [0]:
# 显卡查看
! nvidia-smi

Wed May 20 15:21:16 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 440.82       Driver Version: 418.67       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  Tesla K80           Off  | 00000000:00:04.0 Off |                    0 |
| N/A   31C    P8    28W / 149W |      0MiB / 11441MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                       GPU Memory |
|  GPU       PID   Type   Process name                             Usage      |
|  No ru

In [0]:
# 依赖安装
! pip install fastNLP

Collecting fastNLP
[?25l  Downloading https://files.pythonhosted.org/packages/fc/a5/956a2678ee29e7a50b33c06d0390644a184701b585013c94d90106bdcb4c/FastNLP-0.5.5.tar.gz (274kB)
[K     |████████████████████████████████| 276kB 6.5MB/s 
Collecting nltk>=3.4.1
[?25l  Downloading https://files.pythonhosted.org/packages/92/75/ce35194d8e3022203cca0d2f896dbb88689f9b3fce8e9f9cff942913519d/nltk-3.5.zip (1.4MB)
[K     |████████████████████████████████| 1.4MB 18.4MB/s 
Building wheels for collected packages: fastNLP, nltk
  Building wheel for fastNLP (setup.py) ... [?25l[?25hdone
  Created wheel for fastNLP: filename=FastNLP-0.5.5-cp36-none-any.whl size=332757 sha256=00efc36e1d1ace3518f8ad0e0ef9eb114a2a8fc0f0c4b16651a2a31f719c8081
  Stored in directory: /root/.cache/pip/wheels/13/b9/4e/7e7b9c2e3deae523b2eec14157ed112ce09bf1dee5483a48ae
  Building wheel for nltk (setup.py) ... [?25l[?25hdone
  Created wheel for nltk: filename=nltk-3.5-cp36-none-any.whl size=1434673 sha256=960bba9a511159cc5497f

加载数据集

In [0]:
import os
import sys
import codecs
import copy

import torch
from fastNLP.core import Const
from fastNLP.core.predictor import Predictor
from fastNLP.io import PeopleDailyNERLoader

sys.path.insert(0, '/content/drive/My Drive/my_framework/qyt_clue/')  # 定义搜索路径的优先顺序，序号从0开始，表示最大优先级

import myClue  # noqa
print('myClue module path :{}'.format(myClue.__file__))  # 输出测试模块文件位置
from myClue.core import logger  # noqa
from myClue.core.utils import print_data_bundle  # noqa
from myClue.tools.serialize import load_serialize_obj  # noqa

myClue module path :/content/drive/My Drive/my_framework/qyt_clue/myClue/__init__.py


In [0]:
# 训练数据预测
model_path = '/content/drive/My Drive/my_framework/qyt_clue/data/weibo_NER/model_bert_fine_tuning'
model_file = os.path.join(model_path, 'best_BertCRF_f_2020-05-20-06-14-43-235587')
train_file = '/content/drive/My Drive/my_framework/qyt_clue/data/weibo_NER/train.conll'
predict_output_file = '/content/drive/My Drive/my_framework/qyt_clue/data/weibo_NER/train_bert_predict.conll'
char_vocab_pkl_file = os.path.join(model_path, 'vocab_char.pkl')
target_vocab_pkl_file = os.path.join(model_path, 'target_char.pkl')
# 加载数据
data_loader = PeopleDailyNERLoader()
data_bundle = data_loader.load({'train': train_file})
print_data_bundle(data_bundle)
dataset = data_bundle.datasets['train']
dataset_original = copy.deepcopy(dataset)
# 加载词表
char_vocab = load_serialize_obj(char_vocab_pkl_file)
logger.info('char_vocab:{}'.format(char_vocab))
target_vocab = load_serialize_obj(target_vocab_pkl_file)
logger.info('target_vocab:{}'.format(target_vocab))
# 加载模型
model = torch.load(model_file)
if torch.cuda.is_available():
    model = model.cuda()
    logger.info('use cuda')
model.eval()
logger.info('模型加载完毕:\n{}'.format(model))
# 数据预处理
dataset.rename_field(field_name=Const.RAW_CHAR, new_field_name=Const.INPUT)
dataset.add_seq_len(field_name=Const.INPUT)
dataset.set_input(Const.INPUT, Const.INPUT_LEN)
dataset.set_target(Const.TARGET, Const.INPUT_LEN)
char_vocab.index_dataset(dataset, field_name=Const.INPUT)
# 预测
predictor = Predictor(model)
predict_output = predictor.predict(data=dataset, seq_len_field_name=Const.INPUT_LEN)
pred_results = predict_output.get(Const.OUTPUT)
# 预测结果解码
with codecs.open(predict_output_file, mode='w', encoding='utf8') as fw:
    for datarow, pred_result in zip(dataset_original, pred_results):
        pred_result = [target_vocab.to_word(pred_item) for pred_item in pred_result]
        row_chars = datarow[Const.RAW_CHAR]
        for char, label in zip(row_chars, pred_result):
            fw.write('{}\t{}\n'.format(char, label))
        fw.write('\n')
    # fw.write('\n')
logger.info('predict_output_file：{}'.format(predict_output_file))

2020-05-20 15:21:51 I [utils.py:16] dataset name : train
2020-05-20 15:21:51 I [utils.py:17] dataset len : 1350
2020-05-20 15:21:51 I [utils.py:18] dataset example : 
2020-05-20 15:21:51 I [utils.py:19] 
+------------------------------------------+------------------------------------------+
| raw_chars                                | target                                   |
+------------------------------------------+------------------------------------------+
| ['科', '技', '全', '方', '位', '资', ... | ['O', 'O', 'O', 'O', 'O', 'O', 'O', '... |
| ['对', '，', '输', '给', '一', '个', ... | ['O', 'O', 'O', 'O', 'O', 'O', 'B-PER... |
| ['今', '天', '下', '午', '起', '来', ... | ['O', 'O', 'O', 'O', 'O', 'O', 'O', '... |
| ['今', '年', '拜', '年', '不', '短', ... | ['O', 'O', 'O', 'O', 'O', 'O', 'O', '... |
| ['浑', '身', '酸', '疼', '，', '两', ... | ['O', 'O', 'O', 'O', 'O', 'O', 'O', '... |
+------------------------------------------+------------------------------------------+
2020-05-20 15:21:51 I [utils.py:20

+-------------+-----------+--------+
| field_names | raw_chars | target |
+-------------+-----------+--------+
|   is_input  |   False   | False  |
|  is_target  |   False   | False  |
| ignore_type |           |        |
|  pad_value  |           |        |
+-------------+-----------+--------+


2020-05-20 15:21:52 I [<ipython-input-4-9c0f8b19b9f4>:16] char_vocab:Vocabulary(['科', '技', '全', '方', '位']...)
2020-05-20 15:21:52 I [<ipython-input-4-9c0f8b19b9f4>:18] target_vocab:Vocabulary(['O', 'B-PER.NOM', 'I-PER.NOM', 'B-LOC.NAM', 'I-LOC.NAM']...)
2020-05-20 15:22:08 I [<ipython-input-4-9c0f8b19b9f4>:23] use cuda
2020-05-20 15:22:08 I [<ipython-input-4-9c0f8b19b9f4>:25] 模型加载完毕:
BertCRF(
  (embed): BertEmbedding(
    (dropout_layer): Dropout(p=0.5, inplace=False)
    (model): _BertWordModel(
      (encoder): BertModel(
        (embeddings): BertEmbeddings(
          (word_embeddings): Embedding(3405, 768)
          (position_embeddings): Embedding(512, 768)
          (token_type_embeddings): Embedding(2, 768)
          (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (encoder): BertEncoder(
          (layer): ModuleList(
            (0): BertLayer(
              (attention): BertAttention(
     

In [0]:
# 训练数据预测
train_file = '/content/drive/My Drive/my_framework/qyt_clue/data/weibo_NER/dev.conll'
predict_output_file = '/content/drive/My Drive/my_framework/qyt_clue/data/weibo_NER/dev_bert_predict.conll'
# 加载数据
data_loader = PeopleDailyNERLoader()
data_bundle = data_loader.load({'train': train_file})
print_data_bundle(data_bundle)
dataset = data_bundle.datasets['train']
dataset_original = copy.deepcopy(dataset)
# 数据预处理
dataset.rename_field(field_name=Const.RAW_CHAR, new_field_name=Const.INPUT)
dataset.add_seq_len(field_name=Const.INPUT)
dataset.set_input(Const.INPUT, Const.INPUT_LEN)
dataset.set_target(Const.TARGET, Const.INPUT_LEN)
char_vocab.index_dataset(dataset, field_name=Const.INPUT)
# 预测
predict_output = predictor.predict(data=dataset, seq_len_field_name=Const.INPUT_LEN)
pred_results = predict_output.get(Const.OUTPUT)
# 预测结果解码
with codecs.open(predict_output_file, mode='w', encoding='utf8') as fw:
    for datarow, pred_result in zip(dataset_original, pred_results):
        pred_result = [target_vocab.to_word(pred_item) for pred_item in pred_result]
        row_chars = datarow[Const.RAW_CHAR]
        for char, label in zip(row_chars, pred_result):
            fw.write('{}\t{}\n'.format(char, label))
        fw.write('\n')
    # fw.write('\n')
logger.info('predict_output_file：{}'.format(predict_output_file))

2020-05-20 15:23:55 I [utils.py:16] dataset name : train
2020-05-20 15:23:55 I [utils.py:17] dataset len : 270
2020-05-20 15:23:55 I [utils.py:18] dataset example : 
2020-05-20 15:23:55 I [utils.py:19] 
+------------------------------------------+------------------------------------------+
| raw_chars                                | target                                   |
+------------------------------------------+------------------------------------------+
| ['口', '腔', '溃', '疡', '加', '上', ... | ['O', 'O', 'O', 'O', 'O', 'O', 'O', '... |
| ['用', '最', '大', '努', '力', '去', ... | ['O', 'O', 'O', 'O', 'O', 'O', 'O', '... |
| ['无', '论', '如', '何', '，', '不', ... | ['O', 'O', 'O', 'O', 'O', 'O', 'O', '... |
| ['总', '要', '相', '信', '那', '句', ... | ['O', 'O', 'O', 'O', 'O', 'O', 'O', '... |
| ['清', '清', '淡', '然', '朴', '普', ... | ['O', 'O', 'O', 'O', 'O', 'O', 'O', '... |
+------------------------------------------+------------------------------------------+
2020-05-20 15:23:55 I [utils.py:20]

+-------------+-----------+--------+
| field_names | raw_chars | target |
+-------------+-----------+--------+
|   is_input  |   False   | False  |
|  is_target  |   False   | False  |
| ignore_type |           |        |
|  pad_value  |           |        |
+-------------+-----------+--------+


2020-05-20 15:24:08 I [<ipython-input-5-1d165135c1ab>:28] predict_output_file：/content/drive/My Drive/my_framework/qyt_clue/data/weibo_NER/dev_bert_predict.conll
