In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

import pandas as pd
import warnings
import os
import sys
import re

sys.path.append("../")
warnings.filterwarnings("ignore")

In [2]:
import os


root_dir = os.path.abspath('../bailian_nlp/')

data_path = os.path.join(root_dir, 'datadrive/bailian/pos')
if not os.path.exists(data_path):
    os.makedirs(data_path, exist_ok=True)

train_path = os.path.join(data_path, 'train.csv')
valid_path = os.path.join(data_path, 'valid.csv')

model_dir = os.path.join(root_dir, 'datadrive/models/chinese_L-12_H-768_A-12/')
init_checkpoint_pt = os.path.join(model_dir, 'bert_model.bin')
bert_config_file = os.path.join(model_dir, 'bert_config.json')
vocab_file = os.path.join(model_dir, 'vocab.txt')
model_pt = os.path.join(model_dir, 'pos.bin')
config_file = os.path.join(model_dir, 'pos.json')



In [4]:
def build_data():
    
    import re
    
    p = re.compile(r'(.+?)/(?:([a-z]{1,2})(?:$| ))')
    
    raw_data_dir = os.path.dirname(data_path)
    seg_file = os.path.join(raw_data_dir, 'final_baidu-23w.txt')
    fake_file = os.path.join(raw_data_dir, 'fake.txt')
    special_file = os.path.join(raw_data_dir, 'special.txt')
    dict_file = os.path.join(raw_data_dir, 'single.txt')
    
    delimiter='△△△'
    
    replace_chars = [
        '\x97',
        '\uf076',
        "\ue405",
        "\ue105",
        "\ue415",
        '\x07',
        '\x7f',
        '\u3000',
        '\xa0',
        ' '
    ]
    with open(seg_file) as fin1, \
          open(fake_file) as fin2, \
          open(special_file) as fin3, \
          open(dict_file) as fin4, \
          open(train_path, 'w') as train_f, \
          open(valid_path, 'w') as valid_f:
        
        train_f.write(f'0{delimiter}1\n')
        valid_f.write(f'0{delimiter}1\n')
        
        fins = [fin1, fin2, fin3, fin4]
        for k, fin in enumerate(fins):
            for line in fin:
                line = line.strip()
                if not line:
                    continue

                import random
                score = random.random()

                if k < 2:
                    fout = train_f if score > 0.006 else valid_f
                else:
                    fout = train_f
                
                words = []
                flags = []
                for word, flag in p.findall(line):
                    from bailian_nlp.modules.data.tokenization import _is_control
                    
                    char_list = ['unk' if c in replace_chars or c.isspace() or _is_control(c) else c for c in list(word)]

                    char_size = len(char_list)
                    if char_size == 1:
                        # 一些错误的单个字符实体剔除掉
                        if flag in ['nt', 'ti', 'nr', 'ns', 'nz']:
                            flag = 'xx'
                        # 单个
                        tag_list = [f'S_{flag}']
                    else:
                        tag_list = [f'B_{flag}'] + [f'I_{flag}']  * (len(char_list) - 2) + [f'E_{flag}']

                    if char_size != len(tag_list):
                        print(line)
                        print(word, flag)
                        print(char_list, tag_list)

                    words.extend(char_list)
                    flags.extend(tag_list)

                assert len(words) == len(flags)

                fout.write(delimiter.join([
                    ' '.join(flags),
                    ' '.join(words)
                ]))
                fout.write('\n')
            
            
            
build_data()
    

In [3]:
# 正常训练

from bailian_nlp.modules import BertNerData as NerData

data = NerData.create(
    train_path,
    valid_path, 
    vocab_file,
    data_type="bert_uncased",
    is_cls=False,
    max_seq_len=424,
    batch_size=128
    
)

import torch
import torch.nn as nn
from importlib import reload
from bailian_nlp.modules.models import bert_models
reload(bert_models)

model = bert_models.BertBiLSTMAttnCRF.create(
    len(data.label2idx),
    bert_config_file, 
    init_checkpoint_pt,
    enc_hidden_dim=256
)
model.get_n_trainable_params()


from bailian_nlp.modules.train import train
reload(train)
num_epochs = 1
learner = train.NerLearner(model, data,
                     best_model_path=model_pt,
                     lr=0.001, clip=1.0, sup_labels=data.id2label,
                     t_total=num_epochs * len(data.train_dl))

learner.fit(num_epochs, target_metric='f1')


HBox(children=(IntProgress(value=0, description='bert data', max=793801, style=ProgressStyle(description_width…

2019-04-15 17:44:14,432 DEBUG: get_data cost 223.842493s





HBox(children=(IntProgress(value=0, description='bert data', max=2537, style=ProgressStyle(description_width='…

2019-04-15 17:44:15,578 DEBUG: get_data cost 1.02457s





2019-04-15 17:44:21,139 INFO: Resuming train... Current epoch 0.


HBox(children=(IntProgress(value=0, max=24807), HTML(value='')))

2019-04-15 18:59:51,846 INFO: 
epoch 1, average train epoch loss=6.3807






HBox(children=(IntProgress(value=0, max=80), HTML(value='')))




2019-04-15 19:00:01,235 INFO: on epoch 0 by max_f1: 0.926
2019-04-15 19:00:01,236 INFO: Saving new best model...


              precision    recall  f1-score   support

       <pad>      0.000     0.000     0.000         0
       [CLS]      1.000     1.000     1.000      2537
         B_t      0.971     0.970     0.970       824
         E_t      0.973     0.969     0.971       809
         S_w      0.998     0.998     0.998      5788
        B_nt      0.913     0.952     0.932      3361
        I_nt      0.939     0.977     0.957     15657
        E_nt      0.909     0.924     0.917      3250
        B_ti      0.962     0.978     0.970      1870
        I_ti      0.961     0.985     0.973      2685
        E_ti      0.973     0.989     0.980      1828
        B_nr      0.965     0.969     0.967      2799
        I_nr      0.944     0.964     0.954      2274
        E_nr      0.960     0.947     0.953      2760
         B_v      0.919     0.916     0.917      2957
         E_v      0.921     0.919     0.920      2897
         B_p      0.959     0.972     0.966       144
         E_p      0.966    

In [None]:
num_epochs = 10
learner = train.NerLearner(model, data,
                     best_model_path=model_pt,
                     lr=0.001, clip=1.0, sup_labels=data.id2label,
                     t_total=num_epochs * len(data.train_dl))

learner.fit(num_epochs, target_metric='f1')

In [None]:
# 恢复训练

from bailian_nlp.released import pos
from importlib import reload
reload(pos)
tagger = pos.PosTagger()
tagger.init_env(for_train=True)

data = tagger.learner.data
learner = tagger.learner
num_epochs = 1
learner.load_model()
learner.t_total = num_epochs * len(data.train_dl)
learner.sup_labels = list(set(data.id2label[1:]) | set(learner.sup_labels))
learner.fit(num_epochs, target_metric='f1')


2019-04-17 10:47:39,699 INFO: load default user_dict in /home/liuxiang/Projects/ner-bert/bailian_nlp/datadrive/dict/user_dict.txt
2019-04-17 10:47:39,702 INFO: 本次加载词条数：3
2019-04-17 10:47:39,703 INFO: 当前总词条数: 3
2019-04-17 10:47:47,988 INFO: found pos model file in /home/liuxiang/Projects/ner-bert/bailian_nlp/datadrive/models/chinese_L-12_H-768_A-12/pos.bin
2019-04-17 10:47:48,867 INFO: pos model loads success!


HBox(children=(IntProgress(value=0, description='bert data', max=793769, style=ProgressStyle(description_width…

2019-04-17 10:51:25,215 DEBUG: get_data cost 211.709043s





HBox(children=(IntProgress(value=0, description='bert data', max=2574, style=ProgressStyle(description_width='…

2019-04-17 10:51:26,159 DEBUG: get_data cost 0.94289s





2019-04-17 10:51:26,982 INFO: found pos model file in /home/liuxiang/Projects/ner-bert/bailian_nlp/datadrive/models/chinese_L-12_H-768_A-12/pos.bin
2019-04-17 10:51:27,241 INFO: pos model loads success!
2019-04-17 10:51:27,492 INFO: Resuming train... Current epoch 0.


HBox(children=(IntProgress(value=0, max=24806), HTML(value='')))

In [36]:
import torch
torch.cuda.is_available()

True

In [None]:
from modules.data import bert_data
reload(bert_data)
dl = bert_data.get_bert_data_loader_for_predict(valid_path, learner)

In [None]:
# learner.load_model()
preds = learner.predict(dl)

In [None]:
from modules.utils.plot_metrics import get_bert_span_report
clf_report = get_bert_span_report(dl, preds)
print(clf_report)

In [11]:
from bailian_nlp.released import pos
from importlib import reload
reload(pos)
tagger = pos.PosTagger()


2019-04-16 10:20:07,876 INFO: load default user_dict in /home/liuxiang/Projects/ner-bert/bailian_nlp/datadrive/dict/user_dict.txt
2019-04-16 10:20:07,878 INFO: 本次加载词条数：3
2019-04-16 10:20:07,879 INFO: 当前总词条数: 3
2019-04-16 10:20:08,918 INFO: found pos model file in /home/liuxiang/Projects/ner-bert/bailian_nlp/datadrive/models/chinese_L-12_H-768_A-12/pos.bin
2019-04-16 10:20:09,212 INFO: pos model loads success!


In [19]:
import time

st = time.time()
text = '近日，编程猫（深圳点猫科技有限公司）正式对外宣布完成B轮1.2亿元融资。本轮融资由高瓴资本领投，清流资本、清晗基金跟投，天使轮投资者猎豹移动继续跟投。'
# text = '未来编程教育产业将蓬勃发展，编程猫作为提供工具与内容的企业，有望长期处于行业领跑者地位。'
# text = '美年大健康产业（集团）有限公司美年大健康产业（集团）有限公司美年大健康产业（集团）有限公司始创于2004年,是中国健康体检和医疗服务集团,总部位于上海,深耕布局北京、深圳、沈阳、广州、成都、武汉、...'
# text = '百炼智能百炼智能'
# text = '高越君冯是聪'
text = '周光明确否认CEO佟显侨和衡量推动发出公司公告'
text = '周光明确否认CEO佟显侨和CTO衡量说的罪名'
text = '董事'
text = '一言九鼎'
text = '客户包括雀巢、洲际酒店、瑞士航空、德意志银行、红牛、瑞士联合银行等世界知名公司。'
text = '药方越是多的，越表明病是难的于治疗'
res = tagger.cut(text)
ed = time.time()
print(ed - st)
res

HBox(children=(IntProgress(value=0, description='bert data', max=1, style=ProgressStyle(description_width='ini…

2019-04-16 17:26:36,117 DEBUG: get_data cost 0.037379s





HBox(children=(IntProgress(value=0, max=1), HTML(value='')))

2019-04-16 17:26:36,214 DEBUG: text_array_for_predict cost 0.136597s



0.1380758285522461


[[('药方', 'n'),
  ('越是', 'd'),
  ('多', 'a'),
  ('的', 'u'),
  ('，', 'w'),
  ('越', 'd'),
  ('表明', 'v'),
  ('病', 'n'),
  ('是', 'v'),
  ('难', 'a'),
  ('的', 'u'),
  ('于', 'p'),
  ('治疗', 'v')]]