In [1]:
import os
import re
import random
import logging
from esun_phoneme_tool.tokenizer import Tokenizer
from sklearn.feature_extraction.text import TfidfVectorizer
import math
from sklearn.preprocessing import normalize

import torch
import torch.optim as optim
from torch import Tensor
from torch.utils.data import Dataset, DataLoader, random_split
from torch.nn.utils.rnn import pad_sequence
from torch.nn import CrossEntropyLoss
from transformers import BertTokenizer, BertForSequenceClassification
import numpy as np
import pandas as pd
from tqdm import tqdm

from utils import tokenize_and_map, RunningAverage

CN_EN_RE = r'[\u4e00-\u9fa5A-Za-z]'
FORMAT = '%(asctime)s %(levelname)s: %(message)s'
logging.basicConfig(level=logging.DEBUG, format=FORMAT)
# pd.set_option('display.max_rows', None)

In [2]:
class TextDataset(Dataset):
    def __init__(self, tokenizer, texts, trues=None,
                 pad_token_label_id=0, max_length=512, for_train=True):
        self.tokenizer = tokenizer
        self.texts = texts
        self.trues = trues
        self.pad_token_label_id = pad_token_label_id
        self.max_length = max_length
        self.for_train = for_train

    def __getitem__(self, idx):
        q_texts = self.texts[idx]

        processed_tokens = ['[CLS]'] 
        for text in q_texts:
            tokens, text2token, token2text = tokenize_and_map(self.tokenizer, text)

            cut_index = self.max_length - 50
            if cut_index < len(tokens):
                cut_text_index = text2token.index(cut_index)
                tokens = tokens[:cut_index]
                            
            processed_tokens += tokens + ['[SEP]']
            
        input_ids = torch.tensor(self.tokenizer.convert_tokens_to_ids(processed_tokens))
        token_type_ids = torch.tensor([0] * len(processed_tokens))
        attention_mask = torch.tensor([1] * len(processed_tokens))

        outputs = (input_ids, token_type_ids, attention_mask, )

        if self.for_train:
            true = self.trues[idx]
            label = torch.tensor(true)
            outputs += (label, )

        info = {
            'tokens': tokens,
            'text': q_texts, # [ ] 並非實際進BERT的句子(512上限)
        }
        outputs += (info, )
        return outputs
    
    def __len__(self):
        return len(self.texts)

    def create_mini_batch(self, samples):
        outputs = list(zip(*samples))

        # zero pad 到同一序列長度
        input_ids = pad_sequence(outputs[0], batch_first=True)
        token_type_ids = pad_sequence(outputs[1], batch_first=True)
        attention_mask = pad_sequence(outputs[2], batch_first=True)

        batch_output = (input_ids, token_type_ids, attention_mask)
    
        if self.for_train:
            labels = torch.stack(outputs[3])
            batch_output += (labels, )
        else:
            infos = outputs[3]
            batch_output += (infos, )

        return batch_output

In [3]:
def train_batch(model, data, optimizer, device, criterion):
    model.train()
    input_ids, token_type_ids, attention_mask, labels = [d.to(device) for d in data]

    outputs = model(
        input_ids=input_ids,
        token_type_ids=token_type_ids,
        attention_mask=attention_mask,
        labels=labels
    )

    loss = criterion(outputs.logits.view(-1, 2), labels.view(-1))
    
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    return loss.item()

def evaluate(model, valid_loader):
    model.eval()
    device = 'cuda' if next(model.parameters()).is_cuda else 'cpu'

    loss_averager = RunningAverage()
    acc_averager = RunningAverage()

    tp, fp, fn = 0, 0, 0    
    with torch.no_grad():
        for data in tqdm(valid_loader, desc='evaluate'):
            input_ids, token_type_ids, attention_mask, labels = [d.to(device) for d in data]

            outputs = model(
                input_ids=input_ids,
                token_type_ids=token_type_ids,
                attention_mask=attention_mask,
                labels=labels
            )

            loss_averager.add(outputs.loss.item())
            
            corrects = (outputs.logits.argmax(dim=-1) == labels).cpu().tolist()
            preds = outputs.logits.argmax(dim=-1).cpu().tolist()

            for label, pred in zip(labels, preds):
                tp += 1 if (label, pred) == (0, 0) else 0
                fp += 1 if (label, pred) == (1, 0) else 0
                fn += 1 if (label, pred) == (0, 1) else 0
            
            acc_averager.add_all(corrects)
    precision = tp / (tp + fp) if tp + fp > 0 else None
    recall = tp / (tp + fn) if tp + fn > 0 else None
    f1 = 2 / (1 / precision + 1 / recall) if precision and recall else None
    
    evaluation = {
        'loss': loss_averager.get(), 
        'accuracy':acc_averager.get(),
        'precision': precision,
        'recall': recall,
        'f1': f1
    }
    return evaluation

In [4]:
def setup_seed(seed):
     torch.manual_seed(seed)
     torch.cuda.manual_seed_all(seed)
     np.random.seed(seed)
     random.seed(seed)
     torch.backends.cudnn.deterministic = True

def _empty_cache():
#     try:
    with torch.no_grad():
        torch.cuda.empty_cache()
#     except:
#         pass

In [23]:
def bert_main(texts, trues,
              model_dir='/home/jovyan/if-beautiful-text/owen_dev/if_beautiful_text/cache_dir/bert-base-chinese',
              save_dir='./models/question_check/'):
    lr = 0.00001
    train_batch_size = 8
    evaluate_batch_size = 64
    max_iter = 100000
    show_train_per_iter = 100
    show_eval_per_iter = 100
    save_per_iter = 1000
    cpu_workers = 4
    checkpoint_folder = None

    assert save_per_iter % show_eval_per_iter == 0

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    logging.info(f'device: {device}')

    tokenizer = BertTokenizer.from_pretrained(model_dir)

    global SKIP_TOKEN_IDS, SKIP_TOKENS
    SKIP_TOKEN_IDS = [tokenizer.cls_token_id, tokenizer.sep_token_id, tokenizer.pad_token_id]
    SKIP_TOKENS = [tokenizer.cls_token, tokenizer.sep_token, tokenizer.pad_token]

    if not checkpoint_folder:
        model = BertForSequenceClassification.from_pretrained(
            model_dir, 
            return_dict=True,
            num_labels=2)
    else:
        model = BertForSequenceClassification.from_pretrained(checkpoint_folder)

    model.to(device)

#     dataset = TextDataset(tokenizer, texts, trues)

#     setup_seed(0)
#     CUT_RATIO = 0.8
#     train_size = int(CUT_RATIO * len(dataset))
#     valid_size = len(dataset) - train_size
#     train_dataset, valid_dataset = random_split(dataset, [train_size, valid_size])

    # t_texts, t_trues = prepare_data('/home/jovyan/wm-insur-call-qa/owen/training_data_generated_train_0119.tsv')
    # train_dataset = TextDataset(tokenizer, t_texts, t_trues)

    # v_texts, v_trues = prepare_data('/home/jovyan/wm-insur-call-qa/owen/training_data_generated_valid_0119.tsv')
    # valid_dataset = TextDataset(tokenizer, v_texts, v_trues)
    
    CUT_RATIO = 0.8
    train_size = int(CUT_RATIO * len(texts))
    train_dataset = TextDataset(tokenizer, texts[:train_size], trues[:train_size])
    valid_dataset = TextDataset(tokenizer, texts[train_size:], trues[train_size:])
    
    train_loader = DataLoader(
        dataset=train_dataset,
        batch_size=train_batch_size,
        collate_fn=train_dataset.create_mini_batch,
        shuffle=True,
        num_workers=cpu_workers)
    valid_loader = DataLoader(
        dataset=valid_dataset,
        batch_size=evaluate_batch_size,
        collate_fn=valid_dataset.create_mini_batch,
        shuffle=True,
        num_workers=cpu_workers)

    optimizer = optim.Adam(model.parameters(), lr=lr)
    scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=2, gamma=0.1)
    
    # customerize weight
    weight = [5, 1]
    weight = torch.tensor(weight).to(device)
    weight = weight.float()
    
    criterion = CrossEntropyLoss(weight=weight)
    
    i = 1
    is_running = True
    loss_averager = RunningAverage()
    max_f1 = 0
    while is_running:
        for train_data in train_loader:
            loss = train_batch(model, train_data, optimizer, device, criterion)
            loss_averager.add(loss)

            if i % show_train_per_iter == 0:
                logging.info('train_loss [{iter}]: {train_loss}'.format(
                    iter=i, train_loss=loss_averager.get()))
                loss_averager.flush()

            if i % show_eval_per_iter == 0:
                evaluation = evaluate(model, valid_loader)
                logging.info('valid_evaluation: loss={loss}, accuracy={accuracy}, '
                             'precision={precision}, recall={recall}, f1={f1}'
                             .format(**evaluation))
                if evaluation['f1']:
                    if evaluation['f1'] > max_f1:
                        eval_loss = evaluation['loss']
                        path = os.path.join(save_dir, f'question_check_weight5_1_step{i}_loss{eval_loss}/')
                        logging.info(f'Save model at {path}')
                        model.save_pretrained(path)
                        max_f1 = evaluation['f1']

            if i == max_iter:
                is_running = False
                break
            i += 1

        scheduler.step()

In [40]:
int(8966*0.8)

7172

In [24]:
bert_main(texts, trues)

2021-04-29 17:06:41,710 INFO: device: cuda
Some weights of the model checkpoint at /home/jovyan/if-beautiful-text/owen_dev/if_beautiful_text/cache_dir/bert-base-chinese were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassif

KeyboardInterrupt: 

### 測試結果

In [6]:
def prepare_data(path):
    df = pd.read_csv(path, sep='\t')
    
    texts, trues = [], []
    for i in df.index:
        question = df.loc[i, 'question']
        answer = df.loc[i, 'answer']
        
        question = ''.join([q for q in question if re.match(CN_EN_RE, q)])
        answer = ''.join([a for a in answer if re.match(CN_EN_RE, a)])

        texts.append([question, answer])
        label = 1 if df.loc[i, 'label'] == 5 else 0
        trues.append(label)
    return texts, trues

path = '/home/jovyan/wm-insur-call-qa/owen/data/qa_raw_copy.tsv'
texts, trues = prepare_data(path)

In [7]:
# 實驗1 - 自定義辭庫抓關鍵字
keys = '本人|解約|保費|健康|親自|日常支出|申請|說明|字號|風險|來源|繳費|終止|用途|後三碼|末三碼|簽名|錄音|\
電話|不當行銷|搭售|出生|先生|小姐|電訪|訪問|告知事項|風險|不當話術|不影響|受益人|知道|正確|變更|權利|義務|\
了解|確定|同意|清楚|收取|文件|恢復|協助|表單|一次繳清|躉繳|確認|投保|薪資|土地|帳戶|保單|管理|投資|貸款|房貸|房租|利息|收入\
年|月|日|保險|條款|照會|方便|帳單|股票|基金|投資|核對|辦理|契約'
tmp_texts = []

tokenizer = Tokenizer()

for text in texts:
#     tokens, token_types = tokenizer.tokenize(text[0])
    keys_lst = keys.split('|')
#     text_keyword = [token for token in tokens if token in keys_lst]
    text_keyword = [k for k in keys_lst if re.search(k, text[0])]
#     if not text_keyword:
#         print(text, text_keyword)
    tmp_texts.append([text[1]] + text_keyword)

NameError: name 'texts' is not defined

In [19]:
# 實驗2 - tf-idf
tokenizer = Tokenizer()
stop_words = '玉山|玉山銀行|保代|保代部|保險代理部|您好|你好|這裡|這邊|之後|打擾|敝姓|員工|編號|員編|總行|消金|中心|不好意思|呃|嗯'

# clean stop words
token_texts = []
for text in texts:
    tokens, token_types = tokenizer.tokenize(text[0])
    tmp_tokens = []
    for token, token_type in zip(tokens, token_types):
        if token_type == 'CN' and token not in stop_words.split('|'):
            tmp_tokens.append(token)
    token_texts.append(' '.join(tmp_tokens))

# get top 5 token from tf-idf
tfidf_vec = TfidfVectorizer()
tfidf_mat = tfidf_vec.fit_transform(token_texts,)
vocab_dict =tfidf_vec.vocabulary_#获得所有文本的关键字和其位置的dict
weight = tfidf_mat.toarray()
feat = np.argsort(-weight)#降序排序
total_key_word = []
for l in range(len(token_texts)):
    values_word = []
    for j in range(5):#获取每类文本的5个关键字
        values_word.append([k for k,v in vocab_dict.items() if v == feat[l,j]])
    total_key_word.append(values_word)
total_key_word[:100]

Building prefix dict from /home/jovyan/.local/lib/python3.6/site-packages/esun_phoneme_tool/jieba_dict_pool/general_jieba_0002.txt ...
2021-04-28 16:03:28,089 DEBUG: Building prefix dict from /home/jovyan/.local/lib/python3.6/site-packages/esun_phoneme_tool/jieba_dict_pool/general_jieba_0002.txt ...
Loading model from cache /tmp/jieba.u9c666b2c1c48c114768336a4722d3b92.cache
2021-04-28 16:03:28,090 DEBUG: Loading model from cache /tmp/jieba.u9c666b2c1c48c114768336a4722d3b92.cache
Loading model cost 0.658 seconds.
2021-04-28 16:03:28,748 DEBUG: Loading model cost 0.658 seconds.
Prefix dict has been built successfully.
2021-04-28 16:03:28,749 DEBUG: Prefix dict has been built successfully.


[[['小姐'], ['本人'], ['請問'], ['一下'], ['禮儀']],
 [['江淑'], ['方便'], ['訪問'], ['電話'], ['小姐']],
 [['江淑'], ['小姐'], ['本人'], ['請問'], ['福盛']],
 [['憑證'], ['保證'], ['繳交'], ['近期'], ['透過']],
 [['林建銘'], ['先生'], ['本人'], ['請問'], ['福利']],
 [['萬能'], ['大智'], ['變額'], ['巴黎'], ['法國']],
 [['投資型'], ['淨值'], ['方面'], ['保證'], ['變動']],
 [['天母'], ['親自'], ['藉由'], ['本人'], ['健康']],
 [['參考'], ['成本'], ['管理費'], ['進入'], ['條款']],
 [['扣掉'], ['解約'], ['提前'], ['當年度'], ['當時']],
 [['繳交'], ['來源'], ['資金'], ['解約'], ['保費']],
 [['還是'], ['保費'], ['就是'], ['繳交'], ['來源']],
 [['詢問'], ['儲蓄'], ['一次'], ['繳交'], ['來源']],
 [['規劃'], ['確實'], ['付費'], ['綜合'], ['考量']],
 [['都會'], ['提領'], ['近期'], ['透過'], ['部分']],
 [['林森'], ['林冠廷'], ['藉由'], ['提領'], ['這次']],
 [['提領'], ['部分'], ['契約'], ['終止'], ['以及']],
 [['提領'], ['無法'], ['一次'], ['原本'], ['恢復']],
 [['小姐'], ['本人'], ['請問'], ['一下'], ['禮儀']],
 [['近期'], ['透過'], ['依照'], ['感謝'], ['稍後']],
 [['安聯'], ['號碼'], ['收到'], ['接下來'], ['問題']],
 [['城東'], ['藉由'], ['這次'], ['要保人'], ['旁邊']],
 [['為何'], ['用途'], ['方便'], ['這次'], ['資金']],
 [['

In [20]:
from itertools import chain
q_tokens = [list(chain(*sentence)) for sentence in total_key_word]
processed_texts = [q_token + text for q_token, text in zip(q_tokens, texts)]

In [21]:
processed_texts[:10]

[['小姐',
  '本人',
  '請問',
  '一下',
  '禮儀',
  '您好這裡是玉山銀行您好這裡是玉山銀行保險代理部敝姓唐員工編號請問是湯拱運儲小姐本人嗎您好',
  '喂喂喂'],
 ['江淑',
  '方便',
  '訪問',
  '電話',
  '小姐',
  '呃您好這邊是玉山銀行總行保險代理部敝姓唐員工編號請問是江淑華小姐本人嗎呃呃您好請問她在嗎方便跟他做個電話訪問嗎',
  '喔謝謝'],
 ['江淑',
  '小姐',
  '本人',
  '請問',
  '福盛',
  '您好呃您好這邊是玉山銀行總行保險代理部敝姓唐員工編號請問是江淑華小姐本人嗎',
  '嘿您好對'],
 ['憑證',
  '保證',
  '繳交',
  '近期',
  '透過',
  '呃您好不好意思打擾您感謝您近期透過本行辦理首期保費繳交憑證那依照保險法令的要求為保證您的權益稍後電話訪問的內容會全程錄音請問您同意嗎',
  '好可可以'],
 ['林建銘', '先生', '本人', '請問', '福利', '您好這裡是玉山銀行總行保代部敝姓張員工編號請問是林建銘先生本人嗎', '喂嘿是'],
 ['萬能',
  '大智',
  '變額',
  '巴黎',
  '法國',
  '呃您好感謝您近期透過本行投保法國巴黎人壽大智富變額萬能壽險繳費年期為輪繳那依照保險法令的要求為保障您的權益稍後電話訪問內容將會全程錄音請問您同意嗎',
  '好可以'],
 ['投資型',
  '淨值',
  '方面',
  '保證',
  '變動',
  '好謝謝您那請問您是否知道本次購買的是法國巴黎人壽的投資型保險須自行承擔淨值變動的風險並無投資方面的保證呢',
  '喔是了解'],
 ['天母',
  '親自',
  '藉由',
  '本人',
  '健康',
  '好的那您是否是藉由天母分行的陳若期在旁邊協助並由要被保險您本人親自簽名且被保險人健康告知事項由您本人親自填寫呢',
  '嘿是的'],
 ['參考',
  '成本',
  '管理費',
  '進入',
  '條款',
  '好的那請問理專是否有提供保險條款給您參考並說明產品內容來確認符合您的需求呢好的那請問您是否知道您繳的保費會扣除相關費用後像是保單管理費或是保險成本等才進入投資帳戶呢',
  '有的是知道'],
 ['扣

In [8]:
# 實驗3 - 標準話術權威控制
with open('phrase.txt', 'r') as fr:
    lines = fr.readlines()
    
    lines = [line.strip('\n').strip('\t') for line in lines]
    tmp = []
    for line in lines:
        line = line.strip('\n').strip('\t')
        line = ''.join([l for l in line if re.match(CN_EN_RE, l)])
        if line:
            tmp.append(line)
lines = tmp
lines

['您好請問是先生小姐本人嗎',
 '您好這裡是玉山銀行總行保代部分行敝姓員工編號請問是先生小姐本人嗎',
 '請問您的出生年月日是',
 '請問您知道本次購買的人壽保險不是存款如果辦理解約將可能只領回部分已繳保費',
 '請問您投保時是由分行的從旁協助並由您本人親自填寫健康告知事項及簽名的嗎',
 '請問您投保時是由分行的從旁協助並由要保人及被保險人親自簽名且被保險人的健康告知事項都是親自填寫的嗎',
 '招攬人員有向您說明產品內容且符合您的保險需求繳交保費並不影響您的日常支出請問正確嗎',
 '接下來會為您撥放一段語音宣告請先不要掛斷電話本行人員並未鼓勵或勸誘以辦理貸款保單借款定存解約或保單解約保單終止之方式從事投資理財或購買保險若有上述資金運用於本次產品須事先審慎評估自身財務狀況與風險承受能力並願承擔因財務槓桿操作方式所面臨的風險及保單轉投保之權益損失除辦理貸款或保單借款需支付本金及利息外該產品可能發生之相關風險及最大可能損失請問您是否都瞭解呢',
 '為維護您的資料安全這裡簡單跟您核對基本資料您的身分證字號是AXXX請問後三碼是',
 '請問您是否知道本次購買的是人壽的投資型保險需自行承擔淨值變動的風險並無投資方面的保證',
 '請問您投保時是否皆由分行的在旁邊協助並由要保人及被保險人親自簽名且被保險人之健康告知事項皆由被保險人確認後親自填寫',
 '請問您投保時是否皆由分行的在旁邊協助並由法定代理人您協助要保人及被保險人親自簽名且被保險人之健康告知事項皆由被保險人之法定代理人確認後親自填寫',
 '請問法定代理人欄位是否由您本人親自簽名',
 '請問您投保時的各項文件是否皆由分行的在旁邊協助簽名欄位是否由您親自蓋手印確認且被保險人之健康告知事項皆由被保險人確認後親自填寫',
 '請問理專是否有提供保險條款給您參考並說明產品內容來確認符合您的需求',
 '請問您是否知道您所繳的保費會扣除相關費用後才進入投資帳戶',
 '請問您是否知道如果中途部分提領或提前解約保險公司會依受理當時的保單帳戶價值扣除當年度的解約費用後再給付給您',
 '請問您本次投保繳交保費的資金來源是否為',
 '請問您是否已事先審慎評估自身財務狀況與風險承受能力並願承擔因財務槓桿操作方式所面臨的風險及辦理保單解約轉投保之權益損失除辦理貸款或保單借款需支付本金及利息外還有該產品可能發生之相關

In [9]:
# Count diff ratio and replace sentence
from difflib import SequenceMatcher

def similar(str1, str2):
    return SequenceMatcher(None, str1, str2).ratio()

tmp_texts = []
for text in texts:   
    diff_ratio = [similar(text[0], line) for line in lines]
    if max(diff_ratio) > 0.70:
        simi_sentence = lines[np.argmax(diff_ratio)]
        tmp_texts.append([simi_sentence, text[1]])
    else:
        tmp_texts.append([text[0], text[1]])
            #         print(df.loc[i, 'question'])
#     df.loc[i, 'question_x'] = simi_sentence
len(tmp_texts)

8966

In [58]:
# 用交集判斷相似
questions = []
for line in lines:
    line, _ = tokenizer.tokenize(line)
    questions.append(set(line))

df = pd.DataFrame()
tmp_texts = []
for text in texts:
    token, _ = tokenizer.tokenize(text[0])
    token = set(token)
    diff_ratio = [len(q & token) / len(q) for q in questions]
    
#     simi_sentence = lines[np.argmax(diff_ratio)]
#     df = df.append({'text':text[0], 'simi':simi_sentence, 'ratio':max(diff_ratio)}, ignore_index=True)
    
    if max(diff_ratio) > 0.60:
        simi_sentence = lines[np.argmax(diff_ratio)]
        tmp_texts.append([simi_sentence, text[1]])
    else:
        tmp_texts.append([text[0], text[1]])

In [57]:
df.loc[df['ratio']<0.7, :]

Unnamed: 0,ratio,simi,text
15,0.666667,請問法定代理人欄位是否由您本人親自簽名,好的請問您這次辦理契約終止以及部分提領的文件是否藉由林森分行的林冠廷在旁邊協助並由要保人您親...
20,0.444444,請問您是否知道辦理契約終止後就無法再恢復成原本的保單內容呢,好的謝謝您接下來幾項問題項您確認我們有收到您安聯人壽保單號碼ql的契約終止文件一份請問您是否...
22,0.333333,請問您是否知道辦理契約終止後就無法再恢復成原本的保單內容呢,好的好的方便問您這次辦理契約終止的資金用途為何呢
23,0.388889,請問您是否知道辦理契約終止後就無法再恢復成原本的保單內容呢,好的了解與您確認好的了解與您確認本行人員並未鼓勵或勸誘已契約終止的方式中止的方式來購買新保單...
28,0.466667,請問分行的是否有向您說明產品內容並確認符合您的需求,好的謝謝您接下來幾項問題向您確認我們有收到您安聯人壽保單號碼ql的契約終止文件一份請問您是否...
...,...,...,...
8955,0.500000,請問您本次辦理貸款及保險是否有新申請玉山網路銀行,謝謝您那除了辦理貸款或保單借款需支付的本金及利息外還有該產品可能發生之相關風險及最大可能損失...
8956,0.571429,請問您本次辦理貸款及保險是否有新申請玉山網路銀行,謝謝您那本行人員並未鼓勵或勸誘辦理貸款保單借款保單解約保單終止及定存解約方式購買保險請問是否...
8960,0.645161,感謝您近期透過本行投保人壽繳費年期為年躉繳依照保險法令的要求為保障您的權益稍後電話訪問內容將...,依照保險法令的要求為保障您的權益稍後電話訪問的內容會全程錄音請問您同意嗎
8961,0.444444,請問您是否知道辦理契約終止後就無法再恢復成原本的保單內容呢,好的謝謝您以下幾個問題與您確認我們有收到您安聯人壽保單號碼ql一一九七三九八三的契約終止文件...


In [7]:
# 實驗4 - stopwords 清理
tokenizer = Tokenizer()
stop_words = '玉山|玉山銀行|保代|保代部|保險代理部|您好|你好|這裡|這邊|之後|打擾|敝姓|員工|編號|員編|總行|消金|中心|不好意思|呃|嗯|感謝|謝謝'

# clean stop words
token_texts = []
for text in texts:
    tokens, token_types = tokenizer.tokenize(text[0])
    tmp_tokens = []
    for token, token_type in zip(tokens, token_types):
        if token_type == 'CN' and token not in stop_words.split('|'):
            tmp_tokens.append(token)
    token_texts.append([''.join(tmp_tokens), text[1]])

Building prefix dict from /home/jovyan/.local/lib/python3.6/site-packages/esun_phoneme_tool/jieba_dict_pool/general_jieba_0002.txt ...
2021-04-28 11:50:11,249 DEBUG: Building prefix dict from /home/jovyan/.local/lib/python3.6/site-packages/esun_phoneme_tool/jieba_dict_pool/general_jieba_0002.txt ...
Loading model from cache /tmp/jieba.u9c666b2c1c48c114768336a4722d3b92.cache
2021-04-28 11:50:11,252 DEBUG: Loading model from cache /tmp/jieba.u9c666b2c1c48c114768336a4722d3b92.cache
Loading model cost 0.570 seconds.
2021-04-28 11:50:11,821 DEBUG: Loading model cost 0.570 seconds.
Prefix dict has been built successfully.
2021-04-28 11:50:11,822 DEBUG: Prefix dict has been built successfully.


In [61]:
int(8966*0.8)

7172

In [60]:
df = pd.DataFrame({'text':texts, 'label':trues})
df

Unnamed: 0,text,label
0,"[您好這裡是玉山銀行您好這裡是玉山銀行保險代理部敝姓唐員工編號請問是湯拱運儲小姐本人嗎您好,...",0
1,[呃您好這邊是玉山銀行總行保險代理部敝姓唐員工編號請問是江淑華小姐本人嗎呃呃您好請問她在嗎方...,0
2,"[您好呃您好這邊是玉山銀行總行保險代理部敝姓唐員工編號請問是江淑華小姐本人嗎, 嘿您好對]",1
3,[呃您好不好意思打擾您感謝您近期透過本行辦理首期保費繳交憑證那依照保險法令的要求為保證您的權...,1
4,"[您好這裡是玉山銀行總行保代部敝姓張員工編號請問是林建銘先生本人嗎, 喂嘿是]",1
...,...,...
8961,[好的謝謝您以下幾個問題與您確認我們有收到您安聯人壽保單號碼ql一一九七三九八三的契約終止文...,1
8962,[好的請問您這次辦理契約終止的文件是否皆由士林分行的林傳仲在旁邊協助並由要保人您本人親自簽名...,1
8963,"[好的方便請問您這次辦理契約終止的資金用途為何呢, 呃家用]",1
8964,"[好的了解請問您是否知道辦理契約終止後就無法再恢復成原本的保單內容呢, 嗯哼可以知道嗯]",1


In [59]:
bert_main(tmp_texts, trues)
# 原版
# 2021-04-28 12:06:25,462 INFO: valid_evaluation: loss=0.08820323068006285, accuracy=0.9682274247491639, precision=0.8877005347593583, recall=0.8217821782178217, f1=0.853470437017995
# 2021-04-28 13:13:16,066 INFO: valid_evaluation: loss=0.13116201947860676, accuracy=0.9671125975473801, precision=0.949685534591195, recall=0.7475247524752475, f1=0.8365650969529086

# 權威控制
# 2021-04-28 13:50:51,148 INFO: valid_evaluation: loss=0.13724370429228092, accuracy=0.9671125975473801, precision=0.9333333333333333, recall=0.7623762376237624, f1=0.8392370572207085
# 2021-04-28 13:59:52,614 INFO: valid_evaluation: loss=0.13895737874353753, accuracy=0.9665551839464883, precision=0.9329268292682927, recall=0.7574257425742574, f1=0.8360655737704918

# 僅自定義關鍵字
# 2021-04-28 13:18:45,353 INFO: valid_evaluation: loss=0.11003045374848719, accuracy=0.9637681159420289, precision=0.8442211055276382, recall=0.8316831683168316, f1=0.8379052369077307
# 2021-04-28 13:30:49,645 INFO: valid_evaluation: loss=0.13326888428679828, accuracy=0.9626532887402452, precision=0.9041916167664671, recall=0.7475247524752475, f1=0.8184281842818428

# 去除自定義停用字
# 2021-04-28 11:55:27,290 INFO: valid_evaluation: loss=0.09807703650460162, accuracy=0.967670011148272, precision=0.9044943820224719, recall=0.7970297029702971, f1=0.8473684210526315
# 2021-04-28 12:00:14,627 INFO: valid_evaluation: loss=0.14567224899756498, accuracy=0.9654403567447045, precision=0.9605263157894737, recall=0.7227722772277227, f1=0.824858757062147

# tf-idf top 5 + 原句
# 2021-04-28 16:14:43,515 INFO: valid_evaluation: loss=0.1148602656120884, accuracy=0.9654403567447045, precision=0.8932584269662921, recall=0.7871287128712872, f1=0.8368421052631579
# 2021-04-28 16:26:33,965 INFO: valid_evaluation: loss=0.1434477025578762, accuracy=0.9648829431438127, precision=0.9112426035502958, recall=0.7623762376237624, f1=0.830188679245283

# set版權威控制
# 2021-04-29 09:50:23,936 INFO: valid_evaluation: loss=0.10592393101803188, accuracy=0.9665551839464883, precision=0.8736842105263158, recall=0.8217821782178217, f1=0.846938775510204
# 2021-04-29 09:56:15,590 INFO: valid_evaluation: loss=0.1380416681861569, accuracy=0.9659977703455964, precision=0.937888198757764, recall=0.7475247524752475, f1=0.8319559228650137

# rule-base
# 'precision': 0.7540106951871658,  'recall': 0.6588785046728972, 'f1': 0.7032418952618454

2021-04-29 09:48:10,988 INFO: device: cuda
Some weights of the model checkpoint at /home/jovyan/if-beautiful-text/owen_dev/if_beautiful_text/cache_dir/bert-base-chinese were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassif

KeyboardInterrupt: 

In [13]:
df_train = df.iloc[:7000, :].copy()
df_valid = df.iloc[7000:8000, :].copy()
df_test = df.iloc[8000:, :].copy()

In [62]:
# rule-base 判斷
import re

preds = []
for text in texts[int(0.8*8966):]:   
    wrong_ans_1 = '什麼'
    correct_ans_1 = '理解|了解|知道|明白|是|ok|可以|沒錯|對|好|有|同意|正確|清楚|曉得|懂|不影響|不會|沒問題|確認'
    ques_1 = '不影響|不會|不會影響'
    ques_1_ans = '不影響|不會|不會影響'
    ques_2 = '日常|支出|用途'
    ques_2_ans = '儲蓄|貸款|投資|家用|基金|買|房屋|車|購物|生活|支出'
    ques_3 = '資金來源'
    ques_3_ans = '薪水|薪資|儲蓄'
    
    rst = 0
    # 問題對照規則
    if any([re.search(a, text[0]) for a in ques_1.split('|')]):
        if any([re.search(a, text[1]) for a in ques_1_ans.split('|')]):
            rst = 1
    if any([re.search(a, text[0]) for a in ques_2.split('|')]):
        if any([re.search(a, text[1]) for a in ques_2_ans.split('|')]):
            rst = 1
    if any([re.search(a, text[0]) for a in ques_3.split('|')]):
        if any([re.search(a, text[1]) for a in ques_3_ans.split('|')]):
            rst = 1
            
    # 一般規則
    if any([re.search(a, text[1]) for a in wrong_ans_1.split('|')]):
        rst = 0
    elif any([re.search('不'+a, text[1]) for a in correct_ans_1.split('|')]):
        rst = 0
    elif any([re.search(a, text[1]) for a in correct_ans_1.split('|')]):
        rst = 1
        
    preds.append(rst)
labels = trues[int(0.8*8966):]
# labels = df.label.values.tolist()

In [12]:
# rule-base 判斷
import re

preds = []
for text in texts[int(0.8*8966):]:   
    wrong_ans_1 = '什麼'
    correct_ans_1 = '理解|了解|知道|明白|是|ok|可以|沒錯|對|好|有|同意|正確|清楚|曉得|懂|不影響|不會|沒問題|確認'
    ques_1 = '不影響|不會|不會影響'
    ques_1_ans = '不影響|不會|不會影響'
    ques_2 = '日常|支出|用途'
    ques_2_ans = '儲蓄|貸款|投資|家用|基金|買|房屋|車|購物|生活|支出'
    ques_3 = '資金來源'
    ques_3_ans = '薪水|薪資|儲蓄'
    
    rst = 0
    # 問題對照規則
#     if any([re.search(a, text[0]) for a in ques_1.split('|')]):
#         if any([re.search(a, text[1]) for a in ques_1_ans.split('|')]):
#             rst = 1
#     if any([re.search(a, text[0]) for a in ques_2.split('|')]):
#         if any([re.search(a, text[1]) for a in ques_2_ans.split('|')]):
#             rst = 1
#     if any([re.search(a, text[0]) for a in ques_3.split('|')]):
#         if any([re.search(a, text[1]) for a in ques_3_ans.split('|')]):
#             rst = 1
            
    # 一般規則
    if any([re.search(a, text[1]) for a in wrong_ans_1.split('|')]):
        rst = 0
#     elif any([re.search('不'+a, text[1]) for a in correct_ans_1.split('|')]):
#         rst = 0
    elif any([re.search(a, text[1]) for a in correct_ans_1.split('|')]):
        rst = 1
        
    preds.append(rst)
labels = trues[int(0.8*8966):]
# labels = df.label.values.tolist()

In [15]:
tp, fp, fn = 0, 0, 0    
for i, (label, pred) in enumerate(zip(labels, preds)):
    tp += 1 if (label, pred) == (0, 0) else 0
    if (label, pred) == (1, 0):
        fp += 1
#         print('誤判為 錯', texts[7172+i])
    else:
        fp += 0
    if (label, pred) == (0, 1):
        fn += 1 
        print('誤判為 對', texts[7172+i])
    else:
        fn += 0    
    
precision = tp / (tp + fp) if tp + fp > 0 else None
recall = tp / (tp + fn) if tp + fn > 0 else None
f1 = 2 / (1 / precision + 1 / recall) if precision and recall else None

evaluation = {
    'precision': precision,
    'recall': recall,
    'f1': f1
}
evaluation

誤判為 對 ['好的請問您本次投保繳交保費的資金來源是否為保單解約金呢', '噢是噢']
誤判為 對 ['呃您好不好意思打擾您感謝您近期透過本行辦理契約終止依照保險法令的要求為保障您的權益稍後電話訪問的內容會全程錄音請問您同意嗎', '欸我聽不太清楚捏']
誤判為 對 ['謝謝您的同意那為了維護您的資料安全跟您簡單核對一下您的身分證字號是t二零一四三六請問後三碼是多少呢', '啊不是你你講這樣我不太清楚欸']
誤判為 對 ['好的請問您此次辦理契約終止的文件是否皆由斗六分行的顏慧凌在旁邊協助並由要保人您本人親自簽名呢', '噢你是說斗六玉山銀行']
誤判為 對 ['對就是是不是由斗六分行的顏慧凌在旁邊協助您那並且是由要保人就是您本人親自在文件上簽名呢', '好']
誤判為 對 ['好的那方便請問您此次辦理契約終止的資金用途為何呢', '呃對我要HTR那個第七年了嗎我要第七年了HTR嗯']
誤判為 對 ['您好呃請問這部分有清楚嗎', '喂喂欸我聽到了那我要說話嗎我要說是這樣子嗎']
誤判為 對 ['嗯好的那再次與您確認本保單之規劃您是否已確實了解投保目的保險需求並經綜合考量財務狀況以及付費能力且不影響您的日常支出呢', '欸等一下我在忙好不好喂那個十']
誤判為 對 ['呃您好好再跟您確認一下本保單之規劃您是否已確實了解投保目的保險需求並經綜合考量財務狀況以及付費能力且不影響您的日常支出呢', '欸你你你再說一次繳我我聽不大懂欸']
誤判為 對 ['呃那薪資的部分呢', '薪資也是呃就應該是應該應該是這樣講對啦就是薪資然後存在保險裡面對在那個那個儲蓄裡面嘛']
誤判為 對 ['對這部分有清楚嗎', '欸你嗯沒有聽很清楚你再講一次']
誤判為 對 ['請問本次保費的資金來源是否為買賣不動產', '嘿噢不是是呃本本來就有的欸那個本來的存款嘿']
誤判為 對 ['噢噢好請問招攬人員是否有提供您躉繳與分期繳分期繳等不同繳費方式選擇請問他有提供給你嗎', '呃不好意思那個']
誤判為 對 ['依照保險法令的要求為保障您的權益稍後電話訪問的內容會全程錄音請問您同意嗎', '欸對呀欸好像是昨天嗎']
誤判為 對 ['噢好的謝謝您那請問您是否知道辦理契約終止後就無法再恢復成原本的保單內容呢', '是昨天嗎對呀']
誤判為 對 ['好謝謝呃嘿是那第四題是請問招攬人員是否有提供您躉繳與

{'precision': 0.7361963190184049,
 'recall': 0.594059405940594,
 'f1': 0.6575342465753424}

In [26]:
# pd.set_option('display.max_rows', None)
df = df.loc[df['question'].apply(lambda x: any(i in x for i in ['本人親自簽名'])), ['question', 'answer']]['question_x'] = 1

AttributeError: 'int' object has no attribute 'loc'

In [25]:
df.head()

Unnamed: 0,question,answer,label,question_x
0,您好這裡是玉山銀行您好這裡是玉山銀行保險代理部敝姓唐員工編號請問是湯拱運儲小姐本人嗎您好,喂喂喂,0,
1,呃您好這邊是玉山銀行總行保險代理部敝姓唐員工編號請問是江淑華小姐本人嗎呃呃您好請問她在嗎方便...,喔謝謝,0,
2,您好呃您好這邊是玉山銀行總行保險代理部敝姓唐員工編號請問是江淑華小姐本人嗎,嘿您好對,1,
3,呃您好不好意思打擾您感謝您近期透過本行辦理首期保費繳交憑證那依照保險法令的要求為保證您的權益...,好可可以,1,
4,您好這裡是玉山銀行總行保代部敝姓張員工編號請問是林建銘先生本人嗎,喂嘿是,1,


In [34]:
sum(df['question_x'].apply(lambda x: x != None))

4523