In [59]:
from difflib import SequenceMatcher

import pandas as pd
import numpy as np

In [60]:
import os
os.listdir('/home/jovyan/wm-insur-call-qa/eric/speaker-separation/test_zone/test_result/')

['電訪-房貸壽險-法巴.wav',
 '電訪-房貸壽險-法巴-DPRNN1.wav',
 '電訪-房貸壽險-法巴-DPRNN2.wav',
 '電訪-房貸壽險-法巴-fix-DPRNN1.wav',
 '電訪-房貸壽險-法巴-fix-DPRNN2.wav',
 '電訪-傳統型台幣-中壽.wav',
 '電訪-投資型-安聯-異常.wav',
 '電訪-傳統型台幣-南山.wav',
 '電訪-傳統型台幣-中壽-異常.wav',
 '電訪-投資型-安聯.wav',
 '電訪-傳統型外幣-中壽.wav',
 '電訪-傳統型台幣-南山-異常.wav',
 '電訪-房貸壽險-南山.wav',
 'DPRNN2-電訪-投資型-安聯.wav',
 'DPRNN2-電訪-投資型-安聯-異常.wav',
 'DPRNN2-電訪-房貸壽險-法巴.wav',
 'DPRNN2-電訪-房貸壽險-南山.wav',
 'DPRNN2-電訪-傳統型外幣-中壽.wav',
 'DPRNN2-電訪-傳統型台幣-南山.wav',
 'DPRNN2-電訪-傳統型台幣-南山-異常.wav',
 'DPRNN2-電訪-傳統型台幣-中壽.wav',
 'DPRNN2-電訪-傳統型台幣-中壽-異常.wav',
 'DPRNN1-電訪-投資型-安聯.wav',
 'DPRNN1-電訪-投資型-安聯-異常.wav',
 'DPRNN1-電訪-房貸壽險-法巴.wav',
 'DPRNN1-電訪-房貸壽險-南山.wav',
 'DPRNN1-電訪-傳統型外幣-中壽.wav',
 'DPRNN1-電訪-傳統型台幣-南山.wav',
 'DPRNN1-電訪-傳統型台幣-南山-異常.wav',
 'DPRNN1-電訪-傳統型台幣-中壽.wav',
 'DPRNN1-電訪-傳統型台幣-中壽-異常.wav']

In [61]:
# 取得語者分離與STT結果
def ctm_to_groupdata(data_path, group_name):
    # load stt results
    ctm_df = pd.read_csv(data_path, sep=' ', header=None)
    ctm_df = ctm_df.iloc[:, :-1]
    ctm_df.columns = ['file_name', '1', 'start_time', 'stay_time', 'word']
    selectors = ctm_df.groupby('file_name')
    group_data = selectors.get_group(group_name)
    group_data = group_data.reset_index(drop=True)
    return group_data

In [62]:
def get_times(group_data):
    times = []
    for i in group_data.index:
        start = group_data.loc[i, 'start_time']
        end = (start + group_data.loc[i, 'stay_time']).round(2)
        if times:
            if start == times[-1][1]:
                last_start, last_end = times.pop()
                start = last_start
        times.append((start, end))
    return times

In [63]:
def overlap_ratio(x_start, x_end, y_start, y_end):
    if y_end <= x_start or x_end <= y_start:
        return 0
    overlap_start = max(x_start, y_start)
    overlap_end = min(x_end, y_end)

    x_long = x_end - x_start
    overlap_long = overlap_end - overlap_start
    return overlap_long / x_long

In [64]:
# 簡易版-單人句子取得
def get_sentences(group_data, group_times):
    sentences, starts, ends = [], [], []
    seg, time = [], []
    for i in group_data.index:
        start = group_data.loc[i, 'start_time']
        end = (start + group_data.loc[i, 'stay_time']).round(2)
        text = group_data.loc[i, 'word']
        ratio = sum([overlap_ratio(start, end, time[0], time[1]) for time in group_times])
        if ratio >= 0.6:
            seg += [text]
            time += [start, end]
        elif seg:
            sentences.append(' '.join(seg))
            starts.append(time[0])
            ends.append(time[-1]) 
            seg, time = [], []
            
    df = pd.DataFrame({
        'start': starts,
        'end': ends,
        'sentence': sentences,
    })
    return df

In [65]:
# 複雜版-雙音軌對照
def _get_ctm_sametime_word(ctm, start, end):
    ctm_starts = ctm.start_time.values
    ctm_ends = ctm_starts + ctm.stay_time.values
    simi_index = np.argmax([overlap_ratio(start, end, s, e) for s, e in zip(ctm_starts, ctm_ends)])
    return ctm.word[simi_index]

def get_sentences_once(group_ctm, x_times, y_times, x_ctm, y_ctm):
    """
    # data type:
    _times: [(2.91, 3.15), (9.36, 10.38), ...]
    _ctm: pd.DataFrame.colunms=['file_name', '1', 'start_time', 'stay_time', 'word']
    """
    x_sentences, x_starts, x_ends, y_sentences, y_starts, y_ends = [], [], [], [], [], []
    x_seg, x_time, y_seg, y_time = [], [], [], []
    for i in group_ctm.index:
        start = group_ctm.loc[i, 'start_time']
        end = (start + group_ctm.loc[i, 'stay_time']).round(2)
        text = group_ctm.loc[i, 'word']
        x_ratio = sum([overlap_ratio(start, end, time[0], time[1]) for time in x_times])
        y_ratio = sum([overlap_ratio(start, end, time[0], time[1]) for time in y_times])

        is_x, is_y = False, False
        if x_ratio >= 0.6 and y_ratio >= 0.6:
            # do some compare
            x_word = _get_ctm_sametime_word(x_ctm, start, end)
            y_word = _get_ctm_sametime_word(y_ctm, start, end)

            if x_word == text and y_word == text:
                pass
            elif x_word == text:
                y_ratio = 0
            elif y_word == text: 
                x_ratio = 0
            else:
                pass
        else:
            if text == '<SIL>':
                pass
            elif x_ratio > y_ratio:
                is_x = True
            elif x_ratio <= y_ratio: 
                is_y = True
        
        if x_ratio >= 0.6 or is_x:
            x_seg += [text]
            x_time += [start, end]
        elif x_seg:
            x_sentences.append(' '.join(x_seg))
            x_starts.append(x_time[0])
            x_ends.append(x_time[-1]) 
            x_seg, x_time = [], []
            
        if y_ratio >= 0.6 or is_y:
            y_seg += [text]
            y_time += [start, end]
        elif y_seg:
            y_sentences.append(' '.join(y_seg))
            y_starts.append(y_time[0])
            y_ends.append(y_time[-1]) 
            y_seg, y_time = [], []
        
        if (x_ratio >= 0.6 or is_x) and (y_ratio >= 0.6 or is_y):
            print(f'| # | # | {text} {x_ratio} {y_ratio} | {start} {end}')
        elif (x_ratio >= 0.6 or is_x):
            print(f'| # |   | {text} {x_ratio} {y_ratio} | {start} {end}')
        elif (y_ratio >= 0.6 or is_y):        
            print(f'|   | # | {text} {x_ratio} {y_ratio} | {start} {end}')
        else:
            print(f'|   |   | {text} {x_ratio} {y_ratio} | {start} {end}')
        
    x_df = pd.DataFrame({'start': x_starts, 'end': x_ends, 'sentence': x_sentences})
    y_df = pd.DataFrame({'start': y_starts, 'end': y_ends, 'sentence': y_sentences})
    return x_df, y_df

In [None]:
# 產出行員與顧客之個人逐字稿
from scipy.io import wavfile
from python_speech_features import mfcc
import librosa

def prepare_calculate_rms(wav_path, rate):
    y, _ = librosa.load(wav_path, sr=rate)
    rms_sample = librosa.feature.rms(y=y)
    _max, _min = np.amax(rms_sample), np.amin(rms_sample)
    return y, _max, _min 

# group_ctm, x_times, y_times, x_ctm, y_ctm
def get_sentences_rms(origin_ctm, wav_path_a, wav_path_b, rate=8000):
    y_a, _max_a, _min_a = prepare_calculate_rms(wav_path_a, rate)
    y_b, _max_b, _min_b = prepare_calculate_rms(wav_path_b, rate)

    a_sentences, a_starts, a_ends, b_sentences, b_starts, b_ends = [], [], [], [], [], []
    a_seg, a_time, b_seg, b_time = [], [], [], []

    a_times, b_times = [], []
    for i in origin_ctm.index:
        start = origin_ctm.loc[i, 'start_time']
        end = (start + origin_ctm.loc[i, 'stay_time']).round(2)
        text = origin_ctm.loc[i, 'word']
        if text == '<SIL>':
            continue

        rms_a = librosa.feature.rms(y=y_a[int(start*rate): int(end*rate)])
        rms_a = (rms_a - _min_a) / (_max_a - _min_a)
        rms_b = librosa.feature.rms(y=y_b[int(start*rate): int(end*rate)])
        rms_b = (rms_b - _min_b) / (_max_b - _min_b)
#         # 圖像化顯示結果
#         if np.mean(rms_a) > np.mean(rms_b):
#             print('| # |   |', text,'|', start, end, '|', np.mean(rms_a), np.mean(rms_b))
#         else:
#             print('|   | # |', text,'|', start, end, '|', np.mean(rms_a), np.mean(rms_b))

        
        if np.mean(rms_a) >= np.mean(rms_b):
            a_seg += [text]
            a_time += [start, end]
        elif a_seg:
            a_sentences.append(' '.join(a_seg))
            a_starts.append(a_time[0])
            a_ends.append(a_time[-1]) 
            a_seg, a_time = [], []
            
        if np.mean(rms_a) <= np.mean(rms_b):
            b_seg += [text]
            b_time += [start, end]
        elif b_seg:
            b_sentences.append(' '.join(b_seg))
            b_starts.append(b_time[0])
            b_ends.append(b_time[-1]) 
            b_seg, b_time = [], []
  
        
    a_df = pd.DataFrame({'start': a_starts, 'end': a_ends, 'sentence': a_sentences})
    b_df = pd.DataFrame({'start': b_starts, 'end': b_ends, 'sentence': b_sentences})
    return a_df, b_df

In [66]:
# 話術比對
MIN_THRESHOLD = 0.4

def _similar(str1, str2):
    return SequenceMatcher(None, str1, str2).ratio()

def get_similar_tokens(tokens, sample_str, max_score):
    _str = ''.join(i.word for i in tokens)

    start_idx = 0
    temp = tokens
    for i in range(len(tokens)):
        start_idx += len(tokens[i].word)
        score = _similar(_str[start_idx:], sample_str)
        if  score >= max_score :
            max_score = score
            temp = tokens[i+1:]
        if score < MIN_THRESHOLD:
            break
    tokens = temp        
    _str = ''.join(i.word for i in tokens)
    end_idx = len(_str)

    for i in range(len(tokens)-1, 0, -1):
        end_idx -= len(tokens[i].word)
        score = _similar(_str[0:end_idx], sample_str)
        if  score >= max_score:
            max_score = score
            tokens = tokens[: i]
        elif score < MIN_THRESHOLD:
            break
    return tokens, max_score

def get_time_span(df, ori_question):
    sign = ['，', '。', '?', '？', '、', '!', '！', '/', 'O', '○', ' ', ',', '.', '_', '(', ')', '（', '）', '\n']

    sample_str = ori_question
    for s in sign:
        sample_str = sample_str.replace(s, '')
    items = [row for index, row in df.iterrows()]
    similar_tokens = []
    max_score = MIN_THRESHOLD
    for i in range(len(items) - len(sample_str)):
        _str = ''.join(i[-1] for i in items[i: i+len(sample_str)])
        score = _similar(_str, sample_str)
        if score >= max_score:
            max_score = score
            similar_tokens.append([items[i: i+len(sample_str)], score])

    if len(similar_tokens) == 0:
        return {'start_time': 0, 'end_time': 0, 'tokens':[], 'max_score': 0, 'tokens_text': [], 'ori_question': ori_question}            

    try:
        tokens = similar_tokens[-1][0]
        max_score = similar_tokens[-1][1]
        tokens, max_score = get_similar_tokens(tokens, sample_str, max_score)
        return {'start_time': tokens[0].start_time, 'end_time': tokens[-1].start_time+tokens[-1].stay_time,\
                'tokens': tokens, 'max_score': max_score, 'tokens_text': [i.word for i in tokens], 'ori_question': ori_question}
    except:
        return {'start_time': 0, 'end_time': 0, 'tokens': [], 'max_score': max_score, 'tokens_text': [], 'ori_question': ori_question} 

In [91]:
# 取得語者分離與STT結果
wav = '電訪-傳統型台幣-中壽'
origin_name = f'/home/jovyan/wm-insur-call-qa/eric/speaker-separation/test_zone/test_result/{wav}.wav'
origin_ctm = ctm_to_groupdata('/home/jovyan/exchanging-pool/to_owen/func_asr/stt_result/ctm/ctm', origin_name)

group_1_name = f'/home/jovyan/wm-insur-call-qa/eric/speaker-separation/test_zone/test_result/DPRNN1-{wav}.wav'
group_1_ctm = ctm_to_groupdata('/home/jovyan/exchanging-pool/to_owen/func_asr/stt_result/ctm/ctm', group_1_name)
group_1_times = get_times(group_1_ctm)

group_2_name = f'/home/jovyan/wm-insur-call-qa/eric/speaker-separation/test_zone/test_result/DPRNN2-{wav}.wav'
group_2_ctm = ctm_to_groupdata('/home/jovyan/exchanging-pool/to_owen/func_asr/stt_result/ctm/ctm', group_2_name)
group_2_times = get_times(group_2_ctm)

In [None]:
# 產出行員與顧客之個人逐字稿(舊)
# group_1_asr, group_2_asr = get_sentences_once(origin_ctm, group_1_times, group_2_times, group_1_ctm, group_2_ctm)

In [93]:
# 產出行員與顧客之個人逐字稿
group_1_asr, group_2_asr = get_sentences_rms(origin_ctm, group_1_name, group_2_name)

In [94]:
# 辨識顧客語音
times_1 = sum([e - s for s, e in zip(group_1_asr.start, group_1_asr.end)])
times_2 = sum([e - s for s, e in zip(group_2_asr.start, group_2_asr.end)])
customer_ctm = group_1_asr if times_1 < times_2 else group_2_asr   # customer is group 1 or 2
customer_ctm

Unnamed: 0,start,end,sentence
0,10.23,10.68,喂 你好
1,24.87,26.1,嘿 對 對
2,48.81,49.71,喔 喔
3,61.71,62.04,喔
4,63.36,64.02,阿 對
5,68.76,70.26,五 六 零 一 九
6,86.91,87.21,保
7,107.88,108.18,對
8,118.14,118.53,有
9,125.31,125.64,對


In [95]:
# 載入話術
questions = ["""您好！這裡是玉山銀行總行個金處/OO分行/OO消金中心，
               敝姓O，員工編號OOOOO，請問是○○○先生/小姐本人嗎？""",
            '感謝您近期透過本行投保○○人壽○○○，繳費年期為O年，依照保險法令的要求，為保障您的權益，稍後電話訪問內容將會全程錄音，請問您同意嗎？'
            '為維護您的資料安全，這裡簡單跟您核對基本資料，您的身分證字號是，請問後三碼是？',
            '請問您的出生年月日是?',
            '請問您是否知道本次購買的是○○人壽的保險，不是存款，如果辦理解約將可能只領回部分已繳保費？',
            '請問您投保時，是否皆由○○消金中心的○○○，在旁邊協助，並由您本人○○○親自簽名，且被保險人之健康告知事項皆由您確認後親自填寫？',
            '請問○○消金中心的○○○是否有向您說明產品內容，並確認符合您的需求？',
            '請問招攬人員是否有提供您一次繳清與分期繳等不同繳費方式選擇？',
            '請問您本次投保繳交保費的資金來源是否為',
            """請問您是否已事先審慎評估自身財務狀況與風險承受能力，
               並願承擔因財務槓桿操作方式所面臨的風險及辦理保單解約轉投保之權益損失，
               除辦理貸款或保單借款需支付本金及利息外，
               還有該產品可能發生之相關風險及最大可能損失，
               且本行人員並未鼓勵或勸誘以辦理貸款、保單借款、保單解約/保單終止及定存解約之方式購買保險，
               請問您是否已瞭解？""",
            '與您確認，本保單之規劃您是否已確實瞭解投保目的、保險需求，並經綜合考量財務狀況以及付費能力，且不影響您的日常支出？',
            '與您再次確認上述投保內容和本次貸款並沒有搭售或不當行銷的情形發生，請問是否正確?',
            '請問您本次辦理貸款及保險，是否有新申請玉山網路銀行？']

In [96]:
def get_report(customer_ctm, origin_ctm, questions, base_score=0.4):
    # find the question  time span (if score > base_score)
    q_result = [get_time_span(origin_ctm, q) for q in questions]
    q_result = sorted(q_result, key=lambda x: x['start_time'])

    # record  time info
    temp_record = []
    for i, qs in enumerate(q_result):
        # add question and time start & end
        temp_record.append([qs['start_time'], qs['end_time'], qs['max_score'], qs['ori_question'], ''.join(qs['tokens_text'])])

    for i in range(len(temp_record)):
        q_end = q_result[i]['end_time']
#         next_q_start = q_result[i]['end_time']
        sentences = group_2_asr.loc[customer_ctm['start'].apply(lambda x: q_end-2 < x < q_end+5), :]
        ans = sentences.values[0].tolist() if not sentences.empty else None
        if ans and (temp_record[i][2] != 0):
            ans.append(''.join(ans.pop().split()))
            temp_record[i] += ans
        else:
            temp_record[i] += [0, 0, None]
    report = pd.DataFrame(temp_record, columns=["q_start_time", "q_end_time", "score", "question", "recognize_result",
                                                "reply_start_time", "reply_end_time", "reply"])
    return report        
get_report(customer_ctm, origin_ctm, questions)

Unnamed: 0,q_start_time,q_end_time,score,question,recognize_result,reply_start_time,reply_end_time,reply
0,0.0,0.0,0.0,請問招攬人員是否有提供您一次繳清與分期繳等不同繳費方式選擇？,,0.0,0.0,
1,0.0,0.0,0.0,與您再次確認上述投保內容和本次貸款並沒有搭售或不當行銷的情形發生，請問是否正確?,,0.0,0.0,
2,0.0,0.0,0.0,請問您本次辦理貸款及保險，是否有新申請玉山網路銀行？,,0.0,0.0,
3,13.53,24.84,0.506024,您好！這裡是玉山銀行總行個金處/OO分行/OO消金中心，\n 敝...,這裡是那個玉山銀行然後三和分行齁敝姓李木子李那我們我的員編是零七二八二請問是味道梁先生嗎本人嗎,24.87,26.1,嘿對對
4,27.27,61.65,0.657658,感謝您近期透過本行投保○○人壽○○○，繳費年期為O年，依照保險法令的要求，為保障您的權益，稍...,感謝您就是近期透過本行投保中國人壽的好事連連利率變動型中山保險然後繳繳費的年期為四年那依照保...,61.71,62.04,喔
5,65.19,67.26,1.0,請問您的出生年月日是?,請問您的出生年月日是,68.76,70.26,五六零一九
6,76.77,85.17,0.780488,請問您是否知道本次購買的是○○人壽的保險，不是存款，如果辦理解約將可能只領回部分已繳保費？,請問您是否知道本次購買的是中國人壽的保險不是存款如果辦理借閱證可能指令恢復分期繳保費,86.91,87.21,保
7,88.71,106.8,0.566929,請問您投保時，是否皆由○○消金中心的○○○，在旁邊協助，並由您本人○○○親自簽名，且被保險人...,請問您操作的時候是否有分行的邱泳圈在旁邊做協助病友要保人味道兩極被保險人回到量親自簽名而且被...,107.88,108.18,對
8,112.92,117.18,0.791667,請問○○消金中心的○○○是否有向您說明產品內容，並確認符合您的需求？,是否有跟您說明產品的內容並確認符合您的需求,118.14,118.53,有
9,119.82,123.63,0.947368,請問您本次投保繳交保費的資金來源是否為,請問您等次投保繳交保費的資金來源是否為,125.31,125.64,對


In [94]:
group_1_data.head(30)

Unnamed: 0,file_name,1,start_time,stay_time,word
0,/home/jovyan/wm-insur-call-qa/eric/speaker-sep...,1,2.91,0.24,你好
1,/home/jovyan/wm-insur-call-qa/eric/speaker-sep...,1,9.36,1.02,欸
2,/home/jovyan/wm-insur-call-qa/eric/speaker-sep...,1,15.6,0.6,你好
3,/home/jovyan/wm-insur-call-qa/eric/speaker-sep...,1,16.41,0.33,您好
4,/home/jovyan/wm-insur-call-qa/eric/speaker-sep...,1,16.77,0.27,請問
5,/home/jovyan/wm-insur-call-qa/eric/speaker-sep...,1,17.04,0.18,是
6,/home/jovyan/wm-insur-call-qa/eric/speaker-sep...,1,17.22,0.27,鄭
7,/home/jovyan/wm-insur-call-qa/eric/speaker-sep...,1,17.49,0.27,互
8,/home/jovyan/wm-insur-call-qa/eric/speaker-sep...,1,17.76,0.27,為
9,/home/jovyan/wm-insur-call-qa/eric/speaker-sep...,1,18.03,0.42,先生


In [68]:
texts = []
short_text = ''

a_times, b_times = [], []
for i in origin_ctm.index:
    start = origin_ctm.loc[i, 'start_time']
    end = (start + origin_ctm.loc[i, 'stay_time']).round(2)
    text = origin_ctm.loc[i, 'word']
    ratio_1 = sum([overlap_ratio(start, end, time[0], time[1]) for time in group_1_times])
    ratio_2 = sum([overlap_ratio(start, end, time[0], time[1]) for time in group_2_times])
    a = '#' if ratio_1 >= 0.6 else ' '
    b = '#' if ratio_2 >= 0.6 else ' '
    X_sample = mfcc(wav[int(start*rate): int(end*rate)], samplerate=rate, numcep=24, nfilt=26, nfft=1024)
    a_times.append((start, end)) if ratio_1 >= 0.6 and ratio_2 < 0.6 else ' '
    b_times.append((start, end)) if ratio_2 >= 0.6 and ratio_1 < 0.6  else ' '    
    print('|', a,'|', b, '|', text, ratio_1, ratio_2, '|', start, end, '|',np.mean(X_sample), '|', np.amin(X_sample), '|', np.amax(X_sample))

|   | # | 喂 0 1.0 | 11.1 11.55 | -3.6874181483757504 | -70.15956325956294 | 48.76372863553524
| # |   | 您好 1.0 0 | 11.94 12.33 | -3.157034261596508 | -42.08573270087548 | 22.97389221452257
| # |   | 這裡 1.0 0 | 12.33 12.54 | -3.5375134599015308 | -29.382028153435503 | 16.262393519968608
| # |   | 是 1.0 0 | 12.54 12.66 | -2.7501129127587665 | -30.905405377054837 | 17.991108691002204
| # |   | 玉山銀行 1.0 0 | 12.66 13.26 | -3.3900002274654404 | -40.606826129239515 | 20.816071237758884
| # |   | 客服中心 1.0 0 | 13.92 14.49 | -3.8129626260051235 | -40.697688906976566 | 20.82139933091174
| # |   | 敝姓 1.0 0 | 14.49 14.82 | -3.879897111826479 | -42.78958186762297 | 20.01874730335837
| # |   | 劉 1.0 0 | 14.82 15.09 | -3.9205482589173197 | -43.77845392132365 | 37.66788311672365
| # |   | 員工 1.0 0 | 15.09 15.36 | -3.383539934559926 | -37.484801993056 | 24.163268802967135
| # |   | 編號 1.0 0 | 15.36 15.72 | -3.2744947195911847 | -45.21614238349287 | 22.457133442738307
| # |   | 零 1.0 0 | 15.72 15.9 | -3.