In [1]:
# 원본은 내 pc(Dell)의 deepcell-speechbrain-2 Docker에 있음

import speechbrain
from speechbrain.utils.edit_distance import wer_details_for_batch
import pandas as pd
import numpy as np

In [10]:
# 음절 단위 토큰화, 빈칸을 _로 표시

def char_tokenizer(s):
    result = []
    flag = False
    for c in s:
        if c == ' ':
            flag = True
            continue

        if flag == True:
            c = '_' + c
            flag = False

        result.append(c)
        
    return result

print(char_tokenizer('나는 어제 치킨을 먹었다'))

['나', '는', '_어', '제', '_치', '킨', '을', '_먹', '었', '다']


In [3]:
def remove_space_symbol(s):
    return s.replace(' ', '')
    
# print(remove_space_symbol(ref))

In [7]:
def get_norm_text(ref , hyp):
    refs = char_tokenizer(ref)
    hyps = char_tokenizer(hyp)

    ref_nospace = ref.replace(' ', '')
    hyp_nospace = hyp.replace(' ', '')

    rlen = len(refs)
    hlen = len(hyps)

    scores =  np.zeros((hlen+1, rlen+1), dtype=np.int32)

    # initialize, 공란을 무시하고 음절의 거리 매트릭스 만들기
    for r in range(rlen+1):
        scores[0, r] = r
    for h in range(1, hlen+1):
        scores[h, 0] = scores[h-1, 0] + 1
        for r in range(1, rlen+1):
            sub_or_cor = scores[h-1, r-1] + (0 if ref_nospace[r-1] == hyp_nospace[h-1] else 1)

            insert = scores[h-1, r] + 1
            delete = scores[h, r-1] + 1

            scores[h, r] = min(sub_or_cor, insert, delete)
    # print(scores)

    # traceback and compute alignment
    h, r = hlen, rlen
    ref_norm, hyp_norm = [], []

    while r > 0 or h > 0:
        if h == 0:
            last_r = r - 1
        elif r == 0:
            last_h = h - 1
            last_r = r
        else:
            sub_or_cor = scores[h-1, r-1] + (0 if ref_nospace[r-1] == hyp_nospace[h-1] else 1)
            insert = scores[h-1, r] + 1
            delete = scores[h, r-1] + 1

            if sub_or_cor < min(insert, delete):
                last_h, last_r = h - 1, r - 1
            else:
                last_h, last_r = (h-1, r) if insert < delete else (h, r-1)

            c_hyp = hyps[last_h] if last_h == h-1 else ''
            c_ref = refs[last_r] if last_r == r-1 else ''

            h, r = last_h, last_r

            # do word-spacing normalization
            if c_hyp.replace('_', '') == c_ref.replace('_', ''):
                c_hyp = c_ref

        ref_norm.append(c_ref)
        hyp_norm.append(c_hyp)
    
    return ref_norm[::-1], hyp_norm[::-1]

In [8]:
ref = '나는 어제 양념 치킨을 먹었다' # 양념 이란 단어가 ref에만 있으므로 처리에 문제가 있다. 
hyp = '나는어제  치킨을먹었다'

# ref = '음성인식 평가 방법을 소개합니다'
# hyp = '어 음성 인식 평가방버블 소개합니다'
# -> 어 음성인식 평가 방버블 소개합니다

# ref = '음성인식 방법을 소개합니다'
# hyp = '음성 인식 방버블 소개합니다'

In [11]:
ref_n, hyp_n = get_norm_text(ref, hyp)

In [12]:
hyp_n

['나', '는', '_어', '제', '', '', '_치', '킨', '을', '_먹', '었', '다']

In [13]:
ref_n

['나', '는', '_어', '제', '_양', '념', '_치', '킨', '을', '_먹', '었', '다']

In [8]:
# map(function, iterable)
# '구분자'.join(리스트)

result = ''.join(map(str, ref_n)).replace('_', ' ')
print(result)

나는 어제 양념 치킨을 먹었다


In [9]:
result = ''.join(map(str, hyp_n)).replace('_', ' ')
print(result)

나는 어제 치킨을 먹었다


In [14]:
def get_swords(ref , hyp):
    refs = char_tokenizer(ref)
    hyps = char_tokenizer(hyp)
    ref_nospace = ref.replace(' ', '')
    hyp_nospace = hyp.replace(' ', '')
    rlen = len(refs)
    hlen = len(hyps)
    scores =  np.zeros((hlen+1, rlen+1), dtype=np.int32)

    # initialize, 공란을 무시하고 음절의 거리 매트릭스 만들기
    for r in range(rlen+1):
        scores[0, r] = r
    for h in range(1, hlen+1):
        scores[h, 0] = scores[h-1, 0] + 1
        for r in range(1, rlen+1):
            sub_or_cor = scores[h-1, r-1] + (0 if ref_nospace[r-1] == hyp_nospace[h-1] else 1)
            insert = scores[h-1, r] + 1
            delete = scores[h, r-1] + 1
            scores[h, r] = min(sub_or_cor, insert, delete)

    # traceback and compute alignment
    h, r = hlen, rlen
    ref_norm, hyp_norm = [], []

    while r > 0 or h > 0:
        if h == 0:
            last_r = r - 1
        elif r == 0:
            last_h = h - 1
            last_r = r
        else:
            sub_or_cor = scores[h-1, r-1] + (0 if ref_nospace[r-1] == hyp_nospace[h-1] else 1)
            insert = scores[h-1, r] + 1
            delete = scores[h, r-1] + 1

            if sub_or_cor < min(insert, delete):
                last_h, last_r = h - 1, r - 1
            else:
                last_h, last_r = (h-1, r) if insert < delete else (h, r-1)

            c_hyp = hyps[last_h] if last_h == h-1 else ''
            c_ref = refs[last_r] if last_r == r-1 else ''
            h, r = last_h, last_r

            # do word-spacing normalization
            if c_hyp.replace('_', '') == c_ref.replace('_', ''):
                c_hyp = c_ref

        ref_norm.append(c_ref)
        hyp_norm.append(c_hyp)

    # ref_norm[::-1], hyp_norm[::-1]
    shyp = ''.join(map(str, hyp_norm[::-1])).replace('_', ' ')
    return shyp

In [15]:
get_swords(ref, hyp)

'나는 어제 치킨을 먹었다'

In [16]:
ref

'나는 어제 양념 치킨을 먹었다'

In [17]:
hyp

'나는어제  치킨을먹었다'

In [None]:
file_name = 'hyp_label.csv'
hyp_label_df = pd.read_csv(file_name)

In [None]:
_, hyp_label_df['hyp_n'] = get_norm_text(hyp_label_df['wrd'], hyp_label_df['hyp'])

In [None]:
hyp_label_df

In [None]:
pd.options.display.max_rows = 100

In [None]:
print(hyp_label_df['hyp_n'])

In [None]:
hyp_label_df['hyp_s'] = ''.join(map(str, hyp_label_df['hyp_n'])).replace('_', ' ')

In [None]:
(hyp_label_df['hyp_n']).replace('_', ' ')

In [None]:
wer_test = wer_details_for_batch(hyp_label_df['spk_id'], hyp_label_df['wrd'], hyp_label_df['hyp'])
wer_list = [i['WER'] for i in wer_test]

In [None]:
hyp_label_df['wer'] = wer_list

In [None]:
hyp_label_df

In [None]:
# sWER 구현하기