In [2]:
from pathlib import Path
import json

In [3]:
data_dir = Path("../../data")
if not data_dir.exists():
    data_dir = Path("./")

In [4]:
asr_data = json.load((data_dir/Path("李昆澤-127844.gasr.json")).open(encoding="UTF-8"))
metadata = json.load((data_dir/Path("metadata.json")).open(encoding="UTF-8"))

In [5]:
asr = asr_data["response"]["results"]
transcript = [x for x in metadata if x["video_id"]=="127844"][0]["transcript"]

In [7]:
ans = (data_dir/Path("李昆澤-127844.turn.ans.txt")).read_text().split(",")
len(ans)

20

## 轉注音

In [8]:
# !pip install pypinyin
import re
from pypinyin import lazy_pinyin, Style
from difflib import SequenceMatcher
from functools import cache

In [9]:
tone_re = re.compile("[ˊˇˋ˙]")

@cache
def pinyin_cache(ch):
    return lazy_pinyin(ch, style=Style.BOPOMOFO)[0]

class CharPhone:
    def __init__(self, ch):
        assert len(ch)==1
        self.ch = ch
        self.zhuyin = pinyin_cache(ch)
        self.phones = tone_re.sub("", self.zhuyin)
    
    def __hash__(self):
        return hash(self.phones)
    
    def __eq__(self, other):        
        return self.phones == other.phones
    
    def __repr__(self):
        return f"<Char: {self.ch} ({self.zhuyin})>"        

def char_mapper(text):
    text = re.sub("[ \u3000，。：）（]", "", text)
    return [CharPhone(x) for x in text]

In [10]:
SequenceMatcher(lambda x: not x.ch in "（）", char_mapper("中視（新聞"), char_mapper("中式）心文")).ratio()

1.0

## Preprocess ASR

In [11]:
from itertools import chain

def to_num(x):
    return float(x.replace("s", ""))

def flatten_asr(asr_obj):    
    asr_starts = []
    asr_ends = []
    asr_words = []
    for entry_x in asr_obj:
        alts = entry_x.get("alternatives", [])
        if not alts: continue
        words = alts[0].get("words", [])        
        asr_starts.extend([
            to_num(x["startTime"]) for x in words])
        asr_ends.extend([
            to_num(x["endTime"]) for x in words])
        asr_words.extend([x["word"] for x in words])
        
    return {
        "starts": asr_starts,
        "ends": asr_ends,
        "words": asr_words
    }

flat_asr = flatten_asr(asr)
assert len(flat_asr["starts"]) == len(flat_asr["words"])

In [12]:
asr[0]

{'alternatives': [{'confidence': 0.8894143,
   'transcript': '謝謝主席，請夏林佳龍部長還很有錢人，家人不算機業局長有沒有局長？',
   'words': [{'endTime': '6s', 'startTime': '1.300s', 'word': '謝'},
    {'endTime': '6.100s', 'startTime': '6s', 'word': '謝'},
    {'endTime': '6.300s', 'startTime': '6.100s', 'word': '主'},
    {'endTime': '6.500s', 'startTime': '6.300s', 'word': '席'},
    {'endTime': '6.800s', 'startTime': '6.500s', 'word': '請'},
    {'endTime': '7.100s', 'startTime': '6.800s', 'word': '夏'},
    {'endTime': '7.400s', 'startTime': '7.100s', 'word': '林'},
    {'endTime': '7.600s', 'startTime': '7.400s', 'word': '佳'},
    {'endTime': '7.600s', 'startTime': '7.600s', 'word': '龍'},
    {'endTime': '7.800s', 'startTime': '7.600s', 'word': '部'},
    {'endTime': '7.900s', 'startTime': '7.800s', 'word': '長'},
    {'endTime': '8.400s', 'startTime': '7.900s', 'word': '還'},
    {'endTime': '8.700s', 'startTime': '8.400s', 'word': '很'},
    {'endTime': '8.800s', 'startTime': '8.700s', 'word': '有'},
    {'endTime': '9s'

## 議事錄逐字稿

In [13]:
import re
from itertools import tee
def preprocess_transcript(txt):
    txt = txt.replace("<br />", "")
    txt = re.split("(\n?.+?：)", txt)    
    txt = [x.replace("\n", "").strip() for x in txt]
    txt = [x for x in txt if x]
    iter_list = [iter(txt)] * 2
    turns = [(a,b) for a,b in zip(*iter_list)]
    turns = [x for x in turns if not x[0].startswith("主席：")]
    return turns

In [14]:
turns = preprocess_transcript(transcript)
turns[:3]

[('李委員昆澤：',
  '（10 時 13 分）部長，您好。遊艇證照弊案傷害政府的威信，也影響到考照的公平性，更是衝擊到航運的安全，這一次的事件整體來看，內部監理出了重大漏洞，這個部分請部長簡單說明一下。'),
 ('林部長佳龍：',
  '不管原因是什麼，我們要阻止類似的事情再發生，所以我在第一時間得知檢調在偵辦，就要求同仁同步清查並全面提供資料，而且要澈底建立防弊的措施，不管是相關的作業程序或者承辦人的職權，整個檢討跟改進。'),
 ('李委員昆澤：',
  '航港局的檢討報告提到，要限縮相關人員的權限，加強勾稽比對，並由科長來核對、覆核，系統要更新，限制成績須整批匯入，發照人員也要定期輪調。我也具體的建議，我們必須要加強內部稽核和不定期查核，請簡單說明。')]

## 兩邊對齊

### 用注音對齊

In [68]:
from difflib import SequenceMatcher
from tqdm.auto import tqdm
import numpy as np
win = 20
offset = 1
asr_words = flat_asr["words"] 
asr_starts = flat_asr["starts"]
asr_ends = flat_asr["ends"]

aligned_turn_start = []
current_idx = 0
for turn_x in tqdm(turns):
    probe = turn_x[1][offset:offset+win]    

    scores = []
    for i in range(len(asr_words)-win):                
        target = "".join(asr_words[i:i+win]) 
        sm = SequenceMatcher(None,
                char_mapper(probe), char_mapper(target))
        scores.append(sm.ratio())
    
    scores = np.array(scores)
    scores[:max(current_idx, 2)] = -1
    align_idx = np.argmax(scores)
    current_idx = max(align_idx, current_idx)
    # print(f"[{max(scores)}]")
    # print("Probe:", probe)
    # print("Target: ", "".join(asr_words[align_idx:align_idx+win]))
    aligned_turn_start.append(asr_starts[align_idx])

  0%|          | 0/20 [00:00<?, ?it/s]

In [69]:
from datetime import datetime, timedelta
print("answers", "aligned")
dt = lambda x: x.strftime("%H:%M:%S")
for i in range(len(turns)):
    ans_dt = datetime.strptime(ans[i], "%H:%M:%S")
    aligned_dt = datetime(1900,1,1)+timedelta(seconds=aligned_turn_start[i])    
    print(dt(ans_dt), dt(aligned_dt), (ans_dt-aligned_dt).total_seconds())

answers aligned
00:00:15 00:00:09 5.7
00:00:40 00:00:42 -2.8
00:01:14 00:01:19 -5.2
00:01:42 00:01:43 -1.1
00:02:41 00:02:41 -0.2
00:03:54 00:03:56 -2.1
00:04:09 00:04:10 -1.9
00:04:33 00:04:33 -0.8
00:06:36 00:06:36 -0.3
00:07:01 00:07:00 0.3
00:08:24 00:08:24 -0.7
00:08:39 00:08:24 14.3
00:08:39 00:08:40 -1.1
00:10:05 00:10:06 -1.2
00:10:46 00:10:46 -0.6
00:11:09 00:11:09 -0.3
00:11:32 00:11:27 4.8
00:11:33 00:11:32 0.3
00:11:54 00:11:52 1.3
00:11:56 00:11:53 3.0


In [70]:
def compute_metric(aligned, ans, tol=3.):
    assert len(aligned) == len(ans)
    n_correct = 0

    for aligned_x, ans_x in zip(aligned, ans):
        t = datetime.strptime(ans_x,"%H:%M:%S")
        ansTD = timedelta(hours=t.hour, minutes=t.minute, seconds=t.second).total_seconds()
        alignTD = timedelta(seconds=aligned_x).total_seconds()

        error = abs(ansTD-alignTD)

        if error < tol:
            n_correct += 1
    metric = n_correct/len(aligned)
    return metric
    

In [72]:
compute_metric(aligned_turn_start, ans, tol=5.)

0.85

In [29]:
for start_time, (speaker, transcript) in zip(aligned_turn_start, turns):
    timestamp = str(timedelta(seconds=start_time)).split(".")[0]
    print("[{}] {} {}".format(timestamp, speaker, transcript))

[0:09:23] 李委員昆澤： （10 時 13 分）部長，您好。遊艇證照弊案傷害政府的威信，也影響到考照的公平性，更是衝擊到航運的安全，這一次的事件整體來看，內部監理出了重大漏洞，這個部分請部長簡單說明一下。
[0:00:42] 林部長佳龍： 不管原因是什麼，我們要阻止類似的事情再發生，所以我在第一時間得知檢調在偵辦，就要求同仁同步清查並全面提供資料，而且要澈底建立防弊的措施，不管是相關的作業程序或者承辦人的職權，整個檢討跟改進。
[0:01:19] 李委員昆澤： 航港局的檢討報告提到，要限縮相關人員的權限，加強勾稽比對，並由科長來核對、覆核，系統要更新，限制成績須整批匯入，發照人員也要定期輪調。我也具體的建議，我們必須要加強內部稽核和不定期查核，請簡單說明。
[0:01:42] 葉局長協隆： 謝謝委員，目前我們已經全面變更並改正系統的設定，還有受理申辦的流程，現在承辦人是沒有辦法做任何資料的更改，在發證前，系統也設定成績必須要合格才能夠發證，在考駕照的控管上，也由科長來控管空白駕照以及流水號，每天都必須要逐日核對，並按月做成紀錄往上呈報並做覆核。另外委員剛剛提醒的，要做實地和外部的稽核部分，我們去年 10 月和去年 12 月已經做了兩次到中部航務中心的外部稽核，今年 6 月和 7 月也分別會針對六大項目來進行進一步的實地查核。
[0:02:41] 李委員昆澤： 好，這個部分，剛才很多委員提到相關的題庫艱深，而且大部分都是船機的相關知識，我們應該強化考照的流程，另外針對題庫的合理性以及相關的適用性，必須提出更具體的檢討和改進，而且航行的安全教育真的是非常重要，也必須要加強。我現在要請教的是針對貨櫃的數量，航港局有沒有掌握？現在因為疫情的影響，全世界的海運快速發展，大家需求的貨櫃數量都大幅增加，甚至有錢可能也買不到貨櫃，相關的狀況，我們看到臺灣是一個重要的進出口國家，海運是我們重要的經濟支柱，相對的，海運如果缺乏貨櫃的話，恐怕對經濟會造成重大的影響，這部分請局長簡單說明一下。
[0:03:55] 葉局長協隆： 跟委員報告，全球海運缺櫃的現象是受到新冠肺炎疫情的影響，造成港口塞港以致貨櫃的回流速度減緩，造成整個缺櫃的情形。
[0:04:10] 李委員昆澤： 局長，我們針對缺櫃這樣一個嚴重的狀況，有沒有具體的改善重點？甚至我們有沒有成立協調的工作小組去盤點相關的廠商出

### 用字對齊

In [82]:
ch_win = 10
ch_offset = 2
asr_words = flat_asr["words"] 
asr_starts = flat_asr["starts"]
is_punct = lambda x: x in "，。：）（"

ch_aligned_turn_start = []
current_idx = 0
for turn_x in tqdm(turns):
    probe = turn_x[1][ch_offset:ch_offset+ch_win]

    scores = []
    for i in range(len(asr_words)-ch_win):                
        target = "".join(asr_words[i:i+ch_win]) 
        sm = SequenceMatcher(is_punct,
                probe, target)
        scores.append(sm.ratio())

    scores = np.array(scores)
    scores[:max(current_idx, 2)] = -1
    align_idx = np.argmax(scores)
    current_idx = max(align_idx, current_idx)
    
    # print(f"[{max(scores)}]")
    # print("Probe:", probe)
    # print("Target: ", "".join(asr_words[align_idx:align_idx+20]))
    ch_aligned_turn_start.append(asr_starts[align_idx])

  0%|          | 0/20 [00:00<?, ?it/s]

In [83]:
from datetime import datetime, timedelta
print("answers", "aligned")
dt = lambda x: x.strftime("%H:%M:%S")
for i in range(len(turns)):
    ans_dt = datetime.strptime(ans[i], "%H:%M:%S")
    aligned_dt = datetime(1900,1,1)+timedelta(seconds=aligned_turn_start[i])    
    print(dt(ans_dt), dt(aligned_dt), (ans_dt-aligned_dt).total_seconds())

answers aligned
00:00:15 00:00:09 5.7
00:00:40 00:00:42 -2.8
00:01:14 00:01:19 -5.2
00:01:42 00:01:43 -1.1
00:02:41 00:02:41 -0.2
00:03:54 00:03:56 -2.1
00:04:09 00:04:10 -1.9
00:04:33 00:04:33 -0.8
00:06:36 00:06:36 -0.3
00:07:01 00:07:00 0.3
00:08:24 00:08:24 -0.7
00:08:39 00:08:24 14.3
00:08:39 00:08:40 -1.1
00:10:05 00:10:06 -1.2
00:10:46 00:10:46 -0.6
00:11:09 00:11:09 -0.3
00:11:32 00:11:27 4.8
00:11:33 00:11:32 0.3
00:11:54 00:11:52 1.3
00:11:56 00:11:53 3.0


In [84]:
compute_metric(ch_aligned_turn_start, ans, tol=5.)

0.85

In [None]:
data = [
      {"timestamp": 10.2, "speaker": "", "transcript": ""},
    {"timestamp": 10.2, "speaker": "", "transcript": ""},
    {"timestamp": 10.2, "speaker": "", "transcript": ""},
    {"timestamp": 10.2, "speaker": "", "transcript": ""},
    ...    
]

import json
with open("") as fout:
    json.dump(data,fout)