In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
if "../src" not in sys.path:
    sys.path.append("../src")
    
import pandas as pd
from pathlib import Path
from praatio import textgrid
from itertools import chain

In [3]:
from weave import Utterance, Word, Character, Phone
from weave import BuildUtt

In [12]:
### Read charac-boundary dictionary
word_syll_map = pd.read_csv(Path("../data/mandarin_taiwan_mfa_retrovar.syll.tsv"), sep="\t")\
                  .groupby("ipa").first().drop("word", axis=1)\
                  .to_dict(orient="index")

In [4]:
mfa_data_dir = Path("~/lago/mfa_data/").expanduser()
tg = textgrid.openTextgrid("../data/CN_GY_label.TextGrid", False)
fa = textgrid.openTextgrid(mfa_data_dir / "aligned_retro/CN_GY/CN_GY_003.TextGrid", False)

In [5]:
rz_tier = tg.getTier("realization")
note_tier = tg.getTier("note")
annot_utt_tier = tg.getTier("utt")
annot_chars_tier = tg.getTier("ch")
annot_words_tier = tg.getTier("word")
fa_words = fa.getTier("words")
fa_phones = fa.getTier("phones")

In [6]:
utts = BuildUtt.build_utts_words_chars(annot_utt_tier, annot_words_tier, annot_chars_tier)

In [7]:
utts = BuildUtt.align_annots(utts, "note", note_tier)
utts = BuildUtt.align_annots(utts, "realization", rz_tier)

In [8]:
utts[3]

<Utterance: [35.45-47.70] 對 啊 可能  也 沒有  就 而且 歐洲  歐洲 我 會 覺得 是 一 個 比較  mm  要 比較 有錢 啊 然後 比較 崇高 的 地方 可能 是 要 等 以後  自己 賺錢 還是 什麼 的 你 才 會 去 吧>
 <Word: [35.45-35.55] 對>
  <Character: [35.45-35.55] 對>
 <Word: [35.55-35.68] 啊>
  <Character: [35.55-35.68] 啊>
 <Word: [35.68-36.13] 可能>
  <Character: [35.68-35.82] 可>
  <Character: [35.82-36.13] 能>
 <Word: [36.22-36.39] 也>
  <Character: [36.22-36.39] 也>
 <Word: [36.39-36.76] 沒有>
  <Character: [36.39-36.53] 沒>
  <Character: [36.53-36.76] 有>
 <Word: [37.11-37.33] 就>
  <Character: [37.11-37.33] 就>
 <Word: [37.33-37.59] 而且>
  <Character: [37.33-37.46] 而>
  <Character: [37.46-37.59] 且>
 <Word: [37.59-38.04] 歐洲>
  <Character: [37.59-37.74] 歐>
  <Character: [37.74-38.04] 洲 (w/)>
 <Word: [38.14-38.41] 歐洲>
  <Character: [38.14-38.28] 歐>
  <Character: [38.28-38.41] 洲>
 <Word: [38.41-38.47] 我>
  <Character: [38.41-38.47] 我>
 <Word: [38.47-38.61] 會>
  <Character: [38.47-38.61] 會>
 <Word: [38.61-38.84] 覺得>
  <Character: [38.61-38.72] 覺>
  <Character: [38.72-38.84]

## Align annotations to forced-alignment

In [13]:
mfa_wlist = BuildUtt.build_words_phones(fa_words, fa_phones)
mfa_words = BuildUtt.build_characters(mfa_wlist, word_syll_map)

In [16]:
mfa_utt = Utterance.from_words(mfa_words)
mfa_utt = BuildUtt.merge_utterance(mfa_utt, utts[3])

In [17]:
mfa_utt

<Utterance: [35.45-47.65] 對 啊 可能 也 沒有 就 而且 歐洲 歐洲 我 會 覺得 是 一 個 比較 mm 要 比較 有錢 啊 然後 比較 崇高 的 地方 可能 是 要 等 以後 自己 賺錢 還是 什麼 的 你 才 會 去 吧>
 <Word: [35.45-35.52] 對>
  <Character: [35.45-35.52] 對>
   <Phone: [35.45-35.46] t>
   <Phone: [35.46-35.47] w>
   <Phone: [35.47-35.52] ej˥˩>
 <Word: [35.52-35.66] 啊>
  <Character: [35.52-35.66] 啊>
   <Phone: [35.52-35.56] ʔ>
   <Phone: [35.56-35.66] a˥˥>
 <Word: [35.66-36.09] 可能>
  <Character: [35.66-35.89] 可>
   <Phone: [35.66-35.76] kʰ>
   <Phone: [35.76-35.81] o˨˩˦>
   <Phone: [35.81-35.89] n>
  <Character: [35.89-36.09] 能>
   <Phone: [35.89-36.04] o˧˥>
   <Phone: [36.04-36.09] ŋ>
 <Word: [36.22-36.38] 也>
  <Character: [36.22-36.38] 也>
   <Phone: [36.22-36.33] j>
   <Phone: [36.33-36.38] e˨˩˦>
 <Word: [36.38-36.71] 沒有>
  <Character: [36.38-36.53] 沒>
   <Phone: [36.38-36.50] m>
   <Phone: [36.50-36.53] ej˧˥>
  <Character: [36.53-36.71] 有>
   <Phone: [36.53-36.57] j>
   <Phone: [36.57-36.71] ow˨˩˦>
 <Word: [37.09-37.32] 就>
  <Character: [37.09-37.32] 就>
  