In [1]:
# %load_ext autoreload
# %autoreload 2
%load_ext watermark

In [2]:
import sys
if "../src" not in sys.path:
    sys.path.append("../src")
    
import json
import pandas as pd
from pathlib import Path
from itertools import chain
from typing import cast, Dict
import subprocess
from praatio import textgrid
from weave import Speech, Utterance, Word, Character, Phone
from weave import BuildUtt

## Aligning annotations and MFA

In [3]:
import logging

logger = logging.getLogger()
logger.addHandler(logging.FileHandler("../data/fon_tg/align_annot.log", "w"))
logger.addHandler(logging.StreamHandler())
logger.handlers[1].setLevel(logging.WARNING)
logger.setLevel(logging.DEBUG)
logger.handlers[0].setFormatter(logging.Formatter('[%(levelname)s] (%(name)s) %(asctime)s: %(message)s'))

In [4]:
### Read charac-boundary dictionary
word_syll_map = pd.read_csv(Path("../data/mandarin_taiwan_mfa_retrovar.syll.tsv"), sep="\t")\
                  .groupby("ipa").first().drop("word", axis=1)\
                  .to_dict(orient="index")
word_syll_map = cast(Dict[str, Dict[str, str]], word_syll_map)

In [5]:
mfa_data_dir = Path("~/lago/mfa_data/aligned_retro").expanduser()
tg_dir = Path("../data/fon_tg")

In [6]:
speaker_id = "CN_GY"
tg_path = tg_dir / f"{speaker_id}_label.TextGrid"
speech = Speech.from_textgrid_fon(tg_path, ["note", "realization"])

mfa_speaker_dir = mfa_data_dir / speaker_id
mfa_speaker_paths = list(mfa_speaker_dir.glob("*.TextGrid"))
mfa_speaker_paths.sort()

mfa_utt_list = []
# for mfa_tg_x in mfa_speaker_paths:
for utt_idx in range(len(speech.utterances)):
  mfa_tg_x = mfa_speaker_dir / f"{speaker_id}_{utt_idx:03d}.TextGrid"
  if not mfa_tg_x.exists():
    logger.warning("Not found: %s", str(mfa_tg_x))
    continue
  
  fa = textgrid.openTextgrid(str(mfa_tg_x), False)
  fa_words = fa.getTier("words")
  fa_phones = fa.getTier("phones")
  mfa_wlist = BuildUtt.build_words_phones(fa_words, fa_phones)  #type: ignore
  mfa_words = BuildUtt.build_characters(mfa_wlist, word_syll_map)  #type: ignore
  mfa_utt = Utterance.from_words(mfa_words)
  mfa_utt = BuildUtt.align_with_utterance_cyy_annot(mfa_utt, speech.utterances[utt_idx])
  mfa_utt_list.append(mfa_utt)
  
mfa_speech = Speech.from_utterances(mfa_utt_list)
mfa_speech.to_textgrid(tg_path.with_suffix(".mfa.TextGrid"))
tg_path.with_suffix(".mfa.json").write_text(json.dumps(mfa_speech.to_dict()))
mfa_tg_path = tg_path.with_suffix(".mfa.TextGrid")
mfa_json_path = tg_path.with_suffix(".mfa.json")
logger.info("Done: %s", str(mfa_tg_path))
logger.info("Done: %s", str(mfa_json_path))

print(subprocess.run(["sha1sum", str(mfa_tg_path)], capture_output=True).stdout.decode().strip())
print(subprocess.run(["sha1sum", str(mfa_json_path)], capture_output=True).stdout.decode().strip())

Not found: /Users/seantyh/lago/mfa_data/aligned_retro/CN_GY/CN_GY_077.TextGrid


56751c02a187be3d0cf6da36aca7e9304f8202e5  ../data/fon_tg/CN_GY_label.mfa.TextGrid
4f60869e2ac99eaf774de638af4d9807deb96995  ../data/fon_tg/CN_GY_label.mfa.json


## Test loading

In [7]:

speaker_id = "CN_GY"
tg_dir = Path("../data/fon_tg")
tg_path = tg_dir / f"{speaker_id}_label.TextGrid"

indict = json.loads(tg_path.with_suffix(".mfa.json").read_text())
mfa_loaded = Speech.from_dict(indict)

In [8]:
mfa_loaded

<Speech: 305 utterances, 1830.06 sec(s)>

In [9]:
mfa_loaded.utterances[0].words[0]

<Word: [2.64-3.09] 沒有>
 <Character: [2.64-2.96] 沒>
  <Phone: [2.64-2.76] m>
  <Phone: [2.76-2.96] ej˧˥>
 <Character: [2.96-3.09] 有>
  <Phone: [2.96-3.05] j>
  <Phone: [3.05-3.09] ow˨˩˦>

## Watermarks

In [10]:
%watermark

Last updated: 2023-05-10T11:18:57.480827+02:00

Python implementation: CPython
Python version       : 3.10.10
IPython version      : 8.12.0

Compiler    : Clang 14.0.6 
OS          : Darwin
Release     : 22.1.0
Machine     : arm64
Processor   : arm
CPU cores   : 8
Architecture: 64bit



In [11]:
%watermark --iversions

logging: 0.5.1.2
sys    : 3.10.10 | packaged by conda-forge | (main, Mar 24 2023, 20:12:31) [Clang 14.0.6 ]
pandas : 1.5.2
json   : 2.0.9
praatio: 6.0.0

