In [1]:
# %load_ext autoreload
# %autoreload 2
%load_ext watermark

In [2]:
import sys
if "../src" not in sys.path:
    sys.path.append("../src")
    
import json
import pandas as pd
from pathlib import Path
from itertools import chain
from typing import cast, Dict
import subprocess
from tqdm.auto import tqdm
from praatio import textgrid
from weave import Speech, Utterance, Word, Character, Phone
from weave import BuildUtt

## Aligning annotations and MFA

In [3]:
mfa_aligned_dir = Path("~/lago/mfa_data/aligned_retro_redel").expanduser()
tg_dir = Path("../data/retro_labels")

In [4]:
import logging

logger = logging.getLogger()
log_path = str(tg_dir / "align_annot.log")
logger.addHandler(logging.FileHandler(log_path, "w"))
logger.addHandler(logging.StreamHandler())
logger.handlers[1].setLevel(logging.WARNING)
logger.setLevel(logging.DEBUG)
logger.handlers[0].setFormatter(logging.Formatter('[%(levelname)s] (%(name)s) %(asctime)s: %(message)s'))

In [5]:
### Read charac-boundary dictionary
word_syll_map = pd.read_csv(Path("../data/mandarin_taiwan_mfa_redelvar.syll.tsv"), sep="\t")\
                  .groupby("ipa").first().drop("word", axis=1)\
                  .to_dict(orient="index")
word_syll_map = cast(Dict[str, Dict[str, str]], word_syll_map)

In [6]:
annot_files = list(tg_dir.glob("*.TextGrid"))
speaker_ids = [f.stem.replace("_label", "") 
               for f in annot_files
               if f.stem.endswith("_label")]
speaker_ids[:5]

['LJH_GY', 'CJH_GY', 'KCZX_GY', 'XHY_GY', 'HSK_GY']

In [7]:
tg_dir

PosixPath('../data/retro_labels')

In [8]:
for speaker_id in tqdm(speaker_ids):
  tg_path = tg_dir / f"{speaker_id}_label.TextGrid"
  speech = Speech.from_textgrid_fon(tg_path, ["note", "realization"])

  mfa_speaker_dir = mfa_aligned_dir / speaker_id
  mfa_speaker_paths = list(mfa_speaker_dir.glob("*.TextGrid"))
  mfa_speaker_paths.sort()

  mfa_utt_list = []
  # for mfa_tg_x in mfa_speaker_paths:
  for utt_idx in range(len(speech.utterances)):
    mfa_tg_x = mfa_speaker_dir / f"{speaker_id}_{utt_idx:03d}.TextGrid"
    if not mfa_tg_x.exists():
      logger.warning("Not found: %s", str(mfa_tg_x))
      continue
    
    fa = textgrid.openTextgrid(str(mfa_tg_x), False)
    fa_words = fa.getTier("words")
    fa_phones = fa.getTier("phones")
    mfa_wlist = BuildUtt.build_words_phones(fa_words, fa_phones)  #type: ignore
    mfa_words = BuildUtt.build_characters(mfa_wlist, word_syll_map)  #type: ignore
    mfa_utt = Utterance.from_words(mfa_words, utt_id=mfa_tg_x.stem)
    mfa_utt = BuildUtt.align_with_utterance_cyy_annot(mfa_utt, speech.utterances[utt_idx])
    mfa_utt_list.append(mfa_utt)
    
  mfa_speech = Speech.from_utterances(mfa_utt_list)
  mfa_speech.to_textgrid(tg_path.with_suffix(".mfa.TextGrid"))
  tg_path.with_suffix(".mfa.json").write_text(json.dumps(mfa_speech.to_dict()))
  mfa_tg_path = tg_path.with_suffix(".mfa.TextGrid")
  mfa_json_path = tg_path.with_suffix(".mfa.json")
  logger.info("Done: %s", str(mfa_tg_path))
  logger.info("Done: %s", str(mfa_json_path))

  print(subprocess.run(["sha1sum", str(mfa_tg_path)], capture_output=True).stdout.decode().strip())
  print(subprocess.run(["sha1sum", str(mfa_json_path)], capture_output=True).stdout.decode().strip())

  0%|          | 0/15 [00:00<?, ?it/s]

64e094defecf566f9da917794ade71e9b8c292d2  ../data/retro_labels/LJH_GY_label.mfa.TextGrid
884e36d949f0b6eea8e2a6de46fda875901c9818  ../data/retro_labels/LJH_GY_label.mfa.json
f2c7d4681a9d99076f91d839faa2bf33bb3eec7b  ../data/retro_labels/CJH_GY_label.mfa.TextGrid
78efb589441fdf17eabb16a84e4d53fd6b978c8d  ../data/retro_labels/CJH_GY_label.mfa.json


Not found: /Users/seantyh/lago/mfa_data/aligned_retro_redel/KCZX_GY/KCZX_GY_333.TextGrid


2c30f620f51a1758362a8b1230318f44bea1b59e  ../data/retro_labels/KCZX_GY_label.mfa.TextGrid
57b6680085857e7e9fa26a055bac614c5885c4fd  ../data/retro_labels/KCZX_GY_label.mfa.json


Not found: /Users/seantyh/lago/mfa_data/aligned_retro_redel/XHY_GY/XHY_GY_087.TextGrid
Not found: /Users/seantyh/lago/mfa_data/aligned_retro_redel/XHY_GY/XHY_GY_088.TextGrid


95ec9bc1aed6122ee8ad4d1454adcfe3def7a71e  ../data/retro_labels/XHY_GY_label.mfa.TextGrid
8950d9ce769588d6a6325358c133f51e46105ab0  ../data/retro_labels/XHY_GY_label.mfa.json


The labels are not matching, skipped: mm, 好
The labels are not matching, skipped: 說, mm
The labels are not matching, skipped: 國語, 說
The labels are not matching, skipped: 跟, 國語
The labels are not matching, skipped: 說, 跟
The labels are not matching, skipped: 台語, 說
The labels are not matching, skipped: hon, 台語
The labels are not matching, skipped: 我, hoN
The labels are not matching, skipped: 覺得, 我
The labels are not matching, skipped: 要, 覺得
The labels are not matching, skipped: 看, 要
The labels are not matching, skipped: 我, 看
The labels are not matching, skipped: 跟, 我
The labels are not matching, skipped: 誰, 跟
The labels are not matching, skipped: 在一起, 誰
The labels are not matching, skipped: 一般, 在一起
The labels are not matching, skipped: 來, 一般
The labels are not matching, skipped: 講, 來
The labels are not matching, skipped: hon, 講
The labels are not matching, skipped: 我, hoN
The labels are not matching, skipped: 跟, 我
The labels are not matching, skipped: 工廠, 跟
The labels are not matching, sk

992ff2158e1a9f3fcf3f8ec63834f4f3827fd2a0  ../data/retro_labels/HSK_GY_label.mfa.TextGrid
681ae089130d7aae160e01db9dbf34fe9215386f  ../data/retro_labels/HSK_GY_label.mfa.json
e09eeeddd8007be6313ed0810857f3f3e2c0e5dc  ../data/retro_labels/WJL_GY_label.mfa.TextGrid
b2213539711800761e441587d652704ef7e4137a  ../data/retro_labels/WJL_GY_label.mfa.json


Not found: /Users/seantyh/lago/mfa_data/aligned_retro_redel/GYX_GY/GYX_GY_061.TextGrid


da4864163fac74f66841c85c66213db4ec86e430  ../data/retro_labels/GYX_GY_label.mfa.TextGrid
3333f3054fc1194f9246ea3d82c724cf373fa06b  ../data/retro_labels/GYX_GY_label.mfa.json
8f3ba463f3c1ef4108f369c90d4d2476ec709eff  ../data/retro_labels/CZX_GY_label.mfa.TextGrid
475cca712d78c0c6bcedce094e69338133aac9ff  ../data/retro_labels/CZX_GY_label.mfa.json


Not found: /Users/seantyh/lago/mfa_data/aligned_retro_redel/CN_GY/CN_GY_077.TextGrid


ad75756d03135984adf0a82b55df8359f9061000  ../data/retro_labels/CN_GY_label.mfa.TextGrid
16d409de1d230e0bd54a6085e0ab2acb6c6dbb07  ../data/retro_labels/CN_GY_label.mfa.json
663404293013fdc945f325a0e11ad74ed757e400  ../data/retro_labels/JXW_GY_label.mfa.TextGrid
5854cda5e927dfa78e1b349682cb419817ae9d5c  ../data/retro_labels/JXW_GY_label.mfa.json
5fcfac53037841d390086a4291f4c4955d34f817  ../data/retro_labels/LZW_GY_label.mfa.TextGrid
0a53d69af1debcfae50534e3e10413d01f9363b0  ../data/retro_labels/LZW_GY_label.mfa.json


The labels are not matching, skipped: siun, siun kong


3270c8eaf4a0c5c1a5eca92f02359dcb43684e48  ../data/retro_labels/YYS_GY_label.mfa.TextGrid
062d8c2efc86964bf90b15234690f5ed03a597ae  ../data/retro_labels/YYS_GY_label.mfa.json


Not found: /Users/seantyh/lago/mfa_data/aligned_retro_redel/LRL_GY/LRL_GY_016.TextGrid


911b3196928465a58a696a47cbf1826c956a5b21  ../data/retro_labels/LRL_GY_label.mfa.TextGrid
3184516605edcd1c9ed4aad2c64d230bc934fb39  ../data/retro_labels/LRL_GY_label.mfa.json
cfd55cad0ee939f6919cbe5a8244e32fa2094932  ../data/retro_labels/XHR_GY_label.mfa.TextGrid
9c5972579fb78bdf0455197b8267d361f441caa9  ../data/retro_labels/XHR_GY_label.mfa.json


Not found: /Users/seantyh/lago/mfa_data/aligned_retro_redel/GWH_GY/GWH_GY_054.TextGrid


70598e0e8fedfafa747d2f0fb80716cdcd581b1c  ../data/retro_labels/GWH_GY_label.mfa.TextGrid
9ecea5aa6c2c462f4a66eb04dad3b4088f288413  ../data/retro_labels/GWH_GY_label.mfa.json


## Watermarks

In [9]:
%watermark

Last updated: 2023-05-21T16:56:24.913475+02:00

Python implementation: CPython
Python version       : 3.10.10
IPython version      : 8.13.1

Compiler    : Clang 14.0.6 
OS          : Darwin
Release     : 22.1.0
Machine     : arm64
Processor   : arm
CPU cores   : 8
Architecture: 64bit



In [10]:
%watermark --iversions

sys    : 3.10.10 | packaged by conda-forge | (main, Mar 24 2023, 20:12:31) [Clang 14.0.6 ]
logging: 0.5.1.2
praatio: 6.0.0
pandas : 2.0.1
json   : 2.0.9

