In [1]:
# %load_ext autoreload
# %autoreload 2
%load_ext watermark

In [2]:
import sys
if "../src" not in sys.path:
    sys.path.append("../src")
    
import json
import pandas as pd
from pathlib import Path
from itertools import chain
from typing import cast, Dict
import subprocess
from tqdm.auto import tqdm
from praatio import textgrid
from weave import Speech, Utterance, Word, Character, Phone
from weave import BuildUtt

## Aligning annotations and MFA

In [3]:
mfa_aligned_dir = Path("~/lago/mfa_data/aligned_retro_redel").expanduser()
tg_dir = Path("../data/retro_labels")

In [4]:
import logging

logger = logging.getLogger()
log_path = str(tg_dir / "align_annot.log")
logger.addHandler(logging.FileHandler(log_path, "w"))
logger.addHandler(logging.StreamHandler())
logger.handlers[1].setLevel(logging.WARNING)
logger.setLevel(logging.DEBUG)
logger.handlers[0].setFormatter(logging.Formatter('[%(levelname)s] (%(name)s) %(asctime)s: %(message)s'))

In [5]:
### Read charac-boundary dictionary
word_syll_map = pd.read_csv(Path("../data/mandarin_taiwan_mfa_redelvar.syll.tsv"), sep="\t")\
                  .groupby("ipa").first().drop("word", axis=1)\
                  .to_dict(orient="index")
word_syll_map = cast(Dict[str, Dict[str, str]], word_syll_map)

In [9]:
annot_files = list(tg_dir.glob("*.TextGrid"))
speaker_ids = [f.stem.replace("_label", "") 
               for f in annot_files
               if f.stem.endswith("_label")]
speaker_ids[:5]

['LJH_GY',
 'CJH_GY',
 'KCZX_GY',
 'XHY_GY',
 'HSK_GY',
 'WJL_GY',
 'GYX_GY',
 'CZX_GY',
 'CN_GY',
 'JXW_GY',
 'LZW_GY',
 'YYS_GY',
 'LRL_GY',
 'XHR_GY',
 'GWH_GY']

In [7]:
tg_dir

PosixPath('../data/retro_labels')

In [8]:
for speaker_id in tqdm(speaker_ids):
  tg_path = tg_dir / f"{speaker_id}_label.TextGrid"
  speech = Speech.from_textgrid_fon(tg_path, ["note", "realization"])

  mfa_speaker_dir = mfa_aligned_dir / speaker_id
  mfa_speaker_paths = list(mfa_speaker_dir.glob("*.TextGrid"))
  mfa_speaker_paths.sort()

  mfa_utt_list = []
  # for mfa_tg_x in mfa_speaker_paths:
  for utt_idx in range(len(speech.utterances)):
    mfa_tg_x = mfa_speaker_dir / f"{speaker_id}_{utt_idx:03d}.TextGrid"
    if not mfa_tg_x.exists():
      logger.warning("Not found: %s", str(mfa_tg_x))
      continue
    
    fa = textgrid.openTextgrid(str(mfa_tg_x), False)
    fa_words = fa.getTier("words")
    fa_phones = fa.getTier("phones")
    mfa_wlist = BuildUtt.build_words_phones(fa_words, fa_phones)  #type: ignore
    mfa_words = BuildUtt.build_characters(mfa_wlist, word_syll_map)  #type: ignore
    mfa_utt = Utterance.from_words(mfa_words)
    mfa_utt = BuildUtt.align_with_utterance_cyy_annot(mfa_utt, speech.utterances[utt_idx])
    mfa_utt_list.append(mfa_utt)
    
  mfa_speech = Speech.from_utterances(mfa_utt_list)
  mfa_speech.to_textgrid(tg_path.with_suffix(".mfa.TextGrid"))
  tg_path.with_suffix(".mfa.json").write_text(json.dumps(mfa_speech.to_dict()))
  mfa_tg_path = tg_path.with_suffix(".mfa.TextGrid")
  mfa_json_path = tg_path.with_suffix(".mfa.json")
  logger.info("Done: %s", str(mfa_tg_path))
  logger.info("Done: %s", str(mfa_json_path))

  print(subprocess.run(["sha1sum", str(mfa_tg_path)], capture_output=True).stdout.decode().strip())
  print(subprocess.run(["sha1sum", str(mfa_json_path)], capture_output=True).stdout.decode().strip())

  0%|          | 0/30 [00:00<?, ?it/s]

FileNotFoundError: [Errno 2] No such file or directory: '../data/retro_labels/CN_GY.mfa_label.TextGrid'

## Watermarks

In [None]:
%watermark

Last updated: 2023-05-16T11:58:31.397853+02:00

Python implementation: CPython
Python version       : 3.10.10
IPython version      : 8.13.1

Compiler    : Clang 14.0.6 
OS          : Darwin
Release     : 22.1.0
Machine     : arm64
Processor   : arm
CPU cores   : 8
Architecture: 64bit



In [None]:
%watermark --iversions

json   : 2.0.9
pandas : 2.0.1
logging: 0.5.1.2
praatio: 6.0.0
sys    : 3.10.10 | packaged by conda-forge | (main, Mar 24 2023, 20:12:31) [Clang 14.0.6 ]

