## Data dependencies

`d896d31474decf727af64d30c704a4195b51ce9c  ../../data/fst/utt_rzs_xianzai.json`

In [1]:
annot_seq_dir = "../../data/fst/xianzai-seq-annot"
utt_rzs_path = "../../data/fst/utt_rzs_xianzai.json"
!sha1sum $annot_seq_dir/* | sha1sum
!sha1sum $utt_rzs_path

5b561827a93acfb268819ca4fa12b4ead197831b  -
d896d31474decf727af64d30c704a4195b51ce9c  ../../data/fst/utt_rzs_xianzai.json


## Load Data

In [2]:
import json
from pathlib import Path
from praatio import textgrid

utt_rzs = json.loads(Path(utt_rzs_path).read_text())
annot_seq = {}
for path in Path(annot_seq_dir).glob("*.TextGrid"):    
    tg = textgrid.openTextgrid(path, includeEmptyIntervals=True)
    phone_tier = tg.getTier("phone")    
    annot_seq[path.stem] = " ".join(x.label for x in phone_tier.entries)


In [3]:
for utt, annot_rzs in annot_seq.items():
  annot_seq[utt] = {
    "annot": annot_rzs,
    "kaldi_post": utt_rzs[utt]["post"],
    "kaldi_ori": utt_rzs[utt]["ori"],
  }

In [4]:
from collections import Counter 
annot_freq = Counter()
mfa_freq = Counter()
for utt, item_x in annot_seq.items():
   annot_freq[item_x["annot"]] += 1
   mfa_freq[item_x["kaldi_post"]] += 1

len(annot_seq), annot_freq.most_common(5), mfa_freq.most_common(5)

(60,
 [('ɕ j. aɪ.', 9),
  ('ɕ j. e. n ts aɪ', 7),
  ('ɕ j. e.', 4),
  ('ɕ j. e. aɪ.', 4),
  ('ɕ j. a.', 3)],
 [('ɕ e n ts aj', 11),
  ('ɕ j a aj', 7),
  ('ɕ j a', 6),
  ('ɕ e n aj', 4),
  ('ɕ e aj', 4)])

In [17]:
import re
from difflib import SequenceMatcher
from collections import Counter

del_freq = Counter()
ins_freq = Counter()
sub_map = {}

for utt, item_x in annot_seq.items():
  seq_a = item_x["annot"].strip().split()
  seq_b = item_x["kaldi_post"].strip().split()
  assert not any(x==" " for x in seq_a), seq_a
  m = SequenceMatcher(None, seq_a, seq_b)

  for act, ai, aj, bi, bj in m.get_opcodes():
    toks_a = tuple(seq_a[ai:aj])
    toks_b = tuple(seq_b[bi:bj])
    if act == "delete":
      del_freq[toks_a] += 1
    elif act == "insert":
      ins_freq[toks_b] += 1
    elif act == "replace":
      sub_map.setdefault(toks_a, []).append(toks_b)        


In [31]:
Counter(x["annot"] for x in annot_seq.values())

Counter({'ɕ j. aɪ.': 9,
         'ɕ j. e. n ts aɪ': 7,
         'ɕ j. e.': 4,
         'ɕ j. e. aɪ.': 4,
         'ɕ j. a.': 3,
         'ɕ ɪ. aɪ.': 2,
         'ɕ j eɪ.': 2,
         'ɕ ɪ. n ts a': 2,
         'ʑ j. a.': 2,
         'ɕ j e n ts aɪ': 2,
         'ɕ j. e. n': 2,
         'ɕ n̩ aɪ': 1,
         'ɕ ɪ. n d. aɪ': 1,
         'ɕ ɪ. n a': 1,
         'ɕ j e. n d. aɪ': 1,
         'pɕ aɪ.': 1,
         'e. a.': 1,
         'ɕ eɪ.': 1,
         'ɕ ɪ n ts aɪ': 1,
         'ɕ e. n ts aɪ': 1,
         'ts e. w': 1,
         'ɕ j. e. n aɪ': 1,
         'ɕ ɪ. n ts. a': 1,
         'ts ɨ. a.': 1,
         'ɕ j. e. a. b.': 1,
         'ɕ j. e. n ts. aɪ.': 1,
         'ɕ j e. aɪ.': 1,
         'ɕ j. eɪ.': 1,
         'ɕ j e. n aɪ.': 1,
         'ɕ i. n a w': 1,
         'ɕ ɪ. n ts aɪ': 1,
         'ɕ j a': 1})