In [1]:
%load_ext watermark

In [2]:
from pathlib import Path
import re
from itertools import product
from typing import List

## Read reference dict

In [3]:
base_dir = Path("/Users/seantyh/Documents/MFA/pretrained_models/dictionary/")
ref_dict = base_dir / "mandarin_taiwan_mfa.dict"

In [4]:
pdict_list = Path(ref_dict).read_text()
pdict_list = pdict_list.strip().split("\n")
pdict_list = [x.split("\t") for x in pdict_list]
pdict_list = [x for x in pdict_list if len(x)>0]

In [5]:
pdict_list[:5]

[['<eps>', '1.0', '0.0', '0.0', '0.0', 'sil'],
 ['<unk>', '0.99', '0.27', '1.86', '0.86', 'spn'],
 ['㐌', '0.99', '0.26', '1.0', '1.0', 'i˧˥'],
 ['㐖', '0.99', '0.26', '1.0', '1.0', 'ɕ j e˧˥'],
 ['㐖毒', '0.99', '0.26', '1.0', '1.0', 'ɕ j e˧˥ t u˧˥']]

## Set mapping rules

In [6]:
mappings = {
    "ʈʂ ʐ̩": "ts z̩",
    "ʈʂʰ ʐ̩": "tsʰ z̩",
    "ʂ ʐ̩": "s z̩",
    "ts z̩": "ʈʂ ʐ̩",
    "tsʰ z̩": "ʈʂʰ ʐ̩",
    "s z̩": "ʂ ʐ̩",
    "ʈʂ ": "ts ",
    "ʈʂʰ ": "tsʰ ",
    "ʂ ": "s ",    
    "ts ": "ʈʂ ",
    "tsʰ ": "ʈʂʰ ",
    "s ": "ʂ "
}

pat = re.compile("|".join(mappings.keys()))

In [7]:
tone_pat = re.compile("[\u02e5-\u02e9]+")
def find_onsets(pho_toks: List[str]):
    nuclueus_idxs = []
    onset_idxs = []
    for i, phone_x in enumerate(pho_toks):
        if tone_pat.findall(phone_x):
            nuclueus_idxs.append(i)
        else:
            continue
        
        # test previdx is not coda or another nucleus
        if i-1 < 0: continue
        if i-1 in nuclueus_idxs: continue
        if pho_toks[i-1] in "jwɥ": 
            # it may be a glide. Try the i-2 idx
            if (i-2 < 0 
                or i-2 in nuclueus_idxs
                or pho_toks[i-2] in "nŋɻ"):
                # the glide is the onset, like /ɥ e˧˥ n/
                onset_idxs.append(i-1)
            else:
                # there is a consonant before the glide. It is the onset.
                onset_idxs.append(i-2)
        elif pho_toks[i-1] not in "nŋɻ":
            onset_idxs.append(i-1)
        else:
            # there is no onset
            pass
    return onset_idxs
# find_onsets("ʈʂ ə˥˥ n ʂ ʐ̩˧˥".split())
find_onsets("tɕʰ j e˧˥ n".split())

[0]

### Find onset tests

In [8]:
assert find_onsets("tɕʰ j e˧˥ n".split()) == [0]
assert find_onsets("ʈʂ ə˥˥ n ʂ ʐ̩˧˥".split()) == [0,3]
assert find_onsets("tʰ aj˥˩ ɕ ɥ e˧˥ n".split()) == [0,2]

In [9]:
def make_deletion_variants(ori_pron: str, debug=False):
    pho_toks = ori_pron.split(" ")
    onset_idxs = find_onsets(pho_toks)
    variants = []
    _print = print if debug else lambda *x: ...
    _print("Origin:", ori_pron)

    for comb in product(*[[0, 1]]*len(onset_idxs)):                    
        var_toks = list(pho_toks)
        for idx, flag in enumerate(comb):            
            if flag == 1:
                var_toks[onset_idxs[idx]] = ""
        var_toks = [x for x in var_toks if x]
        var_pron = " ".join(var_toks)
        _print(comb, var_pron)
        variants.append(var_pron)
    return variants

In [10]:
make_deletion_variants("tɕʰ j e˧˥ n", debug=True)

Origin: tɕʰ j e˧˥ n
(0,) tɕʰ j e˧˥ n
(1,) j e˧˥ n


['tɕʰ j e˧˥ n', 'j e˧˥ n']

In [11]:
make_deletion_variants("ʈʂ ə˥˥ n ʂ ʐ̩˧˥", debug=True)

Origin: ʈʂ ə˥˥ n ʂ ʐ̩˧˥
(0, 0) ʈʂ ə˥˥ n ʂ ʐ̩˧˥
(0, 1) ʈʂ ə˥˥ n ʐ̩˧˥
(1, 0) ə˥˥ n ʂ ʐ̩˧˥
(1, 1) ə˥˥ n ʐ̩˧˥


['ʈʂ ə˥˥ n ʂ ʐ̩˧˥', 'ʈʂ ə˥˥ n ʐ̩˧˥', 'ə˥˥ n ʂ ʐ̩˧˥', 'ə˥˥ n ʐ̩˧˥']

In [12]:
def make_retro_variants(ori_pron: str, matches: List[re.Match], debug=False):
    spans = [x.span() for x in matches] 
    variants = []
    _print = print if debug else lambda *x: ...
    _print("Origin:", ori_pron)

    for comb in product(*[[0, 1]]*len(matches)):    
        # first segment before first match
        new_pron = ori_pron[:spans[0][0]]
        for idx, flag in enumerate(comb):
            ori_phone = matches[idx].group()
            if flag == 0:
                new_pron += ori_phone
            else:
                new_pron += mappings[ori_phone]
            if idx < len(comb)-1:
                # there is a following match
                new_pron += ori_pron[spans[idx][1]:spans[idx+1][0]]
            else:
                # this is the last match
                new_pron += ori_pron[spans[idx][1]:]
        _print(comb, new_pron)
        variants.append(new_pron)
    return variants

## Main Loop

In [13]:
vardict_list = []
for item_x in pdict_list:    
    ori_pron = item_x[5]
    matches = list(pat.finditer(ori_pron))
    n_matches = len(matches)                  
    if n_matches == 0:
        onsetdel_variants = make_deletion_variants(ori_pron)
        variants = onsetdel_variants
    else:
        retro_variants = make_retro_variants(ori_pron, matches)
        onsetdel_variants = make_deletion_variants(ori_pron)
        variants = retro_variants + onsetdel_variants[1:]  # the first element is always the original one
    word = item_x[0]
    ori_prior = float(item_x[1])
    pron_prior = str(round(ori_prior/len(variants),2))
    sil_weight = str(0.5)
    vardict_list.extend([word, pron_prior, sil_weight, sil_weight, sil_weight, var_x]
                         for var_x in variants)

In [14]:
len(pdict_list), len(vardict_list)

(82661, 616597)

## Write file

In [15]:

var_dict_path = base_dir / "mandarin_taiwan_mfa_redelvar.dict"
with var_dict_path.open("w") as f:
    for item_x in vardict_list:
        f.write("\t".join(item_x))
        f.write("\n")

In [16]:
!cd {var_dict_path.parent} && sha1sum {var_dict_path.name}

8df503bbfa3770b629e2c0a9401bfdc2e4cf4ddf  mandarin_taiwan_mfa_redelvar.dict


## Watermarks

In [17]:
%watermark

Last updated: 2023-05-15T11:26:52.023084+02:00

Python implementation: CPython
Python version       : 3.10.10
IPython version      : 8.12.0

Compiler    : Clang 14.0.6 
OS          : Darwin
Release     : 22.1.0
Machine     : arm64
Processor   : arm
CPU cores   : 8
Architecture: 64bit



In [18]:
%watermark --iversions

re: 2.2.1

