In [1]:
import re
from utils import apply

In [2]:
# convert all AA to OH except before R and at the end of a word
# MacOS:
# find . -name  '*.yaml' -exec sed -i '' -e "s/AA\([012]\) \([^R]\)/OH\1 \2/g" {} \;
# Linux
# find . -name  '*.yaml' -exec sed -i -e "s/AA\([012]\) \([^R]\)/OH\1 \2/g" {} \;

In [3]:
lot = ("OH0", "OH1", "OH2")

In [9]:
overrides = {
    # things that I can't detect automatically
    "BOTSWANA": "B OH0 T S W AA1 N AX",
    
    # things that were just wrong
    'PHOTOCALL': 'F OW1 T OW0 K AO2 L',
    'GENITALIA': 'JH EH2 N AX T EY1 L Y AX',
    "BEACHBALL": "B IY1 CH B AO2 L",
    "BEFALLEN": "B AX F AO1 L AX N",
    "CAUSALITY": "K AO2 Z AE1 L IH0 T IY0",
    "CONSONANTAL": "K OH2 N S AX N AE1 N T AX L",
    "HALLMARK": "HH AO1 L M AA2 R K",
    "NAPALM": "N EY1 P AA2 M",
    "PAINTBALL": "P EY1 N T B AO2 L",
    "YA'LL": "Y AO1 ' L",
}

exclude = """
BAROMETER
PHARMACOLOGIST
PHARMACOLOGY
WORKAHOLIC
YACHT
YACHTSMAN
""".split()

def find_palm_vowel(word, pronun, dictionary):
    if word in overrides:
        return overrides[word]
    
    if word in exclude:
        return True

    a_indices = []
    # replace digraphs with single letters so that the index calculation is more reliable
    word_with_better_letters = word.replace("SH", "ʃ").replace("TH", "θ").replace("WH", "ʍ").replace("CH", "ʒ")
    for combo, offset in (("(?<![OEUWʍ])A(?![OIEUW])", 0),):# ("WA(?![UWL])", 1), ("QUA(?![UWL])", 2)):
        a_indices += [m.start() + offset for m in re.finditer(combo, word_with_better_letters)]
            
    if not a_indices:  # no suitable letters found
        return True
    
    present_stresses = []
    for vowel in lot:
        if vowel in pronun:
            stress = vowel[-1]
            present_stresses.append(stress)
    if not present_stresses:
        return True

    leeway = 1 if len(present_stresses) == 1 else 1  # if there is more than one AO, don't get them mixed up

    changed_anything = False
    new_pronun = pronun
    for stress in present_stresses:
        vowel = f"OH{stress}"
        aa_index = new_pronun.split().index(vowel)
        if min(abs(a_index - aa_index) for a_index in a_indices) > leeway:
            print(f"probably no candiate '{word}', letter indices: {a_indices}, sound index: {aa_index}")
            continue
        new_pronun = new_pronun.replace(vowel, f"AA{stress}")
        changed_anything = True
    if not changed_anything:
        return True
    print(f"change '{word.lower().replace('a', 'A')}', new pronun: '{new_pronun.lower().replace('aa', 'AA')}'")
    return new_pronun

In [10]:
apply(
    find_palm_vowel,
    save_result=True,
    only_first_file=False,
    mode="transform",
)

change 'AAchen', new pronun: 'AA1 k ax n'
change 'AArhus', new pronun: 'AA2 hh uw1 s'
change 'AbbAs', new pronun: 'ax b AA1 s'
probably no candiate 'ABDOMINAL', letter indices: [0, 7], sound index: 3
change 'AbidjAn', new pronun: 'ae0 b ih0 jh AA1 n'
change 'AbkhAziA', new pronun: 'ae0 b k AA1 z y ax'
change 'AbkhAziAn', new pronun: 'ae0 b k AA1 z iy0 ax n'
change 'Abo', new pronun: 'AA1 b ow0'
probably no candiate 'ABOLISH', letter indices: [0], sound index: 2
probably no candiate 'ABOLISHES', letter indices: [0], sound index: 2
probably no candiate 'ABOMINABLE', letter indices: [0, 6], sound index: 2
probably no candiate 'ABOMINATION', letter indices: [0, 6], sound index: 2
probably no candiate 'ABSCOND', letter indices: [0], sound index: 4
probably no candiate 'ABSCONDED', letter indices: [0], sound index: 4
probably no candiate 'ABSOLVE', letter indices: [0], sound index: 3
probably no candiate 'ABSOLVED', letter indices: [0], sound index: 3
probably no candiate 'ABSOLVING', letter

In [17]:
[m.start() for m in re.finditer("(?<![W])A(?![UW])", "WAUSHWA")]

[]

In [None]:
# fix this:
# ENTOURAGE:
# - OH2 N T UH0 R OH1 ZH
# ENTOURAGES:
# - OH2 N T UH0 R OH1 ZH IH0 Z