In [1]:
import re
from utils import apply

In [2]:
cloth_environs = ("TH", "F", "S", "NG", "G", "K", "N", "SH")
thought = ("AO0", "AO1", "AO2")

In [5]:
overrides = {
    # things that are just wrong
    "BLOGOSPHERE": "B L OO1 G AX S F IH1 R",
    'OCCASION': 'AX K EY1 ZH AX N',
    'OCCASIONAL': 'AX K EY1 ZH AX N AX L',
    "QUASH": "K W AA1 SH",
    
}

exclude = """
COAUTHOR
""".split()

def change_to_cloth_vowel(word, pronun, dictionary):
    if word in overrides:
        return overrides[word]
    
    if word in exclude:
        return True

    o_indices = []
    # replace digraphs with single letters so that the index calculation is more reliable
    word_with_better_letters = word.replace("SH", "ʃ").replace("TH", "θ").replace("WH", "ʍ").replace("CH", "ʒ")
    for combo, offset in (("O", 0), ("WA(?![UWL])", 1), ("QUA(?![UWL])", 2)):
        o_indices += [m.start() + offset for m in re.finditer(combo, word_with_better_letters)]
            
    if not o_indices:  # no suitable letters found
        return True
    
    present_stresses = []
    for vowel in thought:
        if any(f"{vowel} {environ}" in pronun for environ in cloth_environs):
            stress = vowel[-1]
#             if f"AO{stress} SH" in pronun:
#                 print(f"SH is not a valid environment: '{word}'")
#                 continue
            present_stresses.append(stress)
    if not present_stresses:
        return True
    
#     num_o = len([m.start() for m in re.finditer('O', word)])
#     if num_o > 1 and len(present_stresses) > 1:
#         print(f"More than one O: \"{word}\": \"{pronun}\",")
#     return True

    leeway = 2 if len(present_stresses) == 1 else 1  # if there is more than one AO, don't get them mixed up

    changed_anything = False
    new_pronun = pronun
    for stress in present_stresses:
        vowel = f"AO{stress}"
        ao_index = new_pronun.split().index(vowel)
        if min(abs(o_index - ao_index) for o_index in o_indices) > leeway:
#             print(f"probably no candiate '{word}', letter indices: {o_indices}, sound index: {ao_index}")
            continue
        new_pronun = new_pronun.replace(vowel, f"OO{stress}")
        changed_anything = True
    if not changed_anything:
        return True
    print(f"change '{word.lower().replace('o', 'O')}', new pronun: '{new_pronun.lower().replace('oo', 'OO')}'")
    return new_pronun

In [6]:
apply(
    change_to_cloth_vowel,
    save_result=True,
    only_first_file=False,
    mode="transform",
)

change 'abendrOth', new pronun: 'ae1 b ih0 n d r OO0 th'
change 'absecOn', new pronun: 'ae1 b s ax k OO0 n'
change 'accOst', new pronun: 'ax k OO1 s t'
change 'accOsted', new pronun: 'ax k OO1 s t ih0 d'
change 'acOg', new pronun: 'ax k OO1 g'
change 'acOsta', new pronun: 'ax k OO1 s t ax'
change 'acrOss', new pronun: 'ax k r OO1 s'
change 'adOn', new pronun: 'aa0 d OO1 n'
change 'agOstinO', new pronun: 'aa0 g OO0 s t iy1 n ow0'
change 'ahOskie', new pronun: 'ax hh OO1 s k iy0'
change 'airlOck', new pronun: 'eh1 r l OO2 k'
change 'alfOnse', new pronun: 'ae1 l f OO0 n s'
change 'alfOnsO', new pronun: 'ae2 l f OO1 n s ow0'
change 'alfOnzO', new pronun: 'ae2 l f OO1 n z ow0'
change 'allbrittOn', new pronun: 'ao1 l b r ih0 t OO0 n'
change 'alOft', new pronun: 'ax l OO1 f t'
change 'alOng', new pronun: 'ax l OO1 ng'
change 'alOngside', new pronun: 'ax l OO1 ng s ay1 d'
change 'altamOnt', new pronun: 'aa1 l t ax m OO2 n t'
change 'alteOn', new pronun: 'aa1 l t iy0 OO0 n'
change 'althOff', ne

change 'cOffeehOuse', new pronun: 'k OO1 f iy0 hh aw2 s'
change 'cOffeehOuses', new pronun: 'k OO1 f iy0 hh aw2 s ih0 z'
change 'cOffer', new pronun: 'k OO1 f axr'
change 'cOffin', new pronun: 'k OO1 f ih0 n'
change 'cOffing', new pronun: 'k OO1 f ih0 ng'
change 'cOffman', new pronun: 'k OO1 f m ax n'
change 'cOg', new pronun: 'k OO1 g'
change 'cOgnOscenti', new pronun: 'k aa2 g n OO0 s eh1 n t iy0'
change 'cOintreau', new pronun: 'k OO2 n t r ow1'
change 'cOlOn', new pronun: 'k ow1 l OO0 n'
change 'cOlOssal', new pronun: 'k ax l OO1 s ax l'
change 'cOlOssus', new pronun: 'k ax l OO1 s ax s'
change 'cOmpOstela', new pronun: 'k ao2 m p OO0 s t eh1 l ax'
change 'cOncentrate', new pronun: 'k OO1 n s ax n t r ey2 t'
change 'cOncentrated', new pronun: 'k OO1 n s ax n t r ey2 t ih0 d'
change 'cOncentrating', new pronun: 'k OO1 n s ax n t r ey2 t ih0 ng'
change 'cOncentratiOn', new pronun: 'k OO2 n s ax n t r ey1 sh ax n'
change 'cOng', new pronun: 'k OO1 ng'
change 'cOnga', new pronun: 'k OO

change 'eOff', new pronun: 'ey1 OO0 f'
change 'eOn', new pronun: 'iy1 OO0 n'
change 'eOs', new pronun: 'iy1 OO0 s'
change 'eOsinOphilia', new pronun: 'iy2 OO0 s ih2 n ax f ih1 l iy0 ax'
change 'eOsinOphilic', new pronun: 'iy2 OO0 s ih2 n ax f ih1 l ih0 k'
change 'epilOgue', new pronun: 'eh2 p ih0 l OO1 g'
change 'epOnymOus', new pronun: 'ih0 p OO1 n ih0 m ax s'
change 'eriOn', new pronun: 'ih0 r iy0 OO1 n'
change 'evanOff', new pronun: 'eh1 v ax n OO2 f'
change 'exOn', new pronun: 'eh2 k s OO1 n'
changed 25 entries in E.yaml
change 'falkOwski', new pronun: 'f ax l k OO1 f s k iy0'
change 'fallOff', new pronun: 'f ao1 l OO2 f'
change 'fearOn', new pronun: 'f ih1 r OO0 n'
change 'firefOx', new pronun: 'f ay1 axr f OO2 k s'
change 'flintOff', new pronun: 'f l ih1 n t OO0 f'
change 'flOssie', new pronun: 'f l OO1 s iy0'
change 'fOg', new pronun: 'f OO1 g'
change 'fOgarty', new pronun: 'f OO1 g aa2 r t iy0'
change 'fOndle', new pronun: 'f OO1 n d ax l'
change 'fOndled', new pronun: 'f OO1 n

change 'jankOwski', new pronun: 'y ax ng k OO1 f s k iy0'
change 'janOff', new pronun: 'jh ae1 n OO0 f'
change 'janOfsky', new pronun: 'jh ax n OO1 f s k iy0'
change 'janOwski', new pronun: 'y ax n OO1 f s k iy0'
change 'jarmOn', new pronun: 'y aa0 r m OO1 n'
change 'jOcelyn', new pronun: 'jh OO1 s l ih2 n'
change 'jOcelyne', new pronun: 'jh OO1 s l ih2 n'
change 'jOcOse', new pronun: 'jh OO2 k ow1 z'
change 'jOffrey', new pronun: 'jh OO1 f r iy0'
change 'jOg', new pronun: 'jh OO1 g'
change 'jOgged', new pronun: 'jh OO1 g d'
change 'jOgger', new pronun: 'jh OO1 g axr'
change 'jOgging', new pronun: 'jh OO1 g ih0 ng'
change 'jOnbenet', new pronun: 'jh OO1 n b ax n ey1'
change 'jOnbenet's', new pronun: 'jh OO1 n b ax n ey1 ' s'
change 'jOng', new pronun: 'jh OO1 ng'
change 'jOshua', new pronun: 'jh OO1 sh uw0 ax'
change 'jOshua's', new pronun: 'jh OO1 sh uw2 ax ' z'
change 'jOske's', new pronun: 'jh OO1 s k iy0 ' z'
change 'jOslyn', new pronun: 'jh OO1 s l ih0 n'
change 'jOss', new pronun

change 'madOff', new pronun: 'm ae1 d OO2 f'
change 'mahlOn', new pronun: 'm ey1 l OO2 n'
change 'makOwski', new pronun: 'm ax k OO1 f s k iy0'
change 'malathiOn', new pronun: 'm ax l ae1 th iy0 OO0 n'
change 'malinOwski', new pronun: 'm ax l ih0 n OO1 f s k iy0'
change 'malkOwski', new pronun: 'm ax l k OO1 f s k iy0'
change 'maniOn', new pronun: 'm aa0 n y OO1 n'
change 'mankOwski', new pronun: 'm ax ng k OO1 f s k iy0'
change 'manOff', new pronun: 'm ae1 n OO0 f'
change 'manOn', new pronun: 'm aa0 n OO1 n'
change 'marcinkOwski', new pronun: 'm axr ch ih0 ng k OO1 f s k iy0'
change 'markOff', new pronun: 'm aa1 r k OO2 f'
change 'markOwski', new pronun: 'm axr k OO1 f s k iy0'
change 'marOc', new pronun: 'm aa2 r OO1 k'
change 'marOn', new pronun: 'm aa0 r OO1 n'
change 'maslOwski', new pronun: 'm ax s l OO1 f s k iy0'
change 'mastOdOn', new pronun: 'm ae1 s t ax d OO2 n'
change 'matalOn', new pronun: 'm aa0 t aa0 l OO1 n'
change 'mayOn', new pronun: 'm ey0 OO1 n'
change 'mazOn', new

change 'OstentatiOn', new pronun: 'OO2 s t eh0 n t ey1 sh ax n'
change 'OsteOpOrOsis', new pronun: 'OO2 s t iy0 aa2 p axr ow1 s ih0 s'
change 'Osterreichische', new pronun: 'OO1 s t axr r ay2 k ih0 sh iy0'
change 'OsthOff', new pronun: 'aa1 s t hh OO0 f'
change 'Ostracism', new pronun: 'OO1 s t r ax s ih2 z ax m'
change 'Ostracize', new pronun: 'OO1 s t r ax s ay2 z'
change 'Ostracized', new pronun: 'OO1 s t r ax s ay2 z d'
change 'Ostrich', new pronun: 'OO1 s t r ih0 ch'
change 'OstrOff', new pronun: 'OO1 s t r OO0 f'
change 'OstrOwski', new pronun: 'ax s t r OO1 f s k iy0'
change 'OwOssO', new pronun: 'ow0 OO1 s ow0'
changed 75 entries in O.yaml
change 'pabOn', new pronun: 'p aa0 b OO1 n'
change 'paczkOwski', new pronun: 'p ax ch k OO1 f s k iy0'
change 'papOn', new pronun: 'p ae1 p OO0 n'
change 'pawlikOwski', new pronun: 'p aa0 v l ih0 k OO1 f s k iy0'
change 'pawlOwski', new pronun: 'p aa0 v l OO1 f s k iy0'
change 'payOff', new pronun: 'p ey1 OO2 f'
change 'pentecOst', new pronun

change 'salOnga', new pronun: 's ax l OO1 ng g ax'
change 'sandOn', new pronun: 's ae1 n d OO2 n'
change 'saratOv', new pronun: 's eh1 r ax t OO2 f'
change 'sarnOff', new pronun: 's aa1 r n OO0 f'
change 'sarOng', new pronun: 's axr OO1 ng'
change 'saskatchewan', new pronun: 's ae0 s k ae1 ch ax w OO2 n'
change 'saviOn', new pronun: 's ae1 v iy0 OO0 n'
change 'savOn', new pronun: 's ey1 v OO0 n'
change 'schlOss', new pronun: 'sh l OO1 s'
change 'schlOssberg', new pronun: 'sh l OO1 s b axr g'
change 'schlOsser', new pronun: 'sh l OO1 s axr'
change 'schlOssman', new pronun: 'sh l OO1 s m ax n'
change 'schOff', new pronun: 'sh OO1 f'
change 'schOffstall', new pronun: 'sh OO1 f s t ax l'
change 'schrOff', new pronun: 'sh r OO1 f'
change 'schrOth', new pronun: 'sh r OO1 th'
change 'schwan', new pronun: 'sh w OO1 n'
change 'schwandt', new pronun: 'sh w OO1 n t'
change 'schwanz', new pronun: 'sh w OO1 n s'
change 'scOff', new pronun: 's k OO1 f'
change 'scOwcrOft', new pronun: 's k ow1 k r OO

change 'wander', new pronun: 'w OO1 n d axr'
change 'wanderer', new pronun: 'w OO1 n d axr axr'
change 'wanderlust', new pronun: 'w OO1 n d axr l ah2 s t'
change 'wangerin', new pronun: 'w OO1 ng g axr ih0 n'
change 'wansley', new pronun: 'w OO1 n z l iy0'
change 'want', new pronun: 'w OO1 n t'
change 'wanted', new pronun: 'w OO1 n t ih0 d'
change 'wantOn', new pronun: 'w OO1 n t ax n'
change 'warlOck', new pronun: 'w ao1 r l OO2 k'
change 'wascO', new pronun: 'w OO1 s k ow0'
change 'wash', new pronun: 'w OO1 sh'
change 'washable', new pronun: 'w OO1 sh ax b ax l'
change 'washbasin', new pronun: 'w OO1 sh b ey2 s ax n'
change 'washbOard', new pronun: 'w OO1 sh b ao2 r d'
change 'washburn', new pronun: 'w OO1 sh b er2 n'
change 'washburne', new pronun: 'w OO1 sh b er2 n'
change 'washclOth', new pronun: 'w OO1 sh k l OO2 th'
change 'washer', new pronun: 'w OO1 sh axr'
change 'washes', new pronun: 'w OO1 sh ih0 z'
change 'washingtOn', new pronun: 'w OO1 sh ih0 ng t ax n'
change 'washingtO

In [None]:
to_remove = """
coutant
"""

In [118]:
w = 'AWFOFF'
p = 'AO1 F AO2 F'
# w = 'xBLOGOSPHERE'
# p = 'B L AO1 G AO0 S F IH1 R'

In [119]:
change_to_cloth_vowel(w, p, {})

probably no candiate 'AWFOFF', indices: [3], 0
possibly a candiate  'AWFOFF', new pronun: 'AO1 F OA2 F'


True

In [103]:
[m.start() for m in re.finditer('o', 'blogo')]

[2, 4]

In [8]:
[m.start() for m in re.finditer("WA(?![UW])", "WAUSHWA")]

[5]

In [21]:
f"{'ds'}"

'ds'