In [1]:
from __future__ import annotations
from typing import Union, List, Optional, Dict, Tuple

In [2]:
from pathlib import Path
from collections import OrderedDict
from strictyaml import load, MapPattern, Str, Seq

In [3]:
schema = MapPattern(Str(), Str() | MapPattern(Str(), Str()) | Seq(Str()))

In [4]:
def read_yaml_file(fpath: Path):
    """Load the content of a YAML file as an ordered dictionary"""
    raw_data = fpath.open().read()
    return load(raw_data, schema)

In [5]:
YAML_FILES: List[str] = [
    "A.yaml",
    "B.yaml",
    "C.yaml",
    "D.yaml",
    "E.yaml",
    "F.yaml",
    "G.yaml",
    "H.yaml",
    "I.yaml",
    "J.yaml",
    "K.yaml",
    "L.yaml",
    "M.yaml",
    "N.yaml",
    "O.yaml",
    "P.yaml",
    "Q.yaml",
    "R.yaml",
    "S.yaml",
    "T.yaml",
    "U.yaml",
    "V.yaml",
    "W.yaml",
    "X.yaml",
    "Y.yaml",
    "Z.yaml",
    "_other.yaml",
]

In [6]:
def check_existence(dictionary, word, pronunciation):
    value = dictionary.get(word)
    if value is None:
        return False
    if isinstance(value, str):
        found_pronun: str = value
        return pronunciation == found_pronun
    elif isinstance(value, list):
        found_pronun_list: List[str] = value
        return any(pronunciation == found_pronun for found_pronun in found_pronun_list)
    elif isinstance(value, OrderedDict):
        found_pronun_dict: OrderedDict[str, str] = value
        return any(pronunciation == found_pronun for found_pronun in found_pronun_dict.values())
    else:
        raise ValueError(f"unexpected type: {type(value)}")

In [48]:
def is_regular_form(dictionary, word, pronun):
    # first check for possessive form because it's a more specific check this has to come first
    if word[-2:] == "'S" and (pronun[-2:] == " S" or pronun[-2:] == " Z"):
        base_word = word[:-2]
        base_pronun = pronun[:-2]
        form_name = "possessive"
    # plural forms and third-person present verbs
    elif word[-1] == "S" and (pronun[-2:] == " S" or pronun[-2:] == " Z"):
        # check if the base form exists
        base_word = word[:-1]
        base_pronun = pronun[:-2]
        form_name = "plural/third-person"
    elif word[-2:] == "ED" and (pronun[-2:] == " D" or pronun[-2:] == " T"):
        # check if the base form exists
        base_word = word[:-2]
        base_pronun = pronun[:-2]
        form_name = "simple past"
    else:
        return False
    if check_existence(dictionary, base_word, base_pronun):
        if pronun[-2:] == " S" and pronun[-4:-2] not in (" F", " K", " P", " T", "TH"):
            print(f"not a regular form: {word} ({pronun}) (base form: {base_word})")
            return False
        if pronun[-2:] == " T" and pronun[-4:-2] not in (" F", " K", " P", " S", "SH", "CH", "TH"):
            print(f"not a regular form: {word} ({pronun}) (base form: {base_word})")
            return False
        # print(f"found {form_name} form: {word} (base form: {base_word})")
        return True
    return False

In [49]:
def remove_regular_forms(yaml_dict):
    dictionary = yaml_dict.data
    counter = 0
    for word, value in dictionary.items():
        if isinstance(value, str):
            pronun: str = value
            if is_regular_form(dictionary, word, pronun):
                del yaml_dict[word]
                counter += 1
        else:
            if isinstance(value, list):
                pronun_list: List[str] = value
            elif isinstance(value, OrderedDict):
                pronun_dict: OrderedDict[str, str] = value
                pronun_list = list(pronun_dict.values())
            else:
                raise ValueError(f"unexpected type: {type(value)}")
                
            for pronun in pronun_list:
                if is_regular_form(dictionary, word, pronun):
                    del yaml_dict[word]
                    counter += 1
                    break  # stop looking for plural forms, we already deleted the word
    return counter

In [50]:
base_path = Path(".") / "dictionary"
for yaml_file in YAML_FILES:
    yaml_path = base_path / yaml_file
    yaml_dict = read_yaml_file(yaml_path)
    num_removed = remove_regular_forms(yaml_dict)
    print(f"removed {num_removed} entries in {yaml_file}")
    yaml_path.open("w").write(yaml_dict.as_yaml())

not a regular form: AGNES (AE1 G N IH0 S) (base form: AGNE)
not a regular form: ALAMITOS (AE2 L AX M IY1 T OW0 S) (base form: ALAMITO)
not a regular form: ALMOS (AA1 L M OW0 S) (base form: ALMO)
not a regular form: ANAS (AE1 N AX S) (base form: ANA)
not a regular form: APARTHEID'S (AX P AA1 R T AY2 D S) (base form: APARTHEID)
not a regular form: ASTROS (AE1 S T R OW0 S) (base form: ASTRO)
not a regular form: ATLAS (AE1 T L AX S) (base form: ATLA)
removed 0 entries in A.yaml
not a regular form: BERETTAS (B AXR EH1 T AX S) (base form: BERETTA)
not a regular form: BIDCOS (B IH1 D K OW0 S) (base form: BIDCO)
not a regular form: BIMBOS (B IH1 M B OW0 S) (base form: BIMBO)
not a regular form: BODEGAS (B OW0 D EY1 G AX S) (base form: BODEGA)
not a regular form: BONS (B AA1 N S) (base form: BON)
not a regular form: BRAS (B R AA1 S) (base form: BRA)
not a regular form: BUENOS (B W EY1 N OW0 S) (base form: BUENO)
not a regular form: BURRITOS (B AXR IY1 T OW0 S) (base form: BURRITO)
removed 1 ent