In [1]:
from __future__ import annotations
from typing import Union, List, Optional, Dict, Tuple

In [2]:
from pathlib import Path
from collections import OrderedDict
from strictyaml import load, MapPattern, Str, Seq

In [3]:
schema = MapPattern(Str(), Str() | MapPattern(Str(), Str()) | Seq(Str()))

In [4]:
def read_yaml_file(fpath: Path):
    """Load the content of a YAML file as an ordered dictionary"""
    raw_data = fpath.open().read()
    return load(raw_data, schema)

In [5]:
YAML_FILES: List[str] = [
    "A.yaml",
    "B.yaml",
    "C.yaml",
    "D.yaml",
    "E.yaml",
    "F.yaml",
    "G.yaml",
    "H.yaml",
    "I.yaml",
    "J.yaml",
    "K.yaml",
    "L.yaml",
    "M.yaml",
    "N.yaml",
    "O.yaml",
    "P.yaml",
    "Q.yaml",
    "R.yaml",
    "S.yaml",
    "T.yaml",
    "U.yaml",
    "V.yaml",
    "W.yaml",
    "X.yaml",
    "Y.yaml",
    "Z.yaml",
    "_other.yaml",
]

In [6]:
def check_existence(dictionary, word, pronunciation):
    value = dictionary.get(word)
    if value is None:
        return False
    if isinstance(value, str):
        found_pronun: str = value
        return pronunciation == found_pronun
    elif isinstance(value, list):
        found_pronun_list: List[str] = value
        return any(pronunciation == found_pronun for found_pronun in found_pronun_list)
    elif isinstance(value, OrderedDict):
        found_pronun_dict: OrderedDict[str, str] = value
        return any(pronunciation == found_pronun for found_pronun in found_pronun_dict.values())
    else:
        raise ValueError(f"unexpected type: {type(value)}")

In [16]:
def is_regular_form(dictionary, word, pronun):
    # first check for possessive form because it's a more specific check this has to come first
    if word[-2:] == "'S" and pronun[-6:] == " IH0 Z":
        base_word = word[:-2]
        base_pronun = pronun[:-6]
        form_name = "possessive after sibilant"
    # less specific check for possessives
    elif word[-2:] == "'S" and (pronun[-2:] == " S" or pronun[-2:] == " Z"):
        base_word = word[:-2]
        base_pronun = pronun[:-2]
        form_name = "possessive"
    # plural forms and third-person present verbs
    elif word[-1] == "S" and (pronun[-2:] == " S" or pronun[-2:] == " Z"):
        # check if the base form exists
        base_word = word[:-1]
        base_pronun = pronun[:-2]
        form_name = "plural/third-person"
    elif word[-2:] == "ED" and (pronun[-2:] == " D" or pronun[-2:] == " T"):
        # check if the base form exists
        base_word = word[:-2]
        base_pronun = pronun[:-2]
        form_name = "simple past"
    elif word[-3:] == "ING" and pronun[-7:] == " IH0 NG":
        # check if the base form exists
        base_word = word[:-3]
        base_pronun = pronun[:-7]
        form_name = "-ing"
    else:
        return False
    if check_existence(dictionary, base_word, base_pronun):
        if pronun[-2:] == " S" and pronun[-4:-2] not in (" F", " K", " P", " T", "TH"):
            print(f"not a regular form: {word} ({pronun}) (base form: {base_word})")
            return False
        if pronun[-2:] == " T" and pronun[-4:-2] not in (" F", " K", " P", " S", "SH", "CH", "TH"):
            print(f"not a regular form: {word} ({pronun}) (base form: {base_word})")
            return False
        print(f"found {form_name} form: {word} (base form: {base_word})")
        return True
    return False

In [17]:
def remove_regular_forms(yaml_dict):
    dictionary = yaml_dict.data
    counter = 0
    for word, value in dictionary.items():
        if isinstance(value, str):
            pronun: str = value
            if is_regular_form(dictionary, word, pronun):
                del yaml_dict[word]
                counter += 1
        else:
            if isinstance(value, list):
                pronun_list: List[str] = value
            elif isinstance(value, OrderedDict):
                pronun_dict: OrderedDict[str, str] = value
                pronun_list = list(pronun_dict.values())
            else:
                raise ValueError(f"unexpected type: {type(value)}")
                
            for pronun in pronun_list:
                if is_regular_form(dictionary, word, pronun):
                    del yaml_dict[word]
                    counter += 1
                    break  # stop looking for plural forms, we already deleted the word
    return counter

In [19]:
base_path = Path("..") / "dictionary"
for yaml_file in YAML_FILES:
    yaml_path = base_path / yaml_file
    yaml_dict = read_yaml_file(yaml_path)
    num_removed = remove_regular_forms(yaml_dict)
    print(f"removed {num_removed} entries in {yaml_file}")
    yaml_path.open("w").write(yaml_dict.as_yaml())
#     break

found possessive after sibilant form: ABRAMS'S (base form: ABRAMS)
found possessive after sibilant form: ACTRESS'S (base form: ACTRESS)
found possessive after sibilant form: ADDIDAS'S (base form: ADDIDAS)
found possessive after sibilant form: ADIDAS'S (base form: ADIDAS)
found possessive after sibilant form: ADVANCE'S (base form: ADVANCE)
found possessive after sibilant form: AEROSPACE'S (base form: AEROSPACE)
found possessive after sibilant form: AGACHE'S (base form: AGACHE)
found possessive after sibilant form: AGE'S (base form: AGE)
found possessive after sibilant form: AIRBUS'S (base form: AIRBUS)
found possessive after sibilant form: AJAJ'S (base form: AJAJ)
found possessive after sibilant form: AJAX'S (base form: AJAX)
found possessive after sibilant form: ALDRICH'S (base form: ALDRICH)
found possessive after sibilant form: ALDUS'S (base form: ALDUS)
found possessive after sibilant form: ALEX'S (base form: ALEX)
found possessive after sibilant form: ALICE'S (base form: ALICE)
fou

found possessive after sibilant form: GAS'S (base form: GAS)
found possessive after sibilant form: GAVRAS'S (base form: GAVRAS)
found possessive after sibilant form: GENEX'S (base form: GENEX)
found possessive after sibilant form: GEORGE'S (base form: GEORGE)
found possessive after sibilant form: GIBBS'S (base form: GIBBS)
found possessive after sibilant form: GINGRICH'S (base form: GINGRICH)
found possessive after sibilant form: GLASS'S (base form: GLASS)
found possessive after sibilant form: GONZALEZ'S (base form: GONZALEZ)
found possessive after sibilant form: GOODRICH'S (base form: GOODRICH)
found possessive after sibilant form: GOOSE'S (base form: GOOSE)
found possessive after sibilant form: GRACE'S (base form: GRACE)
found possessive after sibilant form: GREECE'S (base form: GREECE)
found possessive after sibilant form: GREENPEACE'S (base form: GREENPEACE)
found possessive after sibilant form: GREENWICH'S (base form: GREENWICH)
found possessive after sibilant form: GROSS'S (base 

found possessive after sibilant form: PACE'S (base form: PACE)
found possessive after sibilant form: PACKAGE'S (base form: PACKAGE)
found possessive after sibilant form: PAGE'S (base form: PAGE)
found possessive after sibilant form: PALACE'S (base form: PALACE)
found possessive after sibilant form: PARIBAS'S (base form: PARIBAS)
found possessive after sibilant form: PARIS'S (base form: PARIS)
not a regular form: PARRETTI'S (P AXR EH1 T IY0 S) (base form: PARRETTI)
found possessive after sibilant form: PATLEX'S (base form: PATLEX)
found possessive after sibilant form: PERES'S (base form: PERES)
found possessive after sibilant form: PHELPS'S (base form: PHELPS)
found possessive after sibilant form: PHOENIX'S (base form: PHOENIX)
found possessive after sibilant form: PIECH'S (base form: PIECH)
found possessive after sibilant form: PIERCE'S (base form: PIERCE)
found possessive after sibilant form: PLACE'S (base form: PLACE)
found possessive after sibilant form: PLUS'S (base form: PLUS)
fou