In [1]:
from __future__ import annotations
from typing import Union, List, Optional, Dict, Tuple

In [4]:
from pathlib import Path
from collections import OrderedDict
from strictyaml import load, MapPattern, Str, Seq
from strictyaml.dumper import StrictYAMLDumper
import ruamel

In [5]:
schema = MapPattern(Str(), Str() | MapPattern(Str(), Str()) | Seq(Str()))

In [6]:
def dump(yaml_dict):
    """
    Render the YAML node and subnodes as string.
    """
    return ruamel.yaml.dump(yaml_dict.as_marked_up(), Dumper=StrictYAMLDumper, allow_unicode=True, width=1000)

In [7]:
def read_yaml_file(fpath: Path):
    """Load the content of a YAML file as an ordered dictionary"""
    raw_data = fpath.open().read()
    return load(raw_data, schema)

In [8]:
YAML_FILES: List[str] = [
    "A.yaml",
    "B.yaml",
    "C.yaml",
    "D.yaml",
    "E.yaml",
    "F.yaml",
    "G.yaml",
    "H.yaml",
    "I.yaml",
    "J.yaml",
    "K.yaml",
    "L.yaml",
    "M.yaml",
    "N.yaml",
    "O.yaml",
    "P.yaml",
    "Q.yaml",
    "R.yaml",
    "S.yaml",
    "T.yaml",
    "U.yaml",
    "V.yaml",
    "W.yaml",
    "X.yaml",
    "Y.yaml",
    "Z.yaml",
    "_other.yaml",
]

In [9]:
def check_existence(dictionary, word, pronunciation):
    value = dictionary.get(word)
    if value is None:
        return False
    if isinstance(value, str):
        found_pronun: str = value
        return pronunciation == found_pronun
    elif isinstance(value, list):
        found_pronun_list: List[str] = value
        return any(pronunciation == found_pronun for found_pronun in found_pronun_list)
    elif isinstance(value, OrderedDict):
        found_pronun_dict: OrderedDict[str, str] = value
        return any(pronunciation == found_pronun for found_pronun in found_pronun_dict.values())
    else:
        raise ValueError(f"unexpected type: {type(value)}")

In [22]:
def is_regular_form(dictionary, word, pronun):
    # first check for possessive form because it's a more specific check this has to come first
    if word[-2:] == "'S" and pronun[-6:] == " IH0 Z":
        base_word = word[:-2]
        base_pronun = pronun[:-6]
        form_name = "possessive after sibilant"
    # less specific check for possessives
    elif word[-2:] == "'S" and (pronun[-2:] == " S" or pronun[-2:] == " Z"):
        base_word = word[:-2]
        base_pronun = pronun[:-2]
        form_name = "possessive"
    # plural forms and third-person present verbs
    elif word[-1] == "S" and (pronun[-2:] == " S" or pronun[-2:] == " Z"):
        # check if the base form exists
        base_word = word[:-1]
        base_pronun = pronun[:-2]
        form_name = "plural/third-person"
    elif word[-2:] == "ED" and (pronun[-2:] == " D" or pronun[-2:] == " T"):
        # check if the base form exists
        base_word = word[:-2]
        base_pronun = pronun[:-2]
        form_name = "simple past"
    elif word[-3:] == "ING" and pronun[-7:] == " IH0 NG":
        # check if the base form exists
        base_word = word[:-3]
        base_pronun = pronun[:-7]
        form_name = "-ing"
    elif word[-3:] == "LLY" and pronun[-6:] == " L IY0":
        # check if the base form exists
        base_word = word[:-2]
        base_pronun = pronun[:-4]
        form_name = "adverb (merged)"
    elif word[-2:] == "LY" and pronun[-6:] == " L IY0":
        # check if the base form exists
        base_word = word[:-2]
        base_pronun = pronun[:-6]
        form_name = "adverb"
    else:
        return False
    if check_existence(dictionary, base_word, base_pronun):
        if pronun[-2:] == " S" and pronun[-4:-2] not in (" F", " K", " P", " T", "TH"):
            print(f"not a regular form: {word} ({pronun}) (base form: {base_word})")
            return False
        if pronun[-2:] == " T" and pronun[-4:-2] not in (" F", " K", " P", " S", "SH", "CH", "TH"):
            print(f"not a regular form: {word} ({pronun}) (base form: {base_word})")
            return False
        print(f"found {form_name} form: {word} (base form: {base_word})")
        return True
    return False

In [23]:
def remove_regular_forms(yaml_dict):
    dictionary = yaml_dict.data
    counter = 0
    for word, value in dictionary.items():
        if isinstance(value, str):
            pronun: str = value
            if is_regular_form(dictionary, word, pronun):
                del yaml_dict[word]
                counter += 1
        else:
            if isinstance(value, list):
                pronun_list: List[str] = value
            elif isinstance(value, OrderedDict):
                pronun_dict: OrderedDict[str, str] = value
                pronun_list = list(pronun_dict.values())
            else:
                raise ValueError(f"unexpected type: {type(value)}")
                
            for pronun in pronun_list:
                if is_regular_form(dictionary, word, pronun):
                    del yaml_dict[word]
                    counter += 1
                    break  # stop looking; we already deleted the word
    return counter

In [24]:
base_path = Path("..") / "entries"
for yaml_file in YAML_FILES:
    yaml_path = base_path / yaml_file
    yaml_dict = read_yaml_file(yaml_path)
    num_removed = remove_regular_forms(yaml_dict)
    print(f"removed {num_removed} entries in {yaml_file}")
    yaml_path.open("w").write(dump(yaml_dict))
#     break

found adverb (merged) form: ABNORMALLY (base form: ABNORMAL)
found adverb form: ABRUPTLY (base form: ABRUPT)
found adverb form: ABSURDLY (base form: ABSURD)
found adverb form: ABUNDANTLY (base form: ABUNDANT)
found adverb (merged) form: ABYSMALLY (base form: ABYSMAL)
found adverb (merged) form: ACCIDENTALLY (base form: ACCIDENTAL)
found adverb form: ACCIDENTLY (base form: ACCIDENT)
found adverb form: ACCUMULATIVELY (base form: ACCUMULATIVE)
found adverb form: ACCURATELY (base form: ACCURATE)
found adverb form: ACCUSINGLY (base form: ACCUSING)
found adverb form: ACHINGLY (base form: ACHING)
found adverb form: ACIDLY (base form: ACID)
found adverb form: ACKERLY (base form: ACKER)
found adverb form: ACTIVELY (base form: ACTIVE)
found adverb (merged) form: ACTUALLY (base form: ACTUAL)
found adverb form: ACUTELY (base form: ACUTE)
found adverb form: ADAMANTLY (base form: ADAMANT)
found adverb (merged) form: ADDITIONALLY (base form: ADDITIONAL)
found adverb form: ADEQUATELY (base form: ADEQU

found adverb (merged) form: DALLY (base form: DAL)
found adverb form: DANGEROUSLY (base form: DANGEROUS)
found adverb form: DANLY (base form: DAN)
found adverb form: DARKLY (base form: DARK)
found adverb form: DEADLY (base form: DEAD)
found adverb form: DEALY (base form: DEA)
found adverb form: DEARLY (base form: DEAR)
found adverb form: DEATHLY (base form: DEATH)
found adverb form: DECENTLY (base form: DECENT)
found adverb form: DECEPTIVELY (base form: DECEPTIVE)
found adverb form: DECIDEDLY (base form: DECIDED)
found adverb form: DECISIVELY (base form: DECISIVE)
found adverb form: DECREPITLY (base form: DECREPIT)
found adverb form: DEDUCTIVELY (base form: DEDUCTIVE)
found adverb form: DEELY (base form: DEE)
found adverb form: DEEPLY (base form: DEEP)
found adverb form: DEFENSIVELY (base form: DEFENSIVE)
found adverb form: DEFIANTLY (base form: DEFIANT)
found adverb form: DEFINITELY (base form: DEFINITE)
found adverb form: DEFINITIVELY (base form: DEFINITIVE)
found adverb form: DEFTLY

found adverb form: GAILY (base form: GAI)
found adverb (merged) form: GAINFULLY (base form: GAINFUL)
found adverb form: GALLANTLY (base form: GALLANT)
found adverb form: GAMELY (base form: GAME)
found adverb form: GANGLY (base form: GANG)
found adverb form: GATELY (base form: GATE)
found adverb (merged) form: GENERALLY (base form: GENERAL)
found adverb (merged) form: GENERATIONALLY (base form: GENERATIONAL)
found adverb form: GENEROUSLY (base form: GENEROUS)
found adverb form: GENTLEMANLY (base form: GENTLEMAN)
found adverb form: GENTLY (base form: GENT)
found adverb form: GENUINELY (base form: GENUINE)
found adverb (merged) form: GEOGRAPHICALLY (base form: GEOGRAPHICAL)
found adverb form: GHOSTLY (base form: GHOST)
found adverb form: GHOULISHLY (base form: GHOULISH)
found adverb (merged) form: GILLY (base form: GIL)
found adverb form: GINGERLY (base form: GINGER)
found adverb form: GIRLISHLY (base form: GIRLISH)
found adverb form: GLADLY (base form: GLAD)
found adverb form: GLARINGLY 

found adverb form: JEALOUSLY (base form: JEALOUS)
found adverb form: JOINTLY (base form: JOINT)
found adverb form: JOKINGLY (base form: JOKING)
found adverb form: JOLY (base form: JO)
found adverb (merged) form: JOYFULLY (base form: JOYFUL)
found adverb (merged) form: JUDICIALLY (base form: JUDICIAL)
found adverb form: JUSTLY (base form: JUST)
removed 7 entries in J.yaml
found adverb form: KEALY (base form: KEA)
found adverb form: KEELY (base form: KEE)
found adverb form: KEENLY (base form: KEEN)
found adverb form: KEITHLY (base form: KEITH)
found adverb (merged) form: KENNELLY (base form: KENNEL)
found adverb form: KENNERLY (base form: KENNER)
found adverb form: KIMBERLY (base form: KIMBER)
found adverb form: KINDLY (base form: KIND)
found adverb form: KINGLY (base form: KING)
found adverb form: KIRALY (base form: KIRA)
found adverb form: KNIGHTLY (base form: KNIGHT)
removed 11 entries in K.yaml
found adverb form: LABORIOUSLY (base form: LABORIOUS)
found adverb (merged) form: LALLY (b

found adverb (merged) form: PAINFULLY (base form: PAINFUL)
found adverb form: PAINLESSLY (base form: PAINLESS)
found adverb form: PAINSTAKINGLY (base form: PAINSTAKING)
found adverb form: PAINTERLY (base form: PAINTER)
found adverb (merged) form: PARADOXICALLY (base form: PARADOXICAL)
found adverb (merged) form: PARENTHETICALLY (base form: PARENTHETICAL)
found adverb (merged) form: PARTIALLY (base form: PARTIAL)
found adverb form: PARTICULARLY (base form: PARTICULAR)
found adverb form: PARTLY (base form: PART)
found adverb form: PASSIONATELY (base form: PASSIONATE)
found adverb form: PASSIVELY (base form: PASSIVE)
found adverb form: PATENTLY (base form: PATENT)
found adverb form: PATIENTLY (base form: PATIENT)
found adverb (merged) form: PEACEFULLY (base form: PEACEFUL)
found adverb form: PECULIARLY (base form: PECULIAR)
found adverb (merged) form: PERENNIALLY (base form: PERENNIAL)
found adverb form: PERILOUSLY (base form: PERILOUS)
found adverb (merged) form: PERIODICALLY (base form:

found adverb form: TACITLY (base form: TACIT)
found adverb (merged) form: TACTFULLY (base form: TACTFUL)
found adverb (merged) form: TACTICALLY (base form: TACTICAL)
found adverb (merged) form: TANGENTIALLY (base form: TANGENTIAL)
found adverb form: TANTALIZINGLY (base form: TANTALIZING)
found adverb form: TARTLY (base form: TART)
found adverb (merged) form: TASTEFULLY (base form: TASTEFUL)
found adverb (merged) form: TEARFULLY (base form: TEARFUL)
found adverb (merged) form: TECHNOLOGICALLY (base form: TECHNOLOGICAL)
found adverb form: TEDIOUSLY (base form: TEDIOUS)
found adverb (merged) form: TELLY (base form: TEL)
found adverb (merged) form: TEMPERAMENTALLY (base form: TEMPERAMENTAL)
found adverb (merged) form: TEMPORALLY (base form: TEMPORAL)
found adverb form: TENACIOUSLY (base form: TENACIOUS)
found adverb form: TENDERLY (base form: TENDER)
found adverb form: TENSELY (base form: TENSE)
found adverb form: TENTATIVELY (base form: TENTATIVE)
found adverb (merged) form: TERMINALLY (b