In [1]:
from __future__ import annotations
from typing import Union, List, Optional, Dict, Tuple

In [2]:
from pathlib import Path
from collections import OrderedDict
from strictyaml import load, MapPattern, Str, Seq, as_document
from strictyaml.dumper import StrictYAMLDumper

In [3]:
schema = MapPattern(Str(), Str() | MapPattern(Str(), Str()) | Seq(Str()))

In [4]:
import ruamel
def dump(yaml_dict):
    """
    Render the YAML node and subnodes as string.
    """
    return ruamel.yaml.dump(yaml_dict.as_marked_up(), Dumper=StrictYAMLDumper, allow_unicode=True, width=1000)

In [5]:
def read_yaml_file(fpath: Path):
    """Load the content of a YAML file as an ordered dictionary"""
    raw_data = fpath.open().read()
    return load(raw_data, schema)

In [6]:
YAML_FILES: List[str] = [
    "A.yaml",
    "B.yaml",
    "C.yaml",
    "D.yaml",
    "E.yaml",
    "F.yaml",
    "G.yaml",
    "H.yaml",
    "I.yaml",
    "J.yaml",
    "K.yaml",
    "L.yaml",
    "M.yaml",
    "N.yaml",
    "O.yaml",
    "P.yaml",
    "Q.yaml",
    "R.yaml",
    "S.yaml",
    "T.yaml",
    "U.yaml",
    "V.yaml",
    "W.yaml",
    "X.yaml",
    "Y.yaml",
    "Z.yaml",
    "_other.yaml",
]

In [7]:
with open("../count_1w.txt", "r") as f:
    most_common_words = [line.split("\t")[0] for line in f.readlines()]
# most_common_words = most_common_words[:200_000]
len(most_common_words)

333333

In [8]:
def is_derived_word(dictionary, word):
    if "'" in word:
        return True
    if word[-1] == "S":
        # check if the base form exists
        base_words = [word[:-1]]
        form_name = "plural/third-person"
    elif word[-2:] == "ED":
        # check if the base form exists
        base_words = [word[:-1], word[:-2], word[:-3]]
        form_name = "simple past"
    elif word[-3:] == "ING":
        # check if the base form exists
        base_words = [word[:-3], word[:-3] + "E", word[:-4]]
        form_name = "-ing"
    elif word[-2:] == "LY":
        base_words = [word[:-2], word[:-3]]
    else:
        return False
    if any(base_word in dictionary for base_word in base_words):
#         print(f"found {form_name} form: {word} (base forms: {base_words})")
        return True
    return False

In [9]:
def remove_rare_words(yaml_dict, deleted_words):
    dictionary = yaml_dict.data
    counter = 0
    for word in dictionary:
        if word.lower() not in most_common_words:
            if is_derived_word(dictionary, word):
                print(f"would have removed {word}, but it is a derived word")
                continue
#             print(f"Remove rare word: {word}")
            deleted_words[word] = dictionary[word]
            del yaml_dict[word]
            counter += 1
    return counter

In [10]:
deleted_words = OrderedDict()
base_path = Path("..") / "entries"
for yaml_file in YAML_FILES:
    yaml_path = base_path / yaml_file
    yaml_dict = read_yaml_file(yaml_path)
    num_removed = remove_rare_words(yaml_dict, deleted_words)
    print(f"removed {num_removed} entries in {yaml_file}")
    yaml_path.open("w").write(dump(yaml_dict))
#     break

deleted_yaml = as_document(deleted_words)
(Path("..") / "rare_words.yaml").open("w").write(dump(deleted_yaml))

would have removed ABRAM'S, but it is a regular form
would have removed ABRIDGES, but it is a regular form
would have removed ACCOR'S, but it is a regular form
would have removed ACCUMULATIVELY, but it is a regular form
would have removed ADAMES, but it is a regular form
would have removed ADMINISTRATOR'S, but it is a regular form
would have removed AFFINED, but it is a regular form
would have removed AFFINING, but it is a regular form
would have removed AFTERIMAGES, but it is a regular form
would have removed AGACHES, but it is a regular form
would have removed AGANS, but it is a regular form
would have removed AGINS, but it is a regular form
would have removed AGONIZES, but it is a regular form
would have removed AIN'T, but it is a regular form
would have removed AINSWORTH'S, but it is a regular form
would have removed AIRCAL'S, but it is a regular form
would have removed AIRFORCES, but it is a regular form
would have removed AIRPLANE'S, but it is a regular form
would have removed AI

would have removed CEREBRALLY, but it is a regular form
would have removed CHAMBERS'S, but it is a regular form
would have removed CHANNELL'S, but it is a regular form
would have removed CHARLES'S, but it is a regular form
would have removed CHAUS, but it is a regular form
would have removed CHEELY, but it is a regular form
would have removed CHEVIOTS, but it is a regular form
would have removed CHILES'S, but it is a regular form
would have removed CHIROPRACTOR'S, but it is a regular form
would have removed CHLORINATING, but it is a regular form
would have removed CINCINNATI'S, but it is a regular form
would have removed CIRCULARIZING, but it is a regular form
would have removed CITYFED, but it is a regular form
would have removed CLAMMED, but it is a regular form
would have removed CLINES'S, but it is a regular form
would have removed COARTICULATED, but it is a regular form
would have removed COARTICULATING, but it is a regular form
would have removed COBERLY, but it is a regular form

would have removed EMLING, but it is a regular form
would have removed ENFRANCHISES, but it is a regular form
would have removed ENGELKING, but it is a regular form
would have removed ENGINEER'S, but it is a regular form
would have removed ENSING, but it is a regular form
would have removed ENTOURAGES, but it is a regular form
would have removed EQUIVOCATING, but it is a regular form
would have removed ESTERLY, but it is a regular form
would have removed ETHNOS'S, but it is a regular form
would have removed ETLING, but it is a regular form
would have removed EUCHRED, but it is a regular form
would have removed EURODEPOSITS, but it is a regular form
would have removed EUROTUNNEL'S, but it is a regular form
would have removed EVANS'S, but it is a regular form
would have removed EVENHANDEDLY, but it is a regular form
would have removed EVERDING, but it is a regular form
would have removed EVERLING, but it is a regular form
would have removed EVERYBODY'D, but it is a regular form
would hav

would have removed HOLBROOKS, but it is a regular form
would have removed HOMEFED, but it is a regular form
would have removed HONDA'S, but it is a regular form
would have removed HORATIO'S, but it is a regular form
would have removed HORNET'S, but it is a regular form
would have removed HORSELY, but it is a regular form
would have removed HOTHOUSES, but it is a regular form
would have removed HOUGHS, but it is a regular form
would have removed HOVERFLY, but it is a regular form
would have removed HOW'D, but it is a regular form
would have removed HOW'RE, but it is a regular form
would have removed HSIUNG'S, but it is a regular form
would have removed HYDROGENATING, but it is a regular form
would have removed HYPHENATING, but it is a regular form
would have removed HYPNOTIZES, but it is a regular form
removed 1172 entries in H.yaml
would have removed I'D, but it is a regular form
would have removed I'LL, but it is a regular form
would have removed I'M, but it is a regular form
would ha

would have removed MICROMANAGED, but it is a regular form
would have removed MICROMANAGES, but it is a regular form
would have removed MIDGETT'S, but it is a regular form
would have removed MIERAS, but it is a regular form
would have removed MIGHT'VE, but it is a regular form
would have removed MIGHTN'T, but it is a regular form
would have removed MILLIRONS, but it is a regular form
would have removed MILLS'S, but it is a regular form
would have removed MINICARS, but it is a regular form
would have removed MINIMILLS, but it is a regular form
would have removed MINNESOTA'S, but it is a regular form
would have removed MINORCO'S, but it is a regular form
would have removed MISALLOCATED, but it is a regular form
would have removed MISALLOCATING, but it is a regular form
would have removed MISAPPROPRIATES, but it is a regular form
would have removed MISCHARACTERIZES, but it is a regular form
would have removed MISCHARACTERIZING, but it is a regular form
would have removed MISCHARGED, but it

would have removed PETIPA'S, but it is a regular form
would have removed PHARISS, but it is a regular form
would have removed PHILHARMONIC'S, but it is a regular form
would have removed PHILIPS'S, but it is a regular form
would have removed PHILLIPS'S, but it is a regular form
would have removed PICKENS'S, but it is a regular form
would have removed PIES'S, but it is a regular form
would have removed PILLAGES, but it is a regular form
would have removed PINNACLE'S, but it is a regular form
would have removed PITTS'S, but it is a regular form
would have removed PLAINTIFF'S, but it is a regular form
would have removed PLATEAUING, but it is a regular form
would have removed PLURALIZED, but it is a regular form
would have removed PLURALIZES, but it is a regular form
would have removed PLURALIZING, but it is a regular form
would have removed POLYGRAM'S, but it is a regular form
would have removed POLYMERIZES, but it is a regular form
would have removed PONTIFICATED, but it is a regular form

would have removed SHULTZ'S, but it is a regular form
would have removed SHUPING, but it is a regular form
would have removed SIDLING, but it is a regular form
would have removed SIELING, but it is a regular form
would have removed SIMERLY, but it is a regular form
would have removed SIMMONS'S, but it is a regular form
would have removed SINGAPORE'S, but it is a regular form
would have removed SITTERLY, but it is a regular form
would have removed SITZES, but it is a regular form
would have removed SIVERLING, but it is a regular form
would have removed SIXTEEN'S, but it is a regular form
would have removed SMEETING, but it is a regular form
would have removed SMITHSONIAN'S, but it is a regular form
would have removed SNIFFILY, but it is a regular form
would have removed SNUGGING, but it is a regular form
would have removed SOBBINGLY, but it is a regular form
would have removed SODOMIZES, but it is a regular form
would have removed SOOTED, but it is a regular form
would have removed SPEA

would have removed WHEN'LL, but it is a regular form
would have removed WHERE'D, but it is a regular form
would have removed WHERE'RE, but it is a regular form
would have removed WHERE'VE, but it is a regular form
would have removed WHICH'RE, but it is a regular form
would have removed WHICH'S, but it is a regular form
would have removed WHITLING, but it is a regular form
would have removed WHITLY, but it is a regular form
would have removed WHO'D, but it is a regular form
would have removed WHO'LL, but it is a regular form
would have removed WHO'RE, but it is a regular form
would have removed WHO'VE, but it is a regular form
would have removed WHY'D, but it is a regular form
would have removed WHY'RE, but it is a regular form
would have removed WICKES'S, but it is a regular form
would have removed WIDDLED, but it is a regular form
would have removed WIDING, but it is a regular form
would have removed WILDCATTING, but it is a regular form
would have removed WILES'S, but it is a regular

638724