Permalink
Cannot retrieve contributors at this time
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
2048 lines (1804 sloc)
139 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# @author Scott Dobbins | |
# @version 0.5 | |
# @date 2018-01-09 18:00 | |
### ACID | |
# contains, pluralizer, singularizer, and lemmatizer | |
# the lemmatizer "digests" words down into their | |
# simplest root form automatically, without any need | |
# to supply part of speech information | |
# lemmatizer also available in Python version | |
import functools | |
import re | |
import numpy as np | |
def non_empty_string(string): | |
return type(string) in (str, unicode) and string != "" | |
def reduce_concat(stuff): | |
return functools.reduce(lambda x, y: x + y, stuff) | |
def collapse_bar(strings): | |
return functools.reduce(lambda x, y: str(x) + '|' + str(y), strings) | |
def paste0(*lists): | |
return list(map(lambda x: reduce_concat(x), zip(*lists))) | |
def any_of(strings): | |
return "(" + collapse_bar(strings) + ")" | |
def flat_concat(ls): | |
return reduce_concat([subls if type(subls) is list else [subls] for subls in ls]) | |
### Constants --------------------------------------------------------------- | |
English_invariant_words = ["bison", "buffalo", "cannon", "carp", "cod", "deer", "fish", "hi", "moose", "pike", "salmon", "sheep", "shrimp", "squid", "swine", "trout"] | |
English_uncountable_words = ["chaos", "chassis", "molasses", "news", "precis", "rendezvous", "series", "species"] | |
English_ie_singulars = ["anomie", "baddie", "beastie", "biggie", "birdie", "boogie", "bootie", "brownie", "calorie", "camaraderie", "charcuterie", "collie", "commie", "cookie", "cootie", "cowrie", "dearie", "doggie", "dougie", "foodie", "genie", "goalie", "goodie", "groupie", "hippie", "hoodie", "hottie", "junkie", "kiddie", "kittie", "magpie", "meanie", "movie", "newbie", "potpie", "sweetiepie", "sweetypie", "patisserie", "pixie", "prarie", "premie", "quickie", "reverie", "rookie", "roomie", "rotisserie", "smoothie", "softie", "sweetie", "hogtie", "necktie", "talkie", "toughie", "townie", "veggie", "wheelie", "yuppie", "zombie"] | |
English_ie_singulars_plurals = [string + "s" for string in English_ie_singulars] | |
English_oe_singulars_string = "\\b(?:al|ob|r|sh|t|w)oes$" | |
English_zz_singulars = ["buzz", "fizz", "frizz", "fuzz", "jazz", "razz"] | |
English_zz_singulars_plurals = [string + "es" for string in English_zz_singulars] | |
English_s_singulars = ["alias", "apparatus", "asbestos", "atlas", "bias", "bonus", "campus", "canvas", "caucus", "citrus", "loris", "mucus", "octopus", "oops", "pancreas", "pelvis", "porticullis", "ruckus", "status", "trellis", "tuckus", "virus"] | |
English_s_singulars_plurals = [string + "es" for string in English_s_singulars] | |
English_f_to_ves_singulars = ["calf", "dwarf", "elf", "half", "hoof", "leaf", "loaf", "scarf", "self", "shelf", "thief", "wolf"] | |
English_f_to_ves_plurals = [re.sub(string = string, pattern = "f$", repl = "ves") for string in English_f_to_ves_singulars] | |
English_fe_to_ves_singulars = ["knife", "life", "wife"] | |
English_fe_to_ves_plurals = [re.sub(string = string, pattern = "fe$", repl = "ves") for string in English_fe_to_ves_singulars] | |
English_us_plurals = ["bayous", "caribous", "emus", "gnus", "menus", "tiramisus", "tutus"] | |
English_is_plurals = ["khakis", "skis", "taxis"] | |
English_normal_oses_plurals = ["brownnoses", "bullnoses", "hardnoses", "hooknoses", "shovelnoses", "arabinoses", "flavinoses", "furanoses", "manoses", "pyranoses", "heptoses", "lactoses", "maltoses", "pentoses"] | |
English_es_to_e_plurals = ["backaches", "bellyaches", "headaches", "stomachaches", "toothaches", "caches", "moustaches", "panaches", "pistaches", "bastes", "castes", "gestes", "hastes", "mostes", "pastes", "pistes", "tastes", "wastes", "chastes", "trystes", "artistes", "batistes", "ripostes", "langoustes"] | |
plain_plural_singulars = ["canto", "hereto", "kimono", "photo", "piano", "portico", "pro", "quarto", "zero"] | |
plain_plural_plurals = [string + "s" for string in plain_plural_singulars] | |
Latin_us_to_i_singulars = ["alumnus", "cactus", "focus", "fungus", "succubus", "syllabus", "terminus", "uterus"] | |
Latin_us_to_i_plurals = ["alumni", "cacti", "foci", "fungi", "succubi", "syllabi", "termini", "uteri"] | |
Latin_us_to_a_plurals = ["addenda", "auditoria", "collisea", "compendia", "media", "memoranda", "millennia", "ova", "referenda", "spectra", "stadia", "strata", "symposia"] | |
Latin_a_to_ae_singulars = ["alga", "alumna", "antenna", "fauna", "fistula", "flora", "formula", "fovea", "hernia", "larva", "trachea"] | |
Latin_is_to_es_singulars = ["crisis", "genesis", "kinesis", "nemesis", "nosis", "oasis", "testis", "thesis", "tosis"] #* bases could be base or basis (asbestoses could be asbestosis but more likely asbestos) | |
Latin_is_to_es_plurals = [re.sub(string = string, pattern = "is$", repl = "es") for string in Latin_is_to_es_singulars] | |
English_ses_to_s_plurals = flat_concat([English_s_singulars_plurals, [string + "es" for string in Latin_us_to_i_singulars]]) | |
Japanese_words_in_English = ["bento", "katana", "kimono", "ninja", "otaku", "samurai", "sushi", "tsunami"] | |
Maori_words_in_English = ["kakapo", "kiwi", "waka"] | |
other_foreign_is_plurals = [string for string in [string + "s" for string in flat_concat([Japanese_words_in_English, Maori_words_in_English])] if string.endswith("is")] | |
all_is_plurals = flat_concat([English_is_plurals, other_foreign_is_plurals]) | |
### Singulars and Plurals --------------------------------------------------- | |
def singularize(words): | |
# invariants | |
is_invariant = np.any([words.endswith(string) for string in English_invariant_words], axis = 0) | \ | |
np.any([words.endswith(string) for string in English_uncountable_words], axis = 0) | \ | |
np.any([words.endswith(string) for string in Japanese_words_in_English], axis = 0) | \ | |
np.any([words.endswith(string) for string in Maori_words_in_English], axis = 0) | \ | |
words.endswith("nese") | |
# Anglo-Saxon oddities | |
is_person = words.endswith("people") | |
remove_last3 = words.endswith("children") | |
is_brother = words.endswith("brethren") | |
is_man = words.endswith("men") & ~(np.any([words.endswith(string) for string in ("abdomen", "acumen", "albumen", "bitumen", "foramen", "hymen", "lumen", "ramen", "regimen", "rumen", "semen", "specimen", "stamen")], axis = 0) | np.array([bool(re.search("\\b[ao]men$", word)) for word in words])) | |
is_oo = np.any([words.endswith(string) for string in ("teeth", "feet", "geese")], axis = 0) | |
is_ouse = is_ouse = np.any([words.endswith(string) for string in ("booklice", "headlice", "dormice", "fieldmice", "shrewmice", "titmice")], axis = 0) | np.array([bool(re.search("\\b[lm]ice$", word)) for word in words]) | |
remove_last2 = np.array([bool(re.search("\\boxen$", word)) for word in words]) | |
is_die = np.array([bool(re.search("\\bdice$", word)) for word in words]) | |
rule_not_found = np.logical_not(np.any((is_invariant, is_person, remove_last3, is_brother, is_man, is_oo, is_ouse, remove_last2, is_die), axis = 0)) | |
# foreign language rules | |
remove_last = np.any([words.endswith(string) for string in ("kobzari", "oblasti", "eaux", "ae")], axis = 0) & rule_not_found | |
need_o = np.any([words.endswith(string) for string in ("kniazhestva", "celli")], axis = 0) & rule_not_found | |
rule_not_found = rule_not_found & np.logical_not(np.any((remove_last, need_o), axis = 0)) | |
need_itis = words.endswith("itides") & rule_not_found | |
rule_not_found = rule_not_found & ~need_itis | |
need_on = np.any([words.endswith(string) for string in ("automata", "criteria", "hedra", "mena")], axis = 0) & rule_not_found | |
rule_not_found = rule_not_found & ~need_on | |
remove_last2 = remove_last2 | (np.any([words.endswith(string) for string in ("im", "mata")], axis = 0) & rule_not_found) | |
need_ah = words.endswith("ot") & rule_not_found | |
rule_not_found = rule_not_found & np.logical_not(np.any((remove_last2, need_ah), axis = 0)) | |
need_ma = words.endswith("mata") & rule_not_found | |
need_us = words.endswith("i") & rule_not_found | |
need_us_special = np.any([words.endswith(string) for string in ("corpora", "genera", "viscera")], axis = 0) & rule_not_found | |
rule_not_found = rule_not_found & np.logical_not(np.any((need_ma, need_us, need_us_special), axis = 0)) | |
need_um = words.endswith("a") & rule_not_found | |
rule_not_found = rule_not_found & ~need_um | |
need_is_latin = (np.any([words.endswith(string) for string in Latin_is_to_es_plurals], axis = 0) | np.array([bool(re.search("\\baxes$", word)) for word in words])) & (np.any([words.endswith(string) for string in English_normal_oses_plurals], axis = 0) | np.array([bool(re.search("\\bnoses$", word)) for word in words])) & rule_not_found | |
rule_not_found = rule_not_found & ~need_is_latin | |
need_ex = np.any([words.endswith(string) for string in ("codices", "cortices", "indices", "vortices")], axis = 0) & rule_not_found | |
need_ix = np.any([words.endswith(string) for string in ("radices", "trices")], axis = 0) & rule_not_found | |
need_is_greek = words.endswith("eis") & ~(words.endswith("senseis") | np.array([bool(re.search("\\bleis$", word)) for word in words])) & rule_not_found | |
rule_not_found = rule_not_found & np.logical_not(np.any((need_ex, need_ix, need_is_greek), axis = 0)) | |
need_f = np.any([words.endswith(string) for string in English_f_to_ves_plurals], axis = 0) & rule_not_found | |
need_fe = np.any([words.endswith(string) for string in English_fe_to_ves_plurals], axis = 0) & rule_not_found | |
need_y = words.endswith("ies") & ~(np.any([words.endswith(string) for string in English_ie_singulars_plurals], axis = 0) | np.array([bool(re.search("\\b[lpt]ies$", word)) for word in words])) & rule_not_found | |
rule_not_found = rule_not_found & np.logical_not(np.any((need_f, need_fe, need_y), axis = 0)) | |
remove_last3 = remove_last3 | ((np.any([words.endswith(string) for string in ("busses", "gasses")], axis = 0) | \ | |
(words.endswith("zzes") & \ | |
~np.any([words.endswith(string) for string in English_zz_singulars_plurals], axis = 0))) & \ | |
rule_not_found) | |
rule_not_found = rule_not_found & ~remove_last3 | |
remove_last = remove_last | (((np.any([words.endswith(string) for string in English_ie_singulars_plurals], axis = 0) | np.array([bool(re.search("\\b[lpt]ies$", word)) for word in words])) | \ | |
np.array([bool(re.search(English_oe_singulars_string, word)) for word in words]) | \ | |
np.array([bool(re.search("[aeiouy][^aeioux]es$", word)) for word in words]) | \ | |
words.endswith("mmes") | \ | |
np.any([words.endswith(string) for string in English_es_to_e_plurals], axis = 0) | \ | |
np.array([bool(re.search("(?:[bcdfglprstz][glr]|l[csv]|n[cgrs]|p[s]|r[cgsv]|s[c]|tt|u|\\bach)es$", word)) for word in words])) & \ | |
~np.any([words.endswith(string) for string in English_ses_to_s_plurals], axis = 0) & \ | |
rule_not_found) | |
rule_not_found = rule_not_found & ~remove_last | |
remove_last2 = remove_last2 | (np.array([bool(re.search("[^e]es$", word)) for word in words]) & rule_not_found) | |
rule_not_found = rule_not_found & ~remove_last2 | |
remove_last = remove_last | (words.endswith("s") & rule_not_found) | |
# fix English rules | |
words[is_person] = [string[:-4] + "rson" for string in words[is_person]] | |
words[is_brother] = [string[:-6] + "other" for string in words[is_brother]] | |
words[is_man] = [string[:-2] + "an" for string in words[is_man]] | |
words[is_oo] = [re.sub(string = string, pattern = "ee([a-z]:1,2)$", repl = "oo\\1") for string in words[is_oo]] | |
words[is_ouse] = [string[:-3] + "ouse" for string in words[is_ouse]] | |
words[is_die] = [string[:-3] + "ie" for string in words[is_die]] | |
words[need_f] = [string[:-3] + "f" for string in words[need_f]] | |
words[need_fe] = [string[:-3] + "fe" for string in words[need_fe]] | |
words[need_y] = [string[:-3] + "y" for string in words[need_y]] | |
# fix foreign rules | |
words[need_o] = [string[:-1] + "o" for string in words[need_o]] | |
words[need_itis] = [string[:-6] + "itis" for string in words[need_itis]] | |
words[need_ah] = [string[:-2] + "ah" for string in words[need_ah]] | |
words[need_ma] = [string[:-2] + "ma" for string in words[need_ma]] | |
words[need_on] = [string[:-1] + "on" for string in words[need_on]] | |
words[need_us] = [string[:-1] + "us" for string in words[need_us]] | |
words[need_us_special] = [string[:-3] + "us" for string in words[need_us_special]] | |
words[need_um] = [string[:-1] + "um" for string in words[need_um]] | |
words[need_ex] = [string[:-4] + "ex" for string in words[need_ex]] | |
words[need_ix] = [string[:-4] + "ix" for string in words[need_ix]] | |
words[need_is_greek] = [string[:-3] + "is" for string in words[need_is_greek]] | |
words[need_is_latin] = [string[:-2] + "is" for string in words[need_is_latin]] | |
# fix generic rules | |
words[remove_last3] = [string[:-3] for string in words[remove_last3]] | |
words[remove_last2] = [string[:-2] for string in words[remove_last2]] | |
words[remove_last] = [string[:-1] for string in words[remove_last]] | |
return words | |
def make_singular(words): | |
can_be_made_singular = ~is_singular(words) | |
if any(can_be_made_singular): | |
words[can_be_made_singular] = singularize(words[can_be_made_singular]) | |
return words | |
### Number Testers ---------------------------------------------------------- | |
def is_singular(words): | |
is_singular_with_s = (np.any([words.endswith(string) for string in English_s_singulars], axis = 0) | np.array([bool(re.search("\\b(?:bu|ga|ibi|len|ye)s$", word)) for word in words])) | \ | |
(np.array([bool(re.search("[^e]iu?s$", word)) for word in words]) & ~np.any([words.endswith(string) for string in all_is_plurals], axis = 0)) | \ | |
np.any([words.endswith(string) for string in Latin_us_to_i_singulars], axis = 0) | \ | |
np.any([words.endswith(string) for string in ("corpus", "genus", "viscus")], axis = 0) | \ | |
(np.any([words.endswith(string) for string in Latin_is_to_es_singulars], axis = 0) | np.array([bool(re.search("\\baxis$", word)) for word in words])) | \ | |
words.endswith("itis") | \ | |
words.endswith("ss") | \ | |
(words.endswith("us") & ~np.any([words.endswith(string) for string in English_us_plurals], axis = 0) & ~words.endswith("eaus")) | |
is_plural_without_s = words.endswith("people") | \ | |
np.any([words.endswith(string) for string in ("brethren", "children")], axis = 0) | \ | |
(words.endswith("men") & ~(np.any([words.endswith(string) for string in ("abdomen", "acumen", "albumen", "bitumen", "foramen", "hymen", "lumen", "ramen", "regimen", "rumen", "semen", "specimen", "stamen")], axis = 0) | np.array([bool(re.search("\\b[ao]men$", word)) for word in words]))) | \ | |
np.any([words.endswith(string) for string in ("teeth", "feet", "geese")], axis = 0) | \ | |
(np.any([words.endswith(string) for string in ("booklice", "headlice", "dormice", "fieldmice", "shrewmice", "titmice")], axis = 0) | np.array([bool(re.search("\\b[lm]ice$", word)) for word in words])) | \ | |
np.array([bool(re.search("\\boxen$", word)) for word in words]) | \ | |
np.array([bool(re.search("\\bdice$", word)) for word in words]) | \ | |
np.any([words.endswith(string) for string in ("kobzari", "oblasti")], axis = 0) | \ | |
words.endswith("eaux") | \ | |
words.endswith("ae") | \ | |
words.endswith("kniazhestva") | \ | |
words.endswith("celli") | \ | |
np.any([words.endswith(string) for string in ("cherubim", "kibbutz", "seraph")], axis = 0) | \ | |
words.endswith("matzot") | \ | |
np.any([words.endswith(string) for string in ("hedra", "mata", "mena", "ria")], axis = 0) | \ | |
np.any([words.endswith(string) for string in ("genera", "viscera", "corpora")], axis = 0) | \ | |
np.any([words.endswith(string) for string in Latin_us_to_i_plurals], axis = 0) | \ | |
np.any([words.endswith(string) for string in Latin_us_to_a_plurals], axis = 0) | |
is_indeterminate = np.any([words.endswith(string) for string in English_invariant_words], axis = 0) | \ | |
np.any([words.endswith(string) for string in English_uncountable_words], axis = 0) | \ | |
np.any([words.endswith(string) for string in Japanese_words_in_English], axis = 0) | \ | |
np.any([words.endswith(string) for string in Maori_words_in_English], axis = 0) | \ | |
words.endswith("nese") | |
is_singular = is_indeterminate | \ | |
is_singular_with_s | \ | |
~(words.endswith("s") | is_plural_without_s) | |
return is_singular | |
### Lemmatizer -------------------------------------------------------------- | |
English_ly_nouns = ["ally", "anomaly", "assembly", "belly", "bully", "butterfly", "contumely", "doily", "dragonfly", "gadfly", "family", "filly", "firefly", "fly", "folly", "gully", "holly", "homily", "horsefly", "housefly", "jelly", "lily", "melancholy", "monopoly", "oligopoly", "panoply", "rally", "sandfly", "tally"] | |
English_ly_verbs = ["apply", "bely", "bully", "comply", "dally", "dilly-dally", "imply", "multiply", "ply", "rally", "rely", "reply", "sally", "shilly-shally", "supply", "tally"] | |
English_ly_adjectives = ["billy", "dilly", "early", "filly", "holy", "likely", "nilly", "only", "silly", "smily", "willy"] | |
English_ly_keepers = list(set(flat_concat([English_ly_nouns, English_ly_verbs, English_ly_adjectives]))) | |
English_ly_to_le_words = ["doubly", "cycly", "muscly", "crackly", "crinkly", "fickly", "knuckly", "sparkly", "tinkly", "wrinkly", "crumply", "dimply", "druply", "riply", "rumply", "simply", "triply", "tuply", "bristly", "gently", "gristly", "rattly", "subtly", "thistly"] | |
English_anti_keepers = ["anticipat", "antidote", "antilog", "antimony", "anting", "antiquari", "antiquary", "antiquat", "antique", "antiqui", "antiquit", "antistrophe"] | |
English_dis_keepers = ["discreet", "discret(?:e|ion)", "discrepan", "discriminat", "disk", "dish", "display", "dismay", "dismal", "dismiss", "dispel", "discern", "discipl", "dispute", "distribu", "disrupt", "disturb", "discus", "diss", "dispos", "disgust", "dismiss", "distill", "disdain", "distort", "disease", "disco$", "discograph", "discover", "district", "distinct", "distinguish", "distan", "disten", "distress"] | |
English_imbmp_keepers = ["imbib", "imbitter", "imbolden", "imbecil", "imblaz", "imbroglio", "imbue", "immediat", "imp$", "impair", "impal", "impeach", "imped", "imperitive", "impertinent", "import", "implement", "imply", "implic", "impregnat", "improp", "impuls", "impresario", "impose", "imposit", "impetuous", "imperil", "imperial", "impact", "implod", "implos", "impress", "imprint", "imput", "impel", "impromptu", "implant", "impish", "impound", "impunit", "improv", "implor", "impuls", "imping", "immanenc", "immigrat", "immun", "immur", "immers", "immanent", "immens"] | |
English_in_keepers = ["in$", "inside$", "into$", "inane", "inanit", "inaug", "inbound", "inbre", "inch", "incas", "incens", "incentiv", "incept", "incid", "incis", "incit", "inclin", "inclos", "includ", "inclus", "incom[ei]", "increas", "increment", "incub", "inculca", "incur", "indeed", "indemn", "indent", "index", "india", "indic", "indie", "indig", "individual", "induc", "indulg", "industr", "indy", "inert", "infant", "inertia", "infatua", "infect", "infer", "infest", "infix", "inflat", "inflect", "inflict", "influen", "info", "infra", "infring", "infus", "ingest", "ingot", "ingrain", "ingrati", "ingredient", "ingroup", "inhabit", "inhal", "inherent", "inherit", "inhibit", "initia", "inject", "injure", "ink", "inlay", "inmate", "inn", "inositol", "input", "inquir", "insert", "insid", "insinuat", "insip", "insist", "insinuat", "inspect", "inspir", "install", "instan", "instat", "instead", "instigat", "instill", "instruct", "instrum", "institut", "insul", "insur", "intact", "integ", "intell", "inten", "inter", "intestin", "intimat", "intomb", "intro", "intru", "intubat", "intuit", "inundat", "inur", "invad", "invas", "invent", "invers", "invert", "invest", "invit", "invok", "invoc", "involv", "inward"] | |
English_mis_keepers = ["missile", "mission", "miser", "mischiev", "miscible", "misceg", "miscell", "misses", "miss$", "missed", "missing", "mishap", "mist", "miso", "mississippi"] | |
English_sub_keepers = ["sub$", "submit", "submar", "subtl", "subb(?:ed|ing)", "subject", "suburb", "subdu(?:e|ing)", "subway", "subsequent", "subvene", "subpena", "subduce", "subvert", "subsidy", "subside", "subsist", "sublime", "subtend", "submer[gs]e", "subtract", "substan[ct]", "subscri[bp]", "substitut", "subsidiar", "substrate"]#*** | |
English_super_keepers = ["super$", "superfluous", "superior", "superlativ"] | |
English_un_keepers = ["uncle", "union", "unif", "univer", "unilat", "uniloc", "unifol", "uniform", "unit", "unival", "univar", "univoc", "unicycl", "uniling", "unilin", "unicam", "uniplan", "unipot", "unicol", "unitar", "unicorn", "uniax", "unique", "unison", "uniface", "unisex", "unless", "until"] | |
English_under_keepers = ["under$", "underneath$", "understand", "understood"] | |
English_other_keepers = ["anti$", "hyper$", "hypo$", "hypothe", "over$", "overly$", "under$", "underwh"] | |
English_prefix_keepers = flat_concat([English_anti_keepers, English_dis_keepers, English_imbmp_keepers, English_in_keepers, English_mis_keepers, English_sub_keepers, English_super_keepers, English_un_keepers, English_under_keepers, English_other_keepers]) | |
English_iable_keepers = ["amiable", "liable", "viable"] | |
English_able_keepers = flat_concat(["able", "available", "cable", "fable", "gable", "horrible", "parable", "probable", "reliable", "stable", "table", "timetable", "vegetable", "vulnerable", English_iable_keepers]) | |
English_ible_keepers = ["bible", "compatible", "eligible", "feasible", "horrible", "possible", "responsible", "terrible"] | |
English_eal_keepers = ["anneal", "appeal", "conceal", "congeal", "deal", "\\bmeal", "ordeal", "\\breal", "repeal", "reveal", "seal", "squeal", "steal"] | |
English_ial_keepers = ["artificial", "axial", "colloquial", "congenial", "cordial", "crucial", "jovial", "judicial", "material", "nubial", "social", "special", "superficial", "trial", "trivial", "venial", "vivial"] | |
English_ual_keepers = ["actual", "casual", "dual", "equal", "eventual", "individual", "lingual", "manual", "menstrual", "mutual", "ritual", "usual", "victual", "visual"] | |
English_al_keepers = flat_concat(["aboriginal", "animal", "arsenal", "capital", "cardinal", "carnival", "cathedral", "charcoal", "chemical", "coal", "crystal", "decimal", "\\bdent", "eternal", "federal", "final", "fiscal", "funeral", "general", "hospital", "integral", "international", "interval", "journal", "lateral", "legal", "liberal", "literal", "local", "loyal", "mammal", "marital", "medieval", "mental", "mineral", "moral", "municipal", "naval", "normal", "numeral", "\\boval", "plural", "primeval", "principal", "radical", "rival", "rural", "scandal", "secular", "several", "spectrum", "spiral", "temporal", "thermal", "total", "vassal", "vertical", "virtual", "vital", "vocal", English_eal_keepers, English_ial_keepers, English_ual_keepers]) #*** integral to integrate? | |
English_ist_keepers = ["assist", "artist", "checklist", "chemist", "cist", "consist", "dentist", "enlist", "exist", "feist", "fist", "foist", "gist", "heist", "hoist", "insist", "list", "joist", "mist", "moist", "persist", "playlist", "protist", "resist", "schist", "shist", "twist", "wishlist", "wrist"] #, "florist" | |
English_ism_keepers = ["animism", "atheism", "autism", "baptism", "catechism", "deism", "fascism", "sadism", "sophism", "theism"] | |
English_ian_keepers = ["lesbian", "thespian"] | |
English_age_removers = ["acreage", "anchorage", "appendage", "baronage", "binage", "bondage", "breakage", "cellarage", "coinage", "corkage", "cousinage", "coverage", "creepage", "drainage", "factorage", "flowerage", "footage", "frontage", "fruitage", "gallonage", "graftage", "harborage", "herbage", "hermitage", "innage", "layerage", "leafage", "leakage", "layerage", "lighterage", "linkage", "meltage", "meterage", "mileage", "moorage", "orphanage", "package", "parentage", "passage", "patronage", "percentage", "pilotage", "portage", "porterage", "postage", "poundage", "pressage", "quarterage", "reportage", "roughage", "seepage", "sewerage", "shortage", "shrinkage", "signage", "siphonage", "spillage", "soilage", "steerage", "stowage", "surplusage", "tankage", "tillage", "tinage", "towage", "tutorage", "voltage", "wagonage", "wattage", "wharfage", "yardage"] | |
English_ish_keepers = ["abolish", "blish", "blemish", "burnish", "dish", "fish", "fetish", "finish", "flourish", "foolish", "garish", "guish", "hashish", "lavish", "monish", "parish", "perish", "plish", "plenish", "polish", "publish", "quish", "ravish", "relish", "wish"] | |
English_ment_keepers = ["parliament", "tournament", "testament", "ornament", "torment", "armament", "garment", "element", "plement", "department", "environment", "segment", "aliment", "moment", "comment", "condiment", "experiment", "ndiment", "pliment", "regiment", "sediment", "sentiment", "triment", "argument", "document", "instrument", "monument"] | |
English_ize_keepers = ["baptize", "braize", "maize", "ognize", "organize", "ostracize", "prize", "seize", "size"] | |
English_able_double_consonants = [thing * 2 for thing in ["b", "d", "g", "m", "n", "p", "r", "t"]] | |
English_doubled_consonants_able = [string + "able" for string in English_able_double_consonants] | |
English_ism_double_consonants = [thing * 2 for thing in ["b", "d", "g", "l", "n", "p", "t", "z"]] | |
English_doubled_consonants_ism = [string + "ism" for string in English_ism_double_consonants] | |
English_er_double_consonants = [thing * 2 for thing in ["b", "d", "g", "m", "n", "p", "t"]] | |
English_doubled_consonants_er = [string + "er" for string in English_er_double_consonants] | |
English_est_double_consonants = [thing * 2 for thing in ["b", "d", "g", "m", "n", "p", "t"]] | |
English_doubled_consonants_est = [string + "est" for string in English_est_double_consonants] | |
English_ed_double_consonants = [thing * 2 for thing in ["b", "d", "g", "l", "m", "n", "p", "r", "t", "v", "z"]] | |
English_doubled_consonants_ed = [string + "ed" for string in English_ed_double_consonants] | |
English_ing_double_consonants = [thing * 2 for thing in ["b", "d", "g", "l", "m", "n", "p", "r", "t", "v", "z"]] | |
English_doubled_consonants_ing = [string + "ing" for string in English_ing_double_consonants] | |
English_eer_keepers = ["beer", "career", "cheer", "deer", "domineer", "engineer", "killdeer", "jeer", "leer", "peer", "pioneer", "queer", "reindeer", "schmeer", "sheer", "sneer", "steer", "veer", "veneer", "volunteer"] | |
English_ier_keepers = ["brier", "cashier", "cavalier", "chandelier", "courier", "frontier", "glacier", "\\bpier", "premier", "soldier", "\\bspier", "\\btier"] | |
English_er_keepers = flat_concat(["under", "whether", "\\bever", "whenever", "wherever", "whichever", "whoever", "whomever", "however", "whatever", "whatsoever", "forever", "either", "neither", "after", "\\bnever", "\\bher", "differ", "number", "tower", "crater", "dinner", "matter", "trouser", "mister", "minister", "amber", "customer", "harbinger", "monger", "\\banger", "manger", "ganger", "\\bother", "another", "paper", "(?:head)?quarter", "helicopter", "over", "member", "water", "fiber", "wonder", "ancester", "cloister", "confer", "corner", "enter", "per", "luster", "neuter", "scepter", "order", "deliver", "prefer", "defer", "foster", "cluster", "murder", "chamber", "september", "october", "november", "december", "register", "weather", "together", "letter", "newsletter", "chapter", "better", "poker", "further", "farther", "remember", "river", "silver", "rather", "summer", "winter", "super", "cancer", "answer", "transfer", "filter", "consider", "partner", "character", "father", "mother", "brother", "sister", "daughter", "leather", "upper", "lower", "laser", "theater", "gender", "soccer", "proper", "refer", "master", "meter", "rubber", "monster", "mester", "prefer", "latter", "tiger", "finger", "danger", "powder", "integer", "pepper", "cover", "spider", "cyber", "shelter", "suffer", "beaver", "trigger", "fever", "butler", "timber", "gather", "roster", "encounter", "hammer", "cylinder", "boulder", "thunder", "ester", "render", "after", "monomer", "dimer", "trimer", "tetramer", "polymer", "bitter", "usher", "ginger", "carpenter", "clever", "alzheimer", "lavender", "eager", "surrender", "lumber", "diaper", "jupiter", "sweater", "minister", "litter", "panther", "pewter", "clutter", "bladder", "lever", "feather", "burger", "ledger", "lobster", "slaughter", "glitter", "garner", "oyster", "clover", "power", "conquer", "badger", "butcher", "register", "kosher", "viper", "whisper", "flower", "utter", "cater", "doppler", "snooker", "juniper", "cucumber", "deter", "infer", "ether", "caliber", "center", "hooker", "cider", "splinter", "chapter", "batter", "sober", "sinister", "otter", "slender", English_eer_keepers, English_ier_keepers]) | |
English_iest_keepers = ["priest"] | |
English_est_keepers = flat_concat(["\\bbest", "digest", "earnest", "(?:\\b|gab|love|slug|song)fest", "harvest", "honest", "\\bjest", "\\blest", "manifest", "\\bnest", "\\bpest", "(?:\\b|arm|head)rest", "\\btest", "\\bvest", "(?:\\b|mid|north|south)west", "\\bzest", "arbalest", "arrest", "attest", "\\bchest", "contest", "crest", "forest", "(?:\\b|house)guest", "infest", "invest", "interest", "protest", "(?:\\b|ac|be|con|in|re)quest", "suggest", "tempest", English_iest_keepers]) | |
English_ed_keepers = ["\\bbed", "bred", "\\bfed", "hundred", "infrared", "naked", "need", "\\bred", "sacred", "\\bshed", "watershed", "\\bwed", "\\bzed"] | |
English_ing_keepers = ["bring", "ceiling", "\\bcling", "darling", "\\bding", "\\bduring", "evening", "\\bfling", "\\bking", "lightning", "morning", "\\bpending", "\\bping", "\\bring", "sibling", "\\bsing", "(?:\\b|un|war)sling", "spring", "sterling", "\\bsting", "string", "swing", "(?:\\b|any|every|no|some)?thing", "(?:\\b|hind|fore)wing", "\\bwring", "\\bzing"] | |
English_s_keepers = ["always", "perhaps", "whereas", "has", "is", "was"] | |
# rules for what kinds of word endings require an ultimate "e" | |
general_e_rules = "(?:(?:\\b|[^aieou]|ll)[aeiouy][bcfgkmsvz])" | |
ce_rules = "(?:[lnrs]c)" | |
de_rules = "(?:(?:[^aeiou][aeiou]|ui)d)" | |
ge_rules = "(?:(?:[dlr]|(?:(?:r|ch|str)a|(?:ll|v)e|(?:b|h|cr)i|(?:c|sp)o|(?:l|p|pl|scro)u)n)g)" | |
le_rules = "(?:(?:(?:imp|wholes|sc|wh)a|(?:(?:\\b|de)f|p|\\b[prt]|rev|sm)i|(?:cond|h|par|\\bp|recons|\\bt)o|(?:r|sched)u|y|[bcdfgkpstz])l)" | |
ne_rules = "(?:(?:[^aeiou][aiu]|(?:\\b(?:[bchtz]|cl|dr)|chaper|(?:de|im|post|pro)p|ph|thr|[as]t)o)n)" | |
oe_rules = "(?:(?:\\bh|(?:\\b|tip(py)?)t|sh)o)" | |
pe_rules = "(?:(?:[^aeiou][aeiuy]|(?:\\b(?:[cdhmr]|el)|gr|sc)o)p)" | |
re_rules = "(?:(?:[^aeiou][aiu]|(?:\\b(?:[bcgps]|ad|ch|depl|enc|expl|ign|impl|rest|sh|sp|st|wh)|sc|sn)o|qui)r)" | |
se_rules = "(?:(?:(?:ai|au|ea|ee|oi|oo|(?:(?:\\b|[^l])[^l]|\\bl)ou)|ui|[lnrw])s)" | |
te_rules = "(?:(?:(?:[^eo]|cre|ide)a|(?:comp|compl|del|excr)e|(?:(?:\\b|[^abeiou])b|(?:\\b|[^i])c|ign|ind|inv|sm|sp|qu|un|wh|wr|xc)i|(?:\\b[cdntv]|m|qu|[^i]v)o|(?:[^aeiou]|\\bro)u|[bhptw]as)t)" | |
ue_rules = "(?:u)" | |
ve_rules = "(?:(?:ai|ea|ee|ei|ie|[lr])v)" | |
ye_rules = "(?:(?:\\b|cross|hawk)ey)" | |
ze_rules = "(?:[^tz]z)" | |
che_rules = "(?:(?:(?:\\b|back|belly|head|stomach|tooth)a|ca)ch)" | |
e_rules = "(?:" + any_of(flat_concat([general_e_rules, ce_rules, de_rules, ge_rules, le_rules, ne_rules, oe_rules, pe_rules, re_rules, se_rules, te_rules, ue_rules, ve_rules, ye_rules, ze_rules, che_rules])) + ")" | |
def digest_words(words): | |
# description: Used for "stemming" or "lemmatizing" words for Natural Language Processing. | |
# description: Works by removing prefixes and suffixes in the appropriate order. | |
# description: | |
# description: It's more accurate than typical stemming approaches: | |
# description: (fewer confabulated results and more correctly connected results, | |
# description: because it tests for and handles many special cases). | |
# description: | |
# description: It's more user-friendly than typical lemmatizing approaches: | |
# description: (you don't need to worry about parts of speech, | |
# description: and it automatically goes to the most basic form). | |
# | |
# details: Uses the companion digested_word_dictionary(words) function to create a dictionary | |
# details: of the unique input words (as the element name/key) | |
# details: and their digested outputs (as the element value). | |
# details: Read the comments in digested_word_dictionary(words) for more information. | |
# details: It relies on rules when there are rules (so it often works on made-up words), | |
# details: but the rest is hard-coded (and there are admittedly still plenty of | |
# details: gaps in coverage for special cases). | |
# details: Uses the companion make_singular(words), is_plural(words), and singularize(words) | |
# details: functions for handling plural (especially foreign/Greek/Latin/unusual plural) forms. | |
# details: See the documentation of these functions for more information. | |
# | |
# input: a character vector of lower-case English words to "digest" into their core lemmas (most meaningful lexical components) | |
# | |
# input specs: Nones and "" elements are acceptable and do not cause error or warning. | |
# input specs: Empty inputs are acceptable and do not cause error or warning. | |
# input specs: Words containing contractions are acceptable and handled properly. | |
# input specs: It also properly handles the last components of hyphenated words, | |
# input specs: ignoring preceding compents (unless they're prefixes, in which case they're removed). | |
# input specs: Proper nouns are currently *NOT* masked or handled properly, | |
# input specs: so don't expect them to be returned unchanged. | |
# | |
# output: a character vector of the "digested" words | |
# | |
# output specs: Nones elements returned as None; "" elements returned as "". | |
# output specs: Elements are returned in the same order (in a vector of the same length). | |
# output specs: Nouns are returned in singular (non-plural) form. | |
# output specs: Verbs are returned in infinitive form. | |
# output specs: All negations (non-/un-/in-/dis-/anti-) are dropped. | |
# output specs: Stopwords are returned unchanged--handle them on your own. | |
# | |
# example input: digest_words("antidisestablishmentarianismesquely") | |
# example output: "establish" | |
# | |
# example input: digest_words("supercalifragilisticexpialidocious") | |
# example output: "califragilisticexpialidocious") | |
# | |
# example input: digest_words("shouldn't've") | |
# example output: "shall" | |
# | |
# example input: digest_words("can't-believe-it's-not-butterific") | |
# example output: "can't-believe-it's-not-butter" | |
# | |
# example input: digest_words("re-doing") | |
# example output: "do" | |
# | |
# notes: This could be used in the future for grammatical approaches, | |
# notes: as it breaks down words by part of speech-related suffixes. | |
# notes: In future may separate contractions into component words. | |
# notes: In future may handle co-, en-, inter-, intra-, semi- prefixes. | |
# | |
if type(words) is list: | |
words = [word if type(word) is str else "" for word in words] | |
results = np.char.array(words, itemsize = (len(max(words, key = len))) + 2) | |
digest_dict = digested_word_dictionary(words) | |
valid = results.nonzero() | |
results[valid] = [digest_dict[word] for word in results[valid]] | |
if type(words) is list: | |
return [result for result in results] | |
else: | |
return results | |
def digested_word_dictionary(words): | |
### process only unique non-blank and non-NA values | |
# (avoids redundant computation) | |
original_words = np.unique(np.char.array(filter(non_empty_string, words))) | |
words = np.char.array(original_words, itemsize = (original_words.itemsize + 2)) | |
### simplify number (singular/plural) to singular case | |
# (obviates checking optional s on some suffixes--avoids unecessary computation) | |
# has desired side effect of simplifying number (plurals and singulars alike all end up as singular) | |
# has desired side effect of taking "ies" verb forms to "y" as well, further simplifying things | |
can_be_made_singular = np.array([not(bool(re.search("\\b" + any_of(English_s_keepers) + "$", word))) for word in words]) | |
if any(can_be_made_singular): | |
words[can_be_made_singular] = make_singular(words[can_be_made_singular]) | |
### handle contractions | |
# contractions block the ending of words (hiding endings in endsWith() checks), so they must be removed | |
subset_scope = np.array([bool(re.search("'", word)) for word in words]) | |
subset = words[subset_scope] | |
if len(subset) != 0: | |
subset = [re.sub(string = string, pattern = "\\bwon't\\b", repl = "will") for string in subset] | |
subset = [re.sub(string = string, pattern = "\\bmight've\\b", repl = "might") for string in subset] | |
subset = [re.sub(string = string, pattern = "(?:n't|'ve|'ll|'re|')+$", repl = "") for string in subset] | |
words[subset_scope] = subset | |
### handle irregular words | |
# irregular past participles ending in "dden" | |
subset_scope = words.endswith("dden") | |
subset = words[subset_scope] | |
if len(subset) != 0: | |
# e.g. "downtrodden" -> "downtread" | |
odden_to_ead = subset.endswith("trodden") | |
subset[odden_to_ead] = [string[:-5] + "ead" for string in subset[odden_to_ead]] | |
# e.g. "forbidden" -> "forbid" | |
delete_den = np.any([subset.endswith(string) for string in ("adden", "bidden", "edden")], axis = 0) | |
subset[delete_den] = [string[:-3] for string in subset[delete_den]] | |
# e.g. "hidden" -> "hide" | |
idden_to_ide = subset.endswith("idden") & ~np.any([subset.endswith(string) for string in ("midden", "swidden")], axis = 0) | |
subset[idden_to_ide] = [string[:-3] + "e" for string in subset[idden_to_ide]] | |
words[subset_scope] = subset | |
# irregular past participles ending in "tten" | |
subset_scope = words.endswith("tten") | |
subset = words[subset_scope] | |
if len(subset) != 0: | |
# e.g. "written" -> "write" | |
itten_to_ite = subset.endswith("itten") & ~np.any([subset.endswith(string) for string in ("kitten", "mitten")], axis = 0) | |
subset[itten_to_ite] = [string[:-3] + "e" for string in subset[itten_to_ite]] | |
# e.g. "rotten" -> "rot" | |
delete_ten = np.any([subset.endswith(string) for string in ("atten", "otten")], axis = 0) | |
subset[delete_ten] = [string[:-3] for string in subset[delete_ten]] | |
words[subset_scope] = subset | |
# irregular past participles ending in "en" (and a few adjectives) | |
subset_scope = words.endswith("en") | |
subset = words[subset_scope] | |
if len(subset) != 0: | |
# e.g. "eaten" -> "eat" (also "been" to "be") | |
delete_en = np.any([subset.endswith(string) for string in ("ashen", "been", "drunken", "earthen", "eaten", "fallen", "olden", "silken", "swollen", "wooden", "woolen")], axis = 0) | |
subset[delete_en] = [string[:-2] for string in subset[delete_en]] | |
# e.g. "broken" -> "broke" (later to "break") | |
delete_n = np.any([subset.endswith(string) for string in ("aken", "chosen", "iven", "oken", "olen", "oven", "risen", "rozen", "seen")], axis = 0) & ~(subset.endswith("kraken") | np.array([bool(re.search("\\boven$", word)) for word in subset])) | |
subset[delete_n] = [string[:-1] for string in subset[delete_n]] | |
words[subset_scope] = subset | |
# irregular past participles ending in "n" | |
subset_scope = words.endswith("n") | |
subset = words[subset_scope] | |
if len(subset) != 0: | |
# e.g. "worn" -> "wore" (later to "wear") | |
n_to_e = np.any([subset.endswith(string) for string in ("born", "torn", "worn")], axis = 0) & ~np.any([subset.endswith(string) for string in ("stubborn", "attorn")], axis = 0) | |
subset[n_to_e] = [string[:-1] + "e" for string in subset[n_to_e]] | |
# e.g. "lain" -> "lie" | |
ain_to_ay = np.array([bool(re.search("\\blain$", word)) for word in subset]) | |
subset[ain_to_ay] = [string[:-3] + "ie" for string in subset[ain_to_ay]] | |
# e.g. "shorn" -> "shear" | |
orn_to_ear = subset.endswith("shorn") | |
subset[orn_to_ear] = [string[:-3] + "ear" for string in subset[orn_to_ear]] | |
# e.g. "drawn" -> "draw" | |
delete_n = np.array([bool(re.search("\\b" + any_of(["blown", "drawn", "grown", "known", "sewn", "shaken", "shown", "sown", "thrown"]) + "$", word)) for word in subset]) | |
subset[delete_n] = [string[:-1] for string in subset[delete_n]] | |
words[subset_scope] = subset | |
# irregular past participles ending in "t" | |
subset_scope = words.endswith("t") | |
subset = words[subset_scope] | |
if len(subset) != 0: | |
# e.g. "burnt" -> "burn" | |
delete_t = np.any([subset.endswith(string) for string in ("burnt", "dealt", "dreamt", "learnt", "meant")], axis = 0) | |
subset[delete_t] = [string[:-1] for string in subset[delete_t]] | |
# e.g. "built" -> "build" | |
t_to_d = np.any([subset.endswith(string) for string in ("built", "spent")], axis = 0) | np.array([bool(re.search("\\b" + any_of(["bent", "lent", "sent"]) + "$", word)) for word in subset]) | |
subset[t_to_d] = [string[:-1] + "d" for string in subset[t_to_d]] | |
# e.g. "lost" -> "lose" | |
t_to_e = subset.endswith("lost") | |
subset[t_to_e] = [string[:-1] + "e" for string in subset[t_to_e]] | |
# e.g. "left" -> "leave" | |
eft_to_eave = np.any([subset.endswith(string) for string in ("bereft", "left")], axis = 0) | |
subset[eft_to_eave] = [string[:-2] + "ave" for string in subset[eft_to_eave]] | |
words[subset_scope] = subset | |
#*** prevents spurious edits later on | |
#*** make common irregular words get fixed even if not at end of word phrase | |
# common irregular words | |
reasonable_slice = np.array([bool(re.search("\\ban$", word)) for word in words]) | |
words[reasonable_slice] = [string[:-1] for string in words[reasonable_slice]] | |
reasonable_slice = np.any([words.endswith(string) for string in ("am", "are", "is", "was", "were")], axis = 0) | |
words[reasonable_slice] = [re.sub(string = string, pattern = "\\b" + any_of(["am", "are", "been", "is", "was", "were"]) + "$", repl = "be") for string in words[reasonable_slice]] | |
reasonable_slice = np.any([words.endswith(string) for string in ("did", "done")], axis = 0) | |
words[reasonable_slice] = [re.sub(string = string, pattern = "\\b" + any_of(["did", "done"]) + "$", repl = "do") for string in words[reasonable_slice]] | |
reasonable_slice = np.array([bool(re.search("\\bha[ds]$", word)) for word in words]) | |
words[reasonable_slice] = [string[:-1] + "ve" for string in words[reasonable_slice]] | |
reasonable_slice = np.any([words.endswith(string) for string in ("went", "gone")], axis = 0) | |
words[reasonable_slice] = [re.sub(string = string, pattern = "\\b" + any_of(["went", "gone"]) + "$", repl = "go") for string in words[reasonable_slice]] | |
reasonable_slice = np.any([words.endswith(string) for string in ("ate", "edible", "edibly")], axis = 0) | |
words[reasonable_slice] = [re.sub(string = string, pattern = "\\b" + any_of(["eats", "ate", "eaten", "eating", "edible", "edibly"]) + "$", repl = "eat") for string in words[reasonable_slice]] | |
reasonable_slice = np.any([words.endswith(string) for string in ("cannot", "could")], axis = 0) | |
words[reasonable_slice] = [re.sub(string = string, pattern = "\\b" + any_of(["cannot", "could"]) + "$", repl = "can") for string in words[reasonable_slice]] | |
reasonable_slice = words.endswith("should") | |
words[reasonable_slice] = [string[:-4] + "all" for string in words[reasonable_slice]] | |
reasonable_slice = words.endswith("might") | |
words[reasonable_slice] = [string[:-4] + "ay" for string in words[reasonable_slice]] | |
reasonable_slice = np.any([words.endswith(string) for string in ("bore", "borne")], axis = 0) | |
words[reasonable_slice] = [re.sub(string = string, pattern = "\\b" + any_of(["bore", "born", "borne"]) + "$", repl = "bear") for string in words[reasonable_slice]] | |
reasonable_slice = np.any([words.endswith(string) for string in ("better", "best")], axis = 0) | |
words[reasonable_slice] = [re.sub(string = string, pattern = "\\b" + any_of(["better", "best"]) + "$", repl = "good") for string in words[reasonable_slice]] | |
reasonable_slice = np.any([words.endswith(string) for string in ("worse", "worst")], axis = 0) | |
words[reasonable_slice] = [re.sub(string = string, pattern = "\\b" + any_of(["worse", "worst"]) + "$", repl = "bad") for string in words[reasonable_slice]] | |
reasonable_slice = words.endswith("these") | |
words[reasonable_slice] = [string[:-3] + "is" for string in words[reasonable_slice]] | |
reasonable_slice = words.endswith("those") | |
words[reasonable_slice] = [string[:-3] + "at" for string in words[reasonable_slice]] | |
# irregular verbs without much pattern | |
# handle irregulars ending in "d" | |
subset_scope = words.endswith("d") | |
subset = words[subset_scope] | |
if len(subset) != 0: | |
subsubset_scope = subset.endswith("ed") | |
subsubset = subset[subsubset_scope] | |
if len(subsubset) != 0: | |
ed_to_ead = np.array([bool(re.search("\\bled$", word)) for word in subsubset]) | |
subsubset[ed_to_ead] = [string[:-1] + "ad" for string in subsubset[ed_to_ead]] | |
ed_to_ee = np.array([bool(re.search("\\bfled$", word)) for word in subsubset]) | |
subsubset[ed_to_ee] = [string[:-1] + "e" for string in subsubset[ed_to_ee]] | |
ed_to_eed = np.array([bool(re.search("\\b" + any_of(["bled", "bred", "fed", "sped"]) + "$", word)) for word in subsubset]) | |
subsubset[ed_to_eed] = [string[:-1] + "ed" for string in subsubset[ed_to_eed]] | |
subset[subsubset_scope] = subsubset | |
subsubset_scope = subset.endswith("id") | |
subsubset = subset[subsubset_scope] | |
if len(subsubset) != 0: | |
id_to_ide = subsubset.endswith("slid") | np.array([bool(re.search("\\bhid$", word)) for word in subsubset]) | |
subsubset[id_to_ide] = [string + "e" for string in subsubset[id_to_ide]] | |
aid_to_ay = np.any([subsubset.endswith(string) for string in ("laid", "paid", "said")], axis = 0) & ~subsubset.endswith("plaid") | |
subsubset[aid_to_ay] = [string[:-2] + "y" for string in subsubset[aid_to_ay]] | |
subset[subsubset_scope] = subsubset | |
subsubset_scope = subset.endswith("ld") | |
subsubset = subset[subsubset_scope] | |
if len(subsubset) != 0: | |
eld_to_old = subsubset.endswith("held") | |
subsubset[eld_to_old] = [string[:-3] + "old" for string in subsubset[eld_to_old]] | |
old_to_ell = np.any([subsubset.endswith(string) for string in ("sold", "told")], axis = 0) | |
subsubset[old_to_ell] = [string[:-3] + "ell" for string in subsubset[old_to_ell]] | |
subset[subsubset_scope] = subsubset | |
ound_to_ind = np.any([subset.endswith(string) for string in ("bound", "found")], axis = 0) # "ground", "wound" (these are also unrelated nouns) | |
subset[ound_to_ind] = [string[:-4] + "ind" for string in subset[ound_to_ind]] | |
subsubset_scope = subset.endswith("od") | |
subsubset = subset[subsubset_scope] | |
if len(subsubset) != 0: | |
od_to_ead = subsubset.endswith("trod") | |
subsubset[od_to_ead] = [string[:-2] + "ead" for string in subsubset[od_to_ead]] | |
ood_to_and = subsubset.endswith("stood") | |
subsubset[ood_to_and] = [string[:-3] + "and" for string in subsubset[ood_to_and]] | |
subset[subsubset_scope] = subsubset | |
eard_to_ear = subset.endswith("heard") | |
subset[eard_to_ear] = [string[:-1] for string in subset[eard_to_ear]] | |
words[subset_scope] = subset | |
# handle irregulars ending in "e" | |
subset_scope = words.endswith("e") | |
subset = words[subset_scope] | |
if len(subset) != 0: | |
subsubset_scope = subset.endswith("de") | |
subsubset = subset[subsubset_scope] | |
if len(subsubset) != 0: | |
ade_to_ake = subsubset.endswith("made") & ~np.any([subsubset.endswith(string) for string in ("amade", "omade")], axis = 0) | |
subsubset[ade_to_ake] = [string[:-2] + "ke" for string in subsubset[ade_to_ake]] | |
ade_to_id = subsubset.endswith("forbade") | np.array([bool(re.search("\\bbade$", word)) for word in subsubset]) | |
subsubset[ade_to_id] = [string[:-3] + "id" for string in subsubset[ade_to_id]] | |
ode_to_ide = np.any([subsubset.endswith(string) for string in ("joyrode", "outrode", "overrode", "strode")], axis = 0) | np.array([bool(re.search("\\brode$", word)) for word in subsubset]) | |
subsubset[ode_to_ide] = [string[:-3] + "ide" for string in subsubset[ode_to_ide]] | |
subset[subsubset_scope] = subsubset | |
subsubset_scope = subset.endswith("ke") | |
subsubset = subset[subsubset_scope] | |
if len(subsubset) != 0: | |
oke_to_ake = subsubset.endswith("woke") | |
subsubset[oke_to_ake] = [string[:-3] + "ake" for string in subsubset[oke_to_ake]] | |
oke_to_eak = np.any([subsubset.endswith(string) for string in ("broke", "spoke")], axis = 0) | |
subsubset[oke_to_eak] = [string[:-3] + "eak" for string in subsubset[oke_to_eak]] | |
subset[subsubset_scope] = subsubset | |
ole_to_eal = subset.endswith("stole") | |
subset[ole_to_eal] = [string[:-3] + "eal" for string in subset[ole_to_eal]] | |
ame_to_ome = subset.endswith("came") | |
subset[ame_to_ome] = [string[:-3] + "ome" for string in subset[ame_to_ome]] | |
one_to_ine = subset.endswith("shone") | |
subset[one_to_ine] = [string[:-3] + "ine" for string in subset[one_to_ine]] | |
ore_to_ear = np.any([subset.endswith(string) for string in ("tore", "wore")], axis = 0) & ~np.any([subset.endswith(string) for string in ("atore", "store")], axis = 0) | |
subset[ore_to_ear] = [string[:-3] + "ear" for string in subset[ore_to_ear]] | |
subsubset_scope = subset.endswith("se") | |
subsubset = subset[subsubset_scope] | |
if len(subsubset) != 0: | |
ose_to_ise = np.array([bool(re.search("\\brose$", word)) for word in subsubset]) | |
subsubset[ose_to_ise] = [string[:-3] + "ise" for string in subsubset[ose_to_ise]] | |
ose_to_oose = subsubset.endswith("chose") | |
subsubset[ose_to_oose] = [string[:-2] + "ose" for string in subsubset[ose_to_oose]] | |
subset[subsubset_scope] = subsubset | |
ote_to_ite = np.any([subset.endswith(string) for string in ("smote", "wrote")], axis = 0) | |
subset[ote_to_ite] = [string[:-3] + "ite" for string in subset[ote_to_ite]] | |
subsubset_scope = subset.endswith("ve") | |
subsubset = subset[subsubset_scope] | |
if len(subsubset) != 0: | |
ave_to_ive = subsubset.endswith("gave") & ~subsubset.endswith("agave") | |
subsubset[ave_to_ive] = [string[:-3] + "ive" for string in subsubset[ave_to_ive]] | |
ove_to_eave = subsubset.endswith("wove") | |
subsubset[ove_to_eave] = [string[:-3] + "eave" for string in subsubset[ove_to_eave]] | |
ove_to_ive = np.any([subsubset.endswith(string) for string in ("drove", "strove", "throve")], axis = 0) | np.array([bool(re.search("\\bdove$", word)) for word in subsubset]) | |
subsubset[ove_to_ive] = [string[:-3] + "ive" for string in subsubset[ove_to_ive]] | |
subset[subsubset_scope] = subsubset | |
oze_to_eeze = subset.endswith("froze") | |
subset[oze_to_eeze] = [string[:-3] + "eeze" for string in subset[oze_to_eeze]] | |
words[subset_scope] = subset | |
# handle irregulars ending in "g" | |
subset_scope = words.endswith("g") | |
subset = words[subset_scope] | |
if len(subset) != 0: | |
aong_to_ing = np.any([subset.endswith(string) for string in ("rang", "sang", "song", "sprang", "strang", "swang", "wrang")], axis = 0) | |
subset[aong_to_ing] = [string[:-3] + "ing" for string in subset[aong_to_ing]] | |
# handle "ung" irregulars | |
subsubset_scope = subset.endswith("ung") | |
subsubset = subset[subsubset_scope] | |
if len(subsubset) != 0: | |
ung_to_ang = subsubset.endswith("hung") | |
subsubset[ung_to_ang] = [string[:-3] + "ang" for string in subsubset[ung_to_ang]] | |
ung_to_ing = np.any([subsubset.endswith(string) for string in ("clung", "flung", "rung", "slung", "sprung", "strung", "stung", "sung", "swung", "wrung")], axis = 0) | |
subsubset[ung_to_ing] = [string[:-3] + "ing" for string in subsubset[ung_to_ing]] | |
subset[subsubset_scope] = subsubset | |
ug_to_ig = subset.endswith("dug") | |
subset[ug_to_ig] = [string[:-2] + "ig" for string in subset[ug_to_ig]] | |
words[subset_scope] = subset | |
# handle irregulars ending in "k" | |
subset_scope = words.endswith("k") | |
subset = words[subset_scope] | |
if len(subset) != 0: | |
subsubset_scope = subset.endswith("ve") | |
subsubset = subset[subsubset_scope] | |
if len(subsubset) != 0: | |
uck_to_ick = subsubset.endswith("stuck") | |
subsubset[uck_to_ick] = [string[:-3] + "ick" for string in subsubset[uck_to_ick]] | |
uck_to_ike = subsubset.endswith("struck") | |
subsubset[uck_to_ike] = [string[:-3] + "ike" for string in subsubset[uck_to_ike]] | |
subset[subsubset_scope] = subsubset | |
aunk_to_ink = np.any([subset.endswith(string) for string in ("drank", "drunk", "sank", "sunk", "slank", "slunk", "stank", "stunk")], axis = 0) | |
subset[aunk_to_ink] = [string[:-3] + "ink" for string in subset[aunk_to_ink]] | |
ook_to_ake = np.any([subset.endswith(string) for string in ("forsook", "shook", "took")], axis = 0) | |
subset[ook_to_ake] = [string[:-3] + "ake" for string in subset[ook_to_ake]] | |
words[subset_scope] = subset | |
# handle irregulars ending in "ll" | |
subset_scope = words.endswith("ll") | |
subset = words[subset_scope] | |
if len(subset) != 0: | |
ell_to_all = subset.endswith("fell") | |
subset[ell_to_all] = [string[:-3] + "all" for string in subset[ell_to_all]] | |
oll_to_ell = subset.endswith("swoll") | |
subset[oll_to_ell] = [string[:-3] + "ell" for string in subset[oll_to_ell]] | |
words[subset_scope] = subset | |
aum_to_im = np.any([words.endswith(string) for string in ("swam", "swum")], axis = 0) | |
words[aum_to_im] = [string[:-2] + "im" for string in words[aum_to_im]] | |
# handle irregulars ending in "n" | |
subset_scope = words.endswith("n") | |
subset = words[subset_scope] | |
if len(subset) != 0: | |
an_to_un = np.any([subset.endswith(string) for string in ("foreran", "reran", "outran", "overran")], axis = 0) | np.array([bool(re.search("\\bran$", word)) for word in subset]) | |
subset[an_to_un] = [string[:-2] + "un" for string in subset[an_to_un]] | |
on_to_in = subset.endswith("won") | |
subset[on_to_in] = [string[:-2] + "in" for string in subset[on_to_in]] | |
aun_to_in = np.any([subset.endswith(string) for string in ("began", "begun", "spun")], axis = 0) | |
subset[aun_to_in] = [string[:-2] + "in" for string in subset[aun_to_in]] | |
own_to_y = subset.endswith("flown") | |
subset[own_to_y] = [string[:-3] + "y" for string in subset[own_to_y]] | |
words[subset_scope] = subset | |
# handle irregulars ending in "t" | |
subset_scope = words.endswith("t") | |
subset = words[subset_scope] | |
if len(subset) != 0: | |
at_to_it = np.any([subset.endswith(string) for string in ("sat", "spat")], axis = 0) | |
subset[at_to_it] = [string[:-2] + "it" for string in subset[at_to_it]] | |
et_to_eet = np.array([bool(re.search("\\bmet$", word)) for word in subset]) | |
subset[et_to_eet] = [string[:-1] + "et" for string in subset[et_to_eet]] | |
# irregular verbs ending in "aught" or "ought" | |
subsubset_scope = subset.endswith("ught") | |
subsubset = subset[subsubset_scope] | |
if len(subsubset) != 0: | |
ought_to_ing = subsubset.endswith("brought") | |
subsubset[ought_to_ing] = [string[:-5] + "ing" for string in subsubset[ought_to_ing]] | |
ought_to_uy = subsubset.endswith("bought") | |
subsubset[ought_to_uy] = [string[:-5] + "uy" for string in subsubset[ought_to_uy]] | |
ought_to_eek = subsubset.endswith("sought") | |
subsubset[ought_to_eek] = [string[:-5] + "eek" for string in subsubset[ought_to_eek]] | |
ought_to_ight = subsubset.endswith("fought") | |
subsubset[ought_to_ight] = [string[:-5] + "ight" for string in subsubset[ought_to_ight]] | |
ought_to_ink = subsubset.endswith("thought") | |
subsubset[ought_to_ink] = [string[:-5] + "ink" for string in subsubset[ought_to_ink]] | |
aught_to_atch = subsubset.endswith("caught") | |
subsubset[aught_to_atch] = [string[:-5] + "atch" for string in subsubset[aught_to_atch]] | |
aught_to_each = subsubset.endswith("taught") | |
subsubset[aught_to_each] = [string[:-5] + "each" for string in subsubset[aught_to_each]] | |
subset[subsubset_scope] = subsubset | |
it_to_ight = subset.endswith("lit") & ~np.any([subset.endswith(string) for string in ("llit", "slit", "split")], axis = 0) | |
subset[it_to_ight] = [string[:-1] + "ght" for string in subset[it_to_ight]] | |
it_to_ite = np.any([subset.endswith(string) for string in ("frostbit", "snakebit")], axis = 0) | np.array([bool(re.search("\\bbit$", word)) for word in subset]) | |
subset[it_to_ite] = [string[:-2] + "ite" for string in subset[it_to_ite]] | |
elt_to_eel = np.any([subset.endswith(string) for string in ("felt", "knelt")], axis = 0) | |
subset[elt_to_eel] = [string[:-2] + "el" for string in subset[elt_to_eel]] | |
ept_to_eep = np.any([subset.endswith(string) for string in ("crept", "kept", "slept", "swept", "wept")], axis = 0) | |
subset[ept_to_eep] = [string[:-2] + "ep" for string in subset[ept_to_eep]] | |
ot_to_et = np.any([subset.endswith(string) for string in ("begot", "forgot")], axis = 0) | np.array([bool(re.search("\\bgot$", word)) for word in subset]) | |
subset[ot_to_et] = [string[:-2] + "et" for string in subset[ot_to_et]] | |
ot_to_oot = np.any([subset.endswith(string) for string in ("countershot", "outshot", "overshot", "reshot", "upshot", "troubleshot")], axis = 0) | np.array([bool(re.search("\\bshot$", word)) for word in subset]) | |
subset[ot_to_oot] = [string[:-1] + "ot" for string in subset[ot_to_oot]] | |
words[subset_scope] = subset | |
# handle irregulars ending in "w" | |
subset_scope = words.endswith("w") | |
subset = words[subset_scope] | |
if len(subset) != 0: | |
aw_to_ee = np.any([subset.endswith(string) for string in ("foresaw", "oversaw", "resaw", "sightsaw")], axis = 0) | np.array([bool(re.search("\\bsaw$", word)) for word in subset]) | |
subset[aw_to_ee] = [string[:-2] + "ee" for string in subset[aw_to_ee]] | |
# irregular verbs ending in "ew" | |
subsubset_scope = subset.endswith("ew") | |
subsubset = subset[subsubset_scope] | |
if len(subsubset) != 0: | |
ew_to_aw = subsubset.endswith("drew") | |
subsubset[ew_to_aw] = [string[:-2] + "aw" for string in subsubset[ew_to_aw]] | |
ew_to_y = subsubset.endswith("flew") | |
subsubset[ew_to_y] = [string[:-2] + "y" for string in subsubset[ew_to_y]] | |
ew_to_ay = subsubset.endswith("slew") | |
subsubset[ew_to_ay] = [string[:-2] + "ay" for string in subsubset[ew_to_ay]] | |
ew_to_ow = np.any([subsubset.endswith(string) for string in ("blew", "grew", "knew", "threw")], axis = 0) | |
subsubset[ew_to_ow] = [string[:-2] + "ow" for string in subsubset[ew_to_ow]] | |
subset[subsubset_scope] = subsubset | |
words[subset_scope] = subset | |
# ay_to_ie <- words %like% "\\blay$" | |
# words[ay_to_ie] <- replace_last_n_chars_with(words[ay_to_ie], 2, "ie") | |
### handle prefixes | |
# decelerate/devolve ~ accelerate/evolve | |
# handled before most prefixes because otherwise "de" would be handled incorrectly | |
de_to_ac = np.array([bool(re.search("\\bdecel", word)) for word in words]) | |
words[de_to_ac] = [re.sub(string = string, pattern = "\\bde", repl = "ac") for string in words[de_to_ac]] | |
de_to_e = np.array([bool(re.search("\\bdevol", word)) for word in words]) | |
words[de_to_e] = [re.sub(string = string, pattern = "\\bd", repl = "") for string in words[de_to_e]] | |
# prevent removal of prefix-like forms that actually aren't acting as prefixes | |
has_keepable_prefix = np.array([bool(re.search("^" + any_of(English_prefix_keepers), word)) for word in words]) | |
# removes multiple (nested) prefixes | |
# excludes a few difficult cases for further processing below | |
delete_prefix = ~has_keepable_prefix & np.array([bool(re.search("\\b(?:(?:(?:a|de|ex|post|pre|re|semi|un|well)-)|((anti|dis|im[bmp]|hyper|hypo|in|mis|non|over|sub|super|under|un)-?))", word)) for word in words]) & np.array([not(bool(re.search("\\b(?:none($|theless)|im(?:migra|pov|prop))", word))) for word in words]) | |
words[delete_prefix] = [re.sub(string = string, pattern = "\\b(?:(?:a|de|ex|post|pre|re|semi|un|well)-|(?:anti|dis|im[bmp]|hyper|hypo|in|mis|non|over|sub|super|under|un)-?)(?:(?:a|de|ex|post|pre|re|semi|un|well)-|(?:anti|dis|im[bmp]|hyper|hypo|in|mis|non|over|sub|super|under|un)-?)*", repl = "") for string in words[delete_prefix]] | |
# needs to be separate because the above rule would have taken immigrate/improper to igrate/roper | |
delete_im_prefix = np.array([bool(re.search("\\bim(?:migra|pov|prop)", word)) for word in words]) | |
words[delete_im_prefix] = [re.sub(string = string, pattern = "\\bim", repl = "") for string in words[delete_im_prefix]] | |
#* could add "ir" to normal prefix set above if a list of English_ir_keepers is made and used | |
delete_ir_prefix = np.array([bool(re.search("\\birr", word)) for word in words]) & np.array([not(bool(re.search("\\birrigat", word))) for word in words]) | |
words[delete_ir_prefix] = [re.sub(string = string, pattern = "\\bir", repl = "") for string in words[delete_ir_prefix]] | |
#* could add "ab" to normal prefix set above if a list of English_ab_keepers is made and used | |
delete_ab_prefix = np.array([bool(re.search("\\babnormal", word)) for word in words]) | |
words[delete_ab_prefix] = [re.sub(string = string, pattern = "\\bab", repl = "") for string in words[delete_ab_prefix]] | |
#* could add "mal" to normal prefix set above if a list of English_mal_keepers is made and used | |
delete_mal_prefix = np.array([bool(re.search("\\bmal", word)) for word in words]) & np.array([not(bool(re.search("\\bmal(?:ady|ari|ark|e(?:$|s|ness)|efa|efi|evo|ici|ign|ing|l(?:$|[aeiou])|m|(?:$|t[aeiou]))", word))) for word in words]) | |
words[delete_mal_prefix] = [re.sub(string = string, pattern = "\\bmal", repl = "") for string in words[delete_mal_prefix]] | |
### handle first batch of generic noun and adjective suffixes | |
# handle "ly" prefix | |
subset_scope = words.endswith("ly") | |
subset = words[subset_scope] | |
if len(subset) != 0: | |
# sometimes "lly" -> "ly" | |
ly_to_l = np.any([subset.endswith(string) for string in ("billy", "hilly", "frilly")], axis = 0) | np.array([bool(re.search("\\bfully$", word)) for word in subset]) | |
subset[ly_to_l] = [string[:-1] for string in subset[ly_to_l]] | |
# e.g. "ably" -> "able" | |
bly_to_ble = subset.endswith("bly") | |
subset[bly_to_ble] = [string[:-1] + "e" for string in subset[bly_to_ble]] | |
ly_keeper_mask = np.array([not(bool(re.search("\\b" + any_of(English_ly_keepers) + "$", word))) for word in subset]) | |
# e.g. "happily" -> "happy" | |
ily_to_y = subset.endswith("ily") & ly_keeper_mask | |
subset[ily_to_y] = [string[:-3] + "y" for string in subset[ily_to_y]] | |
# e.g. "subtly" -> "subtle" | |
ly_to_le = np.any([subset.endswith(string) for string in English_ly_to_le_words], axis = 0) & ly_keeper_mask | |
subset[ly_to_le] = [string[:-1] + "e" for string in subset[ly_to_le]] | |
# e.g. "truly" -> "true" | |
ly_to_e = np.any([subset.endswith(string) for string in ("uly", "wholly")], axis = 0) | |
subset[ly_to_e] = [string[:-2] + "e" for string in subset[ly_to_e]] | |
# general rule--remove suffix | |
delete_ly = subset.endswith("ly") & ly_keeper_mask | |
subset[delete_ly] = [string[:-2] for string in subset[delete_ly]] | |
words[subset_scope] = subset | |
# ("especially" ->) "especial" -> "special" | |
is_especial = words.endswith("especial") | |
words[is_especial] = [re.sub(string = string, pattern = "\\bespecial$", repl = "special") for string in words[is_especial]] | |
# handle "ness" suffix | |
subset_scope = words.endswith("ness") | |
subset = words[subset_scope] | |
if len(subset) != 0: | |
# e.g. "cleanliness" -> "clean" | |
delete_liness = subset.endswith("liness") | |
subset[delete_liness] = [string[:-6] for string in subset[delete_liness]] | |
# e.g. "happiness" -> "happy" | |
iness_to_y = subset.endswith("iness") & ~subset.endswith("business") | |
subset[iness_to_y] = [string[:-5] + "y" for string in subset[iness_to_y]] | |
# general rule--remove suffix | |
delete_ness = subset.endswith("ness") & ~subset.endswith("business") | |
subset[delete_ness] = [string[:-4] for string in subset[delete_ness]] | |
words[subset_scope] = subset | |
# handle "ity" suffix | |
subset_scope = words.endswith("ity") | |
subset = words[subset_scope] | |
if len(subset) != 0: | |
# e.g. "mobility" -> "mobile" | |
bility_to_bile = np.array([bool(re.search("(?:\\bla|mo|nu)bility$", word)) for word in subset]) | |
subset[bility_to_bile] = [string[:-3] + "e" for string in subset[bility_to_bile]] | |
# e.g. "ability" -> "able" | |
bility_to_ble = subset.endswith("bility") | |
subset[bility_to_ble] = [string[:-5] + "le" for string in subset[bility_to_ble]] | |
# e.g. "activity" -> "active" | |
ity_to_e = np.any([subset.endswith(string) for string in ("antiquity", "purity", "ivity")], axis = 0) | |
subset[ity_to_e] = [string[:-3] + "e" for string in subset[ity_to_e]] | |
# e.g. "credulity" -> "credulous" | |
ulity_to_ulous = subset.endswith("ulity") | |
subset[ulity_to_ulous] = [string[:-3] + "ous" for string in subset[ulity_to_ulous]] | |
# e.g. "hilarity" -> "hilarious" | |
arity_to_arious = subset.endswith("hilarity") | |
subset[arity_to_arious] = [string[:-2] + "ous" for string in subset[arity_to_arious]] | |
# e.g. "clarity" -> "clear" | |
arity_to_ear = subset.endswith("clarity") | |
subset[arity_to_ear] = [string[:-5] + "ear" for string in subset[arity_to_ear]] | |
# general rule--leave suffix unless ends with "al", "ic", or "lar" | |
delete_ity = (np.any([subset.endswith(string) for string in ("ality", "icity", "larity")], axis = 0) & ~np.any([subset.endswith(string) for string in ("complicity", "felicity", "quality")], axis = 0)) | np.array([bool(re.search(any_of([string + "ity" for string in English_al_keepers]) + "$", word)) for word in subset]) | |
subset[delete_ity] = [string[:-3] for string in subset[delete_ity]] | |
words[subset_scope] = subset | |
# remove other "ty" suffixes | |
delete_ty = np.any([words.endswith(string) for string in ("certainty", "nicety")], axis = 0) | |
words[delete_ty] = [string[:-2] for string in words[delete_ty]] | |
# handle "esque" suffix | |
subset_scope = words.endswith("esque") | |
subset = words[subset_scope] | |
if len(subset) != 0: | |
# e.g. "statuesque" -> "statue" | |
esque_to_e = np.any([subset.endswith(string) for string in ("uesque", "uresque")], axis = 0) | |
subset[esque_to_e] = [string[:-4] for string in subset[esque_to_e]] | |
# general rule--remove suffix | |
delete_esque = subset.endswith("esque") & ~np.any([subset.endswith(string) for string in ("burlesque", "grotesque")], axis = 0) | |
subset[delete_esque] = [string[:-5] for string in subset[delete_esque]] | |
words[subset_scope] = subset | |
# handle "ish" suffix | |
subset_scope = words.endswith("ish") | |
subset = words[subset_scope] | |
if len(subset) != 0: | |
# e.g. "(im)poverish" -> "poverty" | |
ish_to_ty = subset.endswith("poverish") | |
subset[ish_to_ty] = [string[:-3] + "ty" for string in subset[ish_to_ty]] | |
# e.g. "piggish" -> "pig" | |
delete_ish_letter = np.any([subset.endswith(string) for string in ("cattish", "doggish", "hottish", "piggish")], axis = 0) | |
subset[delete_ish_letter] = [string[:-4] for string in subset[delete_ish_letter]] | |
# e.g. "brutish" -> "brute" | |
ish_to_e = np.any([subset.endswith(string) for string in ("vampirish", "vulturish", "brutish", "ttish", "dovish", "voguish", "purplish", "ylish")], axis = 0) | |
subset[ish_to_e] = [string[:-3] + "e" for string in subset[ish_to_e]] | |
# general rule--remove suffix | |
delete_ish = subset.endswith("ish") & (~np.any([subset.endswith(string) for string in English_ish_keepers], axis = 0) | subset.endswith("oafish")) | |
subset[delete_ish] = [string[:-3] for string in subset[delete_ish]] | |
words[subset_scope] = subset | |
# handle "able" suffixes | |
subset_scope = words.endswith("able") | |
subset = words[subset_scope] | |
if len(subset) != 0: | |
able_keeper_mask = np.array([not(bool(re.search("\\b" + any_of(English_able_keepers) + "$", word))) for word in subset]) | |
# e.g. "reliable" -> "rely" | |
iable_to_y = subset.endswith("iable") & np.array([not(bool(re.search("\\b" + any_of(English_iable_keepers) + "$", word))) for word in subset]) | |
subset[iable_to_y] = [string[:-5] + "y" for string in subset[iable_to_y]] | |
# e.g. "despicable" -> "despise" | |
icable_to_ise = subset.endswith("spicable") | |
subset[icable_to_ise] = [string[:-5] + "se" for string in subset[icable_to_ise]] | |
# e.g. "irritable" -> "irritate" | |
able_to_ate = np.any([subset.endswith(string) for string in ("approximable", "culable", "gulable", "irritable", "operable", "icable")], axis = 0) & able_keeper_mask | |
subset[able_to_ate] = [string[:-3] + "te" for string in subset[able_to_ate]] | |
# e.g. "(inde)fatigable" -> "fatigue" | |
able_to_ue = subset.endswith("fatigable") | |
subset[able_to_ue] = [string[:-4] + "ue" for string in subset[able_to_ue]] | |
# e.g. "memorable" -> "memory" | |
able_to_y = np.any([subset.endswith(string) for string in ("charitable", "memorable")], axis = 0) | |
subset[able_to_y] = [string[:-4] + "y" for string in subset[able_to_y]] | |
# e.g. "flammable" -> "flame | |
able_letter_to_e = subset.endswith("flammable") | |
subset[able_letter_to_e] = [string[:-5] + "e" for string in subset[able_letter_to_e]] | |
# e.g. "transferrable" -> "transfer" | |
delete_able_letter = np.any([subset.endswith(string) for string in English_doubled_consonants_able], axis = 0) & able_keeper_mask | |
subset[delete_able_letter] = [string[:-5] for string in subset[delete_able_letter]] | |
# e.g. "sharable" -> "share" | |
able_to_e = np.array([bool(re.search((e_rules + "able$"), word)) for word in subset]) & able_keeper_mask | |
subset[able_to_e] = [string[:-4] + "e" for string in subset[able_to_e]] | |
# general rule--remove suffix | |
delete_able = subset.endswith("able") & able_keeper_mask | |
subset[delete_able] = [string[:-4] for string in subset[delete_able]] | |
words[subset_scope] = subset | |
# handle "ible" suffixes | |
subset_scope = words.endswith("ible") | |
subset = words[subset_scope] | |
if len(subset) != 0: | |
ible_keeper_mask = np.array([not(bool(re.search("\\b" + any_of(English_ible_keepers) + "$", word))) for word in subset]) | |
# e.g. "(in)visible" -> "vision" | |
ible_to_ion = subset.endswith("visible") | |
subset[ible_to_ion] = [string[:-3] + "on" for string in subset[ible_to_ion]] | |
# e.g. "(in)credible" -> "credit" | |
ible_to_ent = subset.endswith("credible") | |
subset[ible_to_ent] = [string[:-4] + "ent" for string in subset[ible_to_ent]] | |
# e.g. "sensible" -> "sense" | |
ible_to_e = np.array([bool(re.search((e_rules + "ible$"), word)) for word in subset]) & ible_keeper_mask | |
subset[ible_to_e] = [string[:-4] + "e" for string in subset[ible_to_e]] | |
# general rule--remove suffix | |
delete_ible = subset.endswith("ible") & ible_keeper_mask | |
subset[delete_ible] = [string[:-4] for string in subset[delete_ible]] | |
words[subset_scope] = subset | |
# handle "hood" suffix | |
subset_scope = words.endswith("hood") | |
subset = words[subset_scope] | |
if len(subset) != 0: | |
# e.g. "livelihood" -> "live" | |
delete_lihood = subset.endswith("livelihood") | |
subset[delete_lihood] = [string[:-6] for string in subset[delete_lihood]] | |
# e.g. "likelihood" -> "likely" | |
ihood_to_y = subset.endswith("ihood") | |
subset[ihood_to_y] = [string[:-5] + "y" for string in subset[ihood_to_y]] | |
# general rule--remove suffix | |
delete_hood = subset.endswith("hood") | |
subset[delete_hood] = [string[:-4] for string in subset[delete_hood]] | |
words[subset_scope] = subset | |
# handle "ship" suffix | |
subset_scope = words.endswith("ship") | |
subset = words[subset_scope] | |
if len(subset) != 0: | |
delete_ship = ~(np.any([subset.endswith(string) for string in ("airship", "battleship", "fireship", "gunship", "longship", "mediumship", "midship", "motorship", "relationship", "spaceship", "steamship", "tankship", "tranship", "transship", "warship", "worship")], axis = 0) | np.array([bool(re.search("\\bship$", word)) for word in subset])) | |
subset[delete_ship] = [string[:-4] for string in subset[delete_ship]] | |
words[subset_scope] = subset | |
### handle other oddities | |
# e.g. "unison" -> "unity" | |
ison_to_ity = words.endswith("unison") | |
words[ison_to_ity] = [string[:-3] + "ty" for string in words[ison_to_ity]] | |
# e.g. "comparison" -> "compare" | |
ison_to_e = words.endswith("comparison") | |
words[ison_to_e] = [string[:-4] + "e" for string in words[ison_to_e]] | |
# e.g. "legalese" -> "legal" | |
delete_ese = words.endswith("ese") & ~np.any([words.endswith(string) for string in ("diocese", "eese", "manganese", "obese", "these")], axis = 0) | |
words[delete_ese] = [string[:-3] for string in words[delete_ese]] | |
# e.g. "programme" -> "program" | |
amme_to_am = words.endswith("amme") | |
words[amme_to_am] = [string[:-2] for string in words[amme_to_am]] | |
# e.g. "theatre" -> "theater" | |
re_to_er = np.any([words.endswith(string) for string in ("bre", "tre")], axis = 0) | |
words[re_to_er] = [string[:-2] + "er" for string in words[re_to_er]] | |
# e.g. "wowser" -> "wow" | |
delete_ser = words.endswith("wowser") | |
words[delete_ser] = [string[:-3] for string in words[delete_ser]] | |
# e.g. "lawyer" -> "law" | |
delete_yer = np.any([words.endswith(string) for string in ("bowyer", "lawyer", "sawyer")], axis = 0) | |
words[delete_yer] = [string[:-3] for string in words[delete_yer]] | |
# e.g. "western" -> "west" | |
delete_ern = np.any([words.endswith(string) for string in ("eastern", "northern", "southern", "western")], axis = 0) | |
words[delete_ern] = [string[:-3] for string in words[delete_ern]] | |
# e.g. "cowardice" -> "coward" | |
delete_ice = words.endswith("cowardice") | |
words[delete_ice] = [string[:-3] for string in words[delete_ice]] | |
# e.g. "hatred" -> "hate" | |
red_to_e = words.endswith("hatred") | |
words[red_to_e] = [string[:-3] + "e" for string in words[red_to_e]] | |
# e.g. "elder" -> "old" | |
eld_to_old = np.array([bool(re.search("\\beld(?:er|est)?$", word)) for word in words]) | |
words[eld_to_old] = [re.sub(string = string, pattern = "\\beld(?:er|est)?$", repl = "old") for string in words[eld_to_old]] | |
# handle "estry" and "istry" suffixes | |
subset_scope = words.endswith("stry") | |
subset = words[subset_scope] | |
if len(subset) != 0: | |
# e.g. "ancestry" -> "ancester" | |
estry_to_est = np.any([subset.endswith(string) for string in ("ancestry", "forestry")], axis = 0) | |
subset[estry_to_est] = [string[:-2] for string in subset[estry_to_est]] | |
# e.g. "registry" -> "register" | |
istry_to_ter = np.any([subset.endswith(string) for string in ("ministry", "registry")], axis = 0) | |
subset[istry_to_ter] = [string[:-2] + "er" for string in subset[istry_to_ter]] | |
# e.g. "artistry" -> "artist" | |
istry_to_ist = np.any([subset.endswith(string) for string in ("artistry", "baptistry", "chemistry", "dentistry", "sophistry")], axis = 0) | |
subset[istry_to_ist] = [string[:-2] for string in subset[istry_to_ist]] | |
words[subset_scope] = subset | |
### fix final set of generic noun and adjective suffixes | |
# e.g. "opportunistic" -> "opportunism" | |
istic_to_ism = words.endswith("istic") & np.array([not(bool(re.search("\\bstatistic", word))) for word in words]) | |
words[istic_to_ism] = [string[:-3] + "m" for string in words[istic_to_ism]] | |
# e.g. "opportunist" -> "opportunism" | |
# some words are changed knowing they'll be handled below (e.g. "therapist" -> "therapism" -> "therapy") | |
ist_to_ism = words.endswith("ist") & np.array([not(bool(re.search("\\b" + any_of(English_ist_keepers) + "$", word))) for word in words]) | |
words[ist_to_ism] = [string[:-1] + "m" for string in words[ist_to_ism]] | |
# handle "ism" suffix | |
subset_scope = words.endswith("ism") | |
subset = words[subset_scope] | |
if len(subset) != 0: | |
# e.g. "conservatism" -> "conservative" | |
ism_to_ive = subset.endswith("rvatism") | |
subset[ism_to_ive] = [string[:-3] + "ive" for string in subset[ism_to_ive]] | |
# e.g. "scientism" -> "science" | |
tism_to_ce = subset.endswith("scientism") | |
subset[tism_to_ce] = [string[:-4] + "ce" for string in subset[tism_to_ce]] | |
# e.g. "cosmopolitism" -> "cosmopolitan" | |
ism_to_an = subset.endswith("cosmopolitism") | |
subset[ism_to_an] = [string[:-3] + "an" for string in subset[ism_to_an]] | |
# e.g. "(bi)linguism" (or "linguist") -> "lingual" | |
ism_to_al = subset.endswith("linguism") | |
subset[ism_to_al] = [string[:-3] + "al" for string in subset[ism_to_al]] | |
# e.g. "metabolism" -> "metabolic" | |
ism_to_ic = np.any([subset.endswith(string) for string in ("abolism", "barism", "mechanism", "ntrism")], axis = 0) | |
subset[ism_to_ic] = [string[:-2] + "c" for string in subset[ism_to_ic]] | |
# e.g. "therapism" (or "therapist") -> "therapy" | |
ism_to_y = np.any([subset.endswith(string) for string in ("economism", "jurism", "pharmacism", "quism", "rgism", "therapism")], axis = 0) | |
subset[ism_to_y] = [string[:-3] + "y" for string in subset[ism_to_y]] | |
# e.g. "activism" -> "active" | |
ism_to_e = np.any([subset.endswith(string) for string in ("activism", "biblism", "chromism", "chronism", "communism", "cubism", "elitism", "flutism", "imagism", "itism", "nudism", "nudism", "oboism", "purism", "racism", "rapism", "titlism", "tropism", "typism", "vism")], axis = 0) | |
subset[ism_to_e] = [string[:-3] + "e" for string in subset[ism_to_e]] | |
# e.g. "snobbism" -> "snob" | |
delete_ism_letter = np.any([subset.endswith(string) for string in English_doubled_consonants_ism], axis = 0) | |
subset[delete_ism_letter] = [string[:-4] for string in subset[delete_ism_letter]] | |
# general rule--remove suffix | |
delete_ism = subset.endswith("ism") & np.array([not(bool(re.search("\\b" + any_of(English_ism_keepers) + "$", word))) for word in subset]) | |
subset[delete_ism] = [string[:-3] for string in subset[delete_ism]] | |
words[subset_scope] = subset | |
# handle "al" suffix | |
subset_scope = words.endswith("al") | |
subset = words[subset_scope] | |
if len(subset) != 0: | |
# handle "ial" suffix | |
# e.g. "serial" -> "series" | |
is_serial = subset.endswith("serial") | |
subset[is_serial] = [string[:-2] + "es" for string in subset[is_serial]] | |
# e.g. "imperial" -> "empire" | |
is_imperial = subset.endswith("imperial") | |
subset[is_imperial] = [string[:-8] + "empire" for string in subset[is_imperial]] | |
# e.g. "beneficial" -> "benefit" | |
cial_to_t = subset.endswith("beneficial") | |
subset[cial_to_t] = [string[:-4] + "t" for string in subset[cial_to_t]] | |
# e.g. "ceremonial" -> "ceremony" | |
ial_to_y = np.any([subset.endswith(string) for string in ("ceremonial", "colonial", "custodial", "memorial", "mercurial", "monial", "territorial", "trial", "versial")], axis = 0) | |
subset[ial_to_y] = [string[:-3] + "y" for string in subset[ial_to_y]] | |
# e.g. "bacterial" -> "bacterium" | |
ial_to_ium = np.any([subset.endswith(string) for string in ("bacterial", "cranial", "ennial", "fluvial", "sporial", "stadial")], axis = 0) | |
subset[ial_to_ium] = [string[:-2] + "um" for string in subset[ial_to_ium]] | |
# e.g. "essential" -> "essence" | |
tial_to_ce = np.any([subset.endswith(string) for string in ("essential", "influential", "spatial")], axis = 0) | |
subset[tial_to_ce] = [string[:-4] + "ce" for string in subset[tial_to_ce]] | |
# e.g. "financial" -> "finance" | |
ial_to_e = np.array([bool(re.search("(?:[aeiou][bcs]|[nr]c)ial$", word)) for word in subset]) & np.array([not(bool(re.search(any_of(English_ial_keepers) + "$", word))) for word in subset]) | |
subset[ial_to_e] = [string[:-3] + "e" for string in subset[ial_to_e]] | |
# general "ial" rule--remove suffix | |
delete_ial = subset.endswith("ial") & np.array([not(bool(re.search(any_of(English_ial_keepers) + "$", word))) for word in subset]) | |
subset[delete_ial] = [string[:-3] for string in subset[delete_ial]] | |
# handle "ical" suffix | |
# e.g. "cyclical" -> "cycle" | |
lical_to_le = np.any([subset.endswith(string) for string in ("blical", "clical")], axis = 0) | |
subset[lical_to_le] = [string[:-4] + "e" for string in subset[lical_to_le]] | |
# e.g. "surgical" -> "surgery" | |
ical_to_ery = subset.endswith("surgical") | |
subset[ical_to_ery] = [string[:-4] + "ery" for string in subset[ical_to_ery]] | |
# e.g. "identical" -> "identity" | |
ical_to_ity = subset.endswith("identical") | |
subset[ical_to_ity] = [string[:-3] + "ty" for string in subset[ical_to_ity]] | |
# e.g. "chemical" -> "chemist" | |
ical_to_ist = subset.endswith("chemical") | |
subset[ical_to_ist] = [string[:-3] + "st" for string in subset[ical_to_ist]] | |
# general "ical" rule is to follow general "al" rule (remove "al") | |
# handle "ual" suffix | |
# e.g. "annual" -> "annum" | |
ual_to_um = subset.endswith("annual") | |
subset[ual_to_um] = [string[:-2] + "m" for string in subset[ual_to_um]] | |
# e.g. "sensual" -> "sense" | |
ual_to_e = np.any([subset.endswith(string) for string in ("gradual", "sensual")], axis = 0) | |
subset[ual_to_e] = [string[:-3] + "e" for string in subset[ual_to_e]] | |
# e.g. "continual" -> "continue" | |
ual_to_ue = np.any([subset.endswith(string) for string in ("accrual", "continual", "residual", "tissual", "virtual")], axis = 0) | |
subset[ual_to_ue] = [string[:-2] + "e" for string in subset[ual_to_ue]] | |
# e.g. "central" -> "center" | |
tral_to_ter = np.any([subset.endswith(string) for string in ("ancestral", "central", "cloistral", "lustral", "neutral", "sceptral")], axis = 0) | |
subset[tral_to_ter] = [string[:-3] + "er" for string in subset[tral_to_ter]] | |
# general "ual" rule--remove suffix | |
delete_ual = subset.endswith("ual") & np.array([not(bool(re.search(any_of(English_ual_keepers) + "$", word))) for word in subset]) | |
subset[delete_ual] = [string[:-3] for string in subset[delete_ual]] | |
# handle "inal" suffix | |
# e.g. "longitudinal" -> "longitude" | |
tudinal_to_tude = subset.endswith("tudinal") | |
subset[tudinal_to_tude] = [string[:-4] + "e" for string in subset[tudinal_to_tude]] | |
# e.g. "criminal" -> "crime" | |
inal_to_e = subset.endswith("criminal") | |
subset[inal_to_e] = [string[:-4] + "e" for string in subset[inal_to_e]] | |
# e.g. "maternal" -> "mater" | |
#* could change this to "mother"/"father" later | |
delete_nal = np.any([subset.endswith(string) for string in ("maternal", "paternal")], axis = 0) | |
subset[delete_nal] = [string[:-3] for string in subset[delete_nal]] | |
# general "inal" rule is to follow general "al" rule (remove "al") | |
# handle "tal" suffix | |
# e.g. "horizontal" -> "horizon" | |
delete_tal = subset.endswith("horizontal") | |
subset[delete_tal] = [string[:-3] for string in subset[delete_tal]] | |
# general "tal" rule is to follow general "al" rule (remove "al") | |
# handle plain "al" suffix | |
# e.g. "referral" -> "refer" | |
delete_al_letter = subset.endswith("referral") | |
subset[delete_al_letter] = [string[:-3] for string in subset[delete_al_letter]] | |
# e.g. "larval" -> "larva" | |
delete_l = np.any([subset.endswith(string) for string in ("caval", "gingival", "larval", "orchestral", "vaginal")], axis = 0) | |
subset[delete_l] = [string[:-1] for string in subset[delete_l]] | |
# e.g. "peripheral" -> "periphery" | |
al_to_y = np.any([subset.endswith(string) for string in ("peripheral", "societal")], axis = 0) | |
subset[al_to_y] = [string[:-2] + "y" for string in subset[al_to_y]] | |
# e.g. "neural" -> "neuron" | |
al_to_on = subset.endswith("neural") | |
subset[al_to_on] = [string[:-2] + "on" for string in subset[al_to_on]] | |
# e.g. "spectral" -> "spectrum" | |
al_to_um = np.any([subset.endswith(string) for string in ("poreal", "spectral", "minimal", "maximal", "optimal", "cerebral")], axis = 0) | |
subset[al_to_um] = [string[:-2] + "um" for string in subset[al_to_um]] | |
# e.g. "viral" -> "virus" | |
al_to_us = np.any([subset.endswith(string) for string in ("colossal", "focal", "terminal", "viral")], axis = 0) | |
subset[al_to_us] = [string[:-2] + "us" for string in subset[al_to_us]] | |
# e.g. "global" -> "globe" | |
al_to_e = np.any([subset.endswith(string) for string in ("communal", "global", "tribal", "practical", "bridal", "tribunal", "brutal", "ral", "sal", "val")], axis = 0) & ~(np.any([subset.endswith(string) for string in ("behavioral", "doctoral", "electoral", "medieval", "naval", "floral", "primeval")], axis = 0) | np.array([bool(re.search("\\b(?:o|ri)val$", word)) for word in subset])) & np.array([not(bool(re.search(any_of(English_al_keepers) + "$", word))) for word in subset]) | |
subset[al_to_e] = [string[:-2] + "e" for string in subset[al_to_e]] | |
# e.g. "reciprocal" -> "reciprocate" | |
al_to_ate = subset.endswith("reciprocal") | |
subset[al_to_ate] = [string[:-1] + "te" for string in subset[al_to_ate]] | |
# general rule--remove suffix | |
delete_al = subset.endswith("al") & np.array([not(bool(re.search(any_of(English_al_keepers) + "$", word))) for word in subset]) | |
subset[delete_al] = [string[:-2] for string in subset[delete_al]] | |
words[subset_scope] = subset | |
# handle "ian" suffix | |
subset_scope = words.endswith("ian") | |
subset = words[subset_scope] | |
if len(subset) != 0: | |
# e.g. "libertarian" -> "liberty" | |
arian_to_y = np.any([subset.endswith(string) for string in ("ilarian", "itarian", "rtarian", "stinarian")], axis = 0) | |
subset[arian_to_y] = [string[:-5] + "y" for string in subset[arian_to_y]] | |
# e.g. "sectarian" -> "sect" | |
delete_arian = np.any([subset.endswith(string) for string in ("fruitarian", "sectarian")], axis = 0) | |
subset[delete_arian] = [string[:-5] for string in subset[delete_arian]] | |
# e.g. "civilian" -> "civil" | |
ian_to_e = np.any([subset.endswith(string) for string in ("avian", "esian", "ilian")], axis = 0) & ~subset.endswith("civilian") | |
subset[ian_to_e] = [string[:-3] + "e" for string in subset[ian_to_e]] | |
# e.g. "comedian" -> "comedy" | |
ian_to_y = np.any([subset.endswith(string) for string in ("arian", "comedian", "custodian", "torian", "tregedian", "ovarian")], axis = 0) | |
subset[ian_to_y] = [string[:-3] + "y" for string in subset[ian_to_y]] | |
# general rule--remove suffix | |
delete_ian = subset.endswith("ian") & ~np.any([subset.endswith(string) for string in English_ian_keepers], axis = 0) | |
subset[delete_ian] = [string[:-3] for string in subset[delete_ian]] | |
words[subset_scope] = subset | |
# handle "ary" suffix | |
subset_scope = words.endswith("ary") | |
subset = words[subset_scope] | |
if len(subset) != 0: | |
# e.g. "disciplinary" -> "discipline" | |
ary_to_e = np.any([subset.endswith(string) for string in ("antiquary", "disciplinary", "primary")], axis = 0) | |
subset[ary_to_e] = [string[:-3] + "e" for string in subset[ary_to_e]] | |
# e.g. "legendary" -> "legend" | |
delete_ary = np.any([subset.endswith(string) for string in ("dietary", "legendary", "ionary", "mentary", "parliamentary", "secondary")], axis = 0) | |
subset[delete_ary] = [string[:-3] for string in subset[delete_ary]] | |
words[subset_scope] = subset | |
# handle "ment" suffix | |
subset_scope = words.endswith("ment") | |
subset = words[subset_scope] | |
if len(subset) != 0: | |
# e.g. "judgment" -> "judge" | |
ment_to_e = subset.endswith("dgment") | |
subset[ment_to_e] = [string[:-4] + "e" for string in subset[ment_to_e]] | |
# e.g. "merriment" -> "merry" | |
iment_to_y = subset.endswith("iment") & ~np.any([subset.endswith(string) for string in English_ment_keepers], axis = 0) | |
subset[iment_to_y] = [string[:-5] + "y" for string in subset[iment_to_y]] | |
# general rule--remove suffix | |
delete_ment = subset.endswith("ment") & ~np.any([subset.endswith(string) for string in English_ment_keepers], axis = 0) | |
subset[delete_ment] = [string[:-4] for string in subset[delete_ment]] | |
words[subset_scope] = subset | |
# handle "ic" suffix | |
subset_scope = words.endswith("ic") | |
subset = words[subset_scope] | |
if len(subset) != 0: | |
# e.g. "technic" -> "technique" | |
ic_to_ique = subset.endswith("technic") | |
subset[ic_to_ique] = [string[:-1] + "que" for string in subset[ic_to_ique]] | |
# e.g. "cortic(al)" -> "cortex" | |
ic_to_ex = np.any([subset.endswith(string) for string in ("cortic", "vortic")], axis = 0) | |
subset[ic_to_ex] = [string[:-2] + "ex" for string in subset[ic_to_ex]] | |
# handle "ific" suffix | |
# e.g. "scientific" -> "science" | |
tific_to_ce = subset.endswith("scientific") | |
subset[tific_to_ce] = [string[:-5] + "ce" for string in subset[tific_to_ce]] | |
# e.g. "specific" -> "specify" | |
ific_to_ify = np.any([subset.endswith(string) for string in ("cific", "rific")], axis = 0) | |
subset[ific_to_ify] = [string[:-3] + "fy" for string in subset[ific_to_ify]] | |
# handle "tic" suffixes | |
# e.g. # "hypnotic" -> "hypnosis" | |
tic_to_sis = np.any([subset.endswith(string) for string in ("hypnotic", "hypothetic")], axis = 0) | |
subset[tic_to_sis] = [string[:-3] + "sis" for string in subset[tic_to_sis]] | |
atic_to_e = subset.endswith("chromatic") | |
subset[atic_to_e] = [string[:-4] + "e" for string in subset[atic_to_e]] | |
delete_atic = np.any([subset.endswith(string) for string in ("informatic", "symptomatic")], axis = 0) | |
subset[delete_atic] = [string[:-4] for string in subset[delete_atic]] | |
# handle "ric" suffix | |
# e.g. "cylindric" -> "cylinder" | |
ric_to_er = np.any([subset.endswith(string) for string in ("ndric", "ntric", "theatric")], axis = 0) | |
subset[ric_to_er] = [string[:-3] + "er" for string in subset[ric_to_er]] | |
# handle general "ic" suffix | |
# e.g. "spheric" -> "sphere" | |
ic_to_e = np.any([subset.endswith(string) for string in ("spheric", "typic")], axis = 0) | |
subset[ic_to_e] = [string[:-2] + "e" for string in subset[ic_to_e]] | |
# e.g. "toxic" -> "toxin" | |
ic_to_in = subset.endswith("toxic") | |
subset[ic_to_in] = [string[:-1] + "n" for string in subset[ic_to_in]] | |
# e.g. "euphoric" -> "euphoria" | |
ic_to_ia = np.any([subset.endswith(string) for string in ("dysphoric", "euphoric")], axis = 0) | |
subset[ic_to_ia] = [string[:-1] + "a" for string in subset[ic_to_ia]] | |
# e.g. "graphic" -> "graph" | |
delete_ic = np.any([subset.endswith(string) for string in ("alphabetic", "graphic", "gymnastic", "istic", "phoric", "xic")], axis = 0) & np.array([not(bool(re.search("\\bstatistic", word))) for word in subset]) | |
subset[delete_ic] = [string[:-2] for string in subset[delete_ic]] | |
# e.g. "botanic" -> "botany" | |
ic_to_y = np.any([subset.endswith(string) for string in ("archic", "botanic", "categoric", "metric", "nomic", "ologic", "pacific", "phic", "storic")], axis = 0) | |
subset[ic_to_y] = [string[:-2] + "y" for string in subset[ic_to_y]] | |
# general "ic" rule is to leave it | |
words[subset_scope] = subset | |
# handle "ous" suffix | |
subset_scope = words.endswith("ous") | |
subset = words[subset_scope] | |
if len(subset) != 0: | |
# e.g. "multitudinous" -> "multitude" | |
inous_to_e = np.any([subset.endswith(string) for string in ("multitudinous", "vicissitudinous")], axis = 0) | |
subset[inous_to_e] = [string[:-5] + "e" for string in subset[inous_to_e]] | |
# e.g. "extraneous" -> "extra" | |
delete_ous_letters = subset.endswith("extraneous") | |
subset[delete_ous_letters] = [string[:-5] for string in subset[delete_ous_letters]] | |
# e.g. "incestuous" -> "incest" | |
delete_ous_letter = np.any([subset.endswith(string) for string in ("censorious", "incestuous", "tortious")], axis = 0) | |
subset[delete_ous_letter] = [string[:-4] for string in subset[delete_ous_letter]] | |
# e.g. "famous" -> "fame" | |
ous_to_e = np.any([subset.endswith(string) for string in ("famous", "nervous", "porous", "prestigious", "rapturous")], axis = 0) | |
subset[ous_to_e] = [re.sub(string = string, pattern = "(i|in)?ous$", repl = "e") for string in subset[ous_to_e]] | |
# e.g. "monstrous" -> "monster" | |
trous_to_ter = np.any([subset.endswith(string) for string in ("brous", "strous", "xtrous")], axis = 0) | |
subset[trous_to_ter] = [string[:-4] + "er" for string in subset[trous_to_ter]] | |
# e.g. "anxious" -> "anxiety" | |
ous_to_ety = subset.endswith("anxious") | |
subset[ous_to_ety] = [string[:-3] + "ety" for string in subset[ous_to_ety]] | |
# # e.g. "credulous" -> "" | |
# ulous_to_ <- endsWith(subset, "credulous") | |
# subset[ulous_to_] <- replace_last_n_chars_with(subset[ulous_to_], L, "") | |
# e.g. "tenacious" -> "tenacity" | |
ous_to_ty = np.any([subset.endswith(string) for string in ("atrocious", "capacious", "ferocious", "loquacious", "rapacious", "salacious", "tenacious")], axis = 0) | |
subset[ous_to_ty] = [string[:-3] + "ty" for string in subset[ous_to_ty]] | |
# e.g. "rebellious" -> "rebellion" | |
ous_to_on = np.any([subset.endswith(string) for string in ("rebellious", "gious", "tious")], axis = 0) & ~np.any([subset.endswith(string) for string in ("facetious", "litigious", "prodigious")], axis = 0) | |
subset[ous_to_on] = [string[:-2] + "n" for string in subset[ous_to_on]] | |
# e.g. "decorous" -> "decorum" | |
ous_to_um = np.any([subset.endswith(string) for string in ("decorous", "delirious", "tedious", "vacuous")], axis = 0) | |
subset[ous_to_um] = [string[:-3] + "um" for string in subset[ous_to_um]] | |
# e.g. "envious" -> "envy" | |
ious_to_y = np.any([subset.endswith(string) for string in ("efficacious", "envious", "fallacious", "furious", "glorious", "luxurious", "melodious", "onious", "prodigious", "various")], axis = 0) | |
subset[ious_to_y] = [string[:-4] + "y" for string in subset[ious_to_y]] | |
# e.g. "gracious" -> "grace" | |
cious_to_ce = subset.endswith("cious") | |
subset[cious_to_ce] = [string[:-4] + "e" for string in subset[cious_to_ce]] | |
# e.g. "felonous" -> "felony" | |
ous_to_y = np.any([subset.endswith(string) for string in ("adulterous", "felonous", "gamous", "lecherous", "usurous")], axis = 0) | |
subset[ous_to_y] = [string[:-3] + "y" for string in subset[ous_to_y]] | |
# e.g. "hazardous" -> "hazard" | |
delete_ous = np.any([subset.endswith(string) for string in ("advantageous", "amorous", "circuitous", "courageous", "feverous", "hazardous", "joyous", "nymous", "ponderous", "solicitous", "sulfurous", "tuberous", "ulcerous", "valorous", "vaporous", "verminous", "viperous", "vomitous", "zealous")], axis = 0) #*** assume delete all but use ous keepers instead | |
subset[delete_ous] = [string[:-3] for string in subset[delete_ous]] | |
# general "ous" rule is to leave it | |
words[subset_scope] = subset | |
# handle "ful" suffix | |
subset_scope = words.endswith("ful") | |
subset = words[subset_scope] | |
if len(subset) != 0: | |
# e.g. "beautiful" -> "beauty" | |
iful_to_y = subset.endswith("iful") | |
subset[iful_to_y] = [string[:-4] + "y" for string in subset[iful_to_y]] | |
# general rule--remove suffix | |
delete_ful = subset.endswith("ful") & np.array([not(bool(re.search("\\b(?:aw|grate)ful$", word))) for word in subset]) | |
subset[delete_ful] = [string[:-3] for string in subset[delete_ful]] | |
words[subset_scope] = subset | |
# handle "less" suffix | |
subset_scope = words.endswith("less") | |
subset = words[subset_scope] | |
if len(subset) != 0: | |
# e.g. "penniless" -> "penny" | |
iless_to_y = subset.endswith("iless") | |
subset[iless_to_y] = [string[:-5] + "y" for string in subset[iless_to_y]] | |
# general rule--remove suffix | |
delete_less = subset.endswith("less") & np.array([not(bool(re.search("\\b(?:b|hap|(?:never|none)the|un)?less$", word))) for word in subset]) | |
subset[delete_less] = [string[:-4] for string in subset[delete_less]] | |
words[subset_scope] = subset | |
# handle "ar" suffix | |
subset_scope = words.endswith("ar") | |
subset = words[subset_scope] | |
if len(subset) != 0: | |
# handle "ular" suffix | |
# e.g. "angular" -> "angle" | |
ular_to_le = np.any([subset.endswith(string) for string in ("angular", "circular", "clavicular", "corpuscular", "cuticular", "follicular", "miracular", "ocular", "oracular", "singular", "spectacular", "tabular", "tabernacular", "tentacular", "vehicular", "ventricular")], axis = 0) | |
subset[ular_to_le] = [string[:-4] + "le" for string in subset[ular_to_le]] | |
# e.g. "cellular" -> "cell" | |
delete_ular = np.any([subset.endswith(string) for string in ("glandular", "cellular")], axis = 0) | |
subset[delete_ular] = [string[:-4] for string in subset[delete_ular]] | |
# general "ular" rule--remove suffix | |
ular_to_ule = subset.endswith("ular") & ~np.any([subset.endswith(string) for string in ("particular", "popular", "regular")], axis = 0) | |
subset[ular_to_ule] = [string[:-2] + "e" for string in subset[ular_to_ule]] | |
# handle "iar" suffix | |
# e.g. "liar" -> "lie" | |
iar_to_ie = np.array([bool(re.search("\\bliar$", word)) for word in subset]) | |
subset[iar_to_ie] = [string[:-2] + "e" for string in subset[iar_to_ie]] | |
# e.g. "familiar" -> "family" | |
iar_to_y = subset.endswith("familiar") | |
subset[iar_to_y] = [string[:-3] + "y" for string in subset[iar_to_y]] | |
# handle general "ar" suffix | |
# e.g. "scholar" -> "school" | |
delete_ar_school = subset.endswith("scholar") | |
subset[delete_ar_school] = [string[:-3] + "ol" for string in subset[delete_ar_school]] | |
# general "ar" rule is to leave it | |
words[subset_scope] = subset | |
# e.g. "congruence" -> "congruent" | |
ence_to_ent = words.endswith("ence") & ~(np.any([words.endswith(string) for string in ("audience", "defence", "essence", "experience", "influence", "licence", "sentence")], axis = 0) | np.array([bool(re.search("\\b(?:[fhp]|sci|sp|th|wh)ence$", word)) for word in words])) # np.any([words.endswith(string) for string in ("abhorrence", "absence", "accidence", "congruence", "diligence", "evidence", "immanence", "indolence", "inherence", "insistence", "nascence", "opulence", "patience", "permanence", "potence", "presence", "prudence", "quence", "residence", "reticence", "reverence", "salience", "tangence", "transcience", "valence", "violence")], axis = 0) | |
words[ence_to_ent] = [string[:-2] + "t" for string in words[ence_to_ent]] | |
# e.g. "abundance" -> "abundant" | |
ance_to_ant = np.any([words.endswith(string) for string in ("abundance", "clairvoyance", "distance", "ificance", "malignance", "norance", "performance", "pursuance", "resistance")], axis = 0) | |
words[ance_to_ant] = [string[:-2] + "t" for string in words[ance_to_ant]] | |
# handle "ant" suffix | |
subset_scope = words.endswith("ant") | |
subset = words[subset_scope] | |
if len(subset) != 0: | |
# e.g. "significant" -> "signify" | |
ificant_to_y = subset.endswith("ificant") | |
subset[ificant_to_y] = [string[:-5] + "y" for string in subset[ificant_to_y]] | |
# e.g. "reductant" -> "reduce" | |
ctant_to_ce = subset.endswith("reductant") | |
subset[ctant_to_ce] = [string[:-4] + "e" for string in subset[ctant_to_ce]] | |
# e.g. "oxidant" -> "oxide" | |
ant_to_e = np.any([subset.endswith(string) for string in ("ignorant", "oxidant", "piquant", "pleasant", "pursuant")], axis = 0) | |
subset[ant_to_e] = [string[:-3] + "e" for string in subset[ant_to_e]] | |
# e.g. "reactant" -> "react" | |
delete_ant = np.any([subset.endswith(string) for string in ("colorant", "formant", "infestant", "inhabitant", "malignant", "reactant", "relaxant", "resistant", "toxicant")], axis = 0) | |
subset[delete_ant] = [string[:-3] for string in subset[delete_ant]] | |
# e.g. "participant" -> "participate" | |
#*** or make this general rule? | |
ant_to_ate = np.any([subset.endswith(string) for string in ("administrant", "participant", "supplicant")], axis = 0) | |
subset[ant_to_ate] = [string[:-2] + "te" for string in subset[ant_to_ate]] | |
# general "ant" rule is to leave it | |
words[subset_scope] = subset | |
# handle "ent" suffix | |
subset_scope = words.endswith("ent") | |
subset = words[subset_scope] | |
if len(subset) != 0: | |
# e.g. "student" -> "study" | |
ent_to_y = subset.endswith("student") | |
subset[ent_to_y] = [string[:-3] + "y" for string in subset[ent_to_y]] | |
# e.g. "emergent" -> "emerge" | |
ent_to_e = subset.endswith("ergent") | |
subset[ent_to_e] = [string[:-2] for string in subset[ent_to_e]] | |
# e.g. "credent" (from "credence") -> "credit" | |
ent_to_it = subset.endswith("credent") | |
subset[ent_to_it] = [string[:-3] + "it" for string in subset[ent_to_it]] | |
# e.g. "recurrent" -> "recur" | |
delete_ent_letter = np.any([subset.endswith(string) for string in ("deterrent", "incurrent", "occurrent", "recurrent")], axis = 0) | |
subset[delete_ent_letter] = [string[:-4] for string in subset[delete_ent_letter]] | |
# e.g. "different" -> "differ" | |
delete_ent = np.any([subset.endswith(string) for string in ("different", "conferent", "existent", "insistent", "preferent", "referent")], axis = 0) | |
subset[delete_ent] = [string[:-3] for string in subset[delete_ent]] | |
# general "ent" rule is to leave it | |
words[subset_scope] = subset | |
# handle "ive" suffix | |
subset_scope = words.endswith("ive") | |
subset = words[subset_scope] | |
if len(subset) != 0: | |
# handle "ative" suffix | |
# e.g. "affirmative" -> "affirm" | |
delete_ative = np.any([subset.endswith(string) for string in ("ulcerative", "ntative", "rmative", "rtative")], axis = 0) | |
subset[delete_ative] = [string[:-5] for string in subset[delete_ative]] | |
# e.g. "comparative" -> "compare" | |
ative_to_e = np.any([subset.endswith(string) for string in ("comparative", "curative")], axis = 0) | |
subset[ative_to_e] = [string[:-5] + "e" for string in subset[ative_to_e]] | |
# e.g. "relative" -> "relate" | |
ative_to_ate = subset.endswith("ative") & ~(subset.endswith("putative") | np.array([bool(re.search("\\b[dn]ative$", word)) for word in subset])) | |
subset[ative_to_ate] = [string[:-3] + "e" for string in subset[ative_to_ate]] | |
# handle "itive" suffix | |
# e.g. "sensitive" -> "sensate" (-> "sense" later on) | |
itive_to_ate = subset.endswith("sensitive") | |
subset[itive_to_ate] = [string[:-5] + "ate" for string in subset[itive_to_ate]] | |
# handle "ctive" suffix | |
# e.g. "deductive" -> "deduce" | |
ctive_to_ce = subset.endswith("ductive") | |
subset[ctive_to_ce] = [string[:-4] + "e" for string in subset[ctive_to_ce]] | |
# general "ctive" rule--remove suffix (e.g. "detective" -> "detect") | |
delete_ive = subset.endswith("ctive") & ~np.any([subset.endswith(string) for string in ("adjective", "objective", "subjective")], axis = 0) | |
subset[delete_ive] = [string[:-3] for string in subset[delete_ive]] | |
# handle "ptive" suffix | |
# e.g. "captive" -> "capture" | |
ptive_to_pture = subset.endswith("captive") | |
subset[ptive_to_pture] = [string[:-3] + "ure" for string in subset[ptive_to_pture]] | |
# e.g. "presumptive" -> "presume" | |
mptive_to_me = subset.endswith("mptive") | |
subset[mptive_to_me] = [string[:-5] + "e" for string in subset[mptive_to_me]] | |
# e.g. "absorptive" -> "absorb" | |
rptive_to_b = subset.endswith("rptive") | |
subset[rptive_to_b] = [string[:-5] + "b" for string in subset[rptive_to_b]] | |
# e.g. "prescriptive" -> "prescribe" | |
ptive_to_be = subset.endswith("scriptive") | |
subset[ptive_to_be] = [string[:-5] + "be" for string in subset[ptive_to_be]] | |
# e.g. "adaptive" -> "adapt" | |
ptive_to_pt = np.any([subset.endswith(string) for string in ("acceptive", "adaptive", "adoptive", "ruptive")], axis = 0) | |
subset[ptive_to_pt] = [string[:-3] for string in subset[ptive_to_pt]] | |
# e.g. "interruptive" -> "interrupt" | |
delete_ptive = subset.endswith("interruptive") | |
subset[delete_ptive] = [string[:-5] for string in subset[delete_ptive]] | |
# general "ptive" rule--remove suffix (e.g. "receptive" -> "receive") | |
ptive_to_ive = subset.endswith("ptive") | |
subset[ptive_to_ive] = [string[:-5] + "ive" for string in subset[ptive_to_ive]] | |
# handle general "ive" suffix | |
# e.g. "iterative" -> "iterate" | |
ive_to_e = np.any([subset.endswith(string) for string in ("decorative", "defensive", "iterative", "locative", "offensive")], axis = 0) | |
subset[ive_to_e] = [string[:-3] + "e" for string in subset[ive_to_e]] | |
# e.g. "assertive" -> "assert" | |
delete_ive = np.any([subset.endswith(string) for string in ("adoptive", "adventive", "appointive", "assertive", "attractive", "detective", "ejective", "erective", "eruptive", "excessive", "exeptive", "exertive", "preventive", "reactive", "reflective", "selective", "transitive", "vomitive")], axis = 0) | |
subset[delete_ive] = [string[:-3] for string in subset[delete_ive]] | |
# general "ive" rule is to leave it | |
words[subset_scope] = subset | |
# e.g. "celebratory" -> "celebrate" | |
atory_to_ate = words.endswith("atory") & ~words.endswith("oratory") | |
words[atory_to_ate] = [string[:-3] + "e" for string in words[atory_to_ate]] | |
# e.g. "messenger" -> "message" | |
enger_to_age = np.any([words.endswith(string) for string in ("messenger", "passenger")], axis = 0) | |
words[enger_to_age] = [string[:-5] + "age" for string in words[enger_to_age]] | |
# handle "age" suffix | |
subset_scope = words.endswith("age") | |
subset = words[subset_scope] | |
if len(subset) != 0: | |
# e.g. "verbiage" -> "verb" | |
delete_iage = subset.endswith("verbiage") | |
subset[delete_iage] = [string[:-4] for string in subset[delete_iage]] | |
# e.g. "marriage" -> "marry" | |
iage_to_y = subset.endswith("rriage") | |
subset[iage_to_y] = [string[:-4] + "y" for string in subset[iage_to_y]] | |
# e.g. "assemblage" -> "assembly" | |
age_to_y = subset.endswith("blage") | |
subset[age_to_y] = [string[:-3] + "y" for string in subset[age_to_y]] | |
# e.g. "dosage" -> "dose" | |
age_to_e = np.any([subset.endswith(string) for string in ("chaperonage", "cleavage", "dosage", "pipage", "storage", "usage")], axis = 0) | |
subset[age_to_e] = [string[:-3] + "e" for string in subset[age_to_e]] | |
# remove suffix if example in list (e.g. "wattage" -> "watt") | |
delete_age = np.any([subset.endswith(string) for string in English_age_removers], axis = 0) | |
subset[delete_age] = [string[:-3] for string in subset[delete_age]] | |
# general "age" rule is to leave it | |
words[subset_scope] = subset | |
# handle "tion" suffix | |
subset_scope = words.endswith("tion") | |
subset = words[subset_scope] | |
if len(subset) != 0: | |
# handle "ication" suffix | |
# e.g. "amplification" -> "amplify" | |
ication_to_y = subset.endswith("ification") | |
subset[ication_to_y] = [string[:-7] + "y" for string in subset[ication_to_y]] | |
# e.g. "publication" -> "publish" | |
ication_to_ish = subset.endswith("publication") | |
subset[ication_to_ish] = [string[:-6] + "sh" for string in subset[ication_to_ish]] | |
# handle "faction" suffix | |
# e.g. "satisfaction" -> "satisfy" | |
faction_to_fy = subset.endswith("faction") | |
subset[faction_to_fy] = [string[:-6] + "y" for string in subset[faction_to_fy]] | |
# handle "ation" suffix | |
# e.g. "pronunciation" -> "pronounce" | |
nunciation_to_nounce = subset.endswith("nunciation") | |
subset[nunciation_to_nounce] = [string[:-9] + "ounce" for string in subset[nunciation_to_nounce]] | |
# e.g. "filtration" -> "filter" | |
tration_to_ter = np.any([subset.endswith(string) for string in ("filtration", "istration")], axis = 0) | |
subset[tration_to_ter] = [string[:-6] + "er" for string in subset[tration_to_ter]] | |
# e.g. "cancellation" -> "cancel" | |
delete_lation = subset.endswith("cancellation") | |
subset[delete_lation] = [string[:-6] for string in subset[delete_lation]] | |
# e.g. "invitation" -> "invite" | |
ation_to_e = np.any([subset.endswith(string) for string in ("compilation", "invitation")], axis = 0) | |
subset[ation_to_e] = [string[:-5] + "e" for string in subset[ation_to_e]] | |
# e.g. "consideration" -> "consider" | |
delete_ation = np.any([subset.endswith(string) for string in ("accreditation", "adaptation", "consideration", "distillation", "installation", "instillation", "ntation", "recommendation", "transformation")], axis = 0) | |
subset[delete_ation] = [string[:-5] for string in subset[delete_ation]] | |
# e.g. "colonization" -> "colonize" | |
iszation_to_ize = np.any([subset.endswith(string) for string in ("isation", "ization")], axis = 0) | |
subset[iszation_to_ize] = [string[:-6] + "ze" for string in subset[iszation_to_ize]] | |
# e.g. "expectation" -> "expect" | |
delete_ation = np.any([subset.endswith(string) for string in ("expectation", "formation", "foundation", "information", "transportation")], axis = 0) | |
subset[delete_ation] = [string[:-5] for string in subset[delete_ation]] | |
# e.g. "sanitation" -> "sanitary" | |
ation_to_ary = subset.endswith("sanitation") | |
subset[ation_to_ary] = [string[:-4] + "ry" for string in subset[ation_to_ary]] | |
# e.g. "celebration" -> "celebrate" (general "ation" rule) | |
ation_to_ate = subset.endswith("ation") & ~np.any([subset.endswith(string) for string in ("nation", "occupation", "ration", "station", "vocation")], axis = 0) | |
subset[ation_to_ate] = [string[:-3] + "e" for string in subset[ation_to_ate]] | |
# handle "ition" and "ution" suffixes | |
# e.g. "practicioner" -> "practice" | |
ition_to_ice = np.any([subset.endswith(string) for string in ("practition", "practitioner")], axis = 0) | |
subset[ition_to_ice] = [re.sub(string = string, pattern = "tion(er)?$", repl = "ce") for string in subset[ition_to_ice]] | |
# e.g. "solution" -> "solve" | |
ution_to_ve = subset.endswith("olution") | |
subset[ution_to_ve] = [string[:-5] + "ve" for string in subset[ution_to_ve]] | |
# handle "ption" suffix | |
# e.g. "redemption" -> "redeem" | |
mption_to_em = subset.endswith("redemption") | |
subset[mption_to_em] = [string[:-6] + "em" for string in subset[mption_to_em]] | |
# e.g. "consumption" -> "consume" | |
mption_to_me = subset.endswith("mption") & ~subset.endswith("exemption") | |
subset[mption_to_me] = [string[:-6] + "me" for string in subset[mption_to_me]] | |
# e.g. "conception" -> "conceive" | |
eption_to_eive = subset.endswith("eption") & ~np.any([subset.endswith(string) for string in ("exception", "interception")], axis = 0) | |
subset[eption_to_eive] = [string[:-5] + "ive" for string in subset[eption_to_eive]] | |
# e.g. "transcription" -> "transcribe" | |
iption_to_ibe = subset.endswith("iption") | |
subset[iption_to_ibe] = [string[:-5] + "be" for string in subset[iption_to_ibe]] | |
# e.g. "absorption" -> "absorb" | |
orption_to_orb = subset.endswith("orption") | |
subset[orption_to_orb] = [string[:-5] + "b" for string in subset[orption_to_orb]] | |
# handle "ction" suffix | |
# e.g. "destruction" -> "destroy" | |
uction_to_oy = subset.endswith("destruction") | |
subset[uction_to_oy] = [string[:-6] + "oy" for string in subset[uction_to_oy]] | |
# e.g. "introduction" -> "introduce" | |
ction_to_ce = np.any([subset.endswith(string) for string in ("introduction", "reduction", "reproduction", "seduction")], axis = 0) | |
subset[ction_to_ce] = [string[:-4] + "e" for string in subset[ction_to_ce]] | |
# handle general "ion" suffix | |
# e.g. "depiction" -> "depict" | |
delete_ion = np.any([subset.endswith(string) for string in ("ction", "ption")], axis = 0) & ~np.any([subset.endswith(string) for string in ("caption", "duration", "auction", "diction", "fiction", "fraction", "function", "junction", "sanction", "surrection")], axis = 0) | |
subset[delete_ion] = [string[:-3] for string in subset[delete_ion]] | |
# general "ion" rule is to leave it | |
words[subset_scope] = subset | |
# e.g. "compression" -> "compress" | |
delete_ion = words.endswith("ession") & ~np.any([words.endswith(string) for string in ("cession", "session")], axis = 0) | |
words[delete_ion] = [string[:-3] for string in words[delete_ion]] | |
# handle "ery" suffix | |
subset_scope = words.endswith("ery") | |
subset = words[subset_scope] | |
if len(subset) != 0: | |
# e.g. "thuggery" -> "thug" | |
delete_ery_letter = np.any([subset.endswith(string) for string in ("blubbery", "buggery", "cutlery", "doggery", "gunnery", "jewellery", "nunnery", "piggery", "pottery", "robbery", "shrubbery", "snobbery", "thuggery")], axis = 0) | |
subset[delete_ery_letter] = [string[:-4] for string in subset[delete_ery_letter]] | |
# e.g. "brewery" -> "brew" | |
delete_ery = np.any([subset.endswith(string) for string in ("bitchery", "brewery", "butchery", "cookery", "creamery", "crockery", "crookery", "deanery", "demagoguery", "distillery", "eatery", "fishery", "foolery", "fuckery", "greenery", "joinery", "mockery", "monkery", "printery", "quackery", "rookery", "smithery", "trickery")], axis = 0) | |
subset[delete_ery] = [string[:-3] for string in subset[delete_ery]] | |
# e.g. "bribery" -> "bribe" | |
delete_ry = np.any([subset.endswith(string) for string in ("bribery", "bakery", "bravery", "cyclery", "drapery", "fakery", "finery", "forgery", "grotesquery", "imagery", "machinery", "missilery", "mopery", "nursery", "pedlery", "perfumery", "refinery", "rocketry", "roguery", "savagery", "scenery", "slavery", "winery")], axis = 0) | |
subset[delete_ry] = [string[:-2] for string in subset[delete_ry]] | |
# e.g. "watery" -> "water" | |
delete_y = np.any([subset.endswith(string) for string in ("beery", "butlery", "buttery", "cheery", "delivery", "discovery", "flowery", "grocery", "jittery", "leathery", "leery", "mastery", "mothery", "papery", "quivery", "recovery", "rubbery", "silvery", "sneery", "spidery", "watery", "wintery")], axis = 0) | |
subset[delete_y] = [string[:-1] for string in subset[delete_y]] | |
words[subset_scope] = subset | |
# handle "y" suffix | |
subset_scope = words.endswith("y") | |
subset = words[subset_scope] | |
if len(subset) != 0: | |
# e.g. "(in)finity" -> "finite" | |
y_to_e = np.any([subset.endswith(string) for string in ("finity", "injury")], axis = 0) & ~subset.endswith("affinity") | |
subset[y_to_e] = [string[:-1] + "e" for string in subset[y_to_e]] | |
# e.g. "advisory" -> "advisor" | |
delete_y = np.any([subset.endswith(string) for string in ("archy", "complicity", "visory")], axis = 0) | |
subset[delete_y] = [string[:-1] for string in subset[delete_y]] | |
words[subset_scope] = subset | |
# handle "it" suffix | |
subset_scope = words.endswith("it") | |
subset = words[subset_scope] | |
if len(subset) != 0: | |
# "(in)evit(able)" -> "evade" | |
evit_to_evade = np.array([bool(re.search("\\bevit$", word)) for word in subset]) | |
subset[evit_to_evade] = [string[:-2] + "ade" for string in subset[evit_to_evade]] | |
# "implicit" -> "imply" | |
mplicit_to_mply = subset.endswith("mplicit") | |
subset[mplicit_to_mply] = [string[:-4] + "y" for string in subset[mplicit_to_mply]] | |
words[subset_scope] = subset | |
# handle "itude" suffix | |
subset_scope = words.endswith("itude") | |
subset = words[subset_scope] | |
if len(subset) != 0: | |
# e.g. "amplitude" -> "amplify" | |
itude_to_ify = np.any([subset.endswith(string) for string in ("amplitude", "certitude", "fortitude", "gratitude", "magnitude")], axis = 0) | |
subset[itude_to_ify] = [string[:-4] + "fy" for string in subset[itude_to_ify]] | |
# e.g. "similitude" -> "similar" | |
itude_to_ar = subset.endswith("similitude") | |
subset[itude_to_ar] = [string[:-5] + "ar" for string in subset[itude_to_ar]] | |
# e.g. "servitude" -> "serve" | |
itude_to_e = subset.endswith("servitude") | |
subset[itude_to_e] = [string[:-5] + "e" for string in subset[itude_to_e]] | |
# e.g. "plentitude" -> "plenty" | |
itude_to_y = subset.endswith("plentitude") | |
subset[itude_to_y] = [string[:-5] + "y" for string in subset[itude_to_y]] | |
# e.g. "decrepitude" -> "decrepit" | |
itude_to_it = np.any([subset.endswith(string) for string in ("decrepitude", "solicitude")], axis = 0) | |
subset[itude_to_it] = [string[:-3] for string in subset[itude_to_it]] | |
# e.g. "(in)finitude" -> "finite" | |
itude_to_ite = subset.endswith("finitude") | |
subset[itude_to_ite] = [string[:-3] + "e" for string in subset[itude_to_ite]] | |
# e.g. "exactitude" -> "exact" | |
delete_itude = np.any([subset.endswith(string) for string in ("aptitude", "correctitude", "crassitude", "eptitude", "exactitude", "vastitude")], axis = 0) | |
subset[delete_itude] = [string[:-5] for string in subset[delete_itude]] | |
words[subset_scope] = subset | |
# handle "ysis" suffix | |
subset_scope = words.endswith("ysis") | |
subset = words[subset_scope] | |
if len(subset) != 0: | |
# e.g. "lysis" -> "lyse" | |
ysis_to_yse = np.array([bool(re.search("\\blysis$", word)) for word in subset]) | |
subset[ysis_to_yse] = [string[:-3] + "se" for string in subset[ysis_to_yse]] | |
# e.g. "hydrolysis" -> "hydrolyze" | |
ysis_to_yze = subset.endswith("ysis") | |
subset[ysis_to_yze] = [string[:-3] + "ze" for string in subset[ysis_to_yze]] | |
words[subset_scope] = subset | |
### handle comparative/doer ("er"), superlative ("est"), past tense ("ed"), and progressive tense ("ing") endings | |
#* nested back-references don't work in R regex | |
# handle "er" suffix | |
subset_scope = words.endswith("er") | |
subset = words[subset_scope] | |
if len(subset) != 0: | |
er_keeper_mask = np.array([not(bool(re.search(any_of(English_er_keepers) + "$", word))) for word in subset]) | |
# e.g. "controller" -> "control" | |
delete_er_letter = ((np.any([subset.endswith(string) for string in flat_concat([English_doubled_consonants_er, "awfuller", "compeller", "controller", "traveller", "quizzer", "frolicker", "mimicker", "mosaicker", "panicker", "picnicker", "politicker", "trafficker", "laughter", "remainder"])], axis = 0) & np.array([not(bool(re.search("((([aiu]|\\b([fhjstwy]|bests|dw|kn|kv|qu|sh|sm|sp|sw)e|((\\b|en)r)o)ll)|(\\bodd))er$", word))) for word in subset])) | (subset.endswith("eer") & np.array([not(bool(re.search(any_of(English_eer_keepers) + "$", word))) for word in subset]) & ~np.any([subset.endswith(string) for string in ("decreer", "fleer", "freer", "seer")], axis = 0))) & er_keeper_mask | |
subset[delete_er_letter] = [string[:-3] for string in subset[delete_er_letter]] | |
# e.g. "carrier" -> "carry" | |
ier_to_y = subset.endswith("ier") & np.array([not(bool(re.search(any_of(English_ier_keepers) + "$", word))) for word in subset]) & ~(np.any([subset.endswith(string) for string in ("taxier", "waterskier")], axis = 0) | np.array([bool(re.search("\\bskier$", word)) for word in subset])) | |
subset[ier_to_y] = [string[:-3] + "y" for string in subset[ier_to_y]] | |
er_keeper_mask = ~delete_er_letter & np.array([not(bool(re.search(any_of(English_er_keepers) + "$", word))) for word in subset]) | |
# e.g. "(over)seer" -> "see" | |
delete_r = (np.array([bool(re.search((e_rules + "er$"), word)) for word in subset]) | np.any([subset.endswith(string) for string in ("decreer", "fleer", "freer", "seer")], axis = 0)) & er_keeper_mask | |
subset[delete_r] = [string[:-1] for string in subset[delete_r]] | |
# general rule--remove suffix (e.g. "talker" -> "talk") | |
delete_er = subset.endswith("er") & er_keeper_mask | |
subset[delete_er] = [string[:-2] for string in subset[delete_er]] | |
words[subset_scope] = subset | |
# handle "est" suffix | |
subset_scope = words.endswith("est") | |
subset = words[subset_scope] | |
if len(subset) != 0: | |
est_keeper_mask = np.array([not(bool(re.search(any_of(English_est_keepers) + "$", word))) for word in subset]) |