In [1]:
import os
import sys
import requests
import numpy as np
from collections import defaultdict, Counter
from string import ascii_lowercase
from typing import List, Set, Dict
char_map = {c:i for i,c in enumerate(ascii_lowercase)}

In [2]:
def get_words():
    # https://gist.github.com/subhrm/5362767af06597bd1e216c59b760f6cb
    url="https://gist.githubusercontent.com/subhrm/5362767af06597bd1e216c59b760f6cb/raw/6bfa15d263d6d5b63840a8e5b64e04b382fdb079/valid-wordle-words.txt"
    resp = requests.get(url)
    print(f"{resp.status_code=}")
    word_list = [w for w in resp.text.split("\n") if w]
    print(f"{len(word_list)=}")
    for word in word_list:
        assert len(word) == 5, f"{word} is not a 5 letter word"
    return word_list

word_list = get_words()

resp.status_code=200
len(word_list)=14855


In [3]:
def compute_word_scores(word_list):
    # first compute character frequency
    freqs = Counter()
    for word in word_list:
        freqs.update(word)

    min_freq = min(freqs.values()) - 1
    max_freq = freqs.most_common(1)
    print(f"{min_freq=} {max_freq=}")
    w_score = lambda w: sum( np.log(np.log(freqs[c]/min_freq)) for c in set(w))

    word_scores = {w: w_score(w) for w in word_list}
    sorted_scores = sorted(word_scores.items(), key=lambda x : x[1])
    print(f"""Top 10 words""")
    for i in range(1,11):
        print(sorted_scores[-i])
    print(f"Worst 10 words : {sorted_scores[:10] }")

    return word_scores

word_scores = compute_word_scores(word_list)

min_freq=144 max_freq=[('e', 7455)]
Top 10 words
('soare', 6.629943478527169)
('arose', 6.629943478527169)
('aeros', 6.629943478527169)
('seria', 6.580337774890249)
('serai', 6.580337774890249)
('reais', 6.580337774890249)
('raise', 6.580337774890249)
('arise', 6.580337774890249)
('aesir', 6.580337774890249)
('alose', 6.564553681105904)
Worst 10 words : [('qajaq', -3.7568220130505012), ('maqam', -2.5753305084447145), ('queue', -2.497785825012791), ('quiff', -1.8755691536277401), ('quaff', -1.7423285320267206), ('quouk', -1.6768515281941763), ('piqui', -1.6027805679818905), ('quipu', -1.6027805679818905), ('queek', -1.5817869015724828), ('queyu', -1.463388299493958)]


In [4]:
def filter_words(vocab: List[str], rank: Dict, including: str, excluding: str, pos_match: List, not_pos_match: List):
    res = vocab
    if len(excluding) > 0:
        res = [w for w in res if len(set(w).intersection(excluding)) == 0]

    m = len(including)
    if m > 0:
        res = [w for w in res if len(set(w).intersection(including)) == m]

    for i in range(min(5, len(pos_match))):
        if pos_match[i] in char_map:
            c = pos_match[i]
            res = [w for w in res if w[i] == c]

    for i in range(min(5, len(not_pos_match))):
        for c in not_pos_match[i]:
            res = [w for w in res if w[i] != c]

    print(f"Number of words left : {len(res)}")
    scores = [ rank[w] for w in res]
    final_result = sorted( zip(scores,res), reverse=True)
    for score,word in final_result[:80]:
        print(word)
    return

In [11]:
filter_words(
    vocab=word_list,
    rank=word_scores,
    including="stae",
    excluding="lonirmpg",
    pos_match="sta*e",
    not_pos_match=["", "a", "", "", ""],
)

Number of words left : 4
stade
stake
stave
state
