## Updated with the following:
1. Idea of finding the word that maximizes exclusions
2. A game class
3. Interactivity

## Imports

In [1]:
import os
import sys
import requests
import math
import random
import numpy as np
from collections import defaultdict, Counter
from string import ascii_lowercase
from typing import List, Set, Dict
char_map = {c:i for i,c in enumerate(ascii_lowercase)}

## Download Word List

In [2]:
def get_words(url=""):
    resp = requests.get(url)
    print(f"{resp.status_code=}")
    word_list = [w for w in resp.text.split("\n") if w]
    print(f"{len(word_list)=}")
    for word in word_list:
        assert len(word) == 5, f"{word} is not a 5 letter word"
    return word_list

# https://gist.github.com/subhrm/5362767af06597bd1e216c59b760f6cb
URL = "https://gist.githubusercontent.com/subhrm/5362767af06597bd1e216c59b760f6cb/raw/6bfa15d263d6d5b63840a8e5b64e04b382fdb079/valid-wordle-words.txt"
word_list = get_words(URL)

resp.status_code=200
len(word_list)=14855


## Download valid wordle answers

In [3]:
ANSWERS_URL = "https://gist.githubusercontent.com/cfreshman/a03ef2cba789d8cf00c08f767e0fad7b/raw/45c977427419a1e0edee8fd395af1e0a4966273b/wordle-answers-alphabetical.txt"
answers_list = set(get_words(ANSWERS_URL))

### Check that all the answer words are in the master word list
assert len(answers_list - set(word_list)) == 0
assert len(answers_list & set(word_list)) == len(answers_list)

resp.status_code=200
len(word_list)=2315


## Define scoring method

In [4]:
def score_words(words):
    pos_count = [{} for _ in range(5)]
    cnt = len(words)

    for w in words:
        for i,c in enumerate(w):
            pos_count[i][c] = pos_count[i].get(c,0) + 1

    scores = []
    for w in words:
        s = 1.0 if len(set(w)) == 5 else 0
        if w in answers_list:
            s += 2.0
        for i,c in enumerate(w):
            s += math.log10(pos_count[i][c])
        scores.append(s)
    return scores

# test scores
score_words(["hollo", "world", "aaalo", "bbbbo"])


[1.255272505103306, 3.778151250383644, 0.9542425094393249, 0.47712125471966244]

In [5]:
def filter_words(vocab: List[str],
                 including: str,
                 excluding: str,
                 pos_match: List,
                 not_pos_match: List):

    res = vocab
    if len(excluding) > 0:
        ex = set(excluding)
        res = [w for w in res if len(set(w).intersection(ex)) == 0]

    m = len(including)
    if m > 0:
        inc = set(including)
        res = [w for w in res if len(set(w).intersection(inc)) == m]

    for i in range(5):
        if pos_match[i] in char_map:
            c = pos_match[i]
            res = [w for w in res if w[i] == c]

    for i in range(5):
        for c in not_pos_match[i]:
            res = [w for w in res if w[i] != c]

    # print(f"Number of words left : {len(res)}")

    scores = score_words(res)
    final_result = sorted( zip(scores,res), reverse=True)
    return final_result

In [7]:
## Test
filter_words(
    vocab=word_list,
    including="h",
    excluding="abfgijklmpqrsuvwxyz",
    pos_match="*****",
    not_pos_match=["", "", "", "", ""],
)

[(7.4593924877592315, 'notch'),
 (7.345883212931713, 'conch'),
 (7.082641778157132, 'tooth'),
 (6.8327643049405316, 'hence'),
 (6.769377326076139, 'tenth'),
 (6.702430536445526, 'teeth'),
 (5.94546858513182, 'tench'),
 (5.811575005870593, 'deoch'),
 (5.702430536445526, 'dench'),
 (5.6635124704151565, 'doeth'),
 (5.65667304588485, 'honed'),
 (5.521974471987394, 'cohen'),
 (5.45502768235678, 'cooch'),
 (5.367877506637881, 'hooch'),
 (5.288696260590256, 'hoten'),
 (5.199755177253475, 'cotch'),
 (5.1126050015345745, 'hotch'),
 (5.054613054556888, 'hench'),
 (4.904120213476199, 'chode'),
 (4.852967691028819, 'cheth'),
 (4.714664992862537, 'hotte'),
 (4.65667304588485, 'hohed'),
 (4.598681098907163, 'tehee'),
 (4.45502768235678, 'chott'),
 (4.434824296268494, 'thete'),
 (4.362482474751175, 'docht'),
 (4.345883212931713, 'cohoe'),
 (4.33008894574848, 'cheet'),
 (4.230704313612569, 'theed'),
 (4.029058950084499, 'choco'),
 (4.003460532109506, 'oohed'),
 (3.7280289544205183, 'choon'),
 (3.51054

## Define Game Class

In [None]:
class Game:
    def __init__(self, debug=False):
        self.including = set()
        self.excluding = []
        self.pos_match = [""]*5
        self.not_pos_match = [[] for _ in range(5)]
        self.guess = ""
        self.debug = debug

    def get_words(self):
        return filter_words(
            vocab=word_list,
            including=self.including,
            excluding=self.excluding,
            pos_match=self.pos_match,
            not_pos_match=self.not_pos_match
        )

    def start(self):
        # find a word with atleast two vowels
        vowels = list("aeiou")
        random.shuffle(vowels)
        self.including.add(vowels[0])
        self.including.add(vowels[1])
        words = self.get_words()[:20]
        guess = random.choice(words)
        self.guess = guess[1]

        # clear the self.including set
        self.including.clear()
        return self.guess

    def update_result(self, res):
        for i in range(5):
            r = res[i]
            c = self.guess[i]

            if r == "y":
                self.including.add(c)
                self.not_pos_match[i].append(c)
            elif r == "g":
                self.including.add(c)
                self.pos_match[i] = c

        for i, (c,r) in enumerate(zip(self.guess, res)):
            if r == "b":
                self.not_pos_match[i].append(c)
                if c not in self.including:
                    self.excluding.append(c)
        if self.debug:
            print(f"{self.including=} {self.excluding=}")
            print(f"{self.pos_match=} {self.not_pos_match=}")

    def play(self, debug=False):
        candidates = self.get_words()
        self.guess = candidates[0][1]
        return len(candidates), self.guess


## Play a Game

In [None]:
g = Game(debug=True)
pred = g.start()
print(f"Starting word is :  {pred}")
while(True):
    res = input("what is the result:  ").lower()
    if res in ("" , "ggggg", "stop", "ok", "done"):
        print("bye")
        break
    g.update_result(res)
    cnt, pred = g.play()
    if cnt > 1:
        print(f"=== The next predicted word is : {pred} === out of *{cnt}* remaining ones")
    else:
        print(f"#### Found only ONE word : {pred} ####")
        print("The problem should be solved now")
        break



Starting word is :  guard
what is the result:  bbggb
self.including={'r', 'a'} self.excluding=['g', 'u', 'd']
self.pos_match=['', '', 'a', 'r', ''] self.not_pos_match=[['g'], ['u'], [], [], ['d']]
=== The next predicted word is : share === out of *145* remaining ones


KeyboardInterrupt: Interrupted by user

## Test the method on valid Wordle Answers

In [None]:
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt

In [None]:
DO_TEST = False # @param {type:"boolean"}
if DO_TEST:
    num_of_steps = []
    for secret in tqdm(answers_list):
        # for each word try three times
        for _ in range(3):
            game_obj = Game(debug=False)
            guess = game_obj.start()
            steps = 0
            while(True):
                steps += 1
                _res = []
                for i,c in enumerate(guess):
                    if c == secret[i]:
                        _res.append("g")
                    elif c in secret:
                        _res.append("y")
                    else:
                        _res.append("b")

                res = "".join(_res)
                if res == "ggggg":
                    num_of_steps.append(steps)
                    break
                game_obj.update_result(res)
                guess = game_obj.play()

    # Show Test Result
    print(len(num_of_steps))
    print(np.min(num_of_steps), np.max(num_of_steps), np.mean(num_of_steps))
    counts = np.bincount(num_of_steps)
    print(f"{counts=}")
    print(np.round((np.cumsum(counts) / np.sum(counts))*100.0, decimals=1))
    plt.bar(np.arange(len(counts)), height=counts)