## Wordle 5 set second approch

Date : 23-Jun-2024
Time < 5 Min

## Find the sets of 5 words with no char in common

In [1]:
import os
import sys
import requests
from tqdm import tqdm
import numpy as np
from collections import defaultdict, Counter
from string import ascii_lowercase
from typing import List, Set, Dict
char_map = {c:i for i,c in enumerate(ascii_lowercase)}

### Get all words

In [2]:
def get_words():
    # https://gist.github.com/subhrm/5362767af06597bd1e216c59b760f6cb
    url="https://gist.githubusercontent.com/subhrm/5362767af06597bd1e216c59b760f6cb/raw/6bfa15d263d6d5b63840a8e5b64e04b382fdb079/valid-wordle-words.txt"
    resp = requests.get(url)
    print(f"{resp.status_code=}")
    word_list = [w for w in resp.text.split("\n") if w]
    print(f"{len(word_list)=}")
    for word in word_list:
        assert len(word) == 5, f"{word} is not a 5 letter word"
    return word_list

word_list = get_words()

resp.status_code=200
len(word_list)=14855


### Convert words to numbers to keep track of anagrams

In [3]:
def word_to_num(word):
    return sum( (1<<char_map[c]) for c in word)

# test
assert word_to_num("abc") == word_to_num("bac")

word_map = defaultdict(list)
ignored = 0
for w in word_list:
    if len(set(w)) == 5:
        # only keep words with 5 unique characters
        num = word_to_num(w)
        word_map[num].append(w)
    else:
        ignored += 1

print(f"{len(word_map)=}  {ignored=}")

unique_list = sorted(word_map.keys())
n = len(unique_list)

len(word_map)=5650  ignored=5490


## Create graph where the nodes are words and there's a edge if the words share no common letter

In [11]:
# two_set = defaultdict(list)
g =  defaultdict(set)

for i,w1 in enumerate(tqdm(unique_list)):
    for w2 in unique_list[i+1:]:
        if (w1 & w2) == 0:
            g[w1].add(w2)
            # s1 = w1 | w2
            # two_set[s1].append((w1,w2))

# two_set_keys = sorted(two_set.keys())
# print(f"{len(two_set_keys)=}")
print(f"{len(g)=}")

100%|██████████| 5650/5650 [00:08<00:00, 648.25it/s] 


len(two_set_keys)=593720
len(g)=5418


# Solution: 1

Use the 5-clique solution. Iteratively find all complete sub-graphs with 5 nodes. Runs in `< 4 minutes`

In [14]:
solutions =  []

for w1 in tqdm(unique_list):
    f1 = g[w1]
    if len(f1) < 4:
        continue
    for w2 in f1:
        f2 = f1 & g[w2]
        if len(f2) < 3:
            continue
        for w3 in f2:
            f3 = f2 & g[w3]
            if len(f3) < 2:
                continue
            for w4 in f3:
                f4 = f3 & g[w4]
                for w5 in f4:
                    solutions.append((w1,w2,w3,w4,w5))

print(f"\n{len(solutions)=}")

100%|██████████| 5650/5650 [03:59<00:00, 23.63it/s] 


len(solutions)=23





# Solution 2 : Smart Brute force

`Time : ~ 7 minutes`

In [17]:
solutions2 =  []
for w1 in tqdm(unique_list):
    f1 = g[w1]
    if len(f1) < 4:
        continue
    for j,w2 in enumerate(f1):
        s2 = w1 | w2
        f2 = [w for w in g[w2] if s2&w==0]
        if len(f2) < 3:
            continue
        for k,w3 in enumerate(f2):
            s3 = s2 | w3
            f3 = [w for w in f2[k+1:] if s3&w==0]
            for l,w4 in enumerate(f3):
                s4 = s3 | w4
                f4 = [w for w in f3[l+1:] if s4&w==0]
                for w5 in f4:
                    solutions2.append((w1,w2,w3,w4,w5))
print(f"\n{len(solutions2)=}")

100%|██████████| 5650/5650 [06:57<00:00, 13.54it/s] 


len(solutions2)=23





### Index words by lowest char , in other words the lowest set bit

In [None]:
lowest_bit_map = defaultdict(list)
for i,num in enumerate(unique_list):
    lowest_bit_map[num & -num].append(num)
print(len(lowest_bit_map))

17


In [None]:
for k in sorted(lowest_bit_map.keys()):
    print(f"{k}: {len(lowest_bit_map[k])}")

1: 2289
2: 462
4: 584
8: 526
16: 854
32: 136
64: 192
128: 141
256: 238
512: 19
1024: 59
2048: 50
4096: 35
8192: 31
16384: 28
32768: 4
131072: 2


In [None]:
def get_lowest_unset_bit(num):
    # find teh lowest bit that is not set
    i = 1
    while (num & i) > 0:
        i = i << 1
    return i

In [None]:
from itertools import product
for sol in solutions:
    for p in product(*[word_map[w] for w in sol]):
        print(sol, p)

(10562, 656404, 4522017, 17863168, 35668104) ('bling', 'treck', 'waqfs', 'jumpy', 'vozhd')
(17944, 1183750, 4522017, 25206912, 34081088) ('joked', 'crumb', 'waqfs', 'phynx', 'glitz')
(43328, 656404, 4522017, 17830402, 35668104) ('pling', 'treck', 'waqfs', 'jumby', 'vozhd')
(132124, 1069570, 4522017, 25206912, 34081088) ('dreck', 'jumbo', 'waqfs', 'phynx', 'glitz')
(132358, 534608, 4522017, 17863168, 35668104) ('brick', 'glent', 'waqfs', 'jumpy', 'vozhd')
(140368, 559364, 4522017, 17830402, 35668104) ('kreng', 'clipt', 'waqfs', 'jumby', 'vozhd')
(148008, 1057924, 10486034, 17076288, 38275073) ('fjord', 'chunk', 'vibex', 'gymps', 'waltz')
(148008, 1115141, 4460562, 25206912, 34081088) ('fjord', 'quack', 'wembs', 'phynx', 'glitz')
(148008, 1311812, 10486034, 16822400, 38275073) ('fjord', 'gucks', 'vibex', 'nymph', 'waltz')
(148486, 1049176, 4522017, 25206912, 34085120) ('brock', 'judge', 'waqfs', 'phynx', 'miltz')
(151558, 1050136, 4522017, 25206912, 34081088) ('cromb', 'juked', 'waqfs', 