In [215]:
import numpy as np
import requests

Load up some English words
This list was taken from: 

https://github.com/dwyl/english-words/blob/master/words.txt
https://raw.githubusercontent.com/first20hours/google-10000-english/master/google-10000-english-no-swears.txt
https://raw.githubusercontent.com/tabatkins/wordle-list/main/words

In [220]:
r = requests.get("https://raw.githubusercontent.com/tabatkins/wordle-list/main/words")
words = r.content.decode('utf-8').split()
words = [w.lower().strip() for w in words]
words = {w for w in words if w.strip().isalpha() and len(w.strip())==5}
    
len(words)

12947

In [237]:
words_starting_with_a = [w for w in words if w[0] == 'a']
words_starting_with_c = [w for w in words if w[0] == 'c']
words_starting_with_s = [w for w in words if w[0] == 's']

In [235]:
len(words_starting_with_a)

736

In [236]:
len(words_starting_with_c)

920

In [238]:
len(words_starting_with_s)

1560

In [225]:
# Create a 26x5 matrix.  We'll use this to count the occurence of each letter in each of the 5 positions
# index 0 will be 'a', index 1 will 'b'    So we use the ord() function to get the ascii value and subtract off ord('a')
counts = np.zeros((26, 5), int)

# # 'c' count for each position.      ord('c') - ord('a') = 2
counts[2]

array([0, 0, 0, 0, 0])

In [229]:
# Count the occurences for each letter, in each position

for w in words:
    for position, l in enumerate(w):
        letter_val = ord(l) - ord('a')            # so 'a' - 'a' is 0, 'b' - 'a' is 1, etc
        counts[letter_val][position] += 1

In [230]:
# get the order 
test = np.array([4, 2, 7, 1])
test.argsort()   # -> array([3, 1, 0, 2])

# 3 -> 1
# 1 -> 2
# 0 -> 4
# 2 -> 7


# Get the rank
test.argsort().argsort()

array([2, 1, 3, 0])

In [240]:
# Here are the counts for the first letter position
counts[:,0]

array([ 736,  908,  920,  681,  303,  595,  637,  488,  165,  202,  375,
        575,  693,  325,  262,  857,   78,  628, 1560,  815,  189,  242,
        411,   16,  181,  105])

In [241]:
# Here are the ranks for the first position.  
counts[:,0].argsort().argsort()

# a -> 20th
# c -> 24th
# s -> 25th
# t -> 21st
# Looks like 's' is the most popular starting letter

array([20, 23, 24, 18,  9, 15, 17, 13,  3,  6, 11, 14, 19, 10,  8, 22,  1,
       16, 25, 21,  5,  7, 12,  0,  4,  2])

In [278]:
# So make a matrix and compute the ranks
ranks = np.zeros((5, 26))
for i in range(5):
    ranks[i] = counts[:,i].argsort().argsort()

In [283]:
def score_word(counts, word):
    score = 0
    for position, l in enumerate(word):
        letter_val = ord(l) - ord('a')
        score += ranks[position][letter_val]
    return score
    
score_word(counts, "crane")

# c -> 25
# r -> 13
# a -> 16
# n -> 10
# e -> 22

114.0

In [284]:
def score_word(counts, word):
    letter_scores = np.zeros(5)
    for position, l in enumerate(word):
        letter_val = ord(l) - ord('a')            
        letter_scores[position] = ranks[position][letter_val]
        
    weights = np.array([1.5, 1, .75, .75, 1.5])
    letter_scores *= weights
    return letter_scores.sum()
    
score_word(counts, "crane")


126.5

114.0

In [285]:
# Score each word, skip any words with repeated letters
words_with_scores = [(score_word(counts, w), w) for w in words if len(set(w)) == 5]

In [286]:
words_with_scores.sort(key=lambda x: x[0])

In [287]:
words_with_scores[-25:]

[(131.25, 'coals'),
 (131.5, 'cains'),
 (131.5, 'carls'),
 (131.5, 'baits'),
 (131.5, 'cants'),
 (131.5, 'banes'),
 (131.5, 'saine'),
 (131.75, 'certs'),
 (132.0, 'boras'),
 (132.0, 'coate'),
 (132.0, 'cones'),
 (132.0, 'coits'),
 (132.0, 'boats'),
 (132.25, 'pares'),
 (132.25, 'carns'),
 (132.25, 'carte'),
 (132.25, 'cires'),
 (132.25, 'sared'),
 (132.75, 'bores'),
 (133.0, 'canes'),
 (133.5, 'coats'),
 (133.75, 'bares'),
 (133.75, 'carts'),
 (134.25, 'cores'),
 (135.25, 'cares')]