<a href="https://colab.research.google.com/github/ptandon0/wordle_engine/blob/main/simple_wordle_solver.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import random
import pandas as pd
import numpy as np
import json
from scipy.stats import entropy

In [2]:
!wget https://raw.githubusercontent.com/3b1b/videos/master/_2022/wordle/data/freq_map.json

--2022-03-24 17:43:30--  https://raw.githubusercontent.com/3b1b/videos/master/_2022/wordle/data/freq_map.json
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 346629 (339K) [text/plain]
Saving to: ‘freq_map.json’


2022-03-24 17:43:31 (7.93 MB/s) - ‘freq_map.json’ saved [346629/346629]



In [3]:
#wordle 5-gram map from google books ngram viewer, taken from 3b1b github
freq_dict = json.load(open("freq_map.json"))
freqDF = pd.DataFrame.from_dict(freq_dict, "index", columns=["word_frequency"], dtype="float64")

#extract wordle dictionary from frequency map
words = freqDF.index.tolist()
random.shuffle(words)

#apply sigmoid function to word frequency to select top 4000 words as "likely"
#this technique is also borrowed from the 3 blue one brown video
sorted_word_array = np.array(freqDF.sort_values("word_frequency").index.tolist())
width_under_sigmoid = 10 # width of x axis under sigmoid function
n_common = 4000 # number of words which we want to be on positive side
x_width = width_under_sigmoid
c = x_width * (-0.5 + n_common/len(sorted_word_array)) #centerpoint
xs = np.linspace(c-x_width/2, c+x_width/2, len(sorted_word_array))
freqDF = freqDF.sort_values("word_frequency")
freqDF["sigmoid_prob"] = list(map(lambda x: 1.0/(1.0+np.exp(-x)), xs))

In [4]:
#set up list of all possible color combinations:
def generate_all_colors():
  out = ["g", "y", "G"]
  chars = ["g", "y", "G"]
  for x in range(4):
    out = [i + j for i in out for j in chars]
  return out

ALL_COLORS = generate_all_colors()

In [36]:
def wordle_solver(words, freq_df, guesses_x, colors):
    guesses = list(map(str.lower, guesses_x))
    #first get all the grays, yellows, greens
    grays = []
    for i in range(len(guesses)):
        grays.extend([i[0] for i in list(filter(lambda x: x[1] == "g", zip(guesses[i], colors[i])))])
    yellows = []
    for i in range(len(guesses)):
        yellows.extend([i[0] for i in list(filter(lambda x: x[1] == "y", zip(guesses[i], colors[i])))])
    greens = []
    for i in range(len(guesses)):
        greens.extend([i[0] for i in list(filter(lambda x: x[1] == "G", zip(guesses[i], colors[i])))])
    #characters can be both yellow or green and gray, filter out yellow/green characters from gray map
    grays_filtered = [char for char in grays if char not in yellows]
    grays_filtered = [char for char in grays_filtered if char not in greens]
    #filter out gray characters
    print("filtering out gray characters", ",".join(grays_filtered))
    words_tmp = (list(set(words) - set([i for i in words if any(i for j in grays_filtered if j in i)])))
    #find the gray characters that were filtered out
    grays_removed = [char for char in grays if char in (yellows + greens)]
    #find the yellow indices, also add the indices of the removed gray characters(letter can't be in that position)
    yellow_indices = []
    for i in range(len(colors)):
        for j in range(len(colors[i])):
            if colors[i][j] == "y":
                yellow_indices.append((i,j))
    for i in range(len(guesses)): 
      #technically because wordle has always 5 letters we can do in same loop, but let's separate them.
      for j in range(len(guesses[i])):
        if (guesses[i][j] in grays_removed) and (colors[i][j] == "g"):
          yellow_indices.append((i,j))
    print("looking for green characters")
    green_indices = []
    for i in range(len(colors)):
        for j in range(len(colors[i])):
            if colors[i][j] == "G":
                green_indices.append((i,j))
    already_done_indicies = []
    for index in green_indices:
        if index[1] not in already_done_indicies:
          target = guesses[index[0]][index[1]]
          print("filtering for character", target, "at index", index[1])
          words_tmp = [word for word in words_tmp if word[index[1]] == target]
          already_done_indicies.append(index[1])
    if len(words_tmp) > 1: #if greens fully match the word; then we are done.
      #work on yellows
      print("begin filtering yellow characters")
      print("select only words which contain:", ",".join(set(yellows)))
      for yellow in yellows:
            words_tmp = [word for word in words_tmp if yellow in word]
      for index in yellow_indices:
          target = guesses[index[0]][index[1]]
          print("filtering for character", target, "not at at index", index[1])
          words_tmp = [word for word in words_tmp if word[index[1]] != target]
    print(len(words_tmp), "words remain in the corpus")
    print("sort remaining corpus by 5-gram frequency")
    wordDF = freqDF[freqDF.index.isin(words_tmp)].sort_values("sigmoid_prob", ascending=False)
    print("begin_entropy_calculations to sort by information gained for each guess")
    wordDF["word_weight"] = wordDF.word_frequency / (wordDF.word_frequency.sum())
    wordWeightDict = wordDF[["word_weight"]].to_dict()["word_weight"]
    guesses_strip = list(filter(None,guesses))
    colors_strip = list(filter(None,colors))
    colorDF = pd.DataFrame({"poscolor": ALL_COLORS})
    wordDF2 = wordDF.reset_index().assign(key=1).merge(colorDF.assign(key=1), on="key").drop("key",1).rename(columns={"index":"guess"})
    wordDF2["freq"] = wordDF2.apply(lambda x: get_freq_dist(words_tmp, guesses_strip, colors_strip, x.guess, x.poscolor, wordWeightDict),axis=1)
    wordDF2 = wordDF2.groupby(["guess", "sigmoid_prob"]).agg({"freq":[list, sum]})
    wordDF2["freq_dist"] = wordDF2["freq"]["list"].apply(np.array)
    wordDF2["prob_dist"] = np.where(wordDF2["freq"]["sum"] !=0, wordDF2.freq_dist / wordDF2["freq"]["sum"], None)
    wordDF2["entropy"] = wordDF2["prob_dist"].apply(entropy)
    #display(wordDF2)
    out = wordDF2.reset_index()[["guess", "sigmoid_prob", "entropy"]]
    out["score"] = out.sigmoid_prob + out.entropy
    print("The top 10 guesses by bits of information are:")
    display(out.sort_values("entropy", ascending=False).head(10))
    print("The top 10 guesses by likelihood of words in english are:")
    display(out.sort_values("sigmoid_prob",ascending=False).head(10))
    print("The top 10 guesses by simply adding entropy and word likelihood are:")
    display(out.sort_values("score", ascending=False).head(10))
    return(out)

In [37]:
def get_freq_dist(corpus, prev_guesses, prev_colors, target_guess, target_color, weight_dict):
    cur_guesses = (",".join(prev_guesses)+","+target_guess).split(",")
    cur_colors = (",".join(prev_colors)+","+target_color).split(",")
    grays = []
    for i in range(len(cur_guesses)):
        grays.extend([i[0] for i in list(filter(lambda x: x[1] == "g", zip(cur_guesses[i], cur_colors[i])))])
    yellows = []
    for i in range(len(cur_guesses)):
        yellows.extend([i[0] for i in list(filter(lambda x: x[1] == "y", zip(cur_guesses[i], cur_colors[i])))])
    greens = []
    for i in range(len(cur_guesses)):
        greens.extend([i[0] for i in list(filter(lambda x: x[1] == "G", zip(cur_guesses[i], cur_colors[i])))])
    #characters can be both yellow or green and gray, filter out yellow/green characters from gray map
    grays_filtered = [char for char in grays if char not in yellows]
    grays_filtered = [char for char in grays_filtered if char not in greens]
    #filter out gray characters
    corpus_tmp = (list(set(corpus) - set([i for i in corpus if any(i for j in grays_filtered if j in i)])))
    #find the gray characters that were filtered out
    grays_removed = [char for char in grays if char in (yellows + greens)]
    #find the yellow indices, add the indices of the grays removed
    yellow_indices = []
    for i in range(len(cur_colors)):
        for j in range(len(cur_colors[i])):
            if cur_colors[i][j] == "y":
                yellow_indices.append((i,j))
    for i in range(len(cur_guesses)): 
      #technically because wordle has always 5 letters we can do in same loop, but let's separate them.
      for j in range(len(cur_guesses[i])):
        if (cur_guesses[i][j] in grays_removed) and (cur_colors[i][j] == "g"):
          yellow_indices.append((i,j))
    #work on greens
    green_indices = []
    for i in range(len(cur_colors)):
        for j in range(len(cur_colors[i])):
            if cur_colors[i][j] == "G":
                green_indices.append((i,j))
    already_done_indicies = []
    for index in green_indices:
        if index[1] not in already_done_indicies:
          target = cur_guesses[index[0]][index[1]]
          corpus_tmp = [word for word in corpus_tmp if word[index[1]] == target]
          already_done_indicies.append(index[1])
    #work on yellows
    for yellow in yellows:
          corpus_tmp = [word for word in corpus_tmp if yellow in word]
    for index in yellow_indices:
        target = cur_guesses[index[0]][index[1]]
        corpus_tmp = [word for word in corpus_tmp if word[index[1]] != target]
    if len(corpus_tmp) == 0:
        return 0
    else:
        weighted_freq = [weight_dict[k] for k in corpus_tmp]
        return(sum(weighted_freq))

In [47]:
#enter in your guesses (all lowercase, 5 letter words)
#enter in the colors you get back (gray = g, green=G, yellow=y) as a 5 letter word
#you must start with some valid guess and the color pattern the game returns
#calculating entropy for ~900 guesses currently takes ~3 min in colab.
#calculating entropy for <400 guesses is very fast.

guesses = ["salet","retro","write","",""]
colors = ["gggyy","yyygg","gyyyy","",""]
k = wordle_solver(words, freqDF, guesses, colors)

filtering out gray characters s,a,l,o,w
looking for green characters
begin filtering yellow characters
select only words which contain: r,t,i,e
filtering for character e not at at index 3
filtering for character t not at at index 4
filtering for character r not at at index 0
filtering for character e not at at index 1
filtering for character t not at at index 2
filtering for character r not at at index 1
filtering for character i not at at index 2
filtering for character t not at at index 3
filtering for character e not at at index 4
filtering for character r not at at index 3
1 words remain in the corpus
sort remaining corpus by 5-gram frequency
begin_entropy_calculations to sort by information gained for each guess
The top 10 guesses by bits of information are:




Unnamed: 0,guess,sigmoid_prob,entropy,score
,,,,
0.0,their,0.956177,0.0,0.956177


The top 10 guesses by likelihood of words in english are:


Unnamed: 0,guess,sigmoid_prob,entropy,score
,,,,
0.0,their,0.956177,0.0,0.956177


The top 10 guesses by simply adding entropy and word likelihood are:


Unnamed: 0,guess,sigmoid_prob,entropy,score
,,,,
0.0,their,0.956177,0.0,0.956177
