<a href="https://colab.research.google.com/github/ramayer/google-colab-examples/blob/main/Spark_Wordle.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Optimal (for bragging rights) Wordle strategy

* Observations
  * Most wordle strategy guides try to optimize the average or worst-case number of turns it'll take to find a word.
  * In many settings (showing off to a group), a better metric is maximizing your chance of guessing the word on your second guess. 



In [25]:
%pip install pyspark >& /tmp/pyspark.out

In [4]:
import requests
from collections import defaultdict

# plagiarized from https://www.kaggle.com/yelbuzz/wordle-second-guess/script

# Wordle uses a couple lists of words.
def getWordleWords():
    wordledict = requests.get("https://raw.githubusercontent.com/andrew-t/gaming-wordle/main/words.json").json()
    solutions = wordledict["solutions"]
    guesses = wordledict["guesses"]
    return solutions, guesses

# Method to calculate the pattern of colors Wordle will return for a pair of words.
def compute_wordle_color(word, solution):
    solutionlist = [c for c in solution]
    wordleft = [c for c in word]
    colorWord = ["_" for x in range(5)]
    for i, c in enumerate(word):
        if c == solution[i]:
            solutionlist.remove(c)
            wordleft[i] = "."
            colorWord[i] = "g"
    for i, c in enumerate(wordleft):
        if c in solutionlist:
            solutionlist.remove(c)
            colorWord[i] = "y"
    return "".join(colorWord)    

solutions,guesses = getWordleWords()
allWords = solutions + guesses
print(f"""{len(solutions)} valid solution words {",".join(sorted(solutions[0:5]))}...""")
print(f"""{len(guesses)} valid guess words {",".join(guesses[0:5])}...""")

2315 valid solution words awake,cigar,humph,rebut,sissy...
10657 valid guess words aahed,aalii,aargh,aarti,abaca...


In [5]:
compute_wordle_color("awake","aahed")

'g_y_y'

In [6]:
import pyspark
MAX_MEMORY = "8g"  # 24 gives OOM here.
spark = (pyspark.sql.SparkSession.builder.appName("MyApp") 
    .config("spark.executor.memory", MAX_MEMORY) 
    .config("spark.driver.memory", MAX_MEMORY) 
    .enableHiveSupport() 
    .getOrCreate()        
    )
spark

In [7]:
# create a dataframe of all words known to wordle, and a dataframe of just the possible solution words
allwords_df  = spark.createDataFrame([[w] for w in allWords] ,'guess string')
solutions_df = spark.createDataFrame([[w] for w in solutions],'solution string')
allwords_df.createOrReplaceTempView('all_words')
solutions_df.createOrReplaceTempView('solutions')

In [8]:
# create a spark UDF to compute the color of any pair of Wordle words.
import pyspark.sql.functions as psf
import pandas as pd
@psf.pandas_udf('string')
def compute_wordle_color_udf(a:pd.Series,b:pd.Series) -> pd.Series:
  return pd.Series([compute_wordle_color(a,b) for a,b in zip(a.to_list(),b.to_list())])
spark.udf.register("compute_wordle_color",compute_wordle_color_udf)

<function __main__.compute_wordle_color_udf(a: pandas.core.series.Series, b: pandas.core.series.Series) -> pandas.core.series.Series>

In [9]:
%%time
# Compute the world-colors of every pair of all possible combinations of wordle words;
# and sort them by which initial guess produce the most unique color patterns (to guarantee finding the word on your second guess)
df = spark.sql("""
 with colors as (
   select guess,
          solution,
          compute_wordle_color(guess,solution) as colors
    from all_words
    join solutions
  ),
  guesses_with_colors as (
      select 
          guess,
          colors,
          count(*)               as num_solutions,
          collect_list(solution) as possible_solutions
        from colors
        group by guess,colors
  )
  select 
          guess,
          count(case when num_solutions = 1 then 1 else null end) as number_of_unique_hints,
          sum(1/num_solutions)                                    as maybe_a_better_metric,
          collect_list(struct(
              colors,
              num_solutions,
              slice(possible_solutions,1,10) as some_solutions
              )) as details
     from guesses_with_colors
     group by guess
     order by maybe_a_better_metric desc
""").cache()
df.limit(10).toPandas()

CPU times: user 1.48 s, sys: 203 ms, total: 1.68 s
Wall time: 3min 47s


Unnamed: 0,guess,number_of_unique_hints,maybe_a_better_metric,details
0,laten,41,57.977247,"[(_g_gg, 5, [haven, ramen, oaken, waxen, raven..."
1,caron,40,57.739217,"[(_y_gg, 1, [axion]), (_g__g, 8, [pagan, laden..."
2,filet,37,57.492254,"[(__gyy, 2, [delta, tulle]), (g_y_y, 1, [fatal..."
3,parse,36,57.158766,"[(_yy_g, 25, [grade, crate, argue, trace, brak..."
4,rotan,41,56.912519,"[(_g___, 79, [model, booby, golem, loopy, booz..."
...,...,...,...,...
95,prase,33,51.625273,"[(_y___, 88, [forth, ivory, round, corny, fjor..."
96,relay,34,51.614940,"[(_ggy_, 2, [delta, fella]), (__g_y, 2, [polyp..."
97,groma,39,51.596172,"[(gg__y, 16, [grade, great, grate, grave, grac..."
98,bread,35,51.584356,"[(__g__, 64, [whelp, flesh, siege, sweet, smel..."


In [14]:
best_5_initial_words = df.take(3)
for row in best_5_initial_words:
  guess,unique_colors = row.guess, row.number_of_unique_hints
  print(f"""The word "{guess}" will guarantee you to be able to solve the puzzle on your second try for {unique_colors} words""")
  interesting_details = sorted([(clrs,ns,words) for clrs,ns,words in row.details if ns < 4 ],key=lambda x: x[0])
  for clrs,ns,words in interesting_details:
    if ns == 1:
      print(f"""   if you guess "{guess}" and the colors are {clrs}, the only possible word is {words}""")
    else:
      print(f"""   if you guess "{guess}" and the colors are {clrs}, the possible words are {words}""")

The word "laten" will guarantee you to be able to solve the puzzle on your second try for 41 words
   if you guess "laten" and the colors are __g_y, the possible words are ['notch', 'nutty', 'intro']
   if you guess "laten" and the colors are __ggg, the only possible word is ['often']
   if you guess "laten" and the colors are __ggy, the possible words are ['inter', 'enter']
   if you guess "laten" and the colors are __gyy, the possible words are ['entry', 'untie']
   if you guess "laten" and the colors are __y_g, the possible words are ['thorn', 'toxin']
   if you guess "laten" and the colors are __ygg, the only possible word is ['token']
   if you guess "laten" and the colors are __yyg, the possible words are ['stein', 'stern']
   if you guess "laten" and the colors are _g_gy, the only possible word is ['saner']
   if you guess "laten" and the colors are _gg_g, the possible words are ['baton', 'satin']
   if you guess "laten" and the colors are _gggg, the only possible word is ['eate