<a href="https://colab.research.google.com/github/ramayer/google-colab-examples/blob/main/Spark_Wordle.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
%pip install pyspark

Collecting pyspark
  Downloading pyspark-3.2.1.tar.gz (281.4 MB)
[K     |████████████████████████████████| 281.4 MB 35 kB/s 
[?25hCollecting py4j==0.10.9.3
  Downloading py4j-0.10.9.3-py2.py3-none-any.whl (198 kB)
[K     |████████████████████████████████| 198 kB 63.3 MB/s 
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.2.1-py2.py3-none-any.whl size=281853642 sha256=4aa46a273b510a5968b45b88d3002b462cb673db89b345c742d442b8f85cad03
  Stored in directory: /root/.cache/pip/wheels/9f/f5/07/7cd8017084dce4e93e84e92efd1e1d5334db05f2e83bcef74f
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9.3 pyspark-3.2.1


In [None]:
import requests
from collections import defaultdict

# plagiarized from https://www.kaggle.com/yelbuzz/wordle-second-guess/script

def getWordleWords():
    wordledict = requests.get("https://raw.githubusercontent.com/andrew-t/gaming-wordle/main/words.json").json()
    solutions = wordledict["solutions"]
    guesses = wordledict["guesses"]
    return solutions, guesses

def makeColorful(word, solution):
    solutionlist = [c for c in solution]
    wordleft = [c for c in word]
    colorWord = ["0" for x in range(5)]
    for i, c in enumerate(word):
        if c == solution[i]:
            solutionlist.remove(c)
            wordleft[i] = "."
            colorWord[i] = "2"
    for i, c in enumerate(wordleft):
        if c in solutionlist:
            solutionlist.remove(c)
            colorWord[i] = "1"
    return "".join(colorWord)    

solutions,guesses = getWordleWords()
allWords = solutions + guesses

In [None]:
import pyspark

MAX_MEMORY = "8g"  # 24 gives OOM here.

spark = (pyspark.sql.SparkSession.builder.appName("MyApp") 
    #.config("spark.jars.packages", "io.delta:delta-core_2.12:1.0.0") 
    #.config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") 
    #.config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") 
    .config("spark.executor.memory", MAX_MEMORY) 
    .config("spark.driver.memory", MAX_MEMORY) 
    .enableHiveSupport() 
    .getOrCreate()        
    )
spark

In [None]:
allwords_df  = spark.createDataFrame([[w] for w in allWords] ,'guess string')
solutions_df = spark.createDataFrame([[w] for w in solutions],'solution string')
allwords_df.createOrReplaceTempView('all_words')
solutions_df.createOrReplaceTempView('solutions')

In [None]:
import pyspark.sql.functions as psf
import pandas as pd
@psf.pandas_udf('string')
def make_colorful_udf(a:pd.Series,b:pd.Series) -> pd.Series:
  return pd.Series([makeColorful(a,b) for a,b in zip(a.to_list(),b.to_list())])
  #pdf = pd.concat([a,b],axis=1)
  #return pdf.apply(lambda row: makeColorful(row[0],row[1]), axis=1)
spark.udf.register("make_colorful",make_colorful_udf)

<function __main__.make_colorful_udf>

In [None]:
%%time
spark.sql("""
 with colors as (
   select guess,
          solution,
          make_colorful(guess,solution) as colors
    from all_words
    join solutions
  ),
  guesses_with_colors as (
      select 
          guess,
          colors,
          count(*)               as num_solutions,
          collect_list(solution) as possible_solutions
        from colors
        group by guess,colors
  )
  select 
          guess,
          count(case when num_solutions = 1 then 1 else null end) as number_of_unique_hints,
          sum(1/num_solutions)                                    as maybe_a_better_score,
          collect_list(struct(
              colors,
              num_solutions,
              slice(possible_solutions,1,10) as some_solutions
              )) as details
     from guesses_with_colors
     group by guess
     order by number_of_unique_hints desc
""").show(20,100)


+-----+----------------------+--------------------+----------------------------------------------------------------------------------------------------+
|guess|number_of_unique_hints|maybe_a_better_score|                                                                                             details|
+-----+----------------------+--------------------+----------------------------------------------------------------------------------------------------+
|rotan|                    41|   56.91251891566862|[{00000, 256, [sissy, humph, blush, helix, whelp, cluck, spike, mimic, flesh, belly]}, {00002, 17...|
|laten|                    41|   57.97724660056382|[{00000, 205, [sissy, humph, mimic, booby, ivory, shrub, civic, spicy, fjord, dowry]}, {00010, 16...|
|tansy|                    40|  51.033341692847586|[{00110, 42, [spend, using, shown, slung, scorn, swine, risen, sheen, siren, snuck]}, {00112, 4, ...|
|maron|                    40|   55.60352062876734|[{00011, 57, [pound, conic, son

In [None]:
!cat /proc/cpuinfo

processor	: 0
vendor_id	: GenuineIntel
cpu family	: 6
model		: 79
model name	: Intel(R) Xeon(R) CPU @ 2.20GHz
stepping	: 0
microcode	: 0x1
cpu MHz		: 2199.998
cache size	: 56320 KB
physical id	: 0
siblings	: 2
core id		: 0
cpu cores	: 1
apicid		: 0
initial apicid	: 0
fpu		: yes
fpu_exception	: yes
cpuid level	: 13
wp		: yes
flags		: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ss ht syscall nx pdpe1gb rdtscp lm constant_tsc rep_good nopl xtopology nonstop_tsc cpuid tsc_known_freq pni pclmulqdq ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand hypervisor lahf_lm abm 3dnowprefetch invpcid_single ssbd ibrs ibpb stibp fsgsbase tsc_adjust bmi1 hle avx2 smep bmi2 erms invpcid rtm rdseed adx smap xsaveopt arat md_clear arch_capabilities
bugs		: cpu_meltdown spectre_v1 spectre_v2 spec_store_bypass l1tf mds swapgs taa
bogomips	: 4399.99
clflush size	: 64
cache_alignment	: 64
address sizes	: 46 bits physical, 48 b

In [None]:
%%time
spark.sql("""
 select guess,
          solution,
          make_colorful(guess,solution) as colors
    from all_words
    join solutions
    order by colors
    """).show()

+-----+--------+------+
|guess|solution|colors|
+-----+--------+------+
|cigar|   loopy| 00000|
|cigar|   slosh| 00000|
|cigar|   moult| 00000|
|cigar|   unmet| 00000|
|cigar|   spend| 00000|
|cigar|   model| 00000|
|cigar|   thumb| 00000|
|cigar|   floss| 00000|
|cigar|   tweed| 00000|
|cigar|   whelp| 00000|
|cigar|   steed| 00000|
|cigar|   lusty| 00000|
|cigar|   dozen| 00000|
|cigar|   flesh| 00000|
|cigar|   boozy| 00000|
|cigar|   belly| 00000|
|cigar|   pulpy| 00000|
|cigar|   stout| 00000|
|cigar|   solve| 00000|
|cigar|   bleed| 00000|
+-----+--------+------+
only showing top 20 rows

CPU times: user 635 ms, sys: 90.1 ms, total: 725 ms
Wall time: 2min 13s
