In [2]:
import os
import json
from pprint import pprint
import random
import numpy as np
from matplotlib import pyplot as plt

DATA_PATH = "C:/Users/Russell/Documents/School-Work/CMSC470/crossword_puzzles/data/nyt_crosswords-cleaned"

In [3]:
puzzles_available = []

for year in os.listdir(DATA_PATH):
    if not year.isdecimal(): continue       # skip README.md
    for month in os.listdir(os.path.join(DATA_PATH, year)):
        for day in os.listdir(os.path.join(DATA_PATH, year, month)):
            puzzles_available.append((int(year), int(month), int(day.split(".")[0])))

print("Puzzles Available:", len(puzzles_available))

Puzzles Available: 14545


In [4]:
# Read a random sample of puzzles

sample = random.sample(puzzles_available, 10_000)

clue_answer_pairs = []

for year, month, day in sample:
    path = os.path.join(DATA_PATH, f"{year}", f"{month:02d}", f"{day:02d}.json")
    with open(path, encoding="utf-8") as f:
        puzzle = json.load(f)
        for c, a in zip(
            puzzle["clues"]["across"] + puzzle["clues"]["down"],
            puzzle["answers"]["across"] + puzzle["answers"]["down"]
        ):
            clue = c[c.index(".")+2:]
            clue_answer_pairs.append((clue, a))

clues, answers = zip(*clue_answer_pairs)
clue_answer_pairs

[('"Jabberwocky" start', 'TWAS'),
 ('Female W.W. II-era enlistees', 'WAACS'),
 ('Sorento and Sedona', 'KIAS'),
 ('Brewery fixture', 'OAST'),
 ('Is sporting', 'HASON'),
 ('Just sitting', 'IDLE'),
 ('Zuider Zee sight', 'DIKE'),
 ('Midway alternative', 'OHARE'),
 ('Legal basis of a repo', 'LIEN'),
 ('Bone-related', 'OSTEAL'),
 ('What you might do while driving to an unfamiliar place', 'GETLOST'),
 ('Rich dessert', 'TORTE'),
 ('Jumbo and colossal', 'SIZES'),
 ('Words of woe', 'OHME'),
 ('F equivalent', 'ESHARP'),
 ('Ancient theaters', 'ODEA'),
 ("Vintner's prefix", 'OENO'),
 ('"I ___ Parade"', 'LOVEA'),
 ('What you might do next?', 'FINDAGASSTATION'),
 ('Water nymph', 'NAIAD'),
 ('Garland\'s "cowardly" co-star', 'LAHR'),
 ('Manuscript encl.', 'SASE'),
 ('Seasoned vet', 'OLDPRO'),
 ('Southwest art center', 'TAOS'),
 ('Hardly garrulous', 'TERSE'),
 ('Ph.D. hurdles', 'ORALS'),
 ('What you might do next?', 'BUYAMAP'),
 ('Local lingo', 'PATOIS'),
 ('Melville mariner', 'AHAB'),
 ('Leader of the 

In [5]:
# Split data into train and test
from sklearn.model_selection import train_test_split

clues_train, clues_test, answers_train, answers_test = train_test_split(
    clues, answers,
    test_size=0.001, shuffle=True
)

len(clues_train), len(clues_test)

(847201, 849)

In [6]:
# Vectorize Clues
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(strip_accents="unicode", stop_words="english")
X_train = vectorizer.fit_transform(np.array(clues_train))
X_train

# Feature Engineering Ideas
# - clue is a pun (ends in "?")
# - clue is a proper noun (words are capitalized)

<847201x76969 sparse matrix of type '<class 'numpy.float64'>'
	with 1760159 stored elements in Compressed Sparse Row format>

In [7]:
# Build a Nearest Neighbors Model
from sklearn.neighbors import NearestNeighbors

model = NearestNeighbors().fit(X_train)
model

NearestNeighbors()

In [8]:
# Define a guesser function
def guess(clue, slot_length=None, max_guesses=5, max_guesses_raw=30):
    clue_vector = vectorizer.transform([clue])
    distances, indices = model.kneighbors(clue_vector, n_neighbors=max_guesses_raw)
    raw_guesses = [answers_train[i] for i in indices[0]]
    # print([clues_train[i] for i in indices[0]])

    def valid(g):
        o = True
        if slot_length:
            o &= len(g) == slot_length
        o &= g.lower() not in clue.lower()
        return o
    
    guesses = [g for g in raw_guesses if valid(g)]
    return guesses[:max_guesses]

    # TODO:
    # - include a confidence with each guess
    # - use repeated guesses and distances to determine confidence


In [9]:
i = random.randint(0, len(clues_test))
test_clue, test_answer = clues_test[i], answers_test[i]
print("clue:", test_clue)
print("answer:", test_answer)

# distances, indices = model.kneighbors(vectorizer.transform([test_clue]), n_neighbors=10)
# print("guesses:", [answers_train[i] for i in indices[0]])

guesses = guess(test_clue, slot_length=len(test_answer))
print("guesses:", guesses)

clue: One on a trail, perhaps
answer: SLEUTH
guesses: ['OREGON', 'OREGON']


In [18]:
correct_pairs = []
incorrect_pairs = []

for i, (clue, answer) in enumerate(zip(clues_test, answers_test)):
    if i % 100 == 0: print(i)
    guesses = guess(clue, len(answer), max_guesses=5)
    if answer in guesses:
        correct_pairs.append((clue, answer, guesses))
    else:
        incorrect_pairs.append((clue, answer, guesses))

print(f"Accuracy: {len(correct_pairs) / len(clues_test):0.3%}")
# print(incorrect_pairs)

0
100
200
300
400
500
600
700


KeyboardInterrupt: 

In [None]:
incorrect_pairs

[("Technical writer's target", 'ENDUSER', []),
 ('The "W" in R.W.E.', 'WALDO', ['DRONE', 'UNDER', 'ATBAY', 'GEESE', 'ACCTS']),
 ("G. &amp; S.'s Lord High Everything ___",
  'ELSE',
  ['ISIT', 'ISIT', 'ALAD', 'PEER', 'RULE']),
 ('Rap stars often have them',
  'POSSES',
  ['HIPHOP', 'DEARME', 'KISMET', 'COMERS']),
 ('Lemon appendage', 'ADE', ['TAG', 'ARM', 'DUD', 'RAG', 'DUD']),
 ('Let go of', 'RELEASE', ['RELIEVE']),
 ('Features of some Amerindian embroidery',
  'PORCUPINEQUILLS',
  ['EIGHTTIMESTHREE']),
 ('Inclined', 'TRENDED', ['OFAMIND']),
 ('Old-fashioned farm apparatus',
  'CHURN',
  ['DATED', 'MOSSY', 'DOWDY', 'DATED', 'DATED']),
 ('Foes at Gaugamela', 'ALEXANDERDARIUS', []),
 ('Pods of flax', 'BOLLS', ['CACAO', 'CAROB', 'OKRAS', 'LINEN', 'LINEN']),
 ('Deal a mighty blow',
  'SMITE',
  ['ERUPT', 'ERUPT', 'ERUPT', 'ERUPT', 'GOOFF']),
 ('Round dance', 'ONESTEP', ['SPHERED']),
 ('Ripkin of the Orioles', 'CAL', ['TRY', 'HAH', 'PRO']),
 ('Thomas Moore ballad locale',
  'TARA',
  ['ERIN

In [17]:
guess("bruins")

['UCLA', 'UCLA', 'ORR', 'UCLA', 'UCLA']