In [21]:
import os
import json
from pprint import pprint
import random
import numpy as np
from matplotlib import pyplot as plt

DATA_PATH = r"data\nyt_crosswords-cleaned"

In [22]:
puzzles_available = []

for year in os.listdir(DATA_PATH):
    if not year.isdecimal(): continue       # skip README.md
    for month in os.listdir(os.path.join(DATA_PATH, year)):
        for day in os.listdir(os.path.join(DATA_PATH, year, month)):
            puzzles_available.append((int(year), int(month), int(day.split(".")[0])))

print("Puzzles Available:", len(puzzles_available))

Puzzles Available: 14545


In [23]:
# Read a random sample of puzzles

# sample = random.sample(puzzles_available, 10_000)

# sample puzzles from EVEN days
sample = [puzz for puzz in puzzles_available if puzz[-1]%2==0]

clue_answer_pairs = []

for year, month, day in sample:
    path = os.path.join(DATA_PATH, f"{year}", f"{month:02d}", f"{day:02d}.json")
    with open(path, encoding="utf-8") as f:
        puzzle = json.load(f)
        for c, a in zip(
            puzzle["clues"]["across"] + puzzle["clues"]["down"],
            puzzle["answers"]["across"] + puzzle["answers"]["down"]
        ):
            clue = c[c.index(".")+2:]
            clue_answer_pairs.append((clue, a))

clues, answers = zip(*clue_answer_pairs)
clue_answer_pairs

[('Capital of Ghana', 'ACCRA'),
 ('Shows wear, as a chair cushion', 'SAGS'),
 ('Progeny of an old block', 'CHIP'),
 ('Yugoslav native', 'CROAT'),
 ("Guy Fawkes's forte", 'PLOT'),
 ('Window part', 'PANE'),
 ('Well-known drudge', 'CINDERELLA'),
 ('Status quo ___ bellum', 'ANTE'),
 ('Clocked', 'TIMED'),
 ('Bedroom piece', 'DRESSER'),
 ('Bull-rider of myth', 'EUROPA'),
 ('Thus, to Caesar', 'ITA'),
 ('Prior to', 'ERE'),
 ('Roman 1501', 'MDI'),
 ('Harvest goddess', 'OPS'),
 ('Haul', 'LUG'),
 ('Hallucinogen', 'LSD'),
 ('Word with after or new', 'MATH'),
 ('Condescended', 'STOOPED'),
 ('Alaskan native', 'ALEUT'),
 ('Common-Market initials', 'EEC'),
 ('Kind of transit', 'RAPID'),
 ('Custom-made in London', 'BESPOKE'),
 ('___ chance (no way)', 'NOTA'),
 ('Mil. man', 'SGT'),
 ('Turntable abbr.', 'RPM'),
 ('Holy day: Abbr.', 'SAB'),
 ('Handed or headed', 'RED'),
 ('Kind of text or cursor', 'PRE'),
 ("Bill's partner", 'COO'),
 ('Stomach: Prefix', 'GASTRO'),
 ('Composer Victor', 'HERBERT'),
 ('"Thar

In [24]:
# Split data into train and test
from sklearn.model_selection import train_test_split

clues_train, clues_test, answers_train, answers_test = train_test_split(
    clues, answers,
    test_size=0.001, shuffle=True
)

len(clues_train), len(clues_test)

(602694, 604)

In [25]:
# Vectorize Clues
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(strip_accents="unicode", stop_words="english")
X_train = vectorizer.fit_transform(np.array(clues_train))
X_train

# Feature Engineering Ideas
# - clue is a pun (ends in "?")
# - clue is a proper noun (words are capitalized)

<602694x69152 sparse matrix of type '<class 'numpy.float64'>'
	with 1250061 stored elements in Compressed Sparse Row format>

In [26]:
# Build a Nearest Neighbors Model
from sklearn.neighbors import NearestNeighbors

model = NearestNeighbors().fit(X_train)
model

NearestNeighbors()

In [27]:
# Define a guesser function
def guess(clue, slot_length=None, max_guesses=5, max_guesses_raw=30):
    clue_vector = vectorizer.transform([clue])
    distances, indices = model.kneighbors(clue_vector, n_neighbors=max_guesses_raw)
    raw_guesses = [answers_train[i] for i in indices[0]]
    # print([clues_train[i] for i in indices[0]])

    def valid(g):
        o = True
        if slot_length:
            o &= len(g) == slot_length
        o &= g.lower() not in clue.lower()
        return o
    
    guesses = [g for g in raw_guesses if valid(g)]
    return guesses[:max_guesses]

    # TODO:
    # - include a confidence with each guess
    # - use repeated guesses and distances to determine confidence


In [28]:
i = random.randint(0, len(clues_test))
test_clue, test_answer = clues_test[i], answers_test[i]
print("clue:", test_clue)
print("answer:", test_answer)

# distances, indices = model.kneighbors(vectorizer.transform([test_clue]), n_neighbors=10)
# print("guesses:", [answers_train[i] for i in indices[0]])

guesses = guess(test_clue, slot_length=len(test_answer))
print("guesses:", guesses)

clue: Rug, so to speak
answer: WIG
guesses: ['WIG', 'RAG', 'RYA']


In [29]:
correct_pairs = []
incorrect_pairs = []

for i, (clue, answer) in enumerate(zip(clues_test, answers_test)):
    if i % 100 == 0: print(i)
    guesses = guess(clue, len(answer), max_guesses=5)
    if answer in guesses:
        correct_pairs.append((clue, answer, guesses))
    else:
        incorrect_pairs.append((clue, answer, guesses))

print(f"Accuracy: {len(correct_pairs) / len(clues_test):0.3%}")
# print(incorrect_pairs)

0
100


KeyboardInterrupt: 

In [None]:
incorrect_pairs

[('Christmas décor item', 'MISTLETOE', ['WALLPAPER']),
 ('Gives off fumes', 'REEKS', ['STEWS', 'VAPOR', 'STEWS', 'AREEK', 'RAGES']),
 ('Vehicle in a spiritual', 'CHARIOT', ['ORECART']),
 ('Unmanned aircraft', 'DRONES', ['ALBEIT', 'ENGINE']),
 ('Dir. from Milan to Udine', 'ENE', ['SIG', 'TAB', 'EAT', 'OUR', 'PRO']),
 ('Fishing spot for Scots', 'LOCH', ['PIER', 'PIER', 'PIER', 'AULD', 'COVE']),
 ('Interrogate', 'GRILL', ['LEGIT', 'DENSE', 'AMPLE', 'ALIEN', 'ELITE']),
 ('Nose part', 'ALARE', ['NARIS', 'SNOOP', 'SNOOT', 'NASAL', 'SMELL']),
 ('Car part', 'TAILPIPE', ['RADIATOR']),
 ("Gaby's spouse", 'MARI', ['MATE', 'MATE', 'MATE', 'MATE', 'MATE']),
 ('Former Chinese monetary unit',
  'TAEL',
  ['ANNA', 'LIRA', 'LIRA', 'LIRA', 'LIRA']),
 ('Off the leash', 'LOOSE', ['STRAP', 'TWIXT', 'UNION', 'FROWN', 'STILL']),
 ('What a "midnight ride" horse earned', 'ESTEEMOFREVERE', []),
 ('Put money into a sinking fund',
  'AMORTIZE',
  ['DONATETO', 'THEMAINE', 'INTANDEM']),
 ('"Festina lente"', 'MAKEHA

In [30]:
guess("bruins")

['UCLA', 'ORR', 'UCLA', 'ELEVEN', 'UCLA']

In [32]:
# serialize and save model
import pickle

TRAINED_MODEL_PATH = "trained_model.p"

pickle.dump((answers_train, vectorizer, model), open(TRAINED_MODEL_PATH, "wb"))