In [2]:
import os
import json
from pprint import pprint
import random
import numpy as np
from matplotlib import pyplot as plt


DATA_PATH = r"data\nyt_crosswords-cleaned"

In [3]:
puzzles_available = []

for year in os.listdir(DATA_PATH):
    if not year.isdecimal(): continue       # skip README.md
    for month in os.listdir(os.path.join(DATA_PATH, year)):
        for day in os.listdir(os.path.join(DATA_PATH, year, month)):
            puzzles_available.append((int(year), int(month), int(day.split(".")[0])))

print("Puzzles Available:", len(puzzles_available))

Puzzles Available: 14545


In [4]:
# Read a random sample of puzzles

# sample = random.sample(puzzles_available, 10_000)

# sample puzzles from EVEN days
sample = [puzz for puzz in puzzles_available if puzz[-1]%2==0]

clue_answer_pairs = []

for year, month, day in sample:
    path = os.path.join(DATA_PATH, f"{year}", f"{month:02d}", f"{day:02d}.json")
    with open(path, encoding="utf-8") as f:
        puzzle = json.load(f)
        for c, a in zip(
            puzzle["clues"]["across"] + puzzle["clues"]["down"],
            puzzle["answers"]["across"] + puzzle["answers"]["down"]
        ):
            clue = c[c.index(".")+2:]
            clue_answer_pairs.append((clue, a))

clues, answers = zip(*clue_answer_pairs)


In [5]:
# Split data into train and test
from sklearn.model_selection import train_test_split

clues_train, clues_test, answers_train, answers_test = train_test_split(
    clues, answers,
    test_size=0.001, shuffle=True
)

len(clues_train), len(clues_test)

(602694, 604)

In [6]:
# Prepare data
new_clues_train = []
for clue in clues_train:
    clue = clue.replace('\'', '')
    clue = clue.replace('"', '')
    new_clues_train.append([w.lower() for w in clue.split(' ')])

new_clues_test = []
for clue in clues_test:
    clue = clue.replace('\'', '')
    clue = clue.replace('"', '')
    new_clues_test.append([w.lower() for w in clue.split(' ')])

new_clues_test

[['limit'],
 ['opec', 'member,', 'briefly'],
 ['accustomed'],
 ['trapped', 'morays'],
 ['desert', 'plants'],
 ['rulers', 'of', 'yore'],
 ['lock'],
 ['another', 'surgeon', 'had', 'the', 'audience', '___'],
 ['individually'],
 ['beau', '___'],
 ['unwanted', 'overhangs'],
 ['shrink'],
 ['like', 'some', 'legal', 'issues'],
 ['___', 'abner'],
 ['angry', 'look'],
 ['language', 'textbook'],
 ['see', '62-across'],
 ['estate'],
 ['apt', 'rhyme', 'of', 'aahs'],
 ['49ers', 'coach', 'george'],
 ['like', 'some', 'alleles'],
 ['actress', 'caldwell', 'and', 'others'],
 ['like', 'a', 'good', 'egg'],
 ['hit', 'tv', 'series', 'starring', 'gary', 'sinise'],
 ['harsh', 'cry'],
 ['santanas', '___', 'como', 'va'],
 ['womens', 'magazine'],
 ['a', 'role', 'for', 'beverly', 'sills'],
 ['blow', 'a', 'gasket'],
 ['harvest', 'goddess'],
 ['jewish', 'month'],
 ['western', 'eleven'],
 ['toy', 'blowgun'],
 ['dont', 'move', '—', 'ill', 'go', 'for', 'help'],
 ['i', 'smell', '___'],
 ['street', '___', '(gamin)'],
 ['th

In [7]:
import gensim.downloader
import gensim.models
# load word2vec
# gensim_vectors = gensim.downloader.load('word2vec-google-news-300')
# build word2vec
model = gensim.models.word2vec.Word2Vec(new_clues_train, vector_size=100, min_count=1)
#saving the model persistence
model.save('model.bin')
# loading the model(decode error)
#model = gensim.models.KeyedVectors.load_word2vec_format('model.bin') 

# model.wv acts as gensim_vectors

In [8]:
# function average word2vec vector
def avg_feature_vector(words, model, num_features, ind2key_set):
    feature_vec = np.zeros((num_features, ), dtype='float32')
    n_words = 0
    for word in words:
        if word in ind2key_set:
            n_words += 1
            feature_vec = np.add(feature_vec, model[word])
    if (n_words > 0):
        feature_vec = np.divide(feature_vec, n_words)
    return feature_vec


In [9]:
from scipy import spatial
# define cosine similarity score
def sim_score(v1,v2):
    return 1 - spatial.distance.cosine(v1, v2)

In [10]:
# define vectorizer
def word2vec_vectorizer(data, model,num_features,ind2key_set):
    vec_data = []
    for sentence in data:
        sentence = [word for word in sentence if len(word)>1]
        vec_data.append(avg_feature_vector(sentence,model,num_features,ind2key_set))
    
    return vec_data

In [11]:
# Vectorize Clues
X_train = word2vec_vectorizer(new_clues_train,model.wv,100,set(model.wv.index_to_key))


# Feature Engineering Ideas
# - clue is a pun (ends in "?")
# - clue is a proper noun (words are capitalized)

In [164]:
# Build a Nearest Neighbors Model
from sklearn.neighbors import NearestNeighbors

nn_model = NearestNeighbors().fit(X_train)

In [12]:
from sklearn.metrics.pairwise import cosine_similarity
# Define a guesser function
def guess(clue, slot_length=None, max_guesses=5, max_guesses_raw=30):
    clue_vector =  word2vec_vectorizer([clue],model.wv,100,set(model.wv.index_to_key))
    result = cosine_similarity(clue_vector, X_train)[0]
    indices = np.argsort(result)[::-1][:max_guesses_raw]
    raw_guesses = [answers_train[i] for i in indices]
    # print([clues_train[i] for i in indices[0]])

    def valid(g):
        o = True
        if slot_length:
            o &= len(g) == slot_length
        o &= g.lower() not in clue
        return o
    
    guesses = [g for g in raw_guesses if valid(g)]
    return guesses[:max_guesses]

    # TODO:
    # - include a confidence with each guess
    # - use repeated guesses and distances to determine confidence


In [16]:
i = random.randint(0, len(new_clues_test))
test_clue, test_answer = new_clues_test[i], answers_test[i]
print("clue:", test_clue)
print("answer:", test_answer)

# distances, indices = model.kneighbors(vectorizer.transform([test_clue]), n_neighbors=10)
# print("guesses:", [answers_train[i] for i in indices[0]])

guesses = guess(test_clue, slot_length=len(test_answer))
print("guesses:", guesses)

clue: ['downs', 'or', 'salts']
answer: EPSOM
guesses: ['EPSOM', 'EPSOM', 'EPSOM', 'EPSOM', 'EPSOM']


In [18]:
correct_pairs = []
incorrect_pairs = []

for i, (clue, answer) in enumerate(zip(new_clues_test, answers_test)):
    if i % 100 == 0: print(i)
    guesses = guess(clue, len(answer), max_guesses=5)
    if answer in guesses:
        correct_pairs.append((clue, answer, guesses))
    else:
        incorrect_pairs.append((clue, answer, guesses))

print(f"Accuracy: {len(correct_pairs) / len(new_clues_test):0.3%}")
# print(incorrect_pairs)

0
100
200
300
400
500
600
Accuracy: 45.861%


In [170]:
incorrect_pairs

[('Making public', 'BARING', ['DONORS', 'TEUTON', 'BRAKES', 'PRAWNS']),
 ('Ball girl', 'DEB', ['GIL', 'TUT', 'NAG', 'OTT', 'AMT']),
 ('Classmates, e.g.', 'PEERS', ['GENOA', 'MARIO', 'ADAGE', 'PEELS', 'TALON']),
 ('Home wrecker?', 'SLOB', ['ARAL', 'AMSO', 'NERO', 'ISEE', 'THAI']),
 ('Unusually small', 'DWARF', ['GENOA', 'MARIO', 'ADAGE', 'PEELS', 'TALON']),
 ('Classmate, e.g.', 'PEER', ['ARAL', 'AMSO', 'NERO', 'ISEE', 'THAI']),
 ("Priest's garb", 'ALB', ['GIL', 'TUT', 'NAG', 'OTT', 'AMT']),
 ('Confined', 'SHUTIN', ['DONORS', 'TEUTON', 'BRAKES', 'PRAWNS']),
 ('Derisive', 'SNEERY', ['DONORS', 'TEUTON', 'BRAKES', 'PRAWNS']),
 ('Primeval plot', 'EDEN', ['ARAL', 'AMSO', 'NERO', 'ISEE', 'THAI']),
 ("Finish of the 50's", 'CHROME', ['DONORS', 'TEUTON', 'BRAKES', 'PRAWNS']),
 ('"___ and evening star": Tennyson',
  'SUNSET',
  ['DONORS', 'TEUTON', 'BRAKES', 'PRAWNS']),
 ('Capek play', 'RUR', ['GIL', 'TUT', 'NAG', 'OTT', 'AMT']),
 ('___ in the right direction',
  'ASTEP',
  ['GENOA', 'MARIO', 'ADA

In [136]:
guess("bruins")

['DONORS', 'GENOA', 'ARAL', 'AMSO', 'MARIO']

In [16]:
# serialize and save model
import pickle

TRAINED_MODEL_PATH = "trained_model.p"

pickle.dump((answers_train, vectorizer, model), open(TRAINED_MODEL_PATH, "wb"))