In [1]:
import os
import json
from pprint import pprint
import random
import numpy as np
from matplotlib import pyplot as plt


DATA_PATH = r"data\nyt_crosswords-cleaned"

In [2]:
puzzles_available = []

for year in os.listdir(DATA_PATH):
    if not year.isdecimal(): continue       # skip README.md
    for month in os.listdir(os.path.join(DATA_PATH, year)):
        for day in os.listdir(os.path.join(DATA_PATH, year, month)):
            puzzles_available.append((int(year), int(month), int(day.split(".")[0])))

print("Puzzles Available:", len(puzzles_available))

Puzzles Available: 14545


In [3]:
# Read a random sample of puzzles

# sample = random.sample(puzzles_available, 10_000)

# sample puzzles from EVEN days
sample = [puzz for puzz in puzzles_available if puzz[-1]%2==0]

clue_answer_pairs = []

for year, month, day in sample:
    path = os.path.join(DATA_PATH, f"{year}", f"{month:02d}", f"{day:02d}.json")
    with open(path, encoding="utf-8") as f:
        puzzle = json.load(f)
        for c, a in zip(
            puzzle["clues"]["across"] + puzzle["clues"]["down"],
            puzzle["answers"]["across"] + puzzle["answers"]["down"]
        ):
            clue = c[c.index(".")+2:]
            clue_answer_pairs.append((clue, a))

clues, answers = zip(*clue_answer_pairs)


In [4]:
# Split data into train and test
from sklearn.model_selection import train_test_split

clues_train, clues_test, answers_train, answers_test = train_test_split(
    clues, answers,
    test_size=0.001, shuffle=True
)

len(clues_train), len(clues_test)

(602694, 604)

In [5]:
# Prepare data
new_clues_train = []
for clue in clues_train:
    clue = clue.replace('\'', '')
    clue = clue.replace('"', '')
    new_clues_train.append([w.lower() for w in clue.split(' ')])

new_clues_test = []
for clue in clues_test:
    clue = clue.replace('\'', '')
    clue = clue.replace('"', '')
    new_clues_test.append([w.lower() for w in clue.split(' ')])


In [62]:
import gensim.downloader
import gensim.models
from gensim.test.utils import common_texts

# load word2vec
w2v_model = gensim.downloader.load('glove-wiki-gigaword-300')
# build word2vec
#gensim_vectors.save('model.bin')
#w2v_model = gensim.models.word2vec.Word2Vec(new_clues_train, vector_size=300, min_count=1)
#saving the model persistence
w2v_model.save('w2v_model.bin')
# loading the model(decode error)
#model = gensim.models.KeyedVectors.load_word2vec_format('model.bin') 

# model.wv acts as gensim_vectors


In [37]:
# function average word2vec vector
def avg_feature_vector(words, model, num_features, ind2key_set):
    feature_vec = np.zeros((num_features, ), dtype='float32')
    n_words = 0
    for word in words:
        if word in ind2key_set:
            n_words += 1
            feature_vec = np.add(feature_vec, model[word])
    if (n_words > 0):
        feature_vec = np.divide(feature_vec, n_words)
    return feature_vec

In [113]:
# DAN model
import torch
import torch.nn as nn

class DanModel(nn.Module):
    def __init__(self, n_classes, vocab_size, emb_dim=500,n_hidden_units=500, nn_dropout=.5):
        super(DanModel, self).__init__()
        self.n_classes = n_classes
        self.vocab_size = vocab_size
        self.emb_dim = emb_dim
        self.n_hidden_units = n_hidden_units
        self.nn_dropout = nn_dropout
        self.embeddings = nn.Embedding(self.vocab_size, self.emb_dim, padding_idx=0)
        self.linear1 = nn.Linear(emb_dim, n_hidden_units)
        self.linear2 = nn.Linear(n_hidden_units, n_classes)
        self.classifier = nn.Sequential(self.linear1,nn.ReLU(),nn.Dropout(nn_dropout),self.linear2)
        self._softmax = nn.Softmax()

    def average(self, text_embeddings, text_len):
        #average = []
        #vec = np.zeros((num_features, ), dtype='float32')
        #for qq in  text_embeddings:
        #     vec = np.add(vec,qq)
        #vec = np.divide(vec, text_len)
        #average = torch.tensor(average)
        #return average

        average = []
        for i in range(0,text_len):
            average.append((text_embeddings[i].sum(0)/text_len).tolist())
        average = torch.tensor(average)
        return average

        
    def forward(self,input_text,text_len, is_prob=False):

        logits = torch.LongTensor([0.0] * self.n_classes)

        # Complete the forward funtion.  First look up the word embeddings.
        # Then average them 
        #text_embed = self.embeddings(input_text)
        text_embed = self.embeddings(torch.LongTensor(input_text))
        text_embed = self.average(text_embed, text_len)
        # Before feeding them through the network
        logits = self.classifier(text_embed)

        if is_prob:
            logits = self._softmax(logits)

        return logits

In [121]:
def train(model, train_data_loader, train_data_answer, w2v_model,num_features):
    model.train()
    optimizer = torch.optim.Adamax(model.parameters())
    criterion = nn.CrossEntropyLoss()
    
    for idx, text in enumerate(train_data_loader):
        question_text = [abs(w2v_model[word]) for word in text]
        question_len = len(question_text)
        label = train_data_answer[idx].lower()
        if label in set(w2v_model.index_to_key):
            labels = w2v_model[label]
        else:
            vec = np.zeros((num_features, ), dtype='float32')
            for qq in question_text:
                vec = np.add(vec,qq)
            vec = np.divide(vec, question_len)
            w2v_model.add_vectors(label, vec)
            labels = w2v_model[label]
            
        optimizer.zero_grad()
        preds = model.forward(question_text,question_len)
        loss = criterion(preds,labels)
        loss.backward()
        optimizer.step()



In [122]:
dan_model = DanModel(len(set(answers_train)),300,emb_dim=500,n_hidden_units=500)
for epoch in range(50):
    
    train(dan_model, new_clues_train, answers_train,w2v_model,300)

  logits = torch.LongTensor([0.0] * self.n_classes)
  text_embed = self.embeddings(torch.LongTensor(input_text))


RuntimeError: mat1 and mat2 shapes cannot be multiplied (1x300 and 500x500)

In [6]:
from gensim.models.word2vec import Word2Vec
import gensim.downloader as api

corpus = api.load("wiki-english-20171001") # download the corpus and return it opened as an iterable
model = Word2Vec(corpus)
model.train(new_clues_train, total_examples=len(new_clues_train), epochs=5)



(434, 8380390)

In [8]:
from scipy import spatial
# define cosine similarity score
def sim_score(v1,v2):
    return 1 - spatial.distance.cosine(v1, v2)

In [9]:
# define vectorizer
def word2vec_vectorizer(data, model,num_features,ind2key_set):
    vec_data = []
    for sentence in data:
        sentence = [word for word in sentence if len(word)>1]
        vec_data.append(avg_feature_vector(sentence,model,num_features,ind2key_set))
    
    return vec_data

In [30]:
# Vectorize Clues
X_train = word2vec_vectorizer(new_clues_train,model.wv,100,set(model.wv.index_to_key))


# Feature Engineering Ideas
# - clue is a pun (ends in "?")
# - clue is a proper noun (words are capitalized)

In [31]:
# Build a Nearest Neighbors Model
from sklearn.neighbors import NearestNeighbors

nn_model = NearestNeighbors().fit(X_train)

In [32]:
from sklearn.metrics.pairwise import cosine_similarity
# Define a guesser function
def guess(clue, slot_length=None, max_guesses=5, max_guesses_raw=30):
    clue_vector =  word2vec_vectorizer([clue],model.wv,100,set(model.wv.index_to_key))
    result = cosine_similarity(clue_vector, X_train)[0]
    indices = np.argsort(result)[::-1][:max_guesses_raw]
    raw_guesses = [answers_train[i] for i in indices]
    # print([clues_train[i] for i in indices[0]])

    def valid(g):
        o = True
        if slot_length:
            o &= len(g) == slot_length
        o &= g.lower() not in clue
        return o
    
    guesses = [g for g in raw_guesses if valid(g)]
    return guesses[:max_guesses]

    # TODO:
    # - include a confidence with each guess
    # - use repeated guesses and distances to determine confidence


In [36]:
i = random.randint(0, len(new_clues_test))
test_clue, test_answer = new_clues_test[i], answers_test[i]
print("clue:", test_clue)
print("answer:", test_answer)

# distances, indices = model.kneighbors(vectorizer.transform([test_clue]), n_neighbors=10)
# print("guesses:", [answers_train[i] for i in indices[0]])

guesses = guess(test_clue, slot_length=len(test_answer))
print("guesses:", guesses)

clue: ['refrain', 'syllables']
answer: TRALA
guesses: ['TRALA', 'LALAS', 'TRALA', 'EIEIO', 'TRALA']


In [39]:
correct_pairs = []
incorrect_pairs = []

for i, (clue, answer) in enumerate(zip(new_clues_test, answers_test)):
    if i % 250 == 0: 
        guesses = guess(clue, len(answer), max_guesses=5)
        if answer in guesses:
            correct_pairs.append((clue, answer, guesses))
        else:
            incorrect_pairs.append((clue, answer, guesses))

print(f"Accuracy: {len(correct_pairs) / len(new_clues_test):0.3%}")
# print(incorrect_pairs)

Accuracy: 0.142%


In [None]:
incorrect_pairs

[('Making public', 'BARING', ['DONORS', 'TEUTON', 'BRAKES', 'PRAWNS']),
 ('Ball girl', 'DEB', ['GIL', 'TUT', 'NAG', 'OTT', 'AMT']),
 ('Classmates, e.g.', 'PEERS', ['GENOA', 'MARIO', 'ADAGE', 'PEELS', 'TALON']),
 ('Home wrecker?', 'SLOB', ['ARAL', 'AMSO', 'NERO', 'ISEE', 'THAI']),
 ('Unusually small', 'DWARF', ['GENOA', 'MARIO', 'ADAGE', 'PEELS', 'TALON']),
 ('Classmate, e.g.', 'PEER', ['ARAL', 'AMSO', 'NERO', 'ISEE', 'THAI']),
 ("Priest's garb", 'ALB', ['GIL', 'TUT', 'NAG', 'OTT', 'AMT']),
 ('Confined', 'SHUTIN', ['DONORS', 'TEUTON', 'BRAKES', 'PRAWNS']),
 ('Derisive', 'SNEERY', ['DONORS', 'TEUTON', 'BRAKES', 'PRAWNS']),
 ('Primeval plot', 'EDEN', ['ARAL', 'AMSO', 'NERO', 'ISEE', 'THAI']),
 ("Finish of the 50's", 'CHROME', ['DONORS', 'TEUTON', 'BRAKES', 'PRAWNS']),
 ('"___ and evening star": Tennyson',
  'SUNSET',
  ['DONORS', 'TEUTON', 'BRAKES', 'PRAWNS']),
 ('Capek play', 'RUR', ['GIL', 'TUT', 'NAG', 'OTT', 'AMT']),
 ('___ in the right direction',
  'ASTEP',
  ['GENOA', 'MARIO', 'ADA

In [None]:
guess("bruins")

['DONORS', 'GENOA', 'ARAL', 'AMSO', 'MARIO']

In [None]:
# serialize and save model
import pickle

TRAINED_MODEL_PATH = "trained_model.p"

pickle.dump((answers_train, vectorizer, model), open(TRAINED_MODEL_PATH, "wb"))