In [15]:
# Start off with replicating Jason Somers 
# Top 50000 words from glove.42B.300d.zip were written to top_50000.txt

# Load libraries
import numpy as np
import pandas as pd
from scipy import spatial

In [51]:
# Load all data from massive embeddings file. You only need to do this once
embeddings_all = {}
with open("./top_50000.txt", 'r', encoding="utf8") as f:
    for line in f:
        values = line.split()
        word = values[0]
        vector = np.asarray(values[1:], "float32")
        embeddings_all[word] = vector

In [81]:
# Choose smaller sets of embeddings. 
def embeddings_size(embeddings_size, embeddings_all):
    #embeddings_size is the length of the subset of embeddings
    #embeddings_all is the embeddings from 50k top words
    embeddings = {}
    for x in list(embeddings_all)[0:embeddings_size]:
        embeddings[x] = embeddings_all[x]
    #adds words from codenames that don't make it in the top XX words
    for item in codenames_words:
        if item not in embeddings:
            embeddings[item] = embeddings_all[item]
    #
    return embeddings
    
embeddings = embeddings_size(1000, embeddings_all)

In [75]:
def distance(word, reference):
    return spatial.distance.cosine(embeddings[word], embeddings[reference])

def closest_words(reference):
    return sorted(embeddings.keys(), key=lambda w: distance(w, reference))

def goodness(word, answers, bad):
    if word in answers + bad: return -999
    return sum([distance(word, b) for b in bad]) - 4.0 * sum([distance(word, a) for a in answers])

def minimax(word, answers, bad):
    if word in answers + bad: return -999
    return min([distance(word, b) for b in bad]) - max([distance(word, a) for a in answers])

def candidates(answers, bad, size=10):
    best = sorted(embeddings.keys(), key=lambda w: -1 * goodness(w, answers, bad))
    res = [(str(i + 1), "{0:.2f}".format(minimax(w, answers, bad)), w) for i, w in enumerate(sorted(best[:250], key=lambda w: -1 * minimax(w, answers, bad))[:size])]
    return [c[2] for c in res]

In [77]:
# Experiments
from itertools import zip_longest

def grouper(n, iterable, fillvalue=None):
    args = [iter(iterable)] * n
    return zip_longest(fillvalue=fillvalue, *args)

def tabulate(data):
    data = list(grouper(10, data))
    return data

In [78]:
codenames_words = []
with open("./codenames_words.txt", 'r') as f:
    for line in f:
        codenames_words.append(line.strip())

In [79]:
#this function takes a random samples from the codenames words
from random import sample
    
def generate(codenames_words): 
    good = sample(codenames_words, 3)
    bad = sample(codenames_words, 6)
    
    return good, bad

generate(codenames_words)

(['dress', 'note', 'cross'],
 ['tooth', 'ham', 'truck', 'horseshoe', 'head', 'hole'])

In [82]:
#Script takes vocabulary of specific size, randomly generates a set of 9 words {3 good, 6 bad}, and outputs top 10 guesses
good, bad = generate(codenames_words)
print(good, bad)
#clues = tabulate(candidates(answers, bad))
clues = tabulate(candidates(good, bad))
print(clues[0])


['death', 'mercury', 'yard'] ['horseshoe', 'england', 'rome', 'teacher', 'net', 'ruler']
('gas', 'parts', 'truck', 'lyrics', 'water', 'car', 'cover', 'engine', 'cause', 'oil')
