In [1]:
# Start off with replicating Jason Somers 
# Top 50000 words from glove.42B.300d.zip were written to top_50000.txt

# Load libraries
import numpy as np
import pandas as pd
from scipy import spatial

In [2]:
# Load data
embeddings = {}
with open("./top_50000.txt", 'r') as f:
    for line in f:
        values = line.split()
        word = values[0]
        vector = np.asarray(values[1:], "float32")
        embeddings[word] = vector

In [14]:
def distance(word, reference):
    return spatial.distance.cosine(embeddings[word], embeddings[reference])

def closest_words(reference):
    return sorted(embeddings.keys(), key=lambda w: distance(w, reference))

def goodness(word, answers, bad):
    if word in answers + bad: return -999
    return sum([distance(word, b) for b in bad]) - 4.0 * sum([distance(word, a) for a in answers])

def minimax(word, answers, bad):
    if word in answers + bad: return -999
    return min([distance(word, b) for b in bad]) - max([distance(word, a) for a in answers])

def candidates(answers, bad, size=10):
    best = sorted(embeddings.keys(), key=lambda w: -1 * goodness(w, answers, bad))
    res = [(str(i + 1), "{0:.2f}".format(minimax(w, answers, bad)), w) for i, w in enumerate(sorted(best[:250], key=lambda w: -1 * minimax(w, answers, bad))[:size])]
    return [c[2] for c in res]

In [15]:
# Experiments
from itertools import zip_longest

def grouper(n, iterable, fillvalue=None):
    args = [iter(iterable)] * n
    return zip_longest(fillvalue=fillvalue, *args)

def tabulate(data):
    data = list(grouper(10, data))
    return data

In [16]:
answers = ["iron", "ham", "beijing"]
bad = ["fall", "witch", "note", "cat", "bear", "ambulance"]

clues = tabulate(candidates(answers, bad))
print(clues)

[('tong', 'wok', 'guan', 'kitchenware', 'nippon', 'torino', 'thanh', 'jian', 'bao', 'jia')]


In [20]:
codenames_words = []
with open("./codenames_words.txt", 'r') as f:
    for line in f:
        codenames_words.append(line.strip())

In [21]:
from random import sample
    
def generate(codenames_words): 
    good = sample(codenames_words, 3)
    bad = sample(codenames_words, 6)
    
    print(good)
    print(bad)
    
    return True

generate(codenames_words)

['hole', 'belt', 'needle']
['knife', 'amazon', 'lab', 'press', 'olive', 'truck']


True