In [24]:
# Start off with replicating Jason Somers 
# Top 50000 words from glove.42B.300d.zip were written to top_50000.txt

# Load libraries
import numpy as np
import pandas as pd
from scipy import spatial

# Load all data from massive embeddings file. You only need to do this once
embeddings_all = {}
with open("./top_50000.txt", 'r', encoding="utf8") as f:
    for line in f:
        values = line.split()
        word = values[0]
        if len(word) > 1: 
            vector = np.asarray(values[1:], "float32")
            embeddings_all[word] = vector
        
codenames_words = []
with open("./codenames_words.txt", 'r') as f:
    for line in f:
        codenames_words.append(line.strip())

In [25]:
# Choose smaller sets of embeddings. 
def embeddings_size(embeddings_size, embeddings_all):
    #embeddings_size is the length of the subset of embeddings
    #embeddings_all is the embeddings from 50k top words
    embeddings = {}
    for x in list(embeddings_all)[0:embeddings_size]:
        embeddings[x] = embeddings_all[x]
    
    #adds words from codenames that don't make it in the top XX words
    for item in codenames_words:
        if item not in embeddings:
            embeddings[item] = embeddings_all[item]
            
    return embeddings

In [26]:
embeddings_1000 = embeddings_size(1000, embeddings_all)
embeddings_5000 = embeddings_size(5000, embeddings_all)
embeddings_10000 = embeddings_size(10000, embeddings_all)

In [35]:
# Helper Functions
def distance(embeddings, word, reference):
    return spatial.distance.cosine(embeddings[word], embeddings[reference])

def closest_words(embeddings, reference):
    return sorted(embeddings.keys(), key=lambda w: distance(embeddings, w, reference))

#Changed. definied as wanting to maximize for a good clue
def goodness(embeddings, word, answers):
    if word in answers: return -999
    return sum([distance(embeddings, word, a) for a in answers])

#Changed
def candidates(embeddings, answers, size=10):
    best = sorted(embeddings.keys(), key=lambda w: 1 * goodness(embeddings, w, answers))
    #res = [(str(i + 1), "{0:.2f}".format(minimax(embeddings, w, answers)), w) 
           #for i, w in enumerate(sorted(best[:250], key=lambda w: -1 * minimax(embeddings, w, answers))[:size])]
    
    # Modified to only return the top candidates rather than the scores since that's not important
    print(best)
    return best

In [36]:
good = sample(codenames_words, 3)
print(good)
candidates(embeddings_10000, good)

['washer', 'bug', 'hook']


['bug',
 'hook',
 'washer',
 'lock',
 'fix',
 'pull',
 'bugs',
 'handle',
 'drop',
 'problem',
 'screw',
 'dryer',
 'plug',
 'sure',
 'thread',
 'rod',
 'put',
 "n't",
 'trap',
 'washing',
 'up',
 'need',
 'loop',
 'door',
 'turn',
 'fixed',
 'comes',
 'off',
 'locking',
 'bolt',
 'front',
 'it',
 'when',
 'remove',
 'hole',
 'thing',
 'there',
 'window',
 'machine',
 'out',
 'attach',
 'attached',
 'bottom',
 'back',
 'stuck',
 'fixing',
 'hood',
 'grab',
 'load',
 'another',
 'button',
 'dishwasher',
 'kit',
 'needed',
 'just',
 'tool',
 'some',
 'fixes',
 'large',
 'set',
 'features',
 'hanging',
 'head',
 'loading',
 'wo',
 'pin',
 'wash',
 'box',
 'you',
 'clean',
 'if',
 'that',
 'hose',
 'install',
 'broken',
 'switch',
 'handy',
 'included',
 'feature',
 'wire',
 'combo',
 'catch',
 'tip',
 'use',
 'same',
 'could',
 'screen',
 'stuff',
 'problems',
 'down',
 'got',
 'flat',
 'inside',
 'once',
 'one',
 'then',
 'clip',
 'something',
 'does',
 'so',
 'attachment',
 'push',
 'ha

In [12]:
# Functions to help with word + scores (not necessary for data collection)...
from itertools import zip_longest

def grouper(n, iterable, fillvalue=None):
    args = [iter(iterable)] * n
    return zip_longest(fillvalue=fillvalue, *args)

def tabulate(data):
    data = list(grouper(10, data))
    return data

In [19]:
from random import sample
    
def generate_single_example(codenames_words, embeddings):
    # Generates a single example (good words, bad words, top ten clues)
    good = sample(codenames_words, 3)
    clues = candidates(embeddings, good)

    return good, clues

def write_n_examples(codenames_words, embeddings, n, filename): 
    # Generates n examples and writes them to filename
    output_file = open(filename, 'w')
    
    for i in range(n): 
        good, clues = generate_single_example(codenames_words, embeddings)
        word_string = ', '.join(good + clues) + '\n'
        output_file.write(str(i) + '.' + word_string)
    
    output_file.close()
    
# write_n_examples(codenames_words, embeddings_1000, 10, 'test_examples.txt')

In [20]:
def read_n_examples(filename):
    '''Reads filename of good, bad, clues examples into a three-dimensional array. 
    To access the 0th example: data[0]
    To access the good words (in list form) from the 1st example: data[1][0]
        good words list - 0
        bad words list - 1
        clue words list - 2
    To access the first clue word from the 1st example: data[1][2][0]'''
    file = open(filename, 'r')
    
    data = []

    for line in file: 
        full_list = line.strip('\n').split(',')
        good = full_list[:3]
        clues = full_list[9:]
        data.append([good, bad, clues])
        
    file.close()
    return data

In [21]:
write_n_examples(codenames_words, embeddings_1000, 100, 'examples_1000_vocab_nobadwords.txt')

In [22]:
#write_n_examples(codenames_words, embeddings_5000, 100, 'examples_5000_vocab_nobadwords.txt')
write_n_examples(codenames_words, embeddings_10000, 10, 'examples_10000_vocab_nobadwords.txt')