In [None]:
# Load libraries
import numpy as np
import pandas as pd
from scipy import spatial
from random import sample
import copy
import re
import nltk

nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

In [None]:
# Top 50000 words from glove.42B.300d.zip were written to top_50000.txt.

# Load all data from massive embeddings file. You only need to do this once.
# Only adds nouns and proper nouns to 
embeddings_all = {}
with open("./top_50000.txt", 'r', encoding="utf8") as f:
    for line in f:
        values = line.split()
        word = values[0]
        tag = nltk.pos_tag([word])[0][1]
        if tag in ['NN', 'NNP'] and len(word) != 1:
            vector = np.asarray(values[1:], "float32")
            embeddings_all[word] = vector

# Load list of codenames words.
codenames_words = []
with open("./codenames_words.txt", 'r') as f:
    for line in f:
        codenames_words.append(line.strip())

In [None]:
# Function for choosing smaller sets of embeddings. 
def embeddings_size(embeddings_size, embeddings_all):
    #embeddings_size is the length of the subset of embeddings
    #embeddings_all is the embeddings from 50k top words
    embeddings = {}
    for x in list(embeddings_all)[0:embeddings_size]:
        embeddings[x] = embeddings_all[x]
    
    #adds words from codenames that don't make it in the top XX words
    for item in codenames_words:
        if item not in embeddings:
            try: 
                embeddings[item] = embeddings_all[item]
            except KeyError: 
                codenames_words.remove(item)
            
    return embeddings

In [None]:
#embeddings_1000 = embeddings_size(1000, embeddings_all)
#embeddings_5000 = embeddings_size(5000, embeddings_all)
#embeddings_10000 = embeddings_size(10000, embeddings_all)
embeddings_25000 = embeddings_size(25000, embeddings_all)
embeddings_50000 = embeddings_size(50000, embeddings_all)

In [None]:
# Helper Functions
def distance(embeddings, word, reference):
    return spatial.distance.cosine(embeddings[word], embeddings[reference])

def closest_words(embeddings, reference):
    return sorted(embeddings.keys(), key=lambda w: distance(embeddings, w, reference))

# Scoring function for generating clues based on sum of cosine distance
def goodness(embeddings, word, answers):
    if word in answers: return -999
    return sum([distance(embeddings, word, a) for a in answers])

# Chooses top candidates
def candidates(embeddings, answers, size=40):
    best = sorted(embeddings.keys(), key=lambda w: goodness(embeddings, w, answers))
    #res = [(str(i + 1), "{0:.2f}".format(minimax(embeddings, w, answers)), w) 
           #for i, w in enumerate(sorted(best[:250], key=lambda w: -1 * minimax(embeddings, w, answers))[:size])]
    
    return best

In [None]:
# Functions to help with word + scores (not necessary for data collection)...
from itertools import zip_longest

def grouper(n, iterable, fillvalue=None):
    args = [iter(iterable)] * n
    return zip_longest(fillvalue=fillvalue, *args)

def tabulate(data):
    data = list(grouper(10, data))
    return data

In [None]:
from itertools import combinations 
    
# Generates a single example (good words, top ten clues)
def generate_single_example(codenames_words, embeddings):
    # Enforce threshold for 'good' words
#     while True: 
#         good = sample(codenames_words, 3)
#         combo_list = list(combinations(good, 2)) 
#         threshold_list = [distance(embeddings_all, item[0], item[1]) > 0.5 for item in combo_list]
#         if all(good): 
#             break
    good = sample(codenames_words, 3)
        
    clues = candidates(embeddings, good)
    
    # remove good words from clues
    for item in good: 
        if item in clues:
            clues.remove(item)
                
    return good, clues[:10]

# Generates n examples and writes them to filename
def write_n_examples(codenames_words, embeddings, n, filename): 
    output_file = open(filename, 'w')
    
    for i in range(n): 
        good, clues = generate_single_example(codenames_words, embeddings)
        word_string = ', '.join(good + clues) + '\n'
        output_file.write(str(i) + '.' + word_string)
    
    output_file.close()
    
# write_n_examples(codenames_words, embeddings_1000, 10, 'test_examples.txt')

In [None]:
# Not necessary yet...
def read_n_examples(filename):
    '''Reads filename of good, bad, clues examples into a three-dimensional array. 
    To access the 0th example: data[0]
    To access the good words (in list form) from the 1st example: data[1][0]
        good words list - 0
        bad words list - 1
        clue words list - 2
    To access the first clue word from the 1st example: data[1][2][0]'''
    file = open(filename, 'r')
    
    data = []

    for line in file: 
        full_list = line.strip('\n').split(',')
        good = full_list[:3]
        clues = full_list[9:]
        data.append([good, bad, clues])
        
    file.close()
    return data

In [None]:
write_n_examples(codenames_words, embeddings_50000, 10, 'examples_50000_vocab_nobadwords.txt')

In [None]:
#write_n_examples(codenames_words, embeddings_5000, 100, 'examples_5000_vocab_nobadwords.txt')
write_n_examples(codenames_words, embeddings_10000, 10, 'examples_10000_vocab_nobadwords.txt')