In [2]:
# Load libraries
import numpy as np
import pandas as pd
from scipy import spatial
from random import sample
import copy
import re
import nltk

nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/prastutisingh/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/prastutisingh/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [3]:
from bert_serving.client import BertClient
client = BertClient()

In [4]:
# Load all data from massive embeddings file. You only need to do this once
embeddings_all = {}
i = 0
with open("./top_50000.txt", 'r', encoding="utf8") as f:
    for line in f:
        values = line.split()
        word = values[0]
        if len(word) > 1 and word.isalpha(): 
            vectors = client.encode([word])
            embeddings_all[word] = vectors[0]
        
codenames_words = []
with open("./codenames_words.txt", 'r') as f:
    for line in f:
        codenames_words.append(line.strip())

In [5]:
embeddings_all['cat']

array([-5.53800523e-01, -4.61543687e-02, -1.81391183e-02, -5.27625859e-01,
       -1.09318569e-02, -5.89046590e-02,  4.91119176e-02,  3.52730304e-01,
       -3.48756582e-01, -5.00540435e-01, -2.11794600e-01,  2.56730884e-01,
        2.44605705e-01, -6.86356947e-02, -1.47124127e-01,  2.49768957e-01,
        2.37100315e-03,  1.48043737e-01,  1.43015534e-01,  1.85009688e-02,
       -2.03737572e-01, -8.15405771e-02, -4.92950678e-01, -1.27440110e-01,
        3.92178632e-02,  6.26569018e-02, -9.08489302e-02,  3.84163290e-01,
       -1.41799301e-01,  2.52287269e-01,  2.61642784e-01,  4.33803089e-02,
       -2.39104256e-01, -3.39169532e-01, -1.70343950e-01, -4.00132053e-02,
       -7.56622702e-02,  5.71977310e-02, -5.06307960e-01,  2.71703690e-01,
        2.44921967e-01, -2.48858795e-01,  2.48696983e-01, -1.45252109e-01,
        2.83236325e-01,  8.94151703e-02, -1.34371662e+00, -1.63231567e-02,
        6.08708849e-03, -7.90844932e-02, -3.22427064e-01, -4.50175047e-01,
        1.59755275e-01,  

In [None]:
def embeddings_size(embeddings_size, embeddings_all):
    #embeddings_size is the length of the subset of embeddings
    #embeddings_all is the embeddings from 50k top words
    embeddings = {}
    for x in list(embeddings_all)[0:embeddings_size]:
        embeddings[x] = embeddings_all[x]
    
    #adds words from codenames that don't make it in the top XX words
    for item in codenames_words:
        if item not in embeddings:
            try: 
                embeddings[item] = embeddings_all[item]
            except KeyError: 
                codenames_words.remove(item)
            
    return embeddings

In [None]:
embeddings_25000 = embeddings_size(25000, embeddings_all)
embeddings_50000 = embeddings_size(50000, embeddings_all)

In [None]:
# Helper Functions
def distance(embeddings, word, reference):
    return spatial.distance.cosine(embeddings[word], embeddings[reference])

def closest_words(embeddings, reference):
    return sorted(embeddings.keys(), key=lambda w: distance(embeddings, w, reference))

# Scoring function for generating clues based on sum of cosine distance
def goodness(embeddings, word, answers):
    if word in answers: return -999
    return sum([distance(embeddings, word, a) for a in answers])

# Chooses top candidates
def candidates(embeddings, answers, size=40):
    best = sorted(embeddings.keys(), key=lambda w: goodness(embeddings, w, answers))
    #res = [(str(i + 1), "{0:.2f}".format(minimax(embeddings, w, answers)), w) 
           #for i, w in enumerate(sorted(best[:250], key=lambda w: -1 * minimax(embeddings, w, answers))[:size])]
    
    return best

In [None]:
#from itertools import combinations 
    
# Generates a single example (good words, top ten clues)
def generate_single_example(codenames_words, embeddings):
    # Enforce threshold for 'good' words
#     while True: 
#         good = sample(codenames_words, 3)
#         combo_list = list(combinations(good, 2)) 
#         threshold_list = [distance(embeddings_all, item[0], item[1]) > 0.5 for item in combo_list]
#         if all(good): 
#             break
    good = sample(codenames_words, 3)
        
    clues = candidates(embeddings, good)
    
    # remove good words from clues
    for item in good: 
        if item in clues:
            clues.remove(item)
            
    print(good)
    for item in clues: 
        print(item)
        print(distance(embeddings_all, item, good[0]), distance(embeddings_all, item, good[1]), 
              distance(embeddings_all, item, good[2]))
                
    return good, clues[:10]

# Generates n examples and writes them to filename
def write_n_examples(codenames_words, embeddings, n, filename): 
    output_file = open(filename, 'w')
    
    for i in range(n): 
        good, clues = generate_single_example(codenames_words, embeddings)
        word_string = ', '.join(good + clues) + '\n'
        output_file.write(str(i) + '.' + word_string)
    
    output_file.close()
    
# write_n_examples(codenames_words, embeddings_1000, 10, 'test_examples.txt')

In [None]:
generate_single_example(codenames_words, embeddings_25000)