In [51]:
import numpy as np
import pandas as pd
from scipy import spatial
from random import sample
import copy
import re
import nltk
from sklearn.metrics.pairwise import cosine_similarity
from scipy import spatial

In [5]:
# Import encoding dictionaries. i.e. bert_dict['cat'] = [0.5, -.4, 0.001,...]
bert_dict = np.load('bert.npy', allow_pickle = True); bert_dict = bert_dict[()] #This originally contains 42923 words
glove_dict = np.load('glove.npy', allow_pickle = True); glove_dict = glove_dict[()] #This originally contains 27296 words

In [6]:
# Part of speech tagging for bert_dict, removing everything that is not a noun or proper noun
bert_dict_copy = bert_dict.copy()
for key in bert_dict_copy:  
    tag = nltk.pos_tag([key.strip()])[0][1]
    if tag not in ['NN', 'NNP']: 
        del bert_dict[key]

In [7]:
# Import list of codenames words
codenames_words = []
with open("./codenames_words.txt", 'r') as f:
    for line in f:
        codenames_words.append(line.strip())

In [8]:
# Remove codenames words not in embeddings
def removeInvalidCodenames(embeddings, codenames_words):
    removed_words = []
    for item in codenames_words:
        if item not in embeddings:
            removed_words.append(item)
            codenames_words.remove(item)
    #print('Words removed from codenames are', removed_words)
    return codenames_words

codenames_words_glove = removeInvalidCodenames(glove_dict, codenames_words)
codenames_words_bert = removeInvalidCodenames(bert_dict, codenames_words)

In [24]:
# Turn BERT into indexable embeddings
all_bert_words = list(bert_dict.keys())
word_to_idx_bert = dict(zip(all_bert_words, range(len(all_bert_words))))
idx_to_word_bert = dict(zip(range(len(all_bert_words)), all_bert_words))

# Turn GLOVE into indexable embeddings
all_glove_words = list(glove_dict.keys())
word_to_idx_glove = dict(zip(all_glove_words, range(len(all_glove_words))))
idx_to_word_glove = dict(zip(range(len(all_glove_words)), all_glove_words))

# Get indexes for codenames words
codenames_words_idxs_bert = np.array([word_to_idx_bert[word] for word in codenames_words])
codenames_words_idxs_glove = np.array([word_to_idx_glove[word] for word in codenames_words])

# Get BERT embeddings as giant matrix
bert_embedding = [bert_dict[word] for word in all_bert_words]
bert_embedding = np.vstack(bert_embedding) # shape = (27296, 768)

glove_embedding = [glove_dict[word] for word in all_glove_words]
glove_embedding = np.vstack(glove_embedding) # shape = (27296, 300)

In [173]:
from scipy.special import softmax

# Returns list of random indexes from indexes of codenames words
def generate_single_board(codenames_words_idxs, idx_to_word): 
    board = np.random.choice(codenames_words_idxs, 2)
    while len(board) != len(set(board)): #this makes sure you dont choose the same two words for the board
        board = np.random.choice(codenames_words_idxs, 2)
    print('We are looking at board:', [idx_to_word[index] for index in board])
    return board

# Calculates prob. distribution (using softmax) over entire vocabulary for a single board
def clue_dist_for_one_board(embedding, idx_to_word, board, temperature = 1):
    '''
    embedding: V x [embedding_size = 768]
    board: (3,) [indices of good words in all_bert_words]
    scoring: either cosine or euclidean
    temperature: sampling smoothing over prob. distribution
    '''
    
    V, K = embedding.shape
    board_embedding = embedding[board] # shape = (2 x 768)
    # Calculate score for each word in vocabulary using the specified scoring function
    score_euclidean = -((board_embedding.reshape(1, 2, K) - embedding.reshape(V, 1, K)) ** 2).sum(2).sum(1) # shape V
    score_cosine = np.zeros(V)
    for i in range(V):
        score_cosine[i] = -spatial.distance.cosine(board_embedding[0,:], embedding[i,:]) + spatial.distance.cosine(board_embedding[1,:], embedding[i,:])
    score_cosine = np.asarray(score_cosine)
    
    # Penalize scores for good words

    score_euclidean[board] -= 1e6
    score_cosine[board] -= 1e6
    top_indices_euclidean = score_euclidean.argsort()[-100:][::-1]
    top_indices_cosine = score_cosine.argsort()[-100:][::-1]
    top_clues_euclidean = [idx_to_word[index] for index in top_indices_euclidean]
    top_clues_cosine = [idx_to_word[index] for index in top_indices_cosine]
    
    return top_clues_euclidean, top_clues_cosine

In [121]:
board = generate_single_board(codenames_words_idxs, idx_to_word); #generates the two indices of the board
print(board)
clue_dist_for_one_board(bert_embedding, idx_to_word, board, scoring = 'cosine', temperature = 1) #outputs the softmax scorse for all the potential clues

We are looking at board: ['france', 'dog']
[887 350]
The top cosine clues are: ['dijon', 'suisse', 'mauritania', 'visage', 'guelph', 'guernsey', 'sauvignon', 'vuitton', 'clarkson', 'lille']
The top euclidean clues are: ['martinique', 'bike', 'labrador', 'newfoundland', 'bicycle', 'lizard', 'squirrel', 'quebec', 'québec', 'lion']


In [95]:
# Read data from txt file
def read_data(examples, data): 
    '''
    Takes in dataset from examples and data (make sure they're for the same data!!!) and appends each example to
    a matrix. 
    For example, 
        board = 'cat', 'dog' (from examples)
        gold_word = 'pet' (from data)
    Will be appended as: 
        [[word_to_idx['cat], word_to_idx['dog']], [word_to_idx['dog']]]
    '''
    data_matrix = []
    with open(examples) as examples, open(data) as data:
        for line1, line2 in zip(examples, data):
            input_line_1 = line1.split('.')
            good_words = input_line_1[1].strip('\n').split(',')[:2]
            
            input_line_2 = line2.split('.')
            gold_word = input_line_2[1].strip()
            
            if input_line_1[0] == input_line_2[0]: 
                board = [word_to_idx[word.strip()] for word in good_words]
                if gold_word == 'No good clues': 
                    continue
            
                try: 
                    data_matrix.append([board, [word_to_idx[gold_word]]])
                except: 
                    continue
            else: 
                print(input_line_1[0])
                print(input_line_2[0])
                print('Uh oh, misaligment!')
                break
            
    return data_matrix

In [251]:
from random import randint

def baseline_accuracy(data_list, idx_to_word, embedding):
    ''' takes in a data_list of form [[boardword1, boardword2], truth]
    recovers top 100 clues from both models
    checks if truth is in those top 100 
    '''
    data_length = len(data_list)
    data_list = data_list[:data_length]
    euclidean_score = 0
    cosine_score = 0
    random_score = 0
    iteration = 1
    for example in data_list:
        rand_indices = np.random.randint(0, len(idx_to_word), 10)
        rand_words = [idx_to_word[index] for index in rand_indices]
        board_indices = example[0]
        chosen_clue = example[1]
        print(iteration, 'New data!', [idx_to_word[index] for index in board_indices], idx_to_word[chosen_clue[0]])
        if idx_to_word[chosen_clue[0]] in rand_words:
            random_score += 1
            print('Random got it!', random_score/iteration)
        top_clues_euclidean, top_clues_cosine = clue_dist_for_one_board(embedding, idx_to_word, board_indices, temperature = 1) #outputs the softmax scorse for all the potential clues
        if idx_to_word[chosen_clue[0]] in top_clues_euclidean:
            euclidean_score += 1
            print('Euclidean got it!', euclidean_score/iteration)
        if idx_to_word[chosen_clue[0]] in top_clues_cosine:
            cosine_score += 1
            print('Cosine got it!', cosine_score/iteration)
        iteration += 1
    print('Euclidean accuracy is', euclidean_score/data_length)
    print('Cosine accuracy is', cosine_score/data_length)
    print('Random accuray is!', random_score/data_length)


    


In [253]:
data_list = read_data('examples_all_20200313.txt', 'data_all_20200313.txt')
baseline_accuracy(data_list, idx_to_word_glove, glove_embedding)

1 New data! ['engine', 'park'] car
2 New data! ['millionaire', 'mass'] gold
3 New data! ['lawyer', 'doctor'] profession
Euclidean got it! 0.3333333333333333
4 New data! ['forest', 'queen'] elf
5 New data! ['giant', 'lemon'] peach
Euclidean got it! 0.4
6 New data! ['hotel', 'ghost'] haunt
7 New data! ['europe', 'film'] venice
8 New data! ['glove', 'file'] hand
Euclidean got it! 0.375
9 New data! ['capital', 'lab'] grant
10 New data! ['eagle', 'trip'] fly
Euclidean got it! 0.4
11 New data! ['yard', 'berry'] bush
12 New data! ['kangaroo', 'park'] outback
13 New data! ['time', 'center'] sun
14 New data! ['box', 'egypt'] pyramid
15 New data! ['kid', 'glove'] tear
16 New data! ['time', 'wave'] moon
17 New data! ['berry', 'washington'] apple
18 New data! ['angel', 'spike'] name
19 New data! ['lab', 'hotel'] place
Euclidean got it! 0.2631578947368421
20 New data! ['amazon', 'novel'] bookstore
21 New data! ['fish', 'himalaya'] kerala
22 New data! ['worm', 'cricket'] field
23 New data! ['kid', '