In [1]:
import numpy as np
import pandas as pd
from scipy import spatial
from random import sample
import copy
import re
import nltk
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
# Import encoding dictionaries. i.e. bert_dict['cat'] = [0.5, -.4, 0.001,...]
bert_dict = np.load('bert.npy', allow_pickle = True); bert_dict = bert_dict[()]
glove_dict = np.load('glove.npy', allow_pickle = True); glove_dict = glove_dict[()]

In [3]:
# Import list of codenames words
codenames_words = []
with open("./codenames_words.txt", 'r') as f:
    for line in f:
        codenames_words.append(line.strip())
        
# Remove codenames words not in embeddings
def removeInvalidCodenames(embeddings, codenames_words):
    removed_words = []
    for item in codenames_words:
        if item not in embeddings:
            removed_words.append(item)
            codenames_words.remove(item)
    #print('Words removed from codenames are', removed_words)
    return codenames_words

codenames_words_glove = removeInvalidCodenames(glove_dict, codenames_words)
codenames_words = removeInvalidCodenames(bert_dict, codenames_words)

In [4]:
# Turn BERT into indexable embeddings
all_bert_words = list(bert_dict.keys())
word_to_idx = dict(zip(all_bert_words, range(len(all_bert_words))))
idx_to_word = dict(zip(range(len(all_bert_words)), all_bert_words))

# Get indexes for codenames words
codenames_words_idxs = np.array([word_to_idx[word] for word in codenames_words])

# Get BERT embeddings as giant matrix
bert_embedding = [bert_dict[word] for word in all_bert_words]
bert_embedding = np.vstack(bert_embedding) # shape = (42000, 768)

In [5]:
from scipy.special import softmax

# Returns list of random indexes from indexes of codenames words
def generate_single_board(codenames_words_idxs): 
    return np.random.choice(codenames_words_idxs, 3)

# Calculates prob. distribution (using softmax) over entire vocabulary for a single board
def clue_dist_for_one_board(embedding, board, scoring = 'cosine', temperature = 1):
    '''
    embedding: V x [embedding_size = 768]
    board: (3,) [indices of good words in all_bert_words]
    scoring: either cosine or euclidean
    temperature: sampling smoothing over prob. distribution
    '''
    
    V, K = embedding.shape
    board_embedding = embedding[board] # shape = (3 x 768)
    
    # Calculate score for each word in vocabulary using the specified scoring function
    if scoring == 'euclidean': 
        score = -((board_embedding.reshape(1, 3, K) - embedding.reshape(V, 1, K)) ** 2).sum(2).sum(1) # shape V
    else: # cosine
        score = (np.matmul(board_embedding, embedding.T) / 
                 (np.matmul(np.linalg.norm(board_embedding, axis=1, keepdims=True), 
                            np.linalg.norm(embedding, axis=1, keepdims=True).T))).sum(0)
    
    # Penalize scores for good words
    score[board] -= 1e6
    
    return softmax(score / temperature)

# Returns the top clues for a certain board based on the probability distribution over the vocabulary
def return_top_clues(embedding_matrix, board, scoring, p_value): 
    # probability distribution for one board
    p = clue_dist_for_one_board(embedding_matrix, board, scoring, 15)
    return [idx_to_word[word] for word in np.arange(p.shape[0])[p > p_value]]

In [15]:
import torch
from torch import nn

In [None]:
def clue_dist_for_one_board_pt(embedding, board, scoring='cosine', temperature=1):
    """
    Assume for now that the inputs are numpy matrices:
    embedding: V x K matrix of floats
    board: (3,) matrix of ints
    """
    embedding_pt = torch.tensor(embedding, requires_grad=False)
    board_pt = torch.tensor(board, requires_grad=False)
    V, K = embedding.shape
    
    # let's try it 
    board_embedding_pt = embedding_pt[board_pt].view(1, 3 * K) # 3 x 768
    board_embedding_big = board_embedding_pt.expand(V, 3 * K)
    all_input = torch.concat([embedding_pt, board_embedding_big], dim=1) # (V x 3072)
    
    # deeplearning ML whoooo
    neural_network_1 = nn.Linear(3072, 100)
    neural_network_2 = nn.Linear(100, 1)
    
    output = neural_network_2(neural_network_1(all_input)) # V x 1
    return torch.softmax(output / temperature)

In [None]:
nn.Linear()

In [None]:
board = generate_single_board(codenames_words_idxs)

In [41]:
class MyNeuralNetwork(nn.Module):
    def __init__(self, hidden_size):
        super(MyNeuralNetwork, self).__init__()
        self.neural_network_1 = nn.Linear(3072, hidden_size)
        self.neural_network_2 = nn.Linear(hidden_size, 1)

    def forward(self, x):
        return nn.Tanh(self.neural_network_2(nn.Tanh(self.neural_network_1(x))))5678

In [31]:
x = torch.randn(5000, 3072)

In [40]:
(mnn(x) == mnn.forward(x)).all()

tensor(True)

In [42]:
mnn = MyNeuralNetwork(50)
mnn(x).shape

torch.Size([5000, 1])

In [16]:
mnn = MyNeuralNetwork(50)

def clue_dist_...
    
    output = mnn(all_input)
    
    return torch.softmax()

def loss_function(p, gold_word):
    return torch.nn.functional.nll_loss(p, gold_word)

# this line might fail
optimizer = torch.optim.Adam(mnn.parameters(), lr=0.001)

for epoch in epochs:
    loss = 0
    for example in examples:
        # batch size 1
        board, gold_word = example
        loss += loss_function(clue_dist(embedding, board), gold_word)
    
    # backprop!
    loss.backward()
    
    print(neural_network_1.params)
    
    # gradient descent given the computed gradients
    optimizer.step()

SyntaxError: invalid syntax (<ipython-input-16-5c15806c50c8>, line 4)

In [6]:
# Helper functions for debugging

# Pass in words like ['cat', 'dog', 'fish']
def generate_board_from_words(words):
    return np.array([word_to_idx[word] for word in words])

In [7]:
board = generate_single_board(codenames_words_idxs)
print([idx_to_word[idx] for idx in board])

print('Cosine Similarity Scoring Clues:')
# Cosine Clues
clues = return_top_clues(bert_embedding, board, 'cosine', 0.000024)
print(clues[:20])

print('Euclidean Scoring Clues:')
# Euclidean Clues
clues = return_top_clues(bert_embedding, board, 'euclidean', 0.001)
print(clues[:20])

['eye', 'dinosaur', 'ruler']
Cosine Similarity Scoring Clues:
['expert', 'aware', 'native', 'garage', 'circle', 'liquid', 'monster', 'dealer', 'tracking', 'dean', 'sensor', 'lounge', 'killer', 'basket', 'galaxy', 'talented', 'flavor', 'facial', 'citizen', 'container']
Euclidean Scoring Clues:
['alien', 'wizard', 'banana', 'vegetarian', 'clone', 'observer', 'sphere', 'abnormal', 'flagship', 'squirrel', 'surfer', 'sidekick', 'sorcerer', 'primate', 'settler']


In [8]:
def bert_clue_generator(n): 
    for i in range(n):  
        board = generate_single_board(codenames_words_idxs)
        clues = return_top_clues(bert_embedding, board, 'euclidean')
        if len(clues) > 10:
            print([idx_to_word[idx] for idx in board])
            print(clues[:20])
        i += 1

In [9]:
bert_clue_generator(5)

TypeError: return_top_clues() missing 1 required positional argument: 'p_value'

In [10]:
def generate_single_board(codenames_words):
    #randomly selects three words from the codenames board
    board = sample(codenames_words, 3)
    print('The board we will be finding clues for is', board)
    return board
    
def clue_score(embeddings, board, clue):
    #generates a score for a single clue given the board
    if clue in board: #this makes the score for one of the board words really bad
        return -10**6 
    score = 0
    for board_word in board:
        #score += spatial.distance.cosine(embeddings[clue], embeddings[board_word])#1-cosine scoring function
        score += np.linalg.norm(embeddings[clue]-embeddings[board_word])
    return score
    
def clue_dict(embeddings, board):
    #a dictionary for a given board, where the key is the potential clue and the value is the clue score
    score_dict = dict()
    for word in embeddings:
        score_dict[word] = clue_score(embeddings, board, word)
    return score_dict

def extract_top_clues(score_dict):
    sorted_score_dict = sorted(score_dict.items(), key=lambda x:x[1])
    top_ten_clues = [sorted_score_dict[i][0] for i in range(3,3+10)]
    print('Top ten clues are', top_ten_clues)

board = generate_single_board(codenames_words)
score_dict_bert = clue_dict(bert_dict, board)
score_dict_glove = clue_dict(glove_dict, board)
sorted_score_dict = extract_top_clues(score_dict_bert)
sorted_score_dict = extract_top_clues(score_dict_glove)

The board we will be finding clues for is ['rome', 'fly', 'opera']
Top ten clues are ['operas', 'mermaid', 'ballet', 'underworld', 'metropolis', 'musica', 'música', 'bath', 'kite', 'gloria']
Top ten clues are ['hence', 'latter', 'brought', 'aka', 'meant', 'forget', 'mention', 'instance', 'fro', 'wonder']


In [None]:
print(np.linalg.norm(bert_dict['belt']-bert_dict['kangaroo']))
print(np.linalg.norm(bert_dict['belt']-bert_dict['hotel']))
print(np.linalg.norm(bert_dict['kangaroo']-bert_dict['hotel']))

In [None]:
print(spatial.distance.cosine(glove_dict['belt'],glove_dict['kangaroo']))
print(spatial.distance.cosine(glove_dict['belt'],glove_dict['hotel']))
print(spatial.distance.cosine(glove_dict['kangaroo'],glove_dict['hotel']))