In [1]:
import numpy as np
import pandas as pd
from scipy import spatial
import hdbscan
from sklearn.cluster import DBSCAN
from sklearn.cluster import KMeans
from sklearn.cluster import Birch
from sklearn.manifold import TSNE
import random
import pickle

In [2]:
emb_glove = pickle.load(open("glove_6B_300d_lite.p", "rb"))

In [3]:
emb_bert = pickle.load(open("bert_uncased_L-12_H-768_A-12_lite.p", "rb"))

In [4]:
# Load Codenames word list
codenames_df = pd.read_csv("word_list/codenames_word_list.csv") 
codenames = pd.melt(codenames_df, id_vars=['ID', 'Version'], value_vars=['SideA', 'SideB'],
        var_name='Side', value_name='Codename')['Codename'].tolist()
codenames = [i.lower() for i in codenames] # convert to lowercase

# Remove two-word nouns
one_word_idx = [' ' not in i for i in codenames]
codenames = [i for (i, v) in zip(codenames, one_word_idx) if v]

In [5]:
class Game:
    
    def __init__(self, word_list, seed):
        # Initialise random number generator
        self.generator = np.random.RandomState(seed=seed)
        # Set board size (use the 5 x 5 setup)
        size = 5
        self.words = np.array(word_list)
        # Shuffle the wordlist
        shuffle = self.generator.choice(
            len(self.words), size * size, replace=False)
        self.board = self.words[shuffle]
        # Specify the layout for this game
        assignments = self.generator.permutation(size * size)
        self.owner = np.empty(size * size, int)
        self.owner[assignments[0]] = 0  # assassin
        self.owner[assignments[1:10]] = 1  # first player: 9 words
        self.owner[assignments[10:18]] = 2  # second player: 8 words
        self.owner[assignments[18:]] = 3  # bystander: 7 words
        self.assassin_word = self.board[self.owner == 0].tolist()
        self.team1_word = self.board[self.owner == 1].tolist()
        self.team2_word = self.board[self.owner == 2].tolist()
        self.bystander_word = self.board[self.owner == 3].tolist()
        
    def print_words(self):
        print("Team 1: ")
        print(self.team1_word)
        print("Team 2: ")
        print(self.team2_word)
        print("Assassin: ")
        print(self.assassin_word)
        print("Bystanders: ")
        print(self.bystander_word)

In [6]:
game1 = Game(codenames, 23124)

In [7]:
game1.print_words()

Team 1: 
['wing', 'yellowstone', 'pupil', 'embassy', 'watch', 'bucket', 'polo', 'bowl', 'nipple']
Team 2: 
['joystick', 'speed', 'dwarf', 'olive', 'cheese', 'point', 'pee', 'capital']
Assassin: 
['tower']
Bystanders: 
['conductor', 'smoke', 'dog', 'wiener', 'sink', 'jeweler', 'liquor']


In [8]:
class SpyMaster:
    
    def __init__(self, embeddings, game):
        self.embeddings = embeddings
        self.game = game
        self.answers = game.team1_word
        self.bad = game.team2_word + game.assassin_word
    
    def distance(self, word, reference):
        return spatial.distance.cosine(self.embeddings[word], self.embeddings[reference])

    def closest_words(self, reference):
        return sorted(self.embeddings.keys(), key=lambda w: self.distance(w, reference))

    def goodness(self, word, answers, bad):
        if word in self.answers + self.bad: return -999
        return sum([self.distance(word, b) for b in bad]) - 4.0 * sum([self.distance(word, a) for a in answers])

    def minimax(self, word, answers, bad):
        if word in answers + bad: return -999
        return min([self.distance(word, b) for b in bad]) - max([self.distance(word, a) for a in answers])

    def candidates(self, size=10):
        best = sorted(self.embeddings.keys(), key=lambda w: -1 * self.goodness(w, self.answers, self.bad))
        res = [(str(i + 1), "{0:.2f}".format(self.minimax(w, self.answers, self.bad)), w) 
               for i, w in enumerate(sorted(best[:250], key=lambda w: -1 * self.minimax(w, self.answers, self.bad))[:size])]
        return [(". ".join([c[0], c[2]]) + " (" + c[1] + ")") for c in res]

In [9]:
spymaster1 = SpyMaster(embeddings = emb_glove, game = game1)

In [10]:
spymaster1.candidates()

['1. vip (-0.08)',
 '2. sleeping (-0.10)',
 '3. mask (-0.11)',
 '4. corps (-0.13)',
 '5. tag (-0.14)',
 '6. volunteer (-0.14)',
 '7. couch (-0.14)',
 '8. toe (-0.14)',
 '9. bathroom (-0.14)',
 '10. ranger (-0.14)']

In [11]:
spymaster1.closest_words("bank")

['bank',
 'banks',
 'banking',
 'central',
 'credit',
 'financial',
 'investment',
 'lending',
 'monetary',
 'loans',
 'lender',
 'securities',
 'funds',
 'finance',
 'deposit',
 'west',
 'institutions',
 'accounts',
 'cash',
 'money',
 'currency',
 'palestinian',
 'fund',
 'savings',
 'fed',
 'deposits',
 'branch',
 'loan',
 'deutsche',
 'interest',
 'reserve',
 'assets',
 'account',
 'asset',
 'rate',
 'israel',
 'branches',
 'rates',
 'exchange',
 'treasury',
 'lenders',
 'government',
 'mortgage',
 'insurance',
 'settlement',
 'investors',
 'trust',
 'israeli',
 'overnight',
 'chase',
 'commercial',
 'dollars',
 'capital',
 'markets',
 'investments',
 'subsidiary',
 'debt',
 'swiss',
 'financing',
 'raised',
 'sector',
 'month',
 'stake',
 'raise',
 'occupied',
 'the',
 'its',
 'offices',
 'institution',
 'earlier',
 'holdings',
 'which',
 'jewish',
 'dollar',
 'meanwhile',
 'thursday',
 'morgan',
 'operations',
 'headquarters',
 'plan',
 'largest',
 'monday',
 'also',
 'firms',
 '

In [12]:
spymaster2 = SpyMaster(embeddings = emb_bert, game = game1)

In [13]:
spymaster2.candidates()

['1. plane (-0.12)',
 '2. observer (-0.12)',
 '3. vaccine (-0.12)',
 '4. legislative (-0.12)',
 '5. recruiting (-0.13)',
 '6. alter (-0.13)',
 '7. congressional (-0.14)',
 '8. idol (-0.14)',
 '9. parrot (-0.14)',
 '10. custody (-0.14)']

In [14]:
spymaster2.closest_words("bank")

['bank',
 'hospital',
 'store',
 'mall',
 'healthcare',
 'banks',
 'bureau',
 'broker',
 'asylum',
 'market',
 'banking',
 'register',
 'shop',
 'zoo',
 'farm',
 'fortune',
 'skyscraper',
 'port',
 'ventures',
 'atm',
 'cafe',
 'forecast',
 'merchant',
 'invest',
 'financial',
 'stores',
 'circulation',
 'grocery',
 'dairy',
 'garage',
 'academy',
 'branch',
 'labs',
 'institution',
 'physicians',
 'mortgage',
 'institute',
 'boutique',
 'container',
 'network',
 'consulting',
 'corporation',
 'trading',
 'clinic',
 'syndicate',
 'gateway',
 'warehouse',
 'investments',
 'cyber',
 'deposits',
 'leasing',
 'account',
 'herald',
 'paste',
 'financing',
 'tower',
 'subsidiary',
 'registry',
 'crane',
 'volvo',
 'guild',
 'factory',
 'dentist',
 'cooperative',
 'hub',
 'servers',
 'college',
 'exchange',
 'stocks',
 'fleet',
 'cardiac',
 'growth',
 'medical',
 'homes',
 'pocket',
 'physician',
 'palace',
 'loans',
 'vault',
 'broadband',
 'airline',
 'octopus',
 'pipeline',
 'licensing',
 

In [26]:
words = ['family', 'school', 'kid', 'embassy', 'conductor', 'tree', 'green', 'zoo']
X = np.array([emb_bert.get(k) for k in words])

In [27]:
clustering = DBSCAN(eps=0.2, min_samples=2, metric='cosine').fit(X)
clustering.labels_

array([ 0,  0,  0, -1, -1,  0, -1, -1], dtype=int64)

In [28]:
clustering = KMeans(n_clusters=3, random_state=0).fit(X)
clustering.labels_

array([0, 0, 0, 2, 2, 0, 1, 2])

In [29]:
clustering = Birch(n_clusters=3).fit(X)
clustering.labels_

array([1, 1, 1, 0, 0, 1, 2, 0], dtype=int64)

In [32]:
X_embedded = TSNE(n_components=2).fit_transform(X)
clustering = KMeans(n_clusters=3, random_state=0).fit(X_embedded)
clustering.labels_

array([0, 1, 1, 1, 1, 2, 2, 0])

In [31]:
clustering = hdbscan.HDBSCAN(min_cluster_size=2, cluster_selection_epsilon=2).fit(X_embedded)
clustering.labels_

array([-1, -1, -1, -1, -1, -1, -1, -1], dtype=int64)