In [486]:
import numpy as np
import pathlib
import attr
from typing import List, Sequence, Optional, Set, Tuple
import typing as tp
import nptyping as npt
import abc

In [366]:
!ls ../../

directional.txt			    LICENSE    README.md  wiki-100k.txt
google-10000-english-no-swears.txt  notebooks  src	  wordlist-eng.txt


In [354]:
def regularize(list_of_tokens: List[str]) -> List[str]:
    return [token.strip().upper() for token in list_of_tokens]

In [385]:
class WordList:
    def __init__(
            self,
            wordlist_path: str,
            illegals_paths: Optional[List[str]] = None,
            allowed_paths: Optional[List[str]] = None
    ):
        path = pathlib.Path(wordlist_path)
        with path.open() as f:
            self.words = regularize(f.read().splitlines())
        self.illegals = self.load_texts(illegals_paths) if illegals_paths else set()
        self.allowed = self.load_texts(allowed_paths) if allowed_paths else set()
        # If it is illegal for the board, it will be detected later on
        self.allowed.update(self.words)

    def load_texts(self, paths: List[str]) -> Set[str]:
        texts = set()
        for pth in paths:
            path = pathlib.Path(pth)
            with path.open() as f:
                texts.update(f.read().splitlines())
        return set(regularize(texts))

In [386]:
pathlib.Path("../../wordlist-eng.txt").exists()

True

In [579]:
wordlist = WordList(
    "../../wordlist-eng.txt",
    ["../../directional.txt"],
    ["../../wiki-100k.txt", "../../google-10000-english-no-swears.txt", "../../custom_whitelist.txt"]
)

In [388]:
wordlist

<__main__.WordList at 0x7fb6e58ce080>

In [389]:
from numpy.random import default_rng
rng = default_rng()

In [390]:
wordlist.illegals

{'BOTTOM',
 'DOWN',
 'EAST',
 'LEFT',
 'NORTH',
 'RIGHT',
 'SOUTH',
 'TOP',
 'UP',
 'WEST'}

In [274]:
labels = ["BLUE"] * 9 + ["RED"] * 8 + ["BYSTANDER"] * 7 + ["ASSASSIN"]
unique_labels = np.unique(labels).tolist()

In [218]:
len(labels)

25

In [615]:
def is_superstring_or_substring(word: str, target: str) -> bool:
    return target in word or word in target


class Board:
    def __init__(self, wordlist: WordList) -> None:
        self.wordlist = wordlist
        self.words = rng.choice(wordlist.words, 25, replace=False)
        self.word2index = {word: i for i, word in enumerate(self.words)}
        self.labels = rng.permutation(labels)
        self.reset_game()

    def is_related_word(self, word: str) -> bool:
        word = word.upper()
        return any(is_superstring_or_substring(word, target) for target in self.words)

    def is_illegal(self, word: str) -> bool:
        word = word.upper()
        return (
            self.is_related_word(word) or
            word in self.wordlist.illegals or
            word not in self.wordlist.allowed
        )

    def batch_is_illegal(self, words: npt.NDArray[str]) -> npt.NDArray[bool]:
        return np.array([self.is_illegal(w) for w in words])

    def reset_game(self) -> None:
        self.chosen = np.array([False]*25)
        self.which_team_guessing = "BLUE"
        # self.hint_history = []
        # self.state_history = None

    def choose_word(self, word: str) -> str:
        if word.upper() not in self.words:
            raise KeyError(f"Word '{word}' is not on the board.")
        index = self.word2index[word]
        if self.chosen[index]:
            raise ValueError(f"Word '{word}' has already been chosen!")
        self.chosen[index] = True
        return self.labels[index]

    def words_that_are_label(self, label):
        return self.words[self.labels == label]

    @property
    def blue_words(self):
        return words_that_are_label("BLUE")

    @property
    def red_words(self):
        return words_that_are_label("RED")

    @property
    def bystander_words(self):
        return words_that_are_label("BYSTANDER")

    @property
    def assassin_words(self):
        """There is only one assassin in a regular game, but for the sake of generality, here we go!"""
        return words_that_are_label("ASSASSIN")

    def indices_for_label(self, label):
        return np.where(self.labels == label)[0]

    @property
    def blue_indices(self):
        return self.indices_for_label("BLUE")

    @property
    def red_indices(self):
        return self.indices_for_label("RED")

    @property
    def bystander_indices(self):
        return self.indices_for_label("BYSTANDER")

    @property
    def assassin_indices(self):
        """There is only one assassin in a regular game, but for the sake of generality, here we go!"""
        return self.indices_for_label("ASSASSIN")

    def jump_to_random_state(self) -> None:
        """Jump to a valid random state before the end of the game.
        
        There must be 1 assassin, 1-9 blue words, 1-8 red words, and 0-7 bystanders.
        Thus, 0-8 blue, 0-7 red and 1-6 bystander words are chosen.
        """
        self.reset_game()
        num_blue = rng.integers(0, 9)
        num_red = rng.integers(0, 8)
        num_bystanders = rng.integers(0, 7)
        chosen_blue = rng.choice(self.blue_indices, num_blue, replace=False)
        chosen_red = rng.choice(self.red_indices, num_red, replace=False)
        chosen_bystanders = rng.choice(self.bystander_indices, num_bystanders, replace=False)
        chosen_indices = np.concatenate([chosen_blue, chosen_red, chosen_bystanders])
        self.chosen[chosen_indices] = True
        self.which_team_guessing = rng.choice(["BLUE", "RED"])

    def bag_state(self) -> tp.Dict["label", tp.Set["words"]]:
        return {label: set(self.words[(self.labels == label) & ~self.chosen]) for label in unique_labels}

    def remaining_words(self) -> npt.NDArray[str]:
        return self.words[~self.chosen]

In [393]:
rng.choice([0, 1])

0

In [394]:
class CliView:
    def __init__(self, board: Board):
        self.board = board

    def spymaster_words_to_display(self):
        words = []
        # Arguably more readable than the equivalent list comprehension
        for w, l, c in zip(self.board.words, self.board.labels, self.board.chosen):
            w += f"_{l[0]}"
            if c:
                w = w.lower()
            words.append(w) 
        return words

    def operative_words_to_display(self):
        words = []
        # Arguably more readable than the equivalent list comprehension
        for w, l, c in zip(self.board.words, self.board.labels, self.board.chosen):
            if c:
                w += f"_{l[0]}"
                w = w.lower()
            words.append(w)
        return words

    def generic_view(self, words_to_display):
        words = words_to_display()
        print(np.array(words).reshape(5,5))
        print(f"It is {self.board.which_team_guessing}'s turn.")

    def spymaster_view(self):
        self.generic_view(self.spymaster_words_to_display)

    def operative_view(self):
        self.generic_view(self.operative_words_to_display)

In [395]:
np.concatenate([np.arange(3), np.arange(2)])

array([0, 1, 2, 0, 1])

In [616]:
board = Board(wordlist)
view = CliView(board)

In [489]:
view.spymaster_view()

[['WASHER_R' 'CRASH_B' 'SOCK_B' 'BERMUDA_R' 'TUBE_B']
 ['CHURCH_B' 'GIANT_B' 'STADIUM_B' 'GAS_R' 'GENIUS_R']
 ['MATCH_B' 'BOTTLE_R' 'KIWI_B' 'BAT_A' 'PART_B']
 ['LINK_B' 'ANGEL_B' 'CAST_B' 'COOK_R' 'OCTOPUS_R']
 ['AMERICA_B' 'MILLIONAIRE_B' 'OLIVE_R' 'TOKYO_B' 'TRUNK_B']]
It is BLUE's turn.


In [490]:
board.bag_state()

{'ASSASSIN': {'BAT'},
 'BLUE': {'AMERICA',
  'ANGEL',
  'CRASH',
  'GIANT',
  'KIWI',
  'MATCH',
  'TOKYO',
  'TRUNK',
  'TUBE'},
 'BYSTANDER': {'CAST',
  'CHURCH',
  'LINK',
  'MILLIONAIRE',
  'PART',
  'SOCK',
  'STADIUM'},
 'RED': {'BERMUDA',
  'BOTTLE',
  'COOK',
  'GAS',
  'GENIUS',
  'OCTOPUS',
  'OLIVE',
  'WASHER'}}

In [399]:
board.blue_indices

array([ 4,  8, 11, 12, 13, 14, 19, 21, 22])

In [400]:
board.jump_to_random_state()

In [401]:
view.operative_view()

[['NURSE' 'JET' 'WHALE' 'VAN' 'CYCLE']
 ['POST' 'draft_b' 'game_b' 'server_b' 'table_b']
 ['staff_b' 'SCHOOL' 'OCTOPUS' 'bermuda_b' 'AMAZON']
 ['cold_b' 'MARBLE' 'flute_r' 'CAT' 'mexico_b']
 ['SCUBA DIVER' 'MINE' 'STATE' 'hook_b' 'soldier_r']]
It is RED's turn.


In [444]:
@attr.s(frozen=True, auto_attribs=True)
class Hint:
    word: str
    count: Optional[int]
    remaining: int = attr.ib(default=0)

In [310]:
!ls ../../../codenames/dataset/glove

ls: cannot access '../../../codenames/dataset/glove': No such file or directory


In [316]:
glove_path = pathlib.Path("../../../codenames/dataset/glove.6B.300d.npy")


In [319]:
with glove_path.open("rb") as f:
    glove_vectors = np.load(f)

In [437]:
class TextVectorEngine(metaclass=abc.ABCMeta):
    # TODO: add self.vectors, self.tokens, self.token2id here too

    @abc.abstractmethod
    def is_valid_token(self, token):
        pass

    @abc.abstractmethod
    def tokenize(self, phrase):
        pass

In [622]:
class Glove(TextVectorEngine):
    def __init__(self, glove_vector_path, glove_tokens_path):
        gv_path = pathlib.Path(glove_vector_path)
        gt_path = pathlib.Path(glove_tokens_path)
        assert gv_path.exists()
        assert gt_path.exists()
        with gv_path.open("rb") as f:
            self.vectors = np.load(gv_path)
        with gt_path.open() as f:
            self.tokens = f.read().splitlines()
            self.tokens = np.array(regularize(self.tokens))
        self.token2id = {t: i for i, t in enumerate(self.tokens)}

    def is_valid_token(self, token):
        return token.strip().upper() in self.token2id
        
    def tokenize(self, phrase):
        """Simple one-word tokenization. Ignores punctuation."""
        if isinstance(phrase, str):
            phrase = phrase.strip().upper().split()
        else:
            phrase = regularize(phrase)
        return [self.token2id[x] if self.is_valid_token(x) else None for x in phrase]

    def vectorize(self, phrase):
        return self.vectors[self.tokenize(phrase)]

In [455]:
glove.vectors / np.linalg.norm(glove.vectors, axis=1)[:, None]

array([0.99999994, 0.99999994, 0.99999994, ..., 0.99999994, 1.        ,
       1.        ], dtype=float32)

In [468]:
def batched_norm(vec: np.ndarray) -> np.ndarray:
    """Normalize a batch of vectors
    
    Args:
        vec: (batch, dim)
    """
    return vec / np.linalg.norm(vec, axis=1)[:, None]

In [469]:
def batched_cosine_similarity(a: np.ndarray, b: np.ndarray) -> float:
    """Take the batched cosine similarity."""
    a_norm = batched_norm(a)  # (batch1, dim)
    b_norm = batched_norm(b)  # (batch2, dim)
    return a_norm @ b_norm.T  # (batch1, batch2)

In [481]:
?board

[0;31mType:[0m        Board
[0;31mString form:[0m <__main__.Board object at 0x7fb6e4f66160>
[0;31mDocstring:[0m   <no docstring>


In [520]:
type(glove.tokens)

list

In [636]:
class GloveGuesser:
    
    def __init__(self, glove: Glove, board: Board, limit: int = 10):
        self.glove = glove
        self.board = board
        self.limit = limit
        self.strategy_lookup = {
            "mean": self.generate_word_suggestions_mean,
            "minimax": self.generate_word_suggestions_minimax
        }

    def indices_illegal_words(self, chosen_words: npt.NDArray):
        return self.board.batch_is_illegal(chosen_words)

    def generate_word_suggestions_mean(self, words: List[str], limit: int = 10) -> Tuple[Sequence[str], Sequence[float]]:
        for word in words:
            if not self.glove.is_valid_token(word):
                raise ValueError(f"Hint {word} is not a valid hint word!")
        word_vector = self.glove.vectorize(" ".join(words)).mean(0)[None, :]
        similarity_scores = batched_cosine_similarity(word_vector, glove.vectors)[0]
        indices = np.argpartition(-similarity_scores, limit)
        chosen_words = glove.tokens[indices][:limit]
        similarity_scores = similarity_scores[indices][:limit]
        return chosen_words, similarity_scores

    def generate_word_suggestions_minimax(self, words: List[str], limit: int = 10) -> Tuple[Sequence[str], Sequence[float]]:
        for word in words:
            if not self.glove.is_valid_token(word):
                raise ValueError(f"Hint {word} is not a valid hint word!")
        word_vector = self.glove.vectorize(" ".join(words))
        similarity_scores = batched_cosine_similarity(word_vector, glove.vectors).min(axis=0)
        indices = np.argpartition(-similarity_scores, limit)
        chosen_words = glove.tokens[indices][:limit]
        similarity_scores = similarity_scores[indices][:limit]
        return chosen_words, similarity_scores

    def filter_words(self, chosen_words: npt.NDArray[str], similarity_scores: npt.NDArray[float], similarity_threshold=0.0):
        words_to_filter = self.indices_illegal_words(chosen_words) | (similarity_scores < similarity_threshold)
        return chosen_words[~words_to_filter], similarity_scores[~words_to_filter]

    def re_rank(self, chosen_words: npt.NDArray[str], similarity_scores: npt.NDArray[float], limit: int):
        indices = np.argsort(-similarity_scores)
        chosen_words = chosen_words[indices][:limit]
        similarity_scores = similarity_scores[indices][:limit]
        return chosen_words, similarity_scores

    def give_hint_candidates(self, targets: List[str], similarity_threshold=0.0, strategy: str = "minimax"):
        generate_word_suggestions = self.strategy_lookup[strategy]
        chosen_words, similarity_scores = generate_word_suggestions(targets, self.limit*2)

        chosen_words, similarity_scores = self.filter_words(chosen_words, similarity_scores)

        return self.re_rank(chosen_words, similarity_scores, self.limit)

    def give_hint(self, targets: List[str], similarity_threshold=0.0, strategy: str = "minimax"):
        """Greedily choose the best hint."""
        chosen_words, _ = self.give_hint_candidates(targets, similarity_threshold, strategy)
        return chosen_words[0]

    def choose_hint_parameters(self, hint: Hint) -> tp.Tuple[str, int]:
        """TODO: Add strategy mixins"""
        return hint.word, hint.count-hint.remaining

    def guess(self, hint: Hint, strategy: str = "greedy") -> Sequence[str]:
        word, limit = self.choose_hint_parameters(hint)
        if not self.glove.is_valid_token(word):
            raise ValueError(f"Hint {word} is not a valid hint word!")
        word_vector = self.glove.vectorize(word)
        remaining_words = self.board.remaining_words()
        board_vectors = self.glove.vectorize(remaining_words)
        similarity_scores = batched_cosine_similarity(word_vector, board_vectors)[0]
        indices = np.argpartition(-similarity_scores, limit)
        chosen_words = remaining_words[indices][:limit]
        similarity_scores = similarity_scores[indices][:limit]
        return chosen_words, similarity_scores

In [547]:
np.arange(3).min()

0

In [623]:
glove = Glove("../../../codenames/dataset/glove.6B.300d.npy", "../../../codenames/dataset/words")

In [516]:
glove.vectorize("Test").shape

(1, 300)

In [598]:
board.is_illegal("AMERICA")

False

In [637]:
guesser = GloveGuesser(glove, board)

In [626]:
glove.tokenize(board.remaining_words())

[389,
 2855,
 2805,
 13308,
 603,
 2858,
 1866,
 4249,
 3845,
 9248,
 11738,
 7370,
 2061,
 27940,
 316,
 3990,
 17489,
 9993,
 1307,
 20026,
 13889,
 4614,
 10216,
 17003,
 638]

In [631]:
board.bag_state()

{'ASSASSIN': {'COMIC'},
 'BLUE': {'BRUSH',
  'CHAIR',
  'GRASS',
  'MOLE',
  'PAPER',
  'SATELLITE',
  'SCORPION',
  'SHOP',
  'SLUG'},
 'BYSTANDER': {'CRANE', 'DINOSAUR', 'HAWK', 'LINK', 'PASTE', 'POINT', 'TUBE'},
 'RED': {'AUSTRALIA',
  'CHICK',
  'COLD',
  'COMPOUND',
  'FOREST',
  'JUPITER',
  'KEY',
  'OIL'}}

In [638]:
guesser.give_hint_candidates(["chick", "forest", "cold"], strategy="mean")

(array(['DRY', 'COOL', 'HOT', 'WET', 'WARM', 'RAINY', 'RAIN', 'DESERT',
        'HABITAT', 'WINTERS'], dtype='<U68'),
 array([0.5375424 , 0.5000939 , 0.49452868, 0.47711235, 0.45485127,
        0.43468133, 0.4343574 , 0.43188688, 0.42913797, 0.41956434],
       dtype=float32))

In [639]:
guesser.guess(Hint("dry", 3))

(array(['BRUSH', 'COLD', 'GRASS'], dtype='<U11'),
 array([0.47539973, 0.5543054 , 0.32184032], dtype=float32))

In [614]:
board.is_related_word("moles")

True

In [594]:
guesser.strategy_lookup["minimax"]

<bound method GloveGuesser.generate_word_suggestions_minimax of <__main__.GloveGuesser object at 0x7fb6e481a400>>

In [478]:
batched_cosine_similarity(glove.vectors[:1], glove.vectors[:10]).shape

(1, 10)

In [441]:
glove.tokenize(" ".join(wordlist.words))

[637,
 1967,
 325,
 8334,
 12038,
 8427,
 9214,
 453,
 5239,
 12451,
 3292,
 2647,
 12339,
 603,
 26900,
 137,
 1083,
 775,
 231,
 2069,
 14924,
 4925,
 5747,
 1497,
 3045,
 960,
 3827,
 942,
 2913,
 5077,
 2499,
 11012,
 9307,
 480,
 1963,
 534,
 11869,
 1211,
 1846,
 4707,
 10002,
 6676,
 7331,
 1930,
 1641,
 9248,
 8998,
 4794,
 11035,
 41652,
 6910,
 14105,
 774,
 3539,
 351,
 569,
 1904,
 20786,
 5391,
 1784,
 5450,
 2114,
 46995,
 313,
 3845,
 511,
 1090,
 2375,
 5778,
 17489,
 132,
 6242,
 512,
 4012,
 8525,
 23755,
 449,
 2280,
 1866,
 4249,
 3990,
 3031,
 8475,
 953,
 3387,
 5139,
 5223,
 202,
 1333,
 10216,
 2005,
 2162,
 1007,
 3120,
 4124,
 2312,
 2261,
 1257,
 122,
 336,
 6744,
 1714,
 5188,
 14621,
 13308,
 1289,
 2082,
 2926,
 1737,
 7394,
 4635,
 8416,
 1560,
 7774,
 15389,
 5838,
 1598,
 1847,
 2100,
 563,
 525,
 2090,
 621,
 1791,
 807,
 3267,
 6043,
 307263,
 3510,
 1265,
 2854,
 319,
 484,
 2120,
 16677,
 2361,
 2149,
 352,
 2061,
 11532,
 387,
 186,
 851,
 9780,
 5

In [320]:
glove_vectors

array([[ 0.04656  ,  0.21318  , -0.0074364, ...,  0.0090611, -0.20989  ,
         0.053913 ],
       [-0.25539  , -0.25723  ,  0.13169  , ..., -0.2329   , -0.12226  ,
         0.35499  ],
       [-0.12559  ,  0.01363  ,  0.10306  , ..., -0.34224  , -0.022394 ,
         0.13684  ],
       ...,
       [ 0.075713 , -0.040502 ,  0.18345  , ...,  0.21838  ,  0.30967  ,
         0.43761  ],
       [ 0.81451  , -0.36221  ,  0.31186  , ...,  0.075486 ,  0.28408  ,
        -0.17559  ],
       [ 0.429191 , -0.296897 ,  0.15011  , ...,  0.28975  ,  0.32618  ,
        -0.0590532]], dtype=float32)