<a href="https://colab.research.google.com/github/shikha-aggarwal/nlp_games/blob/main/codenames_0.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#An implementation of the board game Codenames.
https://en.wikipedia.org/wiki/Codenames_(board_game)

## 1. Imports

In [1]:
import torch
import torchtext
import collections
import random
import numpy as np
from itertools import combinations

from nltk.stem import PorterStemmer 
from nltk.tokenize import word_tokenize
import nltk
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('words')
from nltk.corpus import wordnet as wn
from nltk.corpus import stopwords
from nltk.corpus import words

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Package words is already up-to-date!


In [2]:
# The first time you run this will download a ~823MB file
glove = torchtext.vocab.GloVe(name="6B", dim=100)
porter_stemmer = PorterStemmer()
stop_words = set(stopwords.words('english'))
wordlist = words.words()

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## 2. Get a set of game-sy words to construct the game. Using just nouns for now.

In [4]:
data_dir = '/content/drive/My Drive/Colab Notebooks/Codewords/data/'
glove_vocab_file = data_dir + 'vocab.txt'

## Save the vocab file only once
# def save_glove_vocab(path):
#   with open(path, 'w+') as f:     
#     for token, index in glove.stoi.items():
#       f.write(f'{token}\n')
# save_glove_vocab(glove_vocab_file)

def read_glove_vocab(path):
    vocab = []
    i = 0
    with open(path, 'r') as f:
      for line in f:
        token = line.strip()
        vocab.append(token)
    return vocab

all_glove_words = read_glove_vocab(glove_vocab_file)

## Get common-use English words. 
## Source: https://github.com/first20hours/google-10000-english
english_word_files = ['english_10k_long.txt',
                      'english_10k_medium.txt']

common_use_words = []

for filename in english_word_files:
  with open(data_dir + filename, "r") as file:
    for line in file:
      word = line.strip()
      if word in all_glove_words:
        common_use_words.append(word)

# Get nouns from wordnet
nouns = {x.name().split('.', 1)[0] for x in wn.all_synsets('n')} 

common_nouns = [w for w in common_use_words if w in nouns]

# shuffle the words
random.shuffle(common_nouns)
print(len(common_nouns))

3714


## 3. Util functions

In [5]:
def get_nearest_words(word_vector, n = 5):
  """
  Returns the nearest words in the Glove vector space.

  :param word_vector: Glove word vector of the word of type torch.Tensor
  :returns: List of tuples (word, distance) in ascending order of distance
  """
  
  distance_to_all = torch.norm(glove.vectors - word_vector, dim=1)
  dist_sorted = sorted(enumerate(distance_to_all.numpy()), key=lambda x: x[1])
  nearest_word_list = []

  for index, distance in dist_sorted:
    word = glove.itos[index]
    if word not in stop_words and word in wordlist:
      nearest_word_list.append((word, distance))
    if len(nearest_word_list) == n:
      break

  return nearest_word_list

  
get_nearest_words(glove['doctor'] - glove['man'] + glove['woman'])

[('doctor', 3.3640678),
 ('nurse', 4.2283154),
 ('physician', 4.7054324),
 ('woman', 4.8734255),
 ('dentist', 4.969891)]

In [6]:
def nearest_valid_suggestion(word_group, nearest_words, distance_threshold):
  """
  Filters out words in nearest_words with same root as any of the words in 
  word_group or more than distance_threshold away.

  :param word_group: List of words - fixed points.
  :param nearest_words: List of potential points nearby sorted by distance
  :param distance_threshold: max distance possible a group word and nearest_word
  :returns: nearest_valid_word, distance
  """
  for word, distance in nearest_words:
    invalid = False
    for w in word_group:
      if porter_stemmer.stem(word) == porter_stemmer.stem(w):
        invalid = True
      if torch.norm(glove[w] - glove[word]) > distance_threshold:
        invalid = True

    if not invalid:
      return word, distance

  return None, None


def get_clue_word_from_mean(word_list, words_covered = 1, distance_threshold = 4):
  """
  Gets the possible clue words for groups of words_covered number of words
  by finding the closest word to the mean of word vectors.

  :param word_list: list of words that we need to find clue for.
  :param words_covered: number of words we want to try finding the clue for.
  :param distance_threshold: max distance possible between clue word and word
  :returns: List of tuples (word_group, (word, dist)) in ascending order of dist
  """
  num_words = len(word_list)
  distances = {}

  nearest_word_list = []

  for combination in combinations(word_list, words_covered):
    sum_tensor = torch.zeros(glove[word_list[0]].shape)
    for w in combination:
      sum_tensor += glove[w]
    mean_word_vec = sum_tensor/(words_covered * 1.0)
    nearest_words = get_nearest_words(mean_word_vec, n = 20)
    nearest_valid_word, dist = nearest_valid_suggestion(combination, nearest_words, 
                                                  distance_threshold)
    if nearest_valid_word is not None:
      nearest_word_list.append((combination, (nearest_valid_word, dist)))

  ## sort according to distance
  nearest_word_list = sorted(nearest_word_list, key=lambda x: x[1][1])

  return nearest_word_list


get_clue_word_from_mean(['doctor', 'man', 'woman', 'grandmother', 'mother', 'king'])

[(('grandmother',), ('aunt', 2.2975805)),
 (('mother',), ('daughter', 2.6008523)),
 (('woman',), ('girl', 3.2580621)),
 (('man',), ('woman', 3.3640678)),
 (('doctor',), ('physician', 3.6094282))]

## 4. Start a Codewords game

In [7]:
## Select words for the game

grid_len = 5
grid_height = 5
num_words_in_game = grid_len * grid_height
word_set = random.sample(common_nouns, num_words_in_game)

## Divide into red, blue, and neutral
one_third = int(num_words_in_game / 3)
red_words = random.sample(word_set, one_third)
remaining_words = [item for item in word_set if item not in red_words]
blue_words = random.sample(remaining_words, one_third)
neutral_words = [item for item in remaining_words if item not in blue_words]

In [8]:
get_clue_word_from_mean(red_words)

[(('difficulty',), ('trouble', 3.360246))]

In [9]:
get_clue_word_from_mean(red_words, words_covered = 2)

[]