<a href="https://colab.research.google.com/github/shikha-aggarwal/nlp_games/blob/main/codewords_0.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### An implementation of an extremely fun game Codewords.

In [1]:
# standard ML imports
import torch
import torchtext # for glove vectors
import collections
import random
import numpy as np

# # Text processing
from nltk.stem import PorterStemmer 
from nltk.tokenize import word_tokenize
import nltk
nltk.download('wordnet')
from nltk.corpus import wordnet as wn

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
# The first time you run this will download a ~823MB file
glove = torchtext.vocab.GloVe(name="6B", dim=50)
porter_stemmer = PorterStemmer() 

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
## Save vocab file

data_dir = '/content/drive/My Drive/Colab Notebooks/Codewords/data/'
vocab_file = data_dir + 'vocab.txt'

## Run the first time only
# def save_vocab(path):
#   with open(path, 'w+') as f:     
#     for token, index in glove.stoi.items():
#       f.write(f'{token}\n')
#       #f.write(f'{index}\t{token}\n')

# save_vocab(vocab_file)

def read_vocab(path):
    vocab = []
    i = 0
    with open(path, 'r') as f:
      for line in f:
        token = line.strip()
        # index, token = line.split('\t')
        vocab.append(token)
    return vocab

all_glove_words = read_vocab(vocab_file)

In [5]:
## Source for common-use English word files: https://github.com/first20hours/google-10000-english

english_word_files = ['english_10k_long.txt',
                      'english_10k_medium.txt']

words = []

for filename in english_word_files:
  with open(data_dir + filename, "r") as file:
    for line in file:
      word = line.strip()
      if word in all_glove_words:
        words.append(word)

nouns = {x.name().split('.', 1)[0] for x in wn.all_synsets('n')} 

noun_words = [w for w in words if w in nouns]

# shuffle the words
random.shuffle(noun_words)
print(len(noun_words))

3714


In [6]:
def get_nearest_words(word_vector, n=5):
  # compute distances to all words
  distance_to_all = torch.norm(glove.vectors - word_vector, dim=1)
  # sort
  dist_sorted = sorted(enumerate(distance_to_all.numpy()), key=lambda x: x[1])
  nearest_word_list = []
  for index, distance in dist_sorted[1:n+1]:
    nearest_word_list.append((glove.itos[index], distance))
  return nearest_word_list

In [7]:
print(get_nearest_words(glove["cat"], n=10))
print(get_nearest_words(glove['queen'] - glove['woman'] + glove['man']))
print(get_nearest_words(glove['grandmother'] - glove['mother'] + glove['father']))
print(get_nearest_words(glove['doctor'] - glove['man'] + glove['woman']))

[('dog', 1.8846031), ('rabbit', 2.4572797), ('monkey', 2.8102052), ('cats', 2.8972251), ('rat', 2.9455352), ('beast', 2.9878407), ('monster', 3.0022194), ('pet', 3.0396757), ('snake', 3.0617998), ('puppy', 3.0644655)]
[('king', 2.8391209), ('prince', 3.2508988), ('crown', 3.4485195), ('knight', 3.5587437), ('coronation', 3.6198905)]
[('uncle', 2.0784423), ('father', 2.0912485), ('grandson', 2.2965577), ('nephew', 2.353551), ('elder', 2.4274695)]
[('nurse', 3.1355345), ('pregnant', 3.7805371), ('child', 3.78347), ('woman', 3.8643107), ('mother', 3.922231)]


In [8]:
def get_clue_word(word_list, words_covered = 1, distance_threshold = 4):
  num_words = len(word_list)
  distances = {}

  nearest_word_list = []
  for i in range(num_words - words_covered + 1):
    word_vec = glove[word_list[i]]
    word_group = [word_list[i]]
    for j in range(i + 1, i + words_covered):
      word_group.append(word_list[j])
      word_vec += glove[word_list[j]]

    mean_word_vec = word_vec / words_covered
    nearest_words = get_nearest_words(mean_word_vec, n=10)
    selected_word, selected_dist = None, None
    for word, distance in nearest_words:
      match = False
      for w in word_group:
        if porter_stemmer.stem(word) == porter_stemmer.stem(w):
          match = True
      if not match:
        selected_word = word
        selected_dist = distance
        break

    nearest_word_list.append((word_group, (selected_word, selected_dist)))

  nearest_word_list = sorted(nearest_word_list, key=lambda x: x[1][1])

  nearest_word_list_final = []
  for word_group, (word, dist) in nearest_word_list:
    if dist < distance_threshold:
        nearest_word_list_final.append((word_group, (word, dist)))

  return nearest_word_list_final

In [9]:
get_clue_word(['doctor', 'man', 'woman', 'grandmother', 'mother', 'king'])

[(['grandmother'], ('aunt', 1.8343304)),
 (['mother'], ('daughter', 1.877375)),
 (['woman'], ('girl', 2.4311144)),
 (['man'], ('woman', 2.6026237)),
 (['doctor'], ('physician', 3.0890384)),
 (['king'], ('prince', 3.1179733))]

### Start a Codewords game

In [10]:
## Select words for the game

grid_len = 5
grid_height = 5
num_words_in_game = grid_len * grid_height
word_set = random.sample(noun_words, num_words_in_game)

## Divide into red and blue
one_third = int(num_words_in_game / 3)
red_words = random.sample(word_set, one_third)
remaining_words = [item for item in word_set if item not in red_words]
blue_words = random.sample(remaining_words, one_third)

In [11]:
get_clue_word(red_words)

[(['morning'], ('afternoon', 1.3307415)),
 (['anime'], ('manga', 2.7139568)),
 (['reflection'], ('evident', 2.8652496)),
 (['activation'], ('signaling', 2.9453557)),
 (['moscow'], ('kiev', 2.998451)),
 (['feedback'], ('input', 3.100595)),
 (['distributor'], ('wholesaler', 3.1208465)),
 (['keyboard'], ('synthesizer', 3.2671976))]

In [12]:
get_clue_word(red_words, words_covered=2)

[(['distributor', 'reflection'], ('creates', 2.793511)),
 (['feedback', 'keyboard'], ('input', 3.0776713)),
 (['morning', 'anime'], ('appearing', 3.0998943)),
 (['reflection', 'feedback'], ('impression', 3.1071374)),
 (['activation', 'distributor'], ('component', 3.1305041)),
 (['anime', 'moscow'], ('simultaneously', 3.286003)),
 (['moscow', 'activation'], ('signaling', 3.3413205))]