In [6]:
import numpy as np
import pandas as pd
import pickle
from scipy import spatial
from bert_serving.client import BertClient

In [2]:
# Load Codenames word list
codenames_df = pd.read_csv("word_list/codenames_word_list.csv") 
codenames = pd.melt(codenames_df, id_vars=['ID', 'Version'], value_vars=['SideA', 'SideB'],
        var_name='Side', value_name='Codename')['Codename'].tolist()
codenames = [i.lower() for i in codenames] # convert to lowercase

# remove two-word nouns
one_word_idx = [' ' not in i for i in codenames]
codenames = [i for (i, v) in zip(codenames, one_word_idx) if v]

In [3]:
# Load common English words
with open('word_list/google-10000-english.txt', 'r') as f:
    english_common = f.read().splitlines()
    
# Remove one-letter words from the list
# english_common = [i for i in english_common if len(i) > 1]

In [5]:
# Combine the common English words with the Codename words
words_list = codenames + english_common
words_list = list(dict.fromkeys(words_list)) # remove duplicates

In [9]:
# Use bert-as-service to encode the word list and generate embedding
bc = BertClient()
vectors = bc.encode(words_list)

In [19]:
# Save the embedding vectors into a dict
embeddings_lite_dict = {}
for word, vector in zip(words_list, vectors):
    embeddings_lite_dict[word] = vector

In [20]:
# Save the lite embeddings as pickle object
pickle.dump( embeddings_lite_dict, open( "bert_uncased_L-12_H-768_A-12_lite.p", "wb" ) )

In [21]:
embeddings_lite_dict['hollywood']

array([-1.98899984e-01, -2.53854722e-01, -5.28132021e-01, -1.16605759e-01,
        4.58844095e-01,  1.88486159e-01, -8.81428942e-02,  3.20778310e-01,
       -5.60905755e-01, -1.86517596e-01, -6.73761666e-02,  1.19121812e-01,
        2.76954234e-01,  1.79751351e-01, -4.10767198e-01,  4.11174335e-02,
       -4.52157250e-03,  4.46556956e-02,  4.23418492e-01, -2.66885459e-02,
       -5.24842799e-01,  1.23574547e-02, -5.20662606e-01, -9.39235762e-02,
       -3.60359959e-02, -1.59078702e-01, -1.96976721e-01,  1.82488635e-01,
       -1.11621320e-01,  5.16569734e-01,  8.21916014e-03, -1.80650298e-02,
       -2.07650557e-01,  1.76358506e-01,  1.91569969e-01, -6.05688393e-01,
       -4.31271605e-02,  8.59159753e-02, -3.24392647e-01, -5.00828167e-03,
       -1.54519036e-01, -3.16297740e-01,  7.13851511e-01, -5.87052882e-01,
        2.84840167e-01, -5.29830158e-02, -1.17656696e+00,  4.13395375e-01,
       -2.39004180e-01, -1.75647601e-01, -6.20101452e-01, -1.59828559e-01,
       -3.00674289e-01,  