In [1]:
import numpy as np
import pandas as pd
import pickle
from scipy import spatial

In [30]:
# load embeddings
embeddings_dict = {}
with open("embeddings/glove/glove.6B.300d.txt", 'r', encoding="utf8") as f:
    for line in f:
        values = line.split()
        word = values[0]
        vector = np.asarray(values[1:], "float32")
        embeddings_dict[word] = vector

In [31]:
# Load Codenames word list
codenames_df = pd.read_csv("word_list/codenames_word_list.csv") 
codenames = pd.melt(codenames_df, id_vars=['ID', 'Version'], value_vars=['SideA', 'SideB'],
        var_name='Side', value_name='Codename')['Codename'].tolist()
codenames = [i.lower() for i in codenames] # convert to lowercase

# remove two-word nouns
one_word_idx = [' ' not in i for i in codenames]
codenames = [i for (i, v) in zip(codenames, one_word_idx) if v]

In [32]:
# Load common English words
with open('word_list/google-10000-english.txt', 'r') as f:
    english_common = f.read().splitlines()
    
# Remove one-letter words from the list
# english_common = [i for i in english_common if len(i) > 1]

In [33]:
# Combine the common English words with the Codename words
words_list = codenames + english_common
words_list = list(dict.fromkeys(words_list)) # remove duplicates

# Remove words from the combine word lists that does not appear in the embeddings
embeddings_list = embeddings_dict.keys()
not_in_embeddings = [x for x in words_list if x not in embeddings_list]
words_list = [x for x in words_list if x not in not_in_embeddings]

In [34]:
# Filter the embeddings dict with the words_list
embeddings_lite_dict = {k: embeddings_dict[k] for k in words_list}

In [35]:
# Save the lite embeddings as pickle object
pickle.dump( embeddings_lite_dict, open( "glove_6B_300d_lite.p", "wb" ) )