In [1]:
import numpy as np
import pandas as pd
import pickle
from scipy import spatial

In [2]:
# load embeddings
embeddings_dict = {}
with open("embeddings/glove/glove.6B.300d.txt", 'r', encoding="utf8") as f:
    for line in f:
        values = line.split()
        word = values[0]
        vector = np.asarray(values[1:], "float32")
        embeddings_dict[word] = vector

In [3]:
# Load Codenames word list
codenames_df = pd.read_csv("word_list/codenames_word_list.csv") 
codenames = pd.melt(codenames_df, id_vars=['ID', 'Version'], value_vars=['SideA', 'SideB'],
        var_name='Side', value_name='Codename')['Codename'].tolist()
codenames = [i.lower() for i in codenames] # convert to lowercase

# remove two-word nouns
one_word_idx = [' ' not in i for i in codenames]
codenames = [i for (i, v) in zip(codenames, one_word_idx) if v]

In [4]:
# Load common English words
with open('word_list/google-10000-english.txt', 'r') as f:
    english_common = f.read().splitlines()
    
# Remove one-letter words from the list
# english_common = [i for i in english_common if len(i) > 1]

In [5]:
# Combine the common English words with the Codename words
words_list = codenames + english_common
words_list = list(dict.fromkeys(words_list)) # remove duplicates

# Remove words from the combine word lists that does not appear in the embeddings
embeddings_list = embeddings_dict.keys()
not_in_embeddings = [x for x in words_list if x not in embeddings_list]
words_list = [x for x in words_list if x not in not_in_embeddings]

In [6]:
# Filter the embeddings dict with the words_list
embeddings_lite_dict = {k: embeddings_dict[k] for k in words_list}

In [7]:
# Save the lite embeddings as pickle object
pickle.dump( embeddings_lite_dict, open( "glove_6B_300d_lite.p", "wb" ) )

In [8]:
embeddings_lite_dict['hollywood']

array([-5.0094e-02, -1.0762e-01,  3.2229e-01,  1.6050e-02,  7.7811e-02,
        2.2434e-02,  8.6380e-02, -5.1917e-01, -7.3523e-02, -1.3516e-01,
        5.5044e-01,  1.1859e-01,  2.8920e-01,  4.3418e-01, -1.8719e-01,
       -8.2179e-01,  7.0902e-02,  1.8455e-01, -6.9144e-02,  6.3938e-01,
        2.7528e-01,  4.0871e-01,  1.5194e-01, -2.8897e-02,  1.9034e-01,
        3.6157e-01,  2.7145e-01, -5.7447e-01,  5.1863e-01,  2.4653e-02,
       -3.2429e-01,  1.9410e-01, -2.6307e-01,  2.2296e-01, -9.1318e-01,
       -3.5211e-01, -8.2717e-01, -7.5952e-03,  3.5023e-01, -3.5074e-01,
        1.3972e-01,  1.3799e-02, -2.6180e-01,  5.0069e-01,  4.6282e-01,
        1.1719e-01,  5.0206e-01, -1.1255e-01,  7.7301e-01, -8.5128e-02,
        3.9358e-01, -7.3209e-01, -1.2514e-01,  4.2907e-01,  5.0027e-01,
       -6.2671e-01, -7.1118e-01,  2.9393e-02,  4.7573e-02, -6.5154e-01,
        4.1159e-02, -3.3808e-01,  3.5713e-01,  7.4648e-01,  4.1764e-01,
       -3.0621e-01,  2.1056e-01,  1.8996e-01,  2.8266e-01, -1.63