In [None]:
import h5py

with h5py.File('./datasets/mini.h5', 'r') as f:
    all_words = [word.decode('utf-8') for word in f['mat']['axis1'][:]] # type: ignore
    all_embeddings = f['mat']['block0_values'][:] # type: ignore

print("all_words dimensions: {}".format(len(all_words)))
print("all_embeddings dimensions: {}".format(all_embeddings.shape)) # type: ignore

print("Random example word: {}".format(all_words[1337]))

all_words dimensions: 362891
all_embeddings dimensions: (362891, 300)
Random example word: /c/de/aufmachung


In [12]:

english_words = [word[6:] for word in all_words if word.startswith('/c/en/')]
english_word_indices = [i for i, word in enumerate(all_words) if word.startswith('/c/en/')]
english_embeddings = all_embeddings[english_word_indices] # type: ignore

index = {word: i for i, word in enumerate(english_words)}

print("Number of English words in all_words: {0}".format(len(english_words)))
print("english_embeddings dimensions: {0}".format(english_embeddings.shape)) # type: ignore

print(english_words[1337])

Number of English words in all_words: 150875
english_embeddings dimensions: (150875, 300)
activated_carbon


Normalize the vectors to get just the semantic meaning.

In [14]:
import numpy as np

norms = np.linalg.norm(english_embeddings, axis=1) # type: ignore
normalized_embeddings = english_embeddings.astype('float32') / norms.astype('float32').reshape([-1, 1]) # type: ignore

In [16]:
def similarity_score(w1, w2):
    return np.dot(
        normalized_embeddings[index[w1], :],
        normalized_embeddings[index[w2], :]
    )

Higher = more similar

In [18]:
# A word is as similar with itself as possible:
print('cat\tcat\t', similarity_score('cat', 'cat'))

# Closely related words still get high scores:
print('cat\tfeline\t', similarity_score('cat', 'feline'))
print('cat\tdog\t', similarity_score('cat', 'dog'))

# Unrelated words, not so much
print('cat\tmoo\t', similarity_score('cat', 'moo'))
print('cat\tfreeze\t', similarity_score('cat', 'freeze'))

# Antonyms are still considered related, sometimes more so than synonyms
print('antonym\topposite\t', similarity_score('antonym', 'opposite'))
print('antonym\tsynonym\t', similarity_score('antonym', 'synonym'))

cat	cat	 1.0
cat	feline	 0.81995475
cat	dog	 0.590724
cat	moo	 0.0039538275
cat	freeze	 -0.03022519
antonym	opposite	 0.3941065
antonym	synonym	 0.46883982


In [37]:
def closests(v, n):
    # sort the words by their score relative to vector v
    scores = np.dot(normalized_embeddings, v)   # list of similarity to v
    best = list(map(lambda i: english_words[i], np.argsort(scores)))
    return best[-n:][::-1]

def most_similar(w, n):
    return closests(normalized_embeddings[index[w], :], n)


In [38]:
print(most_similar('cat', 10))
print(most_similar('dog', 10))
print(most_similar('duke', 10))

['cat', 'humane_society', 'kitten', 'feline', 'colocolo', 'cats', 'kitty', 'maine_coon', 'housecat', 'sharp_teeth']
['dog', 'dogs', 'wire_haired_dachshund', 'doggy_paddle', 'lhasa_apso', 'good_friend', 'puppy_dog', 'bichon_frise', 'woof_woof', 'golden_retrievers']
['duke', 'dukes', 'duchess', 'duchesses', 'ducal', 'dukedom', 'duchy', 'voivode', 'princes', 'prince']
