In [1]:
import pickle
import numpy as np

from scipy.spatial.distance import cosine
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
embeddings = pickle.load(open("embeddings.p", "rb")).numpy()
vocab = pickle.load(open("vocab.p", "rb"))
vocab_list = pickle.load(open("vocab_list.p", "rb"))

print("Vocabulary size:", len(vocab_list))
print("Embeddings shape:", embeddings.shape)

Vocabulary size: 400000
Embeddings shape: (400000, 50)


**Part 1.** Find the most similar words and their cosine similarity.

In [3]:
def most_similar(keyword):
    idx = vocab[keyword]
    keyword_embedding = embeddings[idx].reshape(1, -1)
    
    best_distance = 2
    best_idx = -1
    
    sims = cosine_similarity(embeddings, keyword_embedding)
    
    # Get two largest elements.
    best_idx = np.argpartition(sims.reshape(-1), -2)[-2:]
    second_best_idx = best_idx[np.argsort(sims[best_idx], axis=0)][0, 0]
            
    return vocab_list[second_best_idx], sims[second_best_idx, 0]

In [4]:
words = ["dog", "whale", "before", "however", "fabricate"]

for word in words:
    print("Keyword:", word)
    print("Most similar word is '%s', with cosine distance %f." % (most_similar(word)), end = '\n')

Keyword: dog
Most similar word is 'cat', with cosine distance 0.921801.
Keyword: whale
Most similar word is 'whales', with cosine distance 0.898683.
Keyword: before
Most similar word is 'after', with cosine distance 0.951184.
Keyword: however
Most similar word is 'although', with cosine distance 0.980139.
Keyword: fabricate
Most similar word is 'fabricating', with cosine distance 0.759454.


**Part 2.** Completing the analogy. Given analogy with words "$w_{i_1} : w_{i_2} :: w_{j_1} : \ ?$", the goal is to guess word $w_{j_1}$. We guess its word embedding as $\tilde{v}_{j_2} = -v_{i_1} + v_{j_1} + v_{i_2}$, where $v_{k}$ is the embedding of $w_k$, and find the most similar vector.

In [16]:
def complete_analogy(word1, is_to1, word2, is_to2 = None):
    
    i1 = vocab[word1]
    j1 = vocab[is_to1]
    i2 = vocab[word2]
    
    v_i1 = embeddings[i1]
    v_i2 = embeddings[i2]
    v_j1 = embeddings[j1]
    
    # Guess for completing word.
    v_j2 = -v_i1 + v_j1 + v_i2
    
    sims = cosine_similarity(embeddings, v_j2.reshape(1, -1))
    
    # Get top 6, and remove any that are included in the original words.
    top_six = np.argpartition(sims.reshape(-1), -6)[-6:]
    
    best_idx = []
    for i in top_six:
        if i != i1 and i != j1 and i != i2:
            best_idx.append(i)
    best_idx = np.array(best_idx)
    
    # Get the top 3.
    in_order = np.flip(best_idx[np.argsort(sims[best_idx], axis=0)][:, 0])[0:3]
    print("'%s' is to '%s' as '%s' is to ____." % (word1, is_to1, word2))
    for i, ind in enumerate(in_order):
        print("%d. Completion: %s, similarity: %.3f." %(i+1, vocab_list[ind], sims[ind, 0]))
        
    if is_to2:
        sim = cosine_similarity(embeddings[vocab[is_to2]].reshape(1, -1), v_j2.reshape(1, -1))
        print("Similarity of '%s' and estimated word embedding: %.3f" % (is_to2, sim))

In [17]:
complete_analogy("speak", "speaker", "sing", "singer")

'speak' is to 'speaker' as 'sing' is to ____.
1. Completion: sang, similarity: 0.623.
2. Completion: nateq, similarity: 0.622.
3. Completion: lyricist, similarity: 0.602.
Similarity of 'singer' and estimated word embedding: 0.508


In [18]:
complete_analogy("dog", "puppy", "cat", "kitten")

'dog' is to 'puppy' as 'cat' is to ____.
1. Completion: puppies, similarity: 0.763.
2. Completion: scaredy, similarity: 0.744.
3. Completion: kitten, similarity: 0.741.
Similarity of 'kitten' and estimated word embedding: 0.741


In [19]:
complete_analogy("france", "french", "england", "english")

'france' is to 'french' as 'england' is to ____.
1. Completion: scottish, similarity: 0.868.
2. Completion: english, similarity: 0.837.
3. Completion: welsh, similarity: 0.806.
Similarity of 'english' and estimated word embedding: 0.837


In [20]:
complete_analogy("france", "wine", "england", "whiskey")

'france' is to 'wine' as 'england' is to ____.
1. Completion: orchard, similarity: 0.662.
2. Completion: tasting, similarity: 0.633.
3. Completion: tea, similarity: 0.616.
Similarity of 'whiskey' and estimated word embedding: 0.513
