In [1]:
# WHERE TO GET THE VECTORS:
# GloVe: https://nlp.stanford.edu/projects/glove/
# Direct link: http://nlp.stanford.edu/data/glove.6B.zip
import numpy as np
from sklearn.metrics.pairwise import pairwise_distances
import time

In [2]:
def dist1(a, b):
    return np.linalg.norm(a-b)

def dist2(a, b):
    return 1 - a.dot(b)/(np.linalg.norm(a)*np.linalg.norm(b))

In [3]:
dist2(np.array([1,0,1]), np.array([2,0,2]))
a = np.array([1,0,1])
b = np.array([2,0,2])

In [4]:
def dist2(a, b):
    return 1 - a.dot(b) / (np.linalg.norm(a) * np.linalg.norm(b))

In [5]:
# pick a distance type
dist, metric = dist2, 'cosine'
# dist, metric = dist1, 'euclidean'

In [6]:
# load in pre-trained word vectors
print('Loading word vectors...')
word2vec = {}
embedding = []
idx2word = []
with open('glove.6B/glove.6B.50d.txt', encoding='utf-8') as f:
  # is just a space-separated text file in the format:
  # word vec[0] vec[1] vec[2] ...
    for line in f:
        values = line.split()
        word = values[0]
        vec = np.asarray(values[1:], dtype='float32')
        word2vec[word] = vec
        embedding.append(vec)
        idx2word.append(word)
print('Found %s word vectors.' % len(word2vec))
embedding = np.array(embedding)
V, D = embedding.shape

Loading word vectors...
Found 400000 word vectors.


In [7]:

## more intuitive
def find_analogies(w1, w2, w3):
    start_time = time.time()
    for w in (w1, w2, w3):
        if w not in word2vec:
            print("%s not in dictionary" % w)
            return

    king = word2vec[w1]
    man = word2vec[w2]
    woman = word2vec[w3]
    v0 = king - man + woman

    min_dist = float('inf')
    best_word = ''
    for word, v1 in word2vec.items():
        if word not in (w1, w2, w3):
            d = dist(v0, v1)
        if d < min_dist:
            min_dist = d
            best_word = word
    print(w1, "-", w2, "=", best_word, "-", w3)
    print(f"Time taken: {time.time()-start_time:.4f} milliseconds")
    
find_analogies('king', 'man', 'woman')

king - man = queen - woman
Time taken: 2.0605 milliseconds


In [8]:
## faster and more efficient
def find_analogies(w1, w2, w3):
    start_time = time.time()
    for w in [w1, w2, w3]:
        if w not in word2vec:
            print("%s not in dictionary", w)
            return
        
    king = word2vec[w1]
    man = word2vec[w2]
    woman = word2vec[w3]
    v0 = king - man + woman
    
    #print("v0 shape:", v0.shape)
    
    best_word = ''
    distances = pairwise_distances(v0.reshape(1, D), embedding, metric=metric).reshape(V)
    #print("distances=", distances)
    idxs = distances.argsort()[:4] #We need to take 4 candidates as in the worst case, nearest 3 words can be the input itself.
    for idx in idxs:
        word = idx2word[idx]
        if word not in [w1, w2, w3]:
            best_word=word
            break
    print(w1, "-", w2, "=", best_word, "-", w3)
    print(f"Time taken: {time.time()-start_time:.4f} milliseconds")

find_analogies('king', 'man', 'woman')

king - man = queen - woman
Time taken: 0.0792 milliseconds


In [9]:
def nearest_neighbors(w, n=5):
    if w not in word2vec:
        print("%s not in dictionary:" % w)
        return

    v = word2vec[w]
    distances = pairwise_distances(v.reshape(1, D), embedding, metric=metric).reshape(V)
    idxs = distances.argsort()[1:n+1]
    print("neighbors of: %s" % w)
    for idx in idxs:
        print("\t%s" % idx2word[idx])
        
    

In [10]:
find_analogies('france', 'paris', 'london')

france - paris = britain - london
Time taken: 0.0810 milliseconds


In [11]:
nearest_neighbors('king')

neighbors of: king
	prince
	queen
	ii
	emperor
	son


In [12]:
find_analogies('france', 'paris', 'rome')
find_analogies('paris', 'france', 'italy')
find_analogies('france', 'french', 'english')
find_analogies('japan', 'japanese', 'chinese')
find_analogies('japan', 'japanese', 'italian')
find_analogies('japan', 'japanese', 'australian')
find_analogies('december', 'november', 'june')
find_analogies('miami', 'florida', 'texas')
find_analogies('einstein', 'scientist', 'painter')
find_analogies('china', 'rice', 'bread')
find_analogies('man', 'woman', 'she')
find_analogies('man', 'woman', 'aunt')
find_analogies('man', 'woman', 'sister')
find_analogies('man', 'woman', 'wife')
find_analogies('man', 'woman', 'actress')
find_analogies('man', 'woman', 'mother')
find_analogies('heir', 'heiress', 'princess')
find_analogies('nephew', 'niece', 'aunt')
find_analogies('france', 'paris', 'tokyo')
find_analogies('france', 'paris', 'beijing')
find_analogies('february', 'january', 'november')
find_analogies('france', 'paris', 'rome')
find_analogies('paris', 'france', 'italy')

france - paris = italy - rome
Time taken: 0.0821 milliseconds
paris - france = rome - italy
Time taken: 0.0884 milliseconds
france - french = england - english
Time taken: 0.0826 milliseconds
japan - japanese = china - chinese
Time taken: 0.0961 milliseconds
japan - japanese = italy - italian
Time taken: 0.0884 milliseconds
japan - japanese = australia - australian
Time taken: 0.0928 milliseconds
december - november = july - june
Time taken: 0.0943 milliseconds
miami - florida = houston - texas
Time taken: 0.0845 milliseconds
einstein - scientist = matisse - painter
Time taken: 0.1111 milliseconds
china - rice = chinese - bread
Time taken: 0.1034 milliseconds
man - woman = he - she
Time taken: 0.0838 milliseconds
man - woman = uncle - aunt
Time taken: 0.1031 milliseconds
man - woman = brother - sister
Time taken: 0.0812 milliseconds
man - woman = friend - wife
Time taken: 0.0860 milliseconds
man - woman = actor - actress
Time taken: 0.0972 milliseconds
man - woman = father - mother
Tim

In [13]:
nearest_neighbors('king')
nearest_neighbors('france')
nearest_neighbors('japan')
nearest_neighbors('einstein')
nearest_neighbors('woman')
nearest_neighbors('nephew')
nearest_neighbors('february')
nearest_neighbors('rome')

neighbors of: king
	prince
	queen
	ii
	emperor
	son
neighbors of: france
	french
	belgium
	paris
	spain
	netherlands
neighbors of: japan
	japanese
	china
	korea
	tokyo
	taiwan
neighbors of: einstein
	relativity
	bohr
	physics
	heisenberg
	freud
neighbors of: woman
	girl
	man
	mother
	her
	boy
neighbors of: nephew
	cousin
	brother
	grandson
	son
	uncle
neighbors of: february
	october
	december
	january
	august
	september
neighbors of: rome
	naples
	venice
	italy
	turin
	pope
