In [1]:
import numpy as np
import tqdm
import sys

In [2]:
embedding_file_path = '../word_embedding_pre_trained/glove_42B_300d_uncased/glove.42B.300d.txt'
embedding_dimension = 300

In [3]:
def num_lines(fname):
    result = 0
    with open(fname, 'r') as f:
        while f.buffer.readline():
            result += 1
    return result

In [4]:
def vocab_and_embedding_matrices(file_path):
    num_vocab = num_lines(file_path)
    words = []
    embedding = np.zeros(shape=(num_vocab, embedding_dimension), dtype=np.float32)
    with open(embedding_file_path, 'r') as embedding_file:
        pbar = tqdm.tqdm(total=num_vocab, file=sys.stdout)
        for idx in range(num_vocab):
            line = embedding_file.buffer.readline()
            first_space_idx = line.find(b' ')
            word = line[:first_space_idx].decode('utf-8')
            vector_content = line[first_space_idx + 1:-1]
            vector = np.array([np.float32(ele) for ele in vector_content.split()])

            words.append(str(word))
            embedding[idx] = vector
            pbar.update()
        pbar.close()
    return words, embedding

In [5]:
words, embedding = vocab_and_embedding_matrices(embedding_file_path)

100%|██████████████████████████████████████████████████████████████████████| 1917494/1917494 [06:54<00:00, 4627.39it/s]


In [6]:
word_to_idx = dict()
idx_to_word = dict()
for idx, word in enumerate(words):
    word_to_idx[word] = idx
    idx_to_word[idx] = word

In [7]:
def embedding_for_word(word):
    return embedding[word_to_idx[word]]

In [8]:
def cosine_similarity(vec):
    embedding_norm = np.linalg.norm(embedding, axis=-1, keepdims=False)
    return np.sum(embedding * vec, axis=-1)/(embedding_norm * np.linalg.norm(vec))

In [9]:
def analogy(word_a_to, word_b, word_c_to):
    e_a, e_b, e_c = embedding_for_word(word_a_to), embedding_for_word(word_b), embedding_for_word(word_c_to)
    a_minus_b = e_a - e_b
    c_minus_embedding = e_c - embedding
    for word in (word_a_to, word_b, word_c_to):
        c_minus_embedding[word_to_idx[word]] = 100
    embedding_norm = np.linalg.norm(c_minus_embedding, axis=-1, keepdims=False)
    sim_mat = np.sum(c_minus_embedding * a_minus_b, axis=-1)/(embedding_norm * np.linalg.norm(a_minus_b))
    most_similar_idx = np.argmax(sim_mat)
    return idx_to_word[most_similar_idx]

In [53]:
word = 'sustainable'

In [54]:
word_embedding = embedding_for_word(word)
sim_mat = cosine_similarity(word_embedding)

In [55]:
sim_mat[word_to_idx[word]] = 0

In [56]:
most_similar_idx = np.argmax(sim_mat)
print(idx_to_word[most_similar_idx])

sustainability


In [64]:
word_pairs = (
    ('good', 'better', 'fast'),
    ('close', 'closer', 'far'),
    ('fish', 'water', 'bird'),
    ('beijing', 'china', 'paris'),
    ('man', 'doctor', 'woman'),
    ('man', 'brother', 'woman')
)

In [65]:
for pair in word_pairs:
    try:
        a, b, c = pair
        result = analogy(word_a_to=a, word_b=b, word_c_to=c)
        print('\'{0}\' ---> \'{1}\': \'{2}\' ---> \'{3}\'.'.format(a, b, c, result))
    except Exception:
        pass

'good' ---> 'better': 'fast' ---> 'faster'.
'close' ---> 'closer': 'far' ---> 'farther'.
'fish' ---> 'water': 'bird' ---> 'air'.
'beijing' ---> 'china': 'paris' ---> 'france'.
'man' ---> 'doctor': 'woman' ---> 'physician'.
'man' ---> 'brother': 'woman' ---> 'sister'.
