In [1]:
import numpy as np
from scipy import spatial
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE

In [2]:
embeddings_dict = {}
with open("glove.6B/glove.6B.100d.txt", 'r', encoding="utf-8") as f:
    for line in f:
        values = line.split()
        word = values[0]
        vector = np.asarray(values[1:], "float32")
        embeddings_dict[word] = vector

In [3]:
def find_closest_embeddings(embedding):
    return sorted(embeddings_dict.keys(), key=lambda word: spatial.distance.euclidean(embeddings_dict[word], embedding))

In [4]:
print(find_closest_embeddings(embeddings_dict["insurance"])[1:6])

['insurers', 'pension', 'premiums', 'savings', 'care']


In [None]:
print(find_closest_embeddings(
    embeddings_dict["twig"] - embeddings_dict["branch"] + embeddings_dict["hand"]
)[:5])

In [None]:
tsne = TSNE(n_components=2, random_state=0)

In [None]:
words =  list(embeddings_dict.keys())
vectors = [embeddings_dict[word] for word in words]

In [None]:
Y = tsne.fit_transform(vectors[:1000])

In [None]:
plt.scatter(Y[:, 0], Y[:, 1])

In [None]:
for label, x, y in zip(words, Y[:, 0], Y[:, 1]):
    plt.annotate(label, xy=(x, y), xytext=(0, 0), textcoords="offset points")
plt.show()