# Word Embeddings

Brief interactive demo playing with glove and polyglot embeddings. 

In [1]:
import torch
import torchtext
import numpy as np
import torch.nn.functional as F

import ipywidgets as widgets
from ipywidgets import interact, interactive, fixed, interact_manual

import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns
sns.set()

glove = torchtext.vocab.GloVe('6B', cache='.')
poly = torchtext.vocab.Vectors('poly.txt', cache='.')
from sklearn.decomposition import PCA
poly_pca = PCA(2).fit_transform(poly.vectors)
glove_pca = PCA(2).fit_transform(glove.vectors)

100%|██████████| 100004/100004 [00:01<00:00, 55493.29it/s]


### Similarity Function

We can use either cosine similarity (the angle between two vectors), or
euclidean distance.

In [2]:
def nearest(vectors, q):
    n_vecs = vectors.vectors.shape[0]
    distances = F.cosine_similarity(vectors.vectors, q.repeat((n_vecs, 1)))
#     distances = -1 * F.pairwise_distance(vectors.vectors, q.repeat((n_vecs, 1)))
    topk = distances.topk(10, largest=True)[1]
    for i in topk:
        print(vectors.itos[i])
    return topk

In embedding space, nearby words tend to be similar semantically.

In [3]:
def show_nearest(embs, word):
    if embs == 'Polyglot':
        vectors = poly
        dims = poly_pca
    else:
        vectors = glove
        dims = glove_pca
        
    topk = nearest(vectors, vectors[word])
    ixs = [int(i) for i in topk]
    points = dims[ixs]
    f, a = plt.subplots(1)
    a.scatter(points[:, 0], points[:, 1])
    strs = [vectors.itos[i] for i in ixs]
    for i, txt in enumerate(strs):
        a.annotate(txt, (points[i, 0], points[i, 1]))
        
embs = widgets.ToggleButtons(options=['Polyglot', 'Glove'], description='Embeddings')
word = widgets.Text(continuous_update=False)
interact(show_nearest, word=word, embs = embs);

interactive(children=(ToggleButtons(description='Embeddings', options=('Polyglot', 'Glove'), value='Polyglot')…

### Analogies

Word vectors also exhibit some nice algebraic properties, where you can (seemingly) add and subtract meanings.

In [4]:
def analogy(embs, a, b, c):
    if embs == 'Polyglot':
        vectors = poly
        dims = poly_pca
    else:
        vectors = glove
        dims = glove_pca

    q = vectors[b] - vectors[a] + vectors[c]
    topk = nearest(vectors, q)
    try:
        ixs = [vectors.stoi[s] for s in [a, b, c]]
        ixs = ixs + [int(x) for x in topk[:3]]
        points = dims[ixs]
        f, a = plt.subplots(1)
        a.scatter(points[:, 0], points[:, 1])
        strs = [vectors.itos[i] for i in ixs]
        for i, txt in enumerate(strs):
            a.annotate(txt, (points[i, 0], points[i, 1]))
    except:
        pass
embs = widgets.ToggleButtons(options=['Polyglot', 'Glove'], description='Embeddings')
a = widgets.Text(continuous_update=False, value='king')
b = widgets.Text(continuous_update=False, value='man')
c = widgets.Text(continuous_update=False, value='queen')
interact(analogy, a=a, b=b, c=c, embs=embs);

interactive(children=(ToggleButtons(description='Embeddings', options=('Polyglot', 'Glove'), value='Polyglot')…