# Interactive word vector visualizations in Jupyter notebooks

In [142]:
# Load vectors from file
# The sample file is int he format generated by Glove. 
# First column is the word, followed by its vector, space delimited:
# word <vec>
# word <vec>
# ...
# A sample file with 5000 50-d vectors is included

import pandas as pd
vectors = pd.read_csv('data/vectors_5000.txt', delimiter=' ', header=None).as_matrix()
words = vectors[:,0]
vectors = vectors[:,1:]
print(words.shape)
print(vectors.shape)
print(words[:3])
print(vectors[:3,:])

(5000,)
(5000, 50)
['the' 'of' 'and']
[[0.702965 -0.093212 -0.42150200000000004 -0.430059 -0.464135 0.48163
  1.59297 -1.726543 -0.512899 0.380612 -1.9453529999999999
  -0.33831500000000003 1.430657 -0.375235 -2.058234 -0.022857 1.194399
  -0.10219600000000001 -0.212738 -0.35118499999999997 -0.048301 -1.227926
  -0.059437000000000004 -0.11541300000000002 -1.058568 0.393416 -0.131893
  0.27114299999999997 0.331271 -0.292186 -0.455772 0.20669200000000001
  -0.969767 -0.868569 0.975427 -0.166179 -1.3958760000000001
  -0.8614360000000001 -0.055564999999999996 -1.277792 -0.181928 -1.009301
  -0.434904 0.359189 0.276807 -0.6651 -0.996642 -0.675003 0.325484 1.462228]
 [0.884634 0.203253 0.157106 -0.9560049999999999 -0.7286199999999999
  0.610926 0.986567 -1.909298 -0.8765959999999999 -0.526278 -2.351771
  -0.612168 1.7166439999999998 0.9305329999999999 -1.270684 0.507109
  1.049233 -0.22905 -0.26765300000000003 -0.234254 0.56004
  -1.2946389999999999 -0.14616600000000002 0.5631390000000001 -0

## Define plotting routines

In [124]:
# for interactive plots (optional)
import ipympl
# needs: conda install ipympl -c conda-forge

import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

# regular notebook
# %matplotlib notebook
# interactive plots
%matplotlib ipympl

def plot_words_2d(vectors, words, plot = True):
    fig = plt.figure()
    ax = fig.add_subplot(111)
    ax.scatter(vectors[:,0], vectors[:,1], marker='.')
    for i, word in enumerate(words):
        ax.text(x=vectors[i,0], y=vectors[i,1], s=word)
    if plot:
        plt.show()
    return plt    

def plot_words_3d(vectors, words, plot = True):
    fig = plt.figure()
    ax = fig.add_subplot(111, projection='3d')
    ax.scatter(vectors[:,0], vectors[:,1], vectors[:,2], marker='.')
    for i, word in enumerate(words):
        ax.text(x=vectors[i,0], y=vectors[i,1], z=vectors[i,2], s=word)
    if plot:
        plt.show()
    return plt  



In [125]:
from sklearn.manifold import TSNE

# Project the words in 2D space
model = TSNE(n_components=2, random_state=0, perplexity=15)
if True:
    X_tsne_2d = model.fit_transform(vectors)
    # save for later
    np.savetxt('tsne_2d.txt', X_tsne_2d);
else:
    print("WARNING: loading stale vectors from tsne_2d.txt")
    X_tsne_2d = np.loadtxt('tsne_2d.txt');

# show first 20 points
npts = 50
plot_words_2d(X_tsne_2d[0:npts,:], words[0:npts]);

In [126]:
# Project the words in 3D space
model = TSNE(n_components=3, random_state=0, perplexity=15)
if True:
    X_tsne_3d = model.fit_transform(vectors)
    # save for later
    np.savetxt('tsne_3d.txt', X_tsne_3d);
else:
    print("WARNING: loading stale vectors from tsne_3d.txt")
    X_tsne_3d = np.loadtxt('tsne_3d.txt');
    
npts = 20
plot_words_3d(X_tsne_3d[0:npts,0:3], words[0:npts]);

In [140]:
from sklearn.neighbors import NearestNeighbors

# show interactive 2D plot like in Tensorboard
def plot_words_2d_interactive(vectors, words, plot = True, n_neighbors = 15, vectors_3d = None):
    fig = plt.figure(figsize=(8,10))
    ax = fig.add_subplot(211)
    if vectors_3d is None:
        ax_zoom = fig.add_subplot(212)
    else:
        ax_zoom = fig.add_subplot(212, projection='3d')
    ax.scatter(vectors[:,0], vectors[:,1], marker='.', c='b')
    if plot:
        plt.show()
    
    NN_model = NearestNeighbors(n_neighbors=n_neighbors, algorithm='ball_tree').fit(vectors)
    last_indices = None
    
    def onclick_2d_interactive(event):
        # print('button=%d, x=%d, y=%d, xdata=%f, ydata=%f' % (event.button, event.x, event.y, event.xdata, event.ydata))

        if event.inaxes == ax_zoom:
            return;
        
        distances, indices = NN_model.kneighbors([[event.xdata, event.ydata]], n_neighbors=n_neighbors)
        # distances, indices = NN_model.radius_neighbors([[event.xdata, event.ydata]], radius=0.5)
        # print(indices)
        
        indices = indices[0]
        if indices.size == 0:
            return
        ax_zoom.clear()
        if vectors_3d is None:
            ax_zoom.scatter(vectors[indices,0], vectors[indices,1], marker='.')
            for i in indices:
                ax_zoom.text(x=vectors[i,0], y=vectors[i,1], s=words[i])
        else:
            ax_zoom.scatter(vectors_3d[indices,0], vectors_3d[indices,1], vectors_3d[indices,2], marker='.')
            for i in indices:
                ax_zoom.text(x=vectors_3d[i,0], y=vectors_3d[i,1], z=vectors_3d[i,2], s=words[i])
        # highlight on main plot
        nonlocal last_indices
        if last_indices is not None:
            ax.scatter(vectors[last_indices,0], vectors[last_indices,1], marker='.', c='b')
        ax.scatter(vectors[indices,0], vectors[indices,1], marker='.', c='g')
        last_indices = indices

    cid = fig.canvas.mpl_connect('button_press_event', onclick_2d_interactive)
    return plt

plt.close("all")
npts = 5000
plot_words_2d_interactive(X_tsne_2d[0:npts,0:2], words[0:npts], vectors_3d = X_tsne_3d);

In [143]:
from sklearn.neighbors import NearestNeighbors

ev = None

# show interactive 3D plot like in Tensorboard
def plot_words_3d_interactive(vectors, words, plot = True, n_neighbors = 15):
    fig = plt.figure(figsize=(8,8))
    ax = fig.add_subplot(211, projection='3d')
    ax_zoom = fig.add_subplot(212, projection='3d')
    ax.scatter(vectors[:,0], vectors[:,1], vectors[:,2], marker='.', c='b', picker=3)
    if plot:
        plt.show()
    
    NN_model = NearestNeighbors(n_neighbors=n_neighbors, algorithm='ball_tree').fit(vectors)
    last_indices = None
    
    def onclick_3d_interactive(event):
        _x, _y, _z = event.artist._offsets3d
        ind = event.ind[0]
        x = _x[ind]
        y = _y[ind]
        z = _z[ind]
        # print(x, y, z)
        distances, indices = NN_model.kneighbors([[x, y, z]], n_neighbors=n_neighbors)
        # distances, indices = NN_model.radius_neighbors([[event.xdata, event.ydata]], radius=0.5)
        # print(indices)
        distances = distances[0]
        indices = indices[0]
        if indices.size == 0:
            return

        avg_neighbor = np.mean(distances[1:6])
        keep = distances>2*avg_neighbor
        distance = distances[keep]
        indices = indices[keep]
        
        ax_zoom.clear()
        ax_zoom.scatter(vectors[indices,0], vectors[indices,1], vectors[indices,2], marker='.')
        for i in indices:
            jitter = 0 #0.5 * np.random.rand(1,1)
            ax_zoom.text(x=vectors[i,0], y=vectors[i,1], z=vectors[i,2]+jitter, s=words[i])
        # highlight on main plot
#         nonlocal last_indices
#         if last_indices is not None:
#             ax.scatter(vectors[last_indices,0], vectors[last_indices,1], marker='.', c='b')
#         ax.scatter(vectors[indices,0], vectors[indices,1], marker='.', c='g')
#         last_indices = indices

    cid = fig.canvas.mpl_connect('pick_event', onclick_3d_interactive)
    return plt

plt.close("all")
npts = 1000
plot_words_3d_interactive(X_tsne_3d[0:npts,0:3], words[0:npts]);

In [139]:
# Search by word
word = 'interface'
words = list(words)
idx = words.index(word)

n_neighbors = 10
NN_model = NearestNeighbors(n_neighbors=n_neighbors, algorithm='ball_tree').fit(vectors)
distances, indices = NN_model.kneighbors([vectors[idx,:]], n_neighbors=n_neighbors)
distances = distances[0]
indices = indices[0]
print(indices)
for i in indices:
    print(words[i])

[2063  951 1318 1792 4614 1498 1272 2792 2934 2804]
interface
user
application
hardware
capabilities
device
allows
integrated
tool
platform
