In [2]:

import gensim
# Need the interactive Tools for Matplotlib
%matplotlib notebook
import numpy as np
import matplotlib.pyplot as plt
 
from sklearn.manifold import TSNE

In [6]:
# load pre-trained word2vec embeddings
# The embeddings can be downloaded from command prompt:
# wget -c "https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz"
model = gensim.models.KeyedVectors.load_word2vec_format("w2v/sbw_vectors.bin", binary=True)

In [8]:
print(model['computer']) 

# We will also need to get the words closest to a word
model.similar_by_word('computer')

[ 1.56460539e-01  7.19106942e-02 -1.71413168e-01  4.57964092e-01
  3.23210955e-01 -3.10159534e-01 -2.18714312e-01  4.38964784e-01
 -1.86610654e-01  1.04246929e-01 -2.82200724e-01  9.78404209e-02
  9.19221044e-02 -5.29603422e-01  7.22988567e-04 -4.29081678e-01
  2.75472432e-01  6.89951889e-03 -2.03240007e-01  7.51436427e-02
 -1.98192403e-01  1.21730141e-01  1.08382702e-01  7.18784779e-02
 -7.55023630e-03  9.81419832e-02  1.40906572e-01 -5.04251122e-02
 -1.96120873e-01  2.38828391e-01  2.54535288e-01 -3.91327254e-02
  4.55435753e-01  1.05976023e-01  1.06496856e-01 -2.44677767e-01
 -1.28739491e-01  2.08880126e-01 -2.08917744e-02 -2.04999447e-01
  2.31588304e-01  4.68767196e-01  3.37878704e-01 -1.48182720e-01
  2.71438867e-01 -1.72918722e-01  2.51783162e-01 -4.41745296e-02
  2.00184085e-03 -1.32530913e-01  6.13360107e-03 -1.88555494e-02
  6.24141276e-01  2.96406627e-01 -4.49493378e-02  4.42946732e-01
 -3.17706108e-01 -4.19449687e-01 -9.54157785e-02  3.24930042e-01
 -5.06090760e-01 -5.05943

[(u'aided', 0.7693202495574951),
 (u'computers', 0.755649209022522),
 (u'enabled', 0.7268332839012146),
 (u'connected', 0.716233491897583),
 (u'telephony', 0.7156180739402771),
 (u'helps', 0.7152702808380127),
 (u'useful', 0.7150285243988037),
 (u'reliable', 0.7133311033248901),
 (u'machinery', 0.7068817615509033),
 (u'dedicated', 0.7063027024269104)]

In [10]:
def display_closestwords_tsnescatterplot(model, word):
    
    arr = np.empty((0,300), dtype='f')
    word_labels = [word]

    # get close words
    close_words = model.similar_by_word(word)
    
    # add the vector for each of the closest words to the array
    arr = np.append(arr, np.array([model[word]]), axis=0)
    for wrd_score in close_words:
        wrd_vector = model[wrd_score[0]]
        word_labels.append(wrd_score[0])
        arr = np.append(arr, np.array([wrd_vector]), axis=0)
        
    # find tsne coords for 2 dimensions
    tsne = TSNE(n_components=2, random_state=0)
    np.set_printoptions(suppress=True)
    Y = tsne.fit_transform(arr)

    x_coords = Y[:, 0]
    y_coords = Y[:, 1]
    # display scatter plot
    plt.scatter(x_coords, y_coords)

    for label, x, y in zip(word_labels, x_coords, y_coords):
        plt.annotate(label, xy=(x, y), xytext=(0, 0), textcoords='offset points')
    plt.xlim(x_coords.min()+0.00005, x_coords.max()+0.00005)
    plt.ylim(y_coords.min()+0.00005, y_coords.max()+0.00005)
    plt.show()

In [15]:
display_closestwords_tsnescatterplot(model, 'buenos')

<IPython.core.display.Javascript object>