In [2]:
import gensim.models

from sklearn.manifold import TSNE
import matplotlib
from bokeh.models import ColumnDataSource, LabelSet
from bokeh.plotting import figure, show, output_file



In [4]:
sentences = gensim.models.word2vec.LineSentence('nytimes.txt')
sentence

<gensim.models.word2vec.LineSentence at 0x1d306b2ef28>

In [5]:
model = gensim.models.word2vec.Word2Vec(sentences, size=50, window=5, min_count=2, sg=1, workers=4, iter=10)

In [6]:
len(model.wv.vocab)

1395

In [7]:
model.most_similar('people')

[('Brain’s', 0.9986531734466553),
 ('Even', 0.9985146522521973),
 ('why', 0.9984638094902039),
 ('people.', 0.9984580874443054),
 ('All', 0.998456597328186),
 ('and,', 0.998381495475769),
 ('route', 0.9983707666397095),
 ('Bell', 0.9983270764350891),
 ('once', 0.9983250498771667),
 ('help', 0.998315691947937)]

In [45]:
words = []
for word in model.wv.vocab:
    if str.lower(word) not in words:
        words.append(str.lower(word))

In [47]:
word_vectors = model[words]

In [49]:
tsne = TSNE(random_state=41)
words_tsne = tsne.fit_transform(word_vectors)

In [51]:
p = figure(tools="pan,wheel_zoom,reset,save",
           toolbar_location="above",
           title="word2vec T-SNE for most common words")

source = ColumnDataSource(data=dict(x1=words_tsne[:,0],
                                    x2=words_tsne[:,1],
                                    names=words))

p.scatter(x="x1", y="x2", size=8, source=source)

labels = LabelSet(x="x1", y="x2", text="names", y_offset=6,
                  text_font_size="8pt", text_color="#555555",
                  source=source, text_align='center')
p.add_layout(labels)

show(p)