In [6]:
import spacy
import os
import gensim.models

In [2]:
nlp = spacy.load('en')

In [18]:
sentences = gensim.models.word2vec.LineSentence('nytimes.txt')

In [34]:
model = gensim.models.word2vec.Word2Vec(sentences, size=50, window=5, min_count=2, sg=1, workers=4, iter=10)

In [30]:
len(model.wv.vocab)

1395

In [41]:
model.most_similar('people')

[('one,', 0.998715877532959),
 ('wouldn’t', 0.9986835718154907),
 ('digital', 0.9986792206764221),
 ('also,', 0.9986624121665955),
 ('nothing', 0.9986375570297241),
 ('hours', 0.9985848665237427),
 ('side,', 0.9985794425010681),
 ('“voters”', 0.9985772967338562),
 ('shown', 0.9985491037368774),
 ('said,', 0.998532772064209)]

In [45]:
words = []
for word in model.wv.vocab:
    words.append(word)

In [47]:
word_vectors = model[words]

In [50]:
from sklearn.manifold import TSNE
import matplotlib
from bokeh.models import ColumnDataSource, LabelSet
from bokeh.plotting import figure, show, output_file

In [49]:
tsne = TSNE(random_state=41)
words_tsne = tsne.fit_transform(word_vectors)

In [51]:
p = figure(tools="pan,wheel_zoom,reset,save",
           toolbar_location="above",
           title="word2vec T-SNE for most common words")

source = ColumnDataSource(data=dict(x1=words_tsne[:,0],
                                    x2=words_tsne[:,1],
                                    names=words))

p.scatter(x="x1", y="x2", size=8, source=source)

labels = LabelSet(x="x1", y="x2", text="names", y_offset=6,
                  text_font_size="8pt", text_color="#555555",
                  source=source, text_align='center')
p.add_layout(labels)

show(p)