In [40]:
import gensim.models
import spacy

from sklearn.manifold import TSNE
import matplotlib
from bokeh.models import ColumnDataSource, LabelSet
from bokeh.plotting import figure, show, output_file

In [41]:
sentences = gensim.models.word2vec.LineSentence('nytimes.txt')

In [42]:
model = gensim.models.word2vec.Word2Vec(sentences, size=50, window=5, min_count=2, sg=1, workers=4, iter=10)

In [43]:
len(model.wv.vocab)

1395

In [44]:
model.most_similar('people')

[('Brain’s', 0.9986151456832886),
 ('Even', 0.9985101222991943),
 ('why', 0.9984885454177856),
 ('people.', 0.998461902141571),
 ('All', 0.9984439611434937),
 ('and,', 0.9983448386192322),
 ('once', 0.9983404874801636),
 ('Bell', 0.998337984085083),
 ('route', 0.99833744764328),
 ('help', 0.9983367919921875)]

In [45]:
words = []
for word in model.wv.vocab:
    words.append(word)
len(words)

1395

In [46]:
nlp = spacy.load('en')

In [47]:
words_str = ' '.join(words)
tokens = nlp(words_str)

words_no_stop = []
for token in tokens:
    if not token.is_stop and token.text in model.wv.vocab:
        words_no_stop.append(token.text)

len(words_no_stop)

1006

In [48]:
word_vectors = model[words_no_stop]

In [49]:
tsne = TSNE(random_state=41)
words_tsne = tsne.fit_transform(word_vectors)

In [50]:
p = figure(tools="pan,wheel_zoom,reset,save",
           toolbar_location="above",
           title="word2vec T-SNE for most common words")

source = ColumnDataSource(data=dict(x1=words_tsne[:,0],
                                    x2=words_tsne[:,1],
                                    names=words_no_stop))

p.scatter(x="x1", y="x2", size=8, source=source)

labels = LabelSet(x="x1", y="x2", text="names", y_offset=6,
                  text_font_size="8pt", text_color="#555555",
                  source=source, text_align='center')
p.add_layout(labels)

show(p)

