In [20]:
import gensim.models
import spacy

from sklearn.manifold import TSNE
import matplotlib
from bokeh.models import ColumnDataSource, LabelSet
from bokeh.plotting import figure, show, output_file

In [9]:
sentences = gensim.models.word2vec.LineSentence('nytimes.txt')

<gensim.models.word2vec.LineSentence at 0x1d306b86390>

In [10]:
model = gensim.models.word2vec.Word2Vec(sentences, size=50, window=5, min_count=2, sg=1, workers=4, iter=10)

In [11]:
len(model.wv.vocab)

1395

In [12]:
model.most_similar('people')

[('Brainâ€™s', 0.9986917972564697),
 ('Even', 0.9985790252685547),
 ('All', 0.9985591173171997),
 ('why', 0.9985516667366028),
 ('people.', 0.9985247254371643),
 ('route', 0.9984791278839111),
 ('and,', 0.9984517097473145),
 ('once', 0.998439610004425),
 ('Bell', 0.9984375238418579),
 ('Valley', 0.9984054565429688)]

In [16]:
words = []
for word in model.wv.vocab:
    words.append(word)
len(words)

1395

In [21]:
nlp = spacy.load('en')

In [34]:
words_str = ' '.join(words)
tokens = nlp(words_str)

words_no_stop = []
for token in tokens:
    if not token.is_stop and token.text in model.:
        words_no_stop.append(token.text)

len(words_no_stop)

1037

In [35]:
word_vectors = model[words_no_stop]

KeyError: "word 'Tokyo' not in vocabulary"

In [18]:
tsne = TSNE(random_state=41)
words_tsne = tsne.fit_transform(word_vectors)

In [19]:
p = figure(tools="pan,wheel_zoom,reset,save",
           toolbar_location="above",
           title="word2vec T-SNE for most common words")

source = ColumnDataSource(data=dict(x1=words_tsne[:,0],
                                    x2=words_tsne[:,1],
                                    names=words))

p.scatter(x="x1", y="x2", size=8, source=source)

labels = LabelSet(x="x1", y="x2", text="names", y_offset=6,
                  text_font_size="8pt", text_color="#555555",
                  source=source, text_align='center')
p.add_layout(labels)

show(p)