In [154]:
import gensim.models
import spacy
from collections import Counter

from sklearn.manifold import TSNE
import matplotlib
from bokeh.models import ColumnDataSource, LabelSet
from bokeh.plotting import figure, show, output_file

In [155]:
sentences = gensim.models.word2vec.LineSentence('nytimes.txt')

In [156]:
model = gensim.models.word2vec.Word2Vec(sentences, size=75, window=10, min_count=2, sg=1, workers=4, iter=20)

In [157]:
len(model.wv.vocab)

1395

In [158]:
model.most_similar('machine')

[('learning', 0.8768691420555115),
 ('statistical', 0.873593807220459),
 ('public', 0.8718515634536743),
 ('between', 0.8606248497962952),
 ('thus', 0.8444238305091858),
 ('tool', 0.8433125019073486),
 ('set', 0.8381274938583374),
 ('based', 0.8243482708930969),
 ('corporate', 0.8232636451721191),
 ('image', 0.8224751353263855)]

In [159]:
nytimes_raw = open('nytimes.txt', encoding = 'UTF-8').read()

In [160]:
nlp = spacy.load('en')

In [161]:
nytimes = nlp(nytimes_raw)

In [162]:
common_words = Counter([w.text for w in nytimes if not w.is_stop and w.is_alpha]).most_common()
common_words[:10]

[('Google', 83),
 ('machine', 57),
 ('like', 55),
 ('neural', 53),
 ('network', 42),
 ('Brain', 41),
 ('cat', 39),
 ('story', 38),
 ('Translate', 37),
 ('data', 37)]

In [163]:
words = []
for word in common_words:
    if word[0] not in words and word[0] in model.wv.vocab:
        words.append(word[0])
    if len(words) == 1000:
        break
        
len(words)

803

In [164]:
word_vectors = model[words]

In [165]:
tsne = TSNE(random_state=41)
words_tsne = tsne.fit_transform(word_vectors)

In [169]:
p = figure(tools="pan,wheel_zoom,reset,save",
           toolbar_location="above",
           title="word2vec T-SNE for most common words")

source = ColumnDataSource(data=dict(x1=words_tsne[:,0],
                                    x2=words_tsne[:,1],
                                    names=words))

p.scatter(x="x1", y="x2", size=8, source=source)

labels = LabelSet(x="x1", y="x2", text="names", y_offset=6,
                  text_font_size="8pt", text_color="#555555",
                  source=source, text_align='center')
p.add_layout(labels)

show(p)