In [70]:
import gensim.models
import spacy
from collections import Counter

from sklearn.manifold import TSNE
import matplotlib
from bokeh.models import ColumnDataSource, LabelSet
from bokeh.plotting import figure, show, output_file

In [41]:
sentences = gensim.models.word2vec.LineSentence('nytimes.txt')

In [150]:
model = gensim.models.word2vec.Word2Vec(sentences, size=75, window=10, min_count=2, sg=1, workers=4, iter=20)

In [151]:
len(model.wv.vocab)

1395

In [137]:
model.most_similar('machine')

[('learning', 0.9653977155685425),
 ('information', 0.9627763628959656),
 ('intelligence', 0.9584156274795532),
 ('public', 0.9580063223838806),
 ('however,', 0.9559808969497681),
 ('concept', 0.9542772173881531),
 ('only', 0.9541152715682983),
 ('way', 0.9535739421844482),
 ('similar', 0.9532870650291443),
 ('statistical', 0.9530330896377563)]

In [82]:
nytimes_raw = open('nytimes.txt', encoding = 'UTF-8').read()

In [46]:
nlp = spacy.load('en')

In [83]:
nytimes = nlp(nytimes_raw)

In [144]:
common_words = Counter([w.text for w in nytimes if not w.is_stop and w.is_alpha]).most_common()
common_words[:10]

[('Google', 83),
 ('machine', 57),
 ('like', 55),
 ('neural', 53),
 ('network', 42),
 ('Brain', 41),
 ('cat', 39),
 ('story', 38),
 ('Translate', 37),
 ('data', 37)]

In [145]:
words = []
for word in common_words:
    if word[0] not in words and word[0] in model.wv.vocab:
        words.append(word[0])
    if len(words) == 1000:
        break
        
len(words)

803

In [146]:
word_vectors = model[words]

In [148]:
tsne = TSNE(random_state=41)
words_tsne = tsne.fit_transform(word_vectors)

In [152]:
p = figure(tools="pan,wheel_zoom,reset,save",
           toolbar_location="above",
           title="word2vec T-SNE for most common words")

source = ColumnDataSource(data=dict(x1=words_tsne[:,0],
                                    x2=words_tsne[:,1],
                                    names=words))

p.scatter(x="x1", y="x2", size=8, source=source)

labels = LabelSet(x="x1", y="x2", text="names", y_offset=6,
                  text_font_size="8pt", text_color="#555555",
                  source=source, text_align='center')
p.add_layout(labels)

show(p)