In [70]:
import gensim.models
import spacy
from collections import Counter

from sklearn.manifold import TSNE
import matplotlib
from bokeh.models import ColumnDataSource, LabelSet
from bokeh.plotting import figure, show, output_file

In [41]:
sentences = gensim.models.word2vec.LineSentence('nytimes.txt')

In [71]:
model = gensim.models.word2vec.Word2Vec(sentences, size=100, window=5, min_count=1, sg=1, workers=4, iter=10)

In [72]:
len(model.wv.vocab)

4562

In [73]:
model.most_similar('people')

[('neurons', 0.9995913505554199),
 ('definitions', 0.99957275390625),
 ('learning', 0.9995380640029907),
 ('however,', 0.9995267391204834),
 ('All', 0.9995251893997192),
 ('point', 0.9995114207267761),
 ('small', 0.9995081424713135),
 ('language', 0.99949711561203),
 ('much', 0.9994962215423584),
 ('called', 0.9994936585426331)]

In [82]:
nytimes_raw = open('nytimes.txt', encoding = 'UTF-8').read()

In [46]:
nlp = spacy.load('en')

In [83]:
nytimes = nlp(nytimes_raw)

In [106]:
common_words = Counter([w.text for w in nytimes if not w.is_stop and w.is_alpha]).most_common(2000)

In [108]:
words = []
for word in common_words:
    if word[0] not in words and word[0] in model.wv.vocab:
        words.append(word[0])
    if len(words) == 1000:
        break
        
len(words)
words

['Google',
 'machine',
 'like',
 'neural',
 'network',
 'Brain',
 'cat',
 'story',
 'Translate',
 'data',
 'translation',
 'people',
 'artificial',
 'Dean',
 'time',
 'Le',
 'human',
 'intelligence',
 'Schuster',
 'new',
 'said',
 'company',
 'team',
 'Hughes',
 'English',
 'learning',
 'work',
 'language',
 'system',
 'way',
 'paper',
 'years',
 'computer',
 'reading',
 'networks',
 'long',
 'Hinton',
 'main',
 'told',
 'old',
 'called',
 'Continue',
 'know',
 'different',
 'Advertisement',
 'took',
 'things',
 'ball',
 'point',
 'image',
 'words',
 'word',
 'wanted',
 'year',
 'rules',
 'patterns',
 'began',
 'Pichai',
 'good',
 'day',
 'Chinese',
 'neurons',
 'service',
 'Japanese',
 'going',
 'recognition',
 'based',
 'machines',
 'important',
 'went',
 'want',
 'Corrado',
 'thought',
 'big',
 'millions',
 'early',
 'sentences',
 'problems',
 'users',
 'right',
 'showed',
 'place',
 'computers',
 'ask',
 'research',
 'identify',
 'information',
 'training',
 'pattern',
 'green',
 '

In [91]:
word_vectors = model[common_words]

TypeError: not all arguments converted during string formatting

In [68]:
tsne = TSNE(random_state=41)
words_tsne = tsne.fit_transform(word_vectors)

In [69]:
p = figure(tools="pan,wheel_zoom,reset,save",
           toolbar_location="above",
           title="word2vec T-SNE for most common words")

source = ColumnDataSource(data=dict(x1=words_tsne[:,0],
                                    x2=words_tsne[:,1],
                                    names=words_no_stop))

p.scatter(x="x1", y="x2", size=8, source=source)

labels = LabelSet(x="x1", y="x2", text="names", y_offset=6,
                  text_font_size="8pt", text_color="#555555",
                  source=source, text_align='center')
p.add_layout(labels)

show(p)