In [1]:
import os
import nltk
import pickle
import numpy as np
import pandas as pd

# from pretty import pprint
from bokeh.models import ColumnDataSource, LabelSet
from bokeh.plotting import figure, show, output_file
from bokeh.io import output_notebook
output_notebook()

from gensim.models import Word2Vec

In [3]:
model = Word2Vec.load("w2v/w2v_eng_uk_na_corpora")
len(model.wv.vocab)

16087

In [16]:
with open("eng_na_uk.vocab","w") as op:
    op.write("\n".join(model.wv.vocab.keys()))

In [4]:
from collections import Counter
top_2000_words =[word for word,_ in Counter(model.wv.vocab).most_common(2000)]
top_2000_vecs = model[top_2000_words]

In [5]:
from sklearn.manifold import TSNE
tsne = TSNE(n_components=2, random_state=0)
top_2000_tsne = tsne.fit_transform(top_2000_vecs)

In [6]:
p = figure(tools="pan,wheel_zoom,reset,save",
           toolbar_location="above",
           title="word2vec T-SNE for most common words")

source = ColumnDataSource(data=dict(x1=top_2000_tsne[:,0],
                                    x2=top_2000_tsne[:,1],
                                    names=top_2000_words))

p.scatter(x="x1", y="x2", size=8, source=source)

labels = LabelSet(x="x1", y="x2", text="names", y_offset=6,
                  text_font_size="8pt", text_color="#555555",
                  source=source, text_align='center')
p.add_layout(labels)

show(p)

In [7]:
x="say"
y="sayin"
xx="ask"
# yy="hearing"
print model.wv.most_similar(positive=[x, y], negative=[xx])
print "\n"
print model.wv.most_similar_cosmul(positive=[x, y], negative=[xx])

[('saying', 0.720932126045227), ('doing', 0.5543146133422852), ('smashing', 0.5484839677810669), ('doin', 0.5476279854774475), ('whistling', 0.5422560572624207), ('singin', 0.5416629314422607), ('barmy', 0.539007842540741), ('meaning', 0.5353029370307922), ('cheating', 0.5318008661270142), ('singing', 0.5277507305145264)]


[('saying', 1.0818170309066772), ('smashing', 0.9988341331481934), ('whistling', 0.9760260581970215), ('barmy', 0.9696086645126343), ('interfering', 0.956294596195221), ('doing', 0.943742573261261), ('cheating', 0.939643383026123), ('singin', 0.9315813779830933), ('doin', 0.9268364310264587), ('wobbling', 0.9244562387466431)]


In [9]:
x="peach"
y="peaches"
xx="apple"
# yy="hearing"
print model.wv.most_similar(positive=[x, y], negative=[xx])
print "\n"
print model.wv.most_similar_cosmul(positive=[x, y], negative=[xx])

[('lettuce', 0.6647210121154785), ('salmon', 0.6597726345062256), ('crackers', 0.6532133221626282), ('rice', 0.6488854885101318), ('raisins', 0.6331318616867065), ('cheerios', 0.63209468126297), ('tomatoes', 0.630882203578949), ('cherries', 0.6294560432434082), ('steak', 0.627549409866333), ('mushrooms', 0.6264097690582275)]


[('salmon', 0.9554268717765808), ('lettuce', 0.9443752765655518), ('crackers', 0.9424830079078674), ('rice', 0.9385457038879395), ('tomatoes', 0.9235421419143677), ('burgers', 0.9235407710075378), ('steak', 0.9233205914497375), ('mustard', 0.923240065574646), ('syrup', 0.9218165874481201), ('cheerios', 0.9213539361953735)]
