In [1]:
import gensim
import pickle
import numpy as np
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
from sklearn.externals import joblib
import bokeh
from bokeh.plotting import figure, show, output_notebook, save#, output_file
from bokeh.models import HoverTool, value, LabelSet, Legend, ColumnDataSource
import bokeh.plotting as bplt
import bokeh.models as bmdl
bplt.output_notebook()
%matplotlib inline

In [2]:
model = gensim.models.Word2Vec.load("../out/models/preorder/embs-i10-w20-l10000-v200-j.gensim")
wv = model.wv

In [3]:
# # Load the ol' source{d} embeddings instead
# with open("/Users/skaufman/Desktop/id2vec_500k.pickle", "rb") as fin:
#     words, _, embeddings = pickle.load(fin)
# wv = gensim.models.keyedvectors.WordEmbeddingsKeyedVectors(embeddings[0].shape)
# wv.index2word = words
# wv.index2entity = words
# wv.vocab = {w: gensim.models.keyedvectors.Vocab(index=i)
#             for i, w in enumerate(words)}
# wv.vectors = np.vstack(embeddings)

In [4]:
try:
    tsne = joblib.load("idemba-tsne.pkl")
except Exception:
    tsne = TSNE(verbose=1)
    tsne.fit(wv.vectors_norm)
    joblib.dump(tsne, "idemba-tsne.pkl")

In [5]:
# Some common data prep.
colorgroups = {'purple': 'ijxyab',
               'red': ['bid', 'ask', 'buy', 'sell'],
               'blue': ['get', 'set', 'update', 'put'],
               'green': ['lock', 'unlock', 'release', 'acquire']}
colormap = dict((i, k) for k, v in colorgroups.items() for i in v)
def embedded_bokeh_src(x, y):
    return ColumnDataSource(dict(
        x=x, y=y,
        subtoken=wv.index2word,
        color=[colormap.get(t, 'gray') for t in wv.index2word],
        size=[(5 if t in colormap else 1) for t in wv.index2word],
        alpha=[(1.0 if t in colormap else 0.1) for t in wv.index2word],
    ))

In [6]:
source = embedded_bokeh_src(tsne.embedding_[:, 0], tsne.embedding_[:, 1])
plot_lda = figure(plot_width=950, plot_height=600,
                  title='t-SNE', tools="pan,wheel_zoom,box_zoom,reset,hover,previewsave",
                  x_axis_type=None, y_axis_type=None, min_border=1)
plot_lda.scatter(x='x', y='y', source=source, alpha='alpha', size='size', color='color')#'msize', )

plot_lda.select({'type': HoverTool}).tooltips = {"content": "@subtoken"}
show(plot_lda)

## Analogical Evals.

In [7]:
wv.most_similar(positive=['i', 'x'], negative=['j'])[0]

('y', 0.6219664812088013)

In [8]:
wv.most_similar(positive=['j', 'x'], negative=['i'])[0]

('y', 0.7206199169158936)

In [9]:
wv.most_similar(positive=['ask', 'sell'], negative=['bid'])[0]

('refactor', 0.415157288312912)

Unsurprisingly, this works less well on general English analogies.

In [10]:
wv.most_similar(positive=["acquire", "lock"], negative=["release"])

[('reentrant', 0.5942990183830261),
 ('locks', 0.5739679336547852),
 ('exclusive', 0.5459644198417664),
 ('unlock', 0.5430675148963928),
 ('wake', 0.539844810962677),
 ('try', 0.5262168645858765),
 ('shared', 0.5209817290306091),
 ('semaphore', 0.5038613677024841),
 ('deprecated', 0.4934277832508087),
 ('filler', 0.4822271466255188)]

In [11]:
wv.most_similar(positive=['map', 'append'], negative=['text'], topn=3)

[('hash', 0.6272672414779663),
 ('emoji', 0.6175218224525452),
 ('emojis', 0.5954106450080872)]

In [12]:
wv.most_similar(positive=['bar'])

[('foo', 0.7728173732757568),
 ('seek', 0.7302132844924927),
 ('toolbar', 0.7166825532913208),
 ('nav', 0.7046784162521362),
 ('action', 0.6918985247612),
 ('set', 0.689969539642334),
 ('status', 0.6797653436660767),
 ('get', 0.6793663501739502),
 ('progress', 0.6745686531066895),
 ('navigation', 0.6675430536270142)]

### Negative Example(s)‚ùó

In [13]:
wv.most_similar(positive=["open", "lock"], negative=["close"])[:5]

[('acquire', 0.6510829925537109),
 ('reentrant', 0.6115305423736572),
 ('wake', 0.5958467125892639),
 ('unlock', 0.5732922554016113),
 ('locks', 0.5726479291915894)]

Huh. That had `unlock` in fourth place, but didn't contain `release`. What's near `release`?

In [14]:
wv.most_similar(['release'])

[('lock', 0.6815363764762878),
 ('acquire', 0.6417340040206909),
 ('connection', 0.6209431886672974),
 ('try', 0.610514223575592),
 ('get', 0.6071192026138306),
 ('filler', 0.5992910861968994),
 ('set', 0.5982125997543335),
 ('exclusive', 0.5967963933944702),
 ('semaphore', 0.5962668657302856),
 ('override', 0.5956928133964539)]

Lock is near-_ish_.

## PCA

In [15]:
from sklearn import decomposition
pca = decomposition.PCA(n_components=2)
PCA_X = pca.fit_transform(model.wv.vectors)

In [16]:
p = figure(title = "PCA of Embeddings", tools="pan,wheel_zoom,box_zoom,reset,hover,previewsave")
pca_source = embedded_bokeh_src(PCA_X[:, 0], PCA_X[:, 1])
p.circle(x='x', y='y', color='color', source=pca_source, alpha='alpha', size='size')
pca_hover = p.select({'type': HoverTool})
pca_hover.tooltips = {"content": "@subtoken"}
show(p)

### PCA Over Analogy Sandbox

In [17]:
def group_pca(words):
    word_idxs = [model.wv.vocab[w].index for w in words]
    vec_cpy = model.wv.vectors.copy()
    fit_x = np.vstack([vec_cpy[i] for i in word_idxs])
    pca = decomposition.PCA(n_components=2)
    pca.fit(fit_x)
    transformed = pca.transform(vec_cpy)
    return pca, transformed

def group_figure(words):
    gpca = group_pca(words)
    p = figure(title = ','.join(words), tools="pan,wheel_zoom,box_zoom,reset,hover,previewsave")
    pca_source = embedded_bokeh_src(gpca[:, 0], gpca[:, 1])
    p.circle(x='x', y='y', color='color', source=pca_source, alpha='alpha', size='size')
    p.select({'type': HoverTool}).tooltips = {"content": "@subtoken"}
    show(p)

def analogy_figure(pos, neg, tgt):
    all_words = pos + neg + [tgt]
    pca_model, gpca = group_pca(pos + neg)
    
    title = ' : '.join(pos) + " :: " + ' : '.join(neg)
    p = figure(title=title, tools="pan,wheel_zoom,box_zoom,reset,hover,previewsave")
    pca_source = embedded_bokeh_src(gpca[:, 0], gpca[:, 1])
    p.circle(x='x', y='y', color='color', source=pca_source, alpha='alpha', size='size')
    p.select({'type': HoverTool}).tooltips = {"content": "@subtoken"}

    # print(model.wv.most_similar(positive=pos, negative=neg))
    manual_vec = (model.wv.word_vec(pos[0], use_norm=True) \
        + model.wv.word_vec(pos[1], use_norm=True) \
        - model.wv.word_vec(neg[0], use_norm=True)).reshape(1, -1)
    # print([(w, s) for w, s in model.wv.most_similar([(manual_vec, 1.0)]) if w not in pos + neg])
    
    pca_manual_vec = pca_model.transform(manual_vec)
    p.circle(x=pca_manual_vec[:, 0], y=pca_manual_vec[:, 1], color='black')
    
    show(p)


#group_figure(colorgroups['purple'])
analogy_figure(['i', 'x'], ['j'], 'y')