In [None]:
import time
from utils import persistence as ps
from megaman.embedding import SpectralEmbedding
from megaman.geometry.geometry import Geometry
radius = 20
adjacency_method = 'cyflann'
cyflann_kwds = {'index_type':'kmeans', 'branching':64, 'iterations':20, 'cb_index':0.4}
adjacency_kwds = {'radius':radius, 'cyflann_kwds':cyflann_kwds}
affinity_method = 'gaussian'
affinity_kwds = {'radius':radius}
laplacian_method = 'geometric'
laplacian_kwds = {'scaling_epps':radius}

In [15]:
ps.get_file(bucket='dq-data', filename='wor2vec_300_train.mtx', filepath='wor2vec_300_train.mtx')

<minio.definitions.Object at 0x7f9794071358>

In [52]:
from scipy.io import mmread
X = mmread('wor2vec_300_train.mtx')

In [53]:
X.shape

(541148, 300)

In [54]:
X = X[:100000]

In [55]:
X.shape

(100000, 300)

In [56]:
del geom

In [57]:
geom = Geometry(adjacency_method=adjacency_method, adjacency_kwds=adjacency_kwds,
                affinity_method=affinity_method, affinity_kwds=affinity_kwds,
                laplacian_method=laplacian_method, laplacian_kwds=laplacian_kwds)

In [58]:
geom.set_data_matrix(X)

In [59]:
geom.delete_laplacian_matrix()

In [60]:
t0 = time.time()
lapl = geom.compute_laplacian_matrix(copy=False)
t1 = time.time() - t0
print(t1)

In [66]:
import sys
# These are the usual ipython objects, including this one you are creating
ipython_vars = ['In', 'Out', 'exit', 'quit', 'get_ipython', 'ipython_vars']
# Get a sorted list of the objects and their sizes
sorted([(x, sys.getsizeof(globals().get(x))) 
        for x in dir() if not x.startswith('_') 
        and x not in sys.modules and x not in ipython_vars], 
       key=lambda x: x[1], reverse=True)

[('Geometry', 1464),
 ('SpectralEmbedding', 1464),
 ('adjacency_kwds', 240),
 ('affinity_kwds', 240),
 ('cyflann_kwds', 240),
 ('laplacian_kwds', 240),
 ('mmread', 136),
 ('mmwrite', 136),
 ('X', 112),
 ('X_spec', 112),
 ('np', 80),
 ('ps', 80),
 ('laplacian_method', 58),
 ('affinity_method', 57),
 ('adjacency_method', 56),
 ('geom', 56),
 ('lapl', 56),
 ('spec', 56),
 ('radius', 28),
 ('t0', 24),
 ('t1', 24)]

In [62]:
geom.delete_data_matrix()

In [67]:
import numpy as np
t0 = time.time()
spec = SpectralEmbedding(n_components=3, eigen_solver='amg',geom=geom)
X_spec = spec.fit_transform(X=X.astype(np.float))
#adjacency_matrix = geom.compute_adjacency_matrix()
t1 = time.time() - t0
print(t1)



290.84182572364807


In [68]:
X_spec.shape

(100000, 3)

In [65]:
from scipy.io import mmwrite
mmwrite('embed_train.mtx', X_spec)

In [37]:
import matplotlib
matplotlib.use('Agg')
import pylab as plt

ModuleNotFoundError: No module named 'matplotlib'

In [None]:
print('making pairwise plot...')

fig, axes = plt.subplots(nrows=2, ncols = 3, figsize=(8,8))
fig.subplots_adjust(hspace=0.05,wspace =0.05)

axes[0, 0].scatter(embed[:, 0], embed[:, 1], s = 1, c = 'k')
axes[0, 0].set_title('1 vs 2')
axes[0, 0].xaxis.set_visible(False)
axes[0, 0].yaxis.set_visible(False)

axes[0, 1].scatter(embed[:, 0], embed[:, 2], s = 1, c = 'k')
axes[0, 1].set_title('1 vs 3')
axes[0, 1].xaxis.set_visible(False)
axes[0, 1].yaxis.set_visible(False)

axes[0, 2].scatter(embed[:, 0], embed[:, 3], s = 1, c = 'k')
axes[0, 2].set_title('1 vs 4')
axes[0, 2].xaxis.set_visible(False)
axes[0, 2].yaxis.set_visible(False)

axes[1, 0].scatter(embed[:, 1], embed[:, 2], s = 1, c = 'k')
axes[1, 0].set_title('2 vs 3')
axes[1, 0].xaxis.set_visible(False)
axes[1, 0].yaxis.set_visible(False)

axes[1, 1].scatter(embed[:, 1], embed[:, 3], s = 1, c = 'k')
axes[1, 1].set_title('2 vs 4')
axes[1, 1].xaxis.set_visible(False)
axes[1, 1].yaxis.set_visible(False)

axes[1, 2].scatter(embed[:, 2], embed[:, 3], s = 1, c = 'k')
axes[1, 2].set_title('3 vs 4')
axes[1, 2].xaxis.set_visible(False)
axes[1, 2].yaxis.set_visible(False)
plt.suptitle("pairwise components from spectral embedding into 4 dimensions")

print('saving figure...')
plt.savefig('word2vec_pairwise_embedding'+'.png', format='png')
print('done!')