In [1]:
import time
from utils import persistence as ps
from megaman.embedding import SpectralEmbedding
from megaman.geometry.geometry import Geometry
radius = 10
adjacency_method = 'cyflann'
cyflann_kwds = {'index_type':'kmeans', 'branching':64, 'iterations':20, 'cb_index':0.4}
adjacency_kwds = {'radius':radius, 'cyflann_kwds':cyflann_kwds}
affinity_method = 'gaussian'
affinity_kwds = {'radius':radius}
laplacian_method = 'geometric'
laplacian_kwds = {'scaling_epps':radius}

In [2]:
ps.get_file(bucket='dq-data', filename='wor2vec_300_train.mtx', filepath='wor2vec_300_train.mtx')

<minio.definitions.Object at 0x7f95a94a3b00>

In [3]:
from scipy.io import mmread
X = mmread('wor2vec_300_train.mtx')

In [4]:
X.shape

(541148, 300)

In [5]:
X = X[:100000]

In [6]:
X.shape

(100000, 300)

In [7]:
del geom

NameError: name 'geom' is not defined

In [8]:
geom = Geometry(adjacency_method=adjacency_method, adjacency_kwds=adjacency_kwds,
                affinity_method=affinity_method, affinity_kwds=affinity_kwds,
                laplacian_method=laplacian_method, laplacian_kwds=laplacian_kwds)

In [9]:
geom.set_data_matrix(X)

In [10]:
geom.delete_laplacian_matrix()

In [11]:
t0 = time.time()
lapl = geom.compute_laplacian_matrix(copy=False)
t1 = time.time() - t0
print(t1)

98.51511859893799


In [12]:
geom.delete_data_matrix()

In [13]:
import numpy as np
t0 = time.time()
spec = SpectralEmbedding(n_components=3, eigen_solver='amg',geom=geom)
X_spec = spec.fit_transform(X=X.astype(np.float))
#adjacency_matrix = geom.compute_adjacency_matrix()
t1 = time.time() - t0
print(t1)



386.0650975704193


In [14]:
X_spec.shape

(100000, 3)

In [15]:
from scipy.io import mmwrite
mmwrite('embed_train.mtx', X_spec)

In [16]:
ps.copy_file(dest_bucket='dq-data', file='embed_train.mtx', source='embed_train.mtx')

pushed file embed_train.mtx from embed_train.mtx to minio bucket dq-data


True

In [37]:
import matplotlib
matplotlib.use('Agg')
import pylab as plt

ModuleNotFoundError: No module named 'matplotlib'

In [None]:
print('making pairwise plot...')

fig, axes = plt.subplots(nrows=2, ncols = 3, figsize=(8,8))
fig.subplots_adjust(hspace=0.05,wspace =0.05)

axes[0, 0].scatter(embed[:, 0], embed[:, 1], s = 1, c = 'k')
axes[0, 0].set_title('1 vs 2')
axes[0, 0].xaxis.set_visible(False)
axes[0, 0].yaxis.set_visible(False)

axes[0, 1].scatter(embed[:, 0], embed[:, 2], s = 1, c = 'k')
axes[0, 1].set_title('1 vs 3')
axes[0, 1].xaxis.set_visible(False)
axes[0, 1].yaxis.set_visible(False)

axes[0, 2].scatter(embed[:, 0], embed[:, 3], s = 1, c = 'k')
axes[0, 2].set_title('1 vs 4')
axes[0, 2].xaxis.set_visible(False)
axes[0, 2].yaxis.set_visible(False)

axes[1, 0].scatter(embed[:, 1], embed[:, 2], s = 1, c = 'k')
axes[1, 0].set_title('2 vs 3')
axes[1, 0].xaxis.set_visible(False)
axes[1, 0].yaxis.set_visible(False)

axes[1, 1].scatter(embed[:, 1], embed[:, 3], s = 1, c = 'k')
axes[1, 1].set_title('2 vs 4')
axes[1, 1].xaxis.set_visible(False)
axes[1, 1].yaxis.set_visible(False)

axes[1, 2].scatter(embed[:, 2], embed[:, 3], s = 1, c = 'k')
axes[1, 2].set_title('3 vs 4')
axes[1, 2].xaxis.set_visible(False)
axes[1, 2].yaxis.set_visible(False)
plt.suptitle("pairwise components from spectral embedding into 4 dimensions")

print('saving figure...')
plt.savefig('word2vec_pairwise_embedding'+'.png', format='png')
print('done!')