In [7]:
import json
import glob
eqs = []
files = glob.glob('pipeline_out/*.jsonlist') 
for f in files:
    with open(f) as processed:
        for i, line in enumerate(processed):
            eqs.append(json.loads(line)['aligned'])
            

In [8]:
from gensim.models import TfidfModel
from gensim.corpora import Dictionary
from gensim.similarities import MatrixSimilarity

In [9]:
eqs[0]

[['\\gamma', '^', '5', '\\Upsilon', '_', '-', '(', 'p', '^', '\\mu', ')'],
 ['\\gamma',
  '^',
  '5',
  'S',
  '^',
  'c',
  '_',
  '{',
  '[',
  '1',
  '/',
  '2',
  ']',
  '}',
  '\\,',
  '{',
  '\\calB',
  '}',
  '_',
  '+',
  '(',
  'p',
  '^',
  '\\mu',
  ')',
  '.']]

In [10]:
docs = [doc for sublist in eqs for doc in sublist]
dct = Dictionary(docs)
corpus = [dct.doc2bow(line) for line in docs]
model = TfidfModel(corpus)

In [11]:
index = MatrixSimilarity(model[corpus], num_features=len(dct))

In [12]:
line = docs[2]
sims = index[model[dct.doc2bow(line)]]
sims = sorted(enumerate(sims), key=lambda item: -item[1])

In [13]:
from gensim import interfaces, utils, matutils
import numpy as np
# query = matutils.sparse2full(query, self.num_features)
# query = numpy.asarray(query, dtype=self.index.dtype)

# do a little transposition dance to stop numpy from making a copy of
# self.index internally in numpy.dot (very slow).
# result = numpy.dot(self.index, query.T).T  

In [14]:
num_features = len(dct)
all_dists = []
for aligned_eqs in eqs:
    vecs = []
    for e in aligned_eqs: 
        sparse_vec = model[dct.doc2bow(e)]
        query = matutils.sparse2full(sparse_vec, num_features)
        query = np.asarray(query, dtype=np.float32)
        vecs.append(query)
    
    dists = []
    for i, v in enumerate(vecs):
        for j, v2 in enumerate(vecs):
            if i != j:
                dists.append(v.dot(v2))
    all_dists.extend(list(set(dists)))


In [15]:
# now do a random pass
N = len(eqs)
num_features = len(dct)
rand_dists = []
for aligned_eqs in eqs:
    vecs = []
    for e in aligned_eqs: 
        sparse_vec = model[dct.doc2bow(e)]
        query = matutils.sparse2full(sparse_vec, num_features)
        query = np.asarray(query, dtype=np.float32)
        vecs.append(query)
    
    dists = []
    for i, v in enumerate(vecs):
        # pick a random vector
        rand_i = np.random.randint(N)
        # Note: had to pull in a long range dependency on the docs list here. Could cause problems later.
        sparse_vec = model[dct.doc2bow(docs[rand_i])]
        rand_vec = matutils.sparse2full(sparse_vec, num_features)
        rand_vec = np.asarray(rand_vec, dtype=np.float32)
        dists.append(v.dot(rand_vec))
    rand_dists.extend(list(set(dists)))



In [16]:
len(rand_dists)

6938

In [17]:
len(all_dists)

4286

In [None]:
%matplotlib inline
import pandas as pd
import seaborn as sns

# test = pd.DataFrame([all_dists, rand_dists])

sns.distplot(all_dists, hist=True, rug=False)
sns.distplot(rand_dists, hist=True, rug=False)

sns.plt.show()


KeyboardInterrupt: 

ImportError: No module named matplotlib.pyplot