In [1]:
import pandas as pd
import numpy as np
from utils import read_file, tokenize_latex

In [2]:
import cPickle as pickle
eq_idx = pickle.load(open('data/numbered_embeddings/eq-idx.pkl', 'rb'))
eq_svd_embed = pickle.load(open('data/numbered_embeddings/eq-svd-embed.pkl', 'rb'))

In [3]:
import fnmatch
import os
def read_file_names_recursive(folder):
    matches = []
    for root, dirnames, filenames in os.walk(folder):
        for filename in fnmatch.filter(filenames, '*.tex'):
            matches.append(os.path.join(root, filename))
    return matches

In [4]:
fnames = read_file_names_recursive('data/numbered_embeddings/raw/')

In [5]:
def read_files(fnames):
    tex = {}
    for fname in fnames:
        f = open(fname).read()
        name = fname.split("/")[-1]
        name = name.strip(".tex")
        tex[name] = f
    return tex

In [6]:
tex = read_files(fnames)

In [8]:
tex_df = pd.DataFrame.from_dict(tex, orient="index")
tex_df = tex_df.reset_index()
tex_df.columns = ["eq_id", "tex"]

eq_idx_df = pd.DataFrame(eq_idx).reset_index()
eq_idx_df.columns = ["matrix_index", "eq_id"]

all_data = pd.merge(tex_df, eq_idx_df, on = "eq_id")

In [9]:
class Query:
    def __init__(self, term_matrix, docs, docs_dict, dictionary, cols, k):
        self.term_matrix = term_matrix
        self.docs = docs
        self.docs_dict = docs_dict
        self.dictionary = dictionary
        self.columns = cols
        self.k = k
        
    def _get_terms(self, vals):
        terms = []
        for i, v in vals:
            print i
            print v
            try:
                term = self.docs_dict[i]
                term["sim"] = v
                terms.append(term)
            except:
                print "couldn't find"
        return terms
    def query(self, query):
        idx = self.dictionary[query]["matrix_index"]
        vec = self.term_matrix[idx]
        
        idc, vals = self._vectorized_query(self.term_matrix, vec, self.k)
        
        zipped_vals= zip(idc, vals)
        vals = sorted(zipped_vals, key = lambda x: x[1])
        terms = self._get_terms(vals[:self.k])
        
        neighbors = {"neighbors":[{"Equation Number": {"data": t["eq_id"]}, 
                                   "Equation": {"data": t["tex"], "fmt": "math"}, 
                                   "Similarity": {"data": t["sim"]}} for t in terms]} if terms else {"neighbors": []}
        return neighbors
    
    def _vectorized_query(self, term_matrix, word_vector, k):
        dots = np.dot(term_matrix, word_vector)
        l2norms = np.sqrt(((term_matrix**2).sum(1)[:,None])*((word_vector**2).sum(0)))
        cosine_dists = 1 - (dots[:,None]/l2norms)
        cosine_dists = cosine_dists.reshape(dots.shape)
        idx = np.argpartition(cosine_dists, k)
        vals = cosine_dists[idx[:k]]
        return list(idx[:k].flatten()), list(vals.flatten())

In [11]:
dictionary = all_data.set_index("eq_id").to_dict(orient="index")
docs_dict = all_data.set_index("matrix_index").to_dict(orient="index")
docs = all_data["eq_id"].values
q = Query(eq_svd_embed, docs, docs_dict, dictionary, ["Equation Number", "Equation", "Similarity"], 20)

In [13]:
# example to test the query class
# q.query("34.4.E1")

1903
-2.22044604925e-16
1964
0.0210971188021
1948
0.268002445877
1963
0.420839909388
1951
0.433733005577
1960
0.448092056264
1906
0.493639021766
1914
0.509739864248
1962
0.523163178191
1956
0.531895432868
1911
0.562738061451
1908
0.563731774396
1927
0.599017938176
1905
0.619401755963
1912
0.620847878529
1942
0.642720042712
9387
0.653925401092
couldn't find
1945
0.660732612001
1957
0.661001793891
9385
0.663568084146
couldn't find


{'neighbors': [{'Equation': {'data': '\\[\\begin{Bmatrix}j_{1}&j_{2}&j_{3}\\\\\nl_{1}&l_{2}&l_{3}\\end{Bmatrix}=\\sum_{m_{r}m^{\\prime}_{s}}(-1)^{l_{1}+m^{\\prime%\n}_{1}+l_{2}+m^{\\prime}_{2}+l_{3}+m^{\\prime}_{3}}\\*\\begin{pmatrix}j_{1}&j_{2}&j%\n_{3}\\\\\nm_{1}&m_{2}&m_{3}\\end{pmatrix}\\begin{pmatrix}j_{1}&l_{2}&l_{3}\\\\\nm_{1}&m^{\\prime}_{2}&-m^{\\prime}_{3}\\end{pmatrix}\\begin{pmatrix}l_{1}&j_{2}&l_%\n{3}\\\\\n-m^{\\prime}_{1}&m_{2}&m^{\\prime}_{3}\\end{pmatrix}\\begin{pmatrix}l_{1}&l_{2}&j_%\n{3}\\\\\nm^{\\prime}_{1}&-m^{\\prime}_{2}&m_{3}\\end{pmatrix},\\]',
    'fmt': 'math'},
   'Equation Number': {'data': '34.4.E1'},
   'Similarity': {'data': -2.2204460492503131e-16}},
  {'Equation': {'data': '\\[\\begin{pmatrix}j_{1}&j_{2}&j_{3}\\\\\nm_{1}&m_{2}&m_{3}\\end{pmatrix}\\begin{Bmatrix}j_{1}&j_{2}&j_{3}\\\\\nl_{1}&l_{2}&l_{3}\\end{Bmatrix}=\\sum_{m^{\\prime}_{1}m^{\\prime}_{2}m^{\\prime}_{3%\n}}(-1)^{l_{1}+l_{2}+l_{3}+m^{\\prime}_{1}+m^{\\prime}_{2}+m^{\\prime}_{3}}\\begin{%\

In [14]:
from mathviz_hopper.src.table import Table
t = Table(q, 8082)
t.print_ipython()

Bottle v0.13-dev server starting up (using MyWSGIRefServer())...
Listening on http://localhost:8082/
Hit Ctrl-C to quit.

