In [1]:
import pandas as pd
import numpy as np
from utils import read_file, tokenize_latex
from MulticoreTSNE import MulticoreTSNE as TSNE


In [2]:
import cPickle as pickle
eq_idx = pickle.load(open('data/numbered_embeddings/eq-idx.pkl', 'rb'))
eq_svd_embed = pickle.load(open('data/numbered_embeddings/eq-svd-embed.pkl', 'rb'))

In [7]:
tsne = TSNE(n_jobs=4)
Y = tsne.fit_transform(eq_svd_embed)

In [10]:
type(Y)

numpy.ndarray

In [36]:
df_embeddings = pd.DataFrame(Y)
df_embeddings = df_embeddings.reset_index()
df_embeddings.columns = ["matrix_index", "x", "y"]

In [13]:
import fnmatch
import os
def read_file_names_recursive(folder):
    matches = []
    for root, dirnames, filenames in os.walk(folder):
        for filename in fnmatch.filter(filenames, '*.tex'):
            matches.append(os.path.join(root, filename))
    return matches

In [14]:
fnames = read_file_names_recursive('data/numbered_embeddings/raw/')

In [15]:
def read_files(fnames):
    tex = {}
    for fname in fnames:
        f = open(fname).read()
        name = fname.split("/")[-1]
        name = name.strip(".tex")
        tex[name] = f
    return tex

In [16]:
tex = read_files(fnames)

In [18]:
tex_df = pd.DataFrame.from_dict(tex, orient="index")
tex_df = tex_df.reset_index()
tex_df.columns = ["eq_id", "tex"]

eq_idx_df = pd.DataFrame(eq_idx).reset_index()
eq_idx_df.columns = ["matrix_index", "eq_id"]

all_data = pd.merge(tex_df, eq_idx_df, on = "eq_id")

In [45]:
all_data_tsne = pd.merge(all_data, df_embeddings, on = "matrix_index")

In [49]:
all_data_tsne.set_index("matrix_index").to_json("tnse_embeddings_dev.json", orient = "index")

In [23]:
class Query:
    def __init__(self, term_matrix, docs, docs_dict, dictionary, cols, k):
        self.term_matrix = term_matrix
        self.docs = docs
        self.docs_dict = docs_dict
        self.dictionary = dictionary
        self.columns = cols
        self.k = k
        
    def _get_terms(self, vals):
        terms = []
        for i, v in vals:
            print i
            print v
            try:
                term = self.docs_dict[i]
                term["sim"] = v
                terms.append(term)
            except:
                print "couldn't find"
        return terms
    def query(self, query):
        idx = self.dictionary[query]["matrix_index"]
        vec = self.term_matrix[idx]
        
        idc, vals = self._vectorized_query(self.term_matrix, vec, self.k)
        
        zipped_vals= zip(idc, vals)
        vals = sorted(zipped_vals, key = lambda x: x[1])
        terms = self._get_terms(vals[:self.k])
        
        neighbors = {"neighbors":[{"Equation Number": {"data": t["eq_id"]}, 
                                   "Equation": {"data": t["tex"], "fmt": "math"}, 
                                   "Similarity": {"data": t["sim"]}} for t in terms]} if terms else {"neighbors": []}
        return neighbors
    
    def _vectorized_query(self, term_matrix, word_vector, k):
        dots = np.dot(term_matrix, word_vector)
        l2norms = np.sqrt(((term_matrix**2).sum(1)[:,None])*((word_vector**2).sum(0)))
        cosine_dists = 1 - (dots[:,None]/l2norms)
        cosine_dists = cosine_dists.reshape(dots.shape)
        idx = np.argpartition(cosine_dists, k)
        vals = cosine_dists[idx[:k]]
        return list(idx[:k].flatten()), list(vals.flatten())

In [9]:
dictionary = all_data.set_index("eq_id").to_dict(orient="index")
docs_dict = all_data.set_index("matrix_index").to_dict(orient="index")
docs = all_data["eq_id"].values
q = Query(eq_svd_embed, docs, docs_dict, dictionary, ["Equation Number", "Equation", "Similarity"], 20)

In [10]:
# example to test the query class
# q.query("34.4.E1")

In [11]:
from mathviz_hopper.src.table import Table
t = Table(q, 8082)
t.print_ipython()

Bottle v0.13-dev server starting up (using MyWSGIRefServer())...
Listening on http://localhost:8082/
Hit Ctrl-C to quit.



127.0.0.1 - - [30/Nov/2017 14:35:37] "GET /settings HTTP/1.1" 200 66552


1940
2.22044604925e-16
9757
0.242520940229
couldn't find
6014
0.293176570681
9607
0.323091544838
couldn't find
9751
0.326740846326
couldn't find
9628
0.328974937774
couldn't find
6044
0.358224020396
160
0.372227817294
couldn't find
2933
0.375386893327
couldn't find
3050
0.386785411577
couldn't find
2957
0.390718633748
couldn't find
9638
0.391783308004
couldn't find
9619
0.414696348145
couldn't find
9643
0.417240847909
couldn't find
9630
0.422801293442
couldn't find
1953
0.439818573342
1376
0.441832376911
couldn't find
2367
0.44388785208
9608
0.44582946706
couldn't find
6109
0.453595860428


127.0.0.1 - - [30/Nov/2017 14:36:53] "POST /query HTTP/1.1" 200 1344


1935
-2.22044604925e-16
1909
0.238906299084
1939
0.249774933412
1921
0.407510533003
1916
0.480905458775
1938
0.499298593425
1925
0.506173849835
1922
0.506494255391
1907
0.559306015947
1905
0.616536069529
9286
0.621120431949
couldn't find
1926
0.635947554754
1915
0.640381115758
8477
0.643421881434
couldn't find
1547
0.650526518799
couldn't find
8145
0.657087954521
couldn't find
7414
0.666379917974
couldn't find
1910
0.670100640646
1952
0.671222635974
1533
0.671498512588
couldn't find


127.0.0.1 - - [30/Nov/2017 14:37:23] "POST /query HTTP/1.1" 200 6796
