In [1]:
import pandas as pd
import numpy as np
from notebooks.utils import read_file, tokenize_latex
df = read_file("notebooks/data/1601/1601.00400.json")
df["processed"] = df["text"].apply(lambda x: tokenize_latex(x))


In [2]:
from gensim import corpora
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
equations = df["processed"].tolist()
dictionary = corpora.Dictionary(equations)

corpus = [dictionary.doc2bow(eq) for eq in equations]
corpora.MmCorpus.serialize('/tmp/equations.mm', corpus)
# our vector space model
print(corpus[1:5])

2017-11-28 14:12:20,766 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2017-11-28 14:12:20,768 : INFO : built Dictionary(50 unique tokens: [u'\\frac', u'0', u'Ls', u'1-Y', u'&+']...) from 4 documents (total 443 corpus positions)
2017-11-28 14:12:20,770 : INFO : storing corpus in Matrix Market format to /tmp/equations.mm
2017-11-28 14:12:20,772 : INFO : saving sparse matrix to /tmp/equations.mm
2017-11-28 14:12:20,773 : INFO : PROGRESS: saving document #0
2017-11-28 14:12:20,776 : INFO : saved 4x50 matrix, density=69.000% (138/200)
2017-11-28 14:12:20,778 : INFO : saving MmCorpus index to /tmp/equations.mm.index


[[(0, 1), (2, 2), (3, 5), (4, 30), (5, 11), (6, 1), (8, 2), (9, 30), (10, 2), (11, 4), (12, 2), (13, 1), (14, 1), (15, 1), (16, 1), (17, 1), (18, 2), (19, 2), (20, 2), (21, 1), (22, 2), (23, 6), (24, 1), (25, 4), (26, 2), (27, 1), (28, 1), (29, 1), (30, 1), (31, 1), (32, 1), (33, 1), (34, 1), (35, 1), (36, 1), (37, 1), (38, 1), (39, 12), (40, 2), (41, 1), (42, 3), (43, 4), (44, 1), (45, 1), (46, 2)], [(0, 1), (2, 2), (3, 5), (4, 27), (5, 10), (6, 1), (8, 2), (9, 27), (10, 2), (11, 4), (12, 2), (13, 1), (14, 1), (15, 1), (17, 1), (18, 2), (19, 2), (21, 1), (22, 2), (23, 5), (24, 1), (25, 3), (26, 2), (27, 1), (28, 1), (30, 1), (31, 1), (32, 1), (33, 1), (34, 1), (35, 1), (36, 1), (37, 1), (38, 1), (39, 10), (40, 2), (42, 3), (43, 4), (44, 1), (46, 2), (47, 1)], [(0, 1), (2, 2), (3, 5), (4, 23), (5, 8), (6, 1), (8, 2), (9, 23), (10, 2), (11, 2), (12, 2), (13, 1), (15, 1), (17, 1), (18, 2), (19, 2), (20, 1), (21, 1), (22, 2), (23, 4), (24, 1), (25, 3), (26, 2), (27, 1), (29, 1), (32, 1), 

In [3]:
from gensim import corpora, models, similarities, matutils
tfidf = models.TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]
from gensim import similarities
index = similarities.Similarity("notebooks/index/", corpus_tfidf, len(dictionary.keys()), num_best = 100)

2017-11-28 14:12:20,800 : INFO : collecting document frequencies
2017-11-28 14:12:20,807 : INFO : PROGRESS: processing document #0
2017-11-28 14:12:20,813 : INFO : calculating IDF weights for 4 documents and 49 features (138 matrix non-zeros)
2017-11-28 14:12:20,816 : INFO : starting similarity index under notebooks/index/


In [4]:
docs = ["".join(eq) for eq in df["processed"].tolist()]

In [5]:
import re
def clean_exp(exp):
    exp = re.sub("\\\\begin{equation[^}]*}","" ,exp , flags=re.IGNORECASE)
    exp = re.sub("\\\\end{equation[^}]*}","" ,exp , flags=re.IGNORECASE)
    exp = re.sub("\\\\begin{split[^}]*}","\\\\begin{aligned}" ,exp , flags=re.IGNORECASE)
    exp = re.sub("\\\\end{split[^}]*}","\\\\end{aligned}" ,exp , flags=re.IGNORECASE)
    exp = re.sub("\\\\begin{gather[^}]*}","\\\\begin{aligned}" ,exp , flags=re.IGNORECASE)
    exp = re.sub("\\\\end{gather[^}]*}","\\\\end{aligned}" ,exp , flags=re.IGNORECASE)
    exp = re.sub("\\\\begin{align[^}]*}","\\\\begin{aligned}" ,exp , flags=re.IGNORECASE)
    exp = re.sub("\\\\end{align[^}]*}","\\\\end{aligned}" ,exp, flags=re.IGNORECASE)
    exp = re.sub("\\\\label{[^}]*}","" ,exp , flags=re.IGNORECASE)
    exp = re.sub("\\\\n", "",exp, flags=re.IGNORECASE)
    exp = re.sub("\\$", "",exp , flags=re.IGNORECASE)
    return exp

clean_docs = map(clean_exp, docs)

In [6]:
clean_docs

[u'\\begin{aligned}w^{m}=Ls^{m}\\end{aligned}',
 u'\\begin{aligned}\\min_{L,S}\\sum_{m=1}^{M}\\sum_{i=1}^{N_{m}}&\\frac{1}{2}[max(0,1-Y_{m}^{i}(Ls^{m})^{T}X_{m}^{i})]^{2}\\\\&+\\mu\\sum_{k=1}^{K}\\sum_{g=1}^{G}\\|s_{k}^{g}\\|_{2}+\\gamma\\|L\\|_{1}+\\lambda\\|L\\|^{2}_{F}\\\\\\end{aligned}',
 u'\\begin{aligned}\\min_{L,S}\\sum_{m=1}^{M}\\sum_{i=1}^{N_{m}}&\\frac{1}{2}[max(0,1-Y_{m}^{i}(Ls^{m})^{T}X_{m}^{i})]^{2}\\\\&+\\mu\\sum_{k=1}^{K}\\sum_{g=1}^{G}\\|s_{k}^{g}\\|_{2}\\\\\\end{aligned}',
 u'\\begin{aligned}\\min_{L,S}\\sum_{m=1}^{M}\\sum_{i=1}^{N_{m}}&\\frac{1}{2}[max(0,1-Y_{m}^{i}(Ls^{m})^{T}X_{m}^{i})]^{2}\\\\&+\\gamma\\|L\\|_{1}+\\lambda\\|L\\|^{2}_{F}\\\\\\end{aligned}']

In [7]:
class Query:
    '''
    The important thing here is that the labels on the data coincide with the labels on the columns.
    '''
    def __init__(self, index, docs, dictionary, tokenize_latex, cols):
        self.index = index
        self.docs = docs
        self.dictionary = dictionary
        self.tokenize_latex = tokenize_latex
        self.columns = cols
    def _convert_query(self, query):
        query = self.dictionary.doc2bow(self.tokenize_latex(query))
        sims = self.index[query]
        neighbors = sorted(sims, key=lambda item: -item[1])
        neighbors = {"neighbors":[{self.columns[0]: {"data": self.docs[n[0]], "fmt": "math"}, self.columns[1]: {"data": float(n[1])}} for n in neighbors]} if neighbors else {"neighbors": []}
        return neighbors
    def query(self, q):
        return self._convert_query(q)
        



In [8]:
q = Query(index, clean_docs, dictionary, tokenize_latex, ["neighbor", "similarity_score"])
q.query("\\frac")

2017-11-28 14:12:20,985 : INFO : creating matrix with 4 documents and 50 features
2017-11-28 14:12:20,988 : INFO : creating dense shard #0
2017-11-28 14:12:20,989 : INFO : saving index shard to notebooks/index/.0
2017-11-28 14:12:20,992 : INFO : saving MatrixSimilarity object under notebooks/index/.0, separately None
2017-11-28 14:12:20,998 : INFO : saved notebooks/index/.0
2017-11-28 14:12:21,000 : INFO : loading MatrixSimilarity object from notebooks/index/.0
2017-11-28 14:12:21,002 : INFO : loaded notebooks/index/.0


{'neighbors': [{'neighbor': {'data': u'\\begin{aligned}\\min_{L,S}\\sum_{m=1}^{M}\\sum_{i=1}^{N_{m}}&\\frac{1}{2}[max(0,1-Y_{m}^{i}(Ls^{m})^{T}X_{m}^{i})]^{2}\\\\&+\\gamma\\|L\\|_{1}+\\lambda\\|L\\|^{2}_{F}\\\\\\end{aligned}',
    'fmt': 'math'},
   'similarity_score': {'data': 0.07059364020824432}},
  {'neighbor': {'data': u'\\begin{aligned}\\min_{L,S}\\sum_{m=1}^{M}\\sum_{i=1}^{N_{m}}&\\frac{1}{2}[max(0,1-Y_{m}^{i}(Ls^{m})^{T}X_{m}^{i})]^{2}\\\\&+\\mu\\sum_{k=1}^{K}\\sum_{g=1}^{G}\\|s_{k}^{g}\\|_{2}\\\\\\end{aligned}',
    'fmt': 'math'},
   'similarity_score': {'data': 0.05906001478433609}},
  {'neighbor': {'data': u'\\begin{aligned}\\min_{L,S}\\sum_{m=1}^{M}\\sum_{i=1}^{N_{m}}&\\frac{1}{2}[max(0,1-Y_{m}^{i}(Ls^{m})^{T}X_{m}^{i})]^{2}\\\\&+\\mu\\sum_{k=1}^{K}\\sum_{g=1}^{G}\\|s_{k}^{g}\\|_{2}+\\gamma\\|L\\|_{1}+\\lambda\\|L\\|^{2}_{F}\\\\\\end{aligned}',
    'fmt': 'math'},
   'similarity_score': {'data': 0.05068068951368332}}]}

In [9]:
from mathviz_hopper.src.table import Table
t = Table(q)
t.print_ipython()

# TODO: lambda function to convert each column
# Schema:
# also, enable non-autocomplete selections (the models should be able to handle this)
# also, add a warning if the settings are still loading, and other notifications.
# Data: [{Column A: {name: A, index: [], filter_func: lambda x : f(x)}} , {Column B: {name: B, index: [], filter_func: lambda x : f(x)}} 

Bottle v0.13-dev server starting up (using MyWSGIRefServer())...
Listening on http://localhost:8081/
Hit Ctrl-C to quit.



listening


In [10]:
import requests
r = requests.post('http://localhost:8081/query', json={"query": "\\begin{equation}\n\\Phi_{z}(L) = \\sum_{i=1}^{N} \\frac{1}{C_{i} \\times V_{\\rm max, i}} ,\n\\label{EQ1}\n\\end{equation}\n"})

{'neighbors': [{'similarity_score': {'data': 0.2762615382671356}, 'neighbor': {'fmt': 'math', 'data': u'\\begin{aligned}\\min_{L,S}\\sum_{m=1}^{M}\\sum_{i=1}^{N_{m}}&\\frac{1}{2}[max(0,1-Y_{m}^{i}(Ls^{m})^{T}X_{m}^{i})]^{2}\\\\&+\\gamma\\|L\\|_{1}+\\lambda\\|L\\|^{2}_{F}\\\\\\end{aligned}'}}, {'similarity_score': {'data': 0.2758598029613495}, 'neighbor': {'fmt': 'math', 'data': u'\\begin{aligned}\\min_{L,S}\\sum_{m=1}^{M}\\sum_{i=1}^{N_{m}}&\\frac{1}{2}[max(0,1-Y_{m}^{i}(Ls^{m})^{T}X_{m}^{i})]^{2}\\\\&+\\mu\\sum_{k=1}^{K}\\sum_{g=1}^{G}\\|s_{k}^{g}\\|_{2}\\\\\\end{aligned}'}}, {'similarity_score': {'data': 0.2687106728553772}, 'neighbor': {'fmt': 'math', 'data': u'\\begin{aligned}\\min_{L,S}\\sum_{m=1}^{M}\\sum_{i=1}^{N_{m}}&\\frac{1}{2}[max(0,1-Y_{m}^{i}(Ls^{m})^{T}X_{m}^{i})]^{2}\\\\&+\\mu\\sum_{k=1}^{K}\\sum_{g=1}^{G}\\|s_{k}^{g}\\|_{2}+\\gamma\\|L\\|_{1}+\\lambda\\|L\\|^{2}_{F}\\\\\\end{aligned}'}}]}


127.0.0.1 - - [28/Nov/2017 14:12:21] "POST /query HTTP/1.1" 200 914


In [11]:
r.json()

{u'neighbors': [{u'neighbor': {u'data': u'\\begin{aligned}\\min_{L,S}\\sum_{m=1}^{M}\\sum_{i=1}^{N_{m}}&\\frac{1}{2}[max(0,1-Y_{m}^{i}(Ls^{m})^{T}X_{m}^{i})]^{2}\\\\&+\\gamma\\|L\\|_{1}+\\lambda\\|L\\|^{2}_{F}\\\\\\end{aligned}',
    u'fmt': u'math'},
   u'similarity_score': {u'data': 0.2762615382671356}},
  {u'neighbor': {u'data': u'\\begin{aligned}\\min_{L,S}\\sum_{m=1}^{M}\\sum_{i=1}^{N_{m}}&\\frac{1}{2}[max(0,1-Y_{m}^{i}(Ls^{m})^{T}X_{m}^{i})]^{2}\\\\&+\\mu\\sum_{k=1}^{K}\\sum_{g=1}^{G}\\|s_{k}^{g}\\|_{2}\\\\\\end{aligned}',
    u'fmt': u'math'},
   u'similarity_score': {u'data': 0.2758598029613495}},
  {u'neighbor': {u'data': u'\\begin{aligned}\\min_{L,S}\\sum_{m=1}^{M}\\sum_{i=1}^{N_{m}}&\\frac{1}{2}[max(0,1-Y_{m}^{i}(Ls^{m})^{T}X_{m}^{i})]^{2}\\\\&+\\mu\\sum_{k=1}^{K}\\sum_{g=1}^{G}\\|s_{k}^{g}\\|_{2}+\\gamma\\|L\\|_{1}+\\lambda\\|L\\|^{2}_{F}\\\\\\end{aligned}',
    u'fmt': u'math'},
   u'similarity_score': {u'data': 0.2687106728553772}}]}

In [12]:
import requests
r = requests.get('http://localhost:8081/settings')

127.0.0.1 - - [28/Nov/2017 14:12:21] "GET /settings HTTP/1.1" 200 1007


In [13]:
r.json()

{u'columns': [{u'Header': u'neighbor', u'accessor': u'neighbor'},
  {u'Header': u'similarity_score', u'accessor': u'similarity_score'}],
 u'docs': {u'\\': {u'b': {u'e': {u'g': {u'i': {u'n': {u'{': {u'a': {u'l': {u'i': {u'g': {u'n': {u'e': {u'd': {u'}': {u'\\': {u'm': {u'i': {u'n': {u'_': {u'{': {},
                      u'{L,S}\\sum_{m=1}^{M}\\sum_{i=1}^{N_{m}}&\\frac{1}{2}[max(0,1-Y_{m}^{i}(Ls^{m})^{T}X_{m}^{i})]^{2}\\\\&+\\gamma\\|L\\|_{1}+\\lambda\\|L\\|^{2}_{F}\\\\\\end{aligned}': {u'full_word': 1},
                      u'{L,S}\\sum_{m=1}^{M}\\sum_{i=1}^{N_{m}}&\\frac{1}{2}[max(0,1-Y_{m}^{i}(Ls^{m})^{T}X_{m}^{i})]^{2}\\\\&+\\mu\\sum_{k=1}^{K}\\sum_{g=1}^{G}\\|s_{k}^{g}\\|_{2}+\\gamma\\|L\\|_{1}+\\lambda\\|L\\|^{2}_{F}\\\\\\end{aligned}': {u'full_word': 1},
                      u'{L,S}\\sum_{m=1}^{M}\\sum_{i=1}^{N_{m}}&\\frac{1}{2}[max(0,1-Y_{m}^{i}(Ls^{m})^{T}X_{m}^{i})]^{2}\\\\&+\\mu\\sum_{k=1}^{K}\\sum_{g=1}^{G}\\|s_{k}^{g}\\|_{2}\\\\\\end{aligned}': {u'full_word': 1}}}}}},
   

In [14]:

def insert(st, trie):
    i = 0
    for s in st:
        if s not in trie.keys(): 
            trie[s] = {}
        if i == 20:
            break
        trie = trie[s]
        i+=1
        
    if i == 20: 
        trie[st[i:]] = {}
        trie = trie[st[i:]]
    trie["full_word"] = 1
        
    
def construct_trie(list_of_str):
    trie = {}
    for st in list_of_str:
        insert(st, trie)
    return trie
    

In [15]:
docs_trie = construct_trie(clean_docs)


In [16]:

opts = {
    "columns": [
            {
                "Header": "Word",
                "accessor": "word"
            },
            {
                "Header": "Similarity",
                "accessor": "sim"
            }
   ],
    "port": "8081",
    "docs": docs_trie
}

In [17]:
import json
with open("mathviz_hopper/webpage/mathviz-js-components/public/settings.json", "w+") as f:
    json.dump(opts, f)

127.0.0.1 - - [28/Nov/2017 14:12:21] "GET /settings HTTP/1.1" 200 1007


IOError: [Errno 2] No such file or directory: 'mathviz_hopper/webpage/mathviz-js-components/public/settings.json'

{'neighbors': [{'similarity_score': {'data': 0.32032012939453125}, 'neighbor': {'fmt': 'math', 'data': u'\\begin{aligned}\\min_{L,S}\\sum_{m=1}^{M}\\sum_{i=1}^{N_{m}}&\\frac{1}{2}[max(0,1-Y_{m}^{i}(Ls^{m})^{T}X_{m}^{i})]^{2}\\\\&+\\mu\\sum_{k=1}^{K}\\sum_{g=1}^{G}\\|s_{k}^{g}\\|_{2}\\\\\\end{aligned}'}}, {'similarity_score': {'data': 0.31747668981552124}, 'neighbor': {'fmt': 'math', 'data': u'\\begin{aligned}\\min_{L,S}\\sum_{m=1}^{M}\\sum_{i=1}^{N_{m}}&\\frac{1}{2}[max(0,1-Y_{m}^{i}(Ls^{m})^{T}X_{m}^{i})]^{2}\\\\&+\\mu\\sum_{k=1}^{K}\\sum_{g=1}^{G}\\|s_{k}^{g}\\|_{2}+\\gamma\\|L\\|_{1}+\\lambda\\|L\\|^{2}_{F}\\\\\\end{aligned}'}}, {'similarity_score': {'data': 0.2810024917125702}, 'neighbor': {'fmt': 'math', 'data': u'\\begin{aligned}\\min_{L,S}\\sum_{m=1}^{M}\\sum_{i=1}^{N_{m}}&\\frac{1}{2}[max(0,1-Y_{m}^{i}(Ls^{m})^{T}X_{m}^{i})]^{2}\\\\&+\\gamma\\|L\\|_{1}+\\lambda\\|L\\|^{2}_{F}\\\\\\end{aligned}'}}]}


127.0.0.1 - - [28/Nov/2017 14:12:44] "POST /query HTTP/1.1" 200 916


In [None]:
def word_exists(trie, word):
    
    for i, s in enumerate(word):        
        if s in trie.keys(): 
            trie = trie[s]
        else: return False
            
        if i == 19: 
            break
            
    if i == 19:
        s = word[i+1:]
        if s not in trie.keys(): return False
        trie = trie[s]

    return True if 'full_word' in trie.keys() else False
        
        

In [None]:
word_exists(trie, "fdakfkajshdfjkahsdfkjhadfdfdfdffdfdsdjkfhaskj")

In [None]:
docs = ["".join(eq) for eq in df["processed"].tolist()]
docs_trie = construct_trie(docs)