In [1]:
import pandas as pd
from common.clean import replace
from common.katex import katex, load_display_scripts
from common.heuristics import test_for_suitable, split_high_level_eqs, split_exprs
from common.tokenize import tokenize
import sys
import csv
import json

def process_row(r, writer=None):
    d = {}
    d['eq_id'] = r[0]
    d['eq'] = r[1]
    d['clean'] = replace(r[1])
    d['clean_split'] = split_high_level_eqs(d['clean'])
    if d['clean_split'] is None or len(d['clean_split']) < 2:
        return None
    suitable = test_for_suitable(d['clean_split'])
    if suitable is None:
        return None
    d['clean_split_filtered'] = suitable
    d['clean_split_filtered_tokenized'] = [tokenize(e) for e in d['clean_split_filtered']]
    if writer:
        writer.write(json.dumps(d)+'\n')
        
    return d

load_display_scripts()

In [2]:
tokenize('\\frac{1}{T} f = x \, \n \\tab \macro y')
# ret = []
# ret_s = ''
# for c in 'f = x \, \n \\tab \macro y':
#     print(c)
#     ret.append(c)
#     ret_s += c
# ret

['\\frac',
 '{',
 '1',
 '}',
 '{',
 'T',
 '}',
 'f',
 '=',
 'x',
 '\\,',
 '\\tab',
 '\\macro',
 'y']

In [3]:
def process_row(r, writer=None):
    d = {}
    d['eq_id'] = r[0]
    d['eq'] = r[1]
    d['clean'] = replace(d['eq'])
    d['clean_expressions'] = split_exprs(d['clean'])
    if not d['clean_expressions']:
        return None
    d['clean_expressions_split'] = []
    for exp in d['clean_expressions']:
        split_eqs = split_high_level_eqs(exp)
        if split_eqs:
            d['clean_expressions_split'].append(split_eqs)
    
    if len(d['clean_expressions_split']) < 2:
        return None
    
    d['clean_split_filtered'] = []
    for exp in d['clean_expressions_split']:
        suitable = test_for_suitable(exp)
        if not suitable:
            return None
        d['clean_split_filtered'].append(suitable)
        
    return d
#     d['clean_expressions_split'] = split_high_level_eqs(d['clean_expressions'])


In [4]:
found = 0
with open('eqs_100k.tsv', 'rb') as reader:
    for i, row in enumerate(reader):
        # The equations were saved in the json-encoded format, which double the number of \\ 
        # escapes that we want.
        row = row.decode("unicode_escape")
        row = row.split('\t', 1)[1]
        rowcp = row

        row = replace(row)

        tokenized = tokenize(row)

        ses = split_exprs(tokenized)
        if ses:
            
            for expr in ses:
                se = split_high_level_eqs(expr)

                suit = test_for_suitable(se)
                if suit:
                    if found > 15:
                        print("\nnew\n")
                        print(row)
                        katex(row)._ipython_display_()
                    found += 1
                    if found > 21:
                        raise GetOutOfLoop
                    for s in suit:
                        if found > 15:
                            print(s)
                            katex(" ".join(s))._ipython_display_()       

['\\frac', '{', 'L', '^', '{', '3', '/', '2', '}', '}', '{', '1', '2', '\\pi', '}', '+', '\\frac', '{', '\\sqrt', '{', 'L', '}', '}', '{', '\\pi', '}']


['s', '\\,', '\\sqrt', '{', 'b', '}', '\\;', '\\;', '\\;', '\\textrm', '{', 'a', 'n', 'd', '}', '\\;', '\\;', '\\;', 'R']


['\\frac', '{', '2', '\\pi', '}', '{', '\\sqrt', '{', 'L', '}', '}', '-', '\\frac', '{', '\\pi', '\\sqrt', '{', 'L', '}', '}', '{', '2', '}']



new

\bigcup_{i \in I} \Omega_i : {\mathcal{X}} {\dashrightarrow} {\mathcal{Y}},  \left( \bigcup_{i \in I} \Omega_i \right)(x):=
\bigcup_{i \in I} \Omega_i(x) \text{ for all } x \in {\mathcal{X}}.



['\\left', '(', '\\bigcup', '_', '{', 'i', '\\in', 'I', '}', '\\Omega', '_', 'i', '\\right', ')', '(', 'x', ')', ':']


['\\bigcup', '_', '{', 'i', '\\in', 'I', '}', '\\Omega', '_', 'i', '(', 'x', ')', '\\text', '{', 'f', 'o', 'r', 'a', 'l', 'l', '}', 'x', '\\in', '{', '\\mathcal', '{', 'X', '}', '}', '.']



new

\textstyle p_0-1=\frac{2\delta}{\delta+1} (p-1),  \frac{1}{p}= \frac{1}{2p_0}+\frac{1}{2p_1}.



['\\textstyle', 'p', '_', '0', '-', '1']


['\\frac', '{', '2', '\\delta', '}', '{', '\\delta', '+', '1', '}', '(', 'p', '-', '1', ')']



new

\textstyle p_0-1=\frac{2\delta}{\delta+1} (p-1),  \frac{1}{p}= \frac{1}{2p_0}+\frac{1}{2p_1}.



['\\frac', '{', '1', '}', '{', 'p', '}']


['\\frac', '{', '1', '}', '{', '2', 'p', '_', '0', '}', '+', '\\frac', '{', '1', '}', '{', '2', 'p', '_', '1', '}', '.']



new


\label{}
{\mathfrak J}=\frac{\vec{\eta} \cdot \vec{\xi} + \Omega\vec{\xi} \cdot \vec{\xi}}{\vec{\eta}\cdot \vec{\eta} + \Omega\vec{\eta} \cdot \vec{\xi}}
=
\frac{{\mathfrak R}^2\Omega - \omega}{\Omega\omega -1}.




['\\frac', '{', '\\vec', '{', '\\eta', '}', '\\cdot', '\\vec', '{', '\\xi', '}', '+', '\\Omega', '\\', 'v', 'e', 'c', '{', '\\xi', '}', '\\cdot', '\\vec', '{', '\\xi', '}', '}', '{', '\\vec', '{', '\\eta', '}', '\\cdot', '\\vec', '{', '\\eta', '}', '+', '\\Omega', '\\', 'v', 'e', 'c', '{', '\\eta', '}', '\\cdot', '\\vec', '{', '\\xi', '}', '}']


['\\frac', '{', '{', '\\mathfrak', 'R', '}', '^', '2', '\\Omega', '-', '\\omega', '}', '{', '\\Omega', '\\', 'o', 'm', 'e', 'g', 'a', '-', '1', '}', '.']



new

2-\frac{\nu}{z_1}-\frac{\nu}{z_2}=1\mbox{with}\frac{\nu}{z_3}\geq1



['2', '-', '\\frac', '{', '\\nu', '}', '{', 'z', '_', '1', '}', '-', '\\frac', '{', '\\nu', '}', '{', 'z', '_', '2', '}']


['1', '\\mbox', '{', 'w', 'i', 't', 'h', '}', '\\frac', '{', '\\nu', '}', '{', 'z', '_', '3', '}', '\\geq', '1']



new

 = (s+1)^2+ s(s+1)=dim_{\mathbb{R}} \  \mathbf {so}(s+1,s+1)



NameError: name 'GetOutOfLoop' is not defined

In [None]:
# test: "../data/eqs_100k.tsv" 'aligned_ex.csv'
if len(sys.argv) < 3:
    print("usage: python identify_and_tokenize.py /path/to/tsv outpath/filename.csv")
else:
    filename = sys.argv[1]
    outpath = sys.argv[2]

with open(filename, 'r') as csvfile:
    with open(outpath, "w+") as outfile:
        reader = csvfile #csv.reader(csvfile, quoting=csv.QUOTE_MINIMAL, delimiter='\t') 
        for i, row in enumerate(reader):
            row = row.split('\t', 1)
            if i == 0:
                continue
            process_row(row, outfile)

            if i % 100000 == 0:
                print("{i} rows processed".format(i=i))


