In [1]:
%load_ext autoreload
%autoreload 2

import csv
import json
import os
import pandas as pd
import papermill as pm
import re
import scrapbook as sb
import uuid

from functions import gpt

from tqdm.notebook import tqdm_notebook
tqdm_notebook().pandas()

pd.set_option('display.max_colwidth', None)

from pylatexenc.latexwalker import LatexWalker, LatexMathNode, LatexMacroNode, LatexGroupNode, LatexCharsNode, LatexEnvironmentNode

0it [00:00, ?it/s]

In [2]:
base_name = "2021_Hashimoto_Neural_ODE_and_holographic_QCD_PUB"
project_folder = "diygenomics-projects"
sub_category = "math"
work_bucket = "AdS-CFT"
external_id = "2023_05_22_92dc0613b4493d7b5847g"

In [3]:
data_path = os.getenv('DATA_PATH')
file_path = lambda *args: os.path.join(data_path, project_folder, sub_category, work_bucket,
                                       base_name, 'mathpix', *args)

model = 'gpt-4' # 'gpt-3.5-turbo' # 'gpt-4'
index_col = 'uuid'

input_file = f'{external_id}.lines.json'

In [4]:
with open(file_path(input_file), 'r') as f:
    data = json.load(f)

In [5]:
pattern = r'^\([A-Za-z]?\d+(\.\d+)?\)$'

text_nodes = []
captured_math = []

for page in data['pages']:
    for line in page['lines']:
        text = line['text']
        
        if re.match(pattern, text):
            current_annotations = []
            current_annotations.append(text)
            if len(text_nodes) > 0:
                while len(text_nodes) > 0: #  and re.match(pattern, text_nodes[offset]['text'])
                    previous_line = text_nodes.pop()
                    if re.match(pattern, previous_line['text']):
                        current_annotations.append(previous_line['text'])
                    else:
                        captured_math.append((previous_line['text'], current_annotations))
                        break
        
        text_nodes.append(line)

In [69]:
begin_gathered_pattern = r'\\begin{gathered}\n\\'
end_gathered_pattern = r'\\end{gathered}\n\\'
begin_gathered_no_newline_pattern = r'\\begin{gathered}'
end_gathered_no_newline_pattern = r'\\end{gathered}'
begin_gathered_bracket_pattern = '\[\n'
end_gathered_bracket_pattern = '\n]'

modified_tuples = []
tuples = captured_math
for i in range(len(tuples)):
    if len(tuples[i][1]) > 1:
        prior_tuple = modified_tuples.pop()
        original_math = prior_tuple[0]
        original_annotations = tuples[i][1][::-1]
        maths = original_math.split("\\\\")
        for index, math in enumerate(maths):
            math = re.sub(begin_gathered_pattern, '', math)
            math = re.sub(end_gathered_pattern, '', math)
            math = re.sub(begin_gathered_no_newline_pattern, '', math)
            math = re.sub(end_gathered_no_newline_pattern, '', math)
            math = re.sub(begin_gathered_bracket_pattern, '', math)
            math = re.sub(end_gathered_bracket_pattern, '', math)
            modified_tuples.append((math, original_annotations[index]))
    else:
        modified_tuples.append((tuples[i][0], tuples[i][1][0]))

In [71]:
uuids = [uuid.uuid4() for _ in range(len(modified_tuples))]

df = pd.DataFrame(modified_tuples, columns=['math', 'paper_annotation'], index=uuids)
df = df.rename_axis('uuid', axis='index')

In [72]:
df

Unnamed: 0_level_0,math,paper_annotation
uuid,Unnamed: 1_level_1,Unnamed: 2_level_1
6998d986-fca2-4650-a4bd-4656e657fa72,\[\n\mathrm{d} s^{2}=-f(\eta) \mathrm{d} t^{2}+\mathrm{d} \eta^{2}+g(\eta)\left(\mathrm{d} x_{1}^{2}+\cdots+\mathrm{d} x_{d-1}^{2}\right)\n\],(1)
340ffe88-4db9-43d5-b5e1-fbf4eb392df0,\[\n\sqrt{|g|}=\sqrt{-\operatorname{det} g}=\sqrt{f(\eta) g(\eta)^{d-1}}\n\],(2)
52828cb3-4ebb-4c28-9e71-fcbafa108f52,\[\nS[\phi]=\frac{1}{2} \int \sqrt{|g|}\left(g^{\mu \nu} \partial_{\mu} \phi \partial_{\nu} \phi+m^{2} \phi^{2}+\frac{\lambda}{2} \phi^{4}\right) .\n\],(3)
6ab19ac6-9607-4aa4-8ffc-c37867a8799c,\[\n-\frac{1}{\sqrt{|g|}} \partial_{\mu}\left(\sqrt{|g|} g^{\mu \nu} \partial_{\nu} \phi\right)+m^{2} \phi+\lambda \phi^{3}=0\n\],(4)
13619afb-215e-4b83-ab99-0c57d8a6df11,\[\n-\partial_{\eta}^{2} \phi-\left(\partial_{\eta} \ln \sqrt{|g|}\right) \partial_{\eta} \phi+m^{2} \phi+\lambda \phi^{3}=0\n\],(5)
b557096d-7cbc-4f93-b7a7-6a1618e13023,"\[\n\begin{aligned}\n& \pi=\partial_{\eta} \phi, \\\n& \partial_{\eta} \pi+h(\eta) \pi-m^{2} \phi-\lambda \phi^{3}=0\n\end{aligned}\n\]",(6)
e48cc84e-d9ea-47bc-9a0f-fa3feddbe015,\[\nh(\eta) \equiv \partial_{\eta} \ln \sqrt{f(\eta) g(\eta)^{d-1}}\n\],(7)
7d595253-663a-41c1-bad2-49a585f3284f,\[\n\left[\frac{2}{\eta} \pi-m^{2} \phi-\lambda \phi^{3}\right]_{\eta \sim 0}=0\n\],(8)
74d665ac-0ad8-474c-95ac-9d20e1e022f3,\[\n\pi(\eta \sim 0)=0\n\],(9)
932593b2-bc00-46c0-a9a8-4ef78354a742,\[\nL^{3 / 2} \phi \sim \alpha e^{-\eta / L}+\beta e^{-3 \eta / L}-\frac{\lambda \alpha^{3}}{2 L^{2}} \eta e^{-3 \eta / L}\n\],(10)


In [73]:
df.to_csv(file_path('extracted_annotated_math.csv'), quoting=csv.QUOTE_MINIMAL)

In [None]:
# text_nodes = []

# for page in data['pages']:
#     for line in page['lines']:
#         text = line['text']
        
#         if re.match(pattern, text):
#             if previous_line is not None:
#                 if re.match(pattern, previous_line['text']):
#                     print(previous_line['text'])
        
#         previous_line = line