In [None]:
import base64
import json
import os
import pandas as pd
import requests
import uuid
import zipfile

pd.set_option('display.max_colwidth', None)

In [None]:
file_name = '2021_Wightman-Posthuma_A_genomewide_association_study_with_112_563_individuals_identifies_new_risk_loci_for_Alzheimers_disease.pdf'
external_id = '2023_05_02_27142069922ab9506d3dg'

data_path = os.getenv('DATA_PATH')
file_path = lambda *args: os.path.join(data_path, 'diygenomics-projects', 'experiment-a', *args)

endpoint = 'https://api.mathpix.com/v3/'

app_id = os.getenv('MATHPIX_APP_ID')
app_key = os.getenv('MATHPIX_APP_KEY')

headers = {
    'app_id': app_id,
    'app_key': app_key,
}

conversion_options = {
    'formats': ['text', 'html', 'data', 'latex_styled'],
    'data_options': {
                        'include_asciimath': True, 'include_tsv': True, 'include_svg': True, 
                        'include_table_html': True, 'include_latex': True, 'include_mathml': True
    },
    'math_inline_delimiters': ['$', '$'],
    'rm_spaces': True,
    'enable_tables_fallback': True
}

base_name = os.path.splitext(os.path.basename(file_name))[0].replace(' ', '_')

In [None]:
def post_image_file(image_file_path):
    r = requests.post(os.path.join(endpoint, 'text'),
        files={'file': open(image_file_path, 'rb')},
        data={
          'options_json': json.dumps(conversion_options)
        },
        headers=headers
    )
    
    return r.json()

In [None]:
with zipfile.ZipFile(file_path(base_name, 'mathpix', f'{external_id}.tex.zip'), 'r') as zip_ref:
    zip_ref.extractall(file_path(base_name, 'mathpix'))

In [None]:
image_data = []
files = os.listdir(file_path(base_name, 'mathpix', external_id, 'images'))

for file in files:
    image_file_path = file_path(base_name, 'mathpix', external_id, 'images', file)
    image_data.append(post_image_file(image_file_path))

In [None]:
data = {'file_name': files, 'mathpix_response': image_data}

df = pd.DataFrame(data)

df['uuid'] = [uuid.uuid4() for _ in range(len(df))]
df.set_index('uuid', inplace=True)

In [None]:
df.to_csv(file_path(base_name, 'mathpix', external_id, 'image_results.csv'))