In [1]:
%load_ext autoreload
%autoreload 2

import json
import os
import pandas as pd
import tiktoken
import uuid

from pylatexenc.latex2text import LatexNodes2Text

pd.set_option('display.max_colwidth', None)

In [2]:
product_name = 'mel_swan'
project_name = 'diygenomics'

original_file = '2021_Wightman-Posthuma_A_genomewide_association_study_with_112_563_individuals_identifies_new_risk_loci_for_Alzheimers_disease'
external_id = '2023_05_02_27142069922ab9506d3dg'
input_file = f'{external_id}.lines.json'
output_file_lines = f'truth_{external_id}.lines.csv'
output_file_chunks = f'truth_{external_id}.chunks.csv'

data_path = os.getenv('DATA_PATH')
file_path = lambda *args: os.path.join(data_path, 'eric-client-projects', product_name, project_name, 'experiment-a', 
                                       original_file, 'mathpix', *args)

enc = tiktoken.get_encoding("gpt2")

max_chunk_size = 6000
overlap_size = 10

In [3]:
with open(file_path(input_file), 'r') as file:
    raw_article = json.load(file)

In [4]:
data = []

for page in raw_article['pages']:
    page_number = page['page']
    for line in page['lines']:
        text = LatexNodes2Text().latex_to_text(line['text'])
        line_number = line['line']
        column_number = line['column']
        
        data.append({'page_number': page_number, 'line_number': line_number, 'column_number': column_number, 'text': text})
        
df = pd.DataFrame(data)

In [5]:
df['uuid'] = [uuid.uuid4() for _ in range(len(df))]
df.set_index('uuid', inplace=True)

In [6]:
df.to_csv(file_path(output_file_lines))

In [7]:
data = []
current_chunk = ""
start_line_uuid = None
stop_line_uuid = None

for index, row in df.iterrows():
    text = row['text']
    current_uuid = index

    if current_chunk == "":
        start_line_uuid = current_uuid
        current_chunk = text
    else:
        current_chunk += " " + text

    if len(current_chunk) > max_chunk_size:
        overlap_words = current_chunk.split()[-overlap_size:]
        stop_line_uuid = current_uuid
        data.append({'start_line_uuid': start_line_uuid, 'stop_line_uuid': stop_line_uuid, 'text': current_chunk})
        start_line_uuid = current_uuid
        current_chunk = " ".join(overlap_words)

if current_chunk != "":
    stop_line_uuid = current_uuid
    data.append({'start_line_uuid': start_line_uuid, 'stop_line_uuid': stop_line_uuid, 'text': current_chunk})
    
df_chunks = pd.DataFrame(data)

In [8]:
df_chunks['uuid'] = [uuid.uuid4() for _ in range(len(df_chunks))]
df_chunks.set_index('uuid', inplace=True)

In [9]:
df_chunks.to_csv(file_path(output_file_chunks))