In [15]:
import re
import os
from typing import List, Tuple

from tqdm import tqdm

In [19]:
# Define the LaTeX environments you want to extract
environments = ['equation', 'align', 'gather', 'multline', 'flalign']

# Regular expression for matching LaTeX environments (including unnumbered versions)
env_regex = r'\\begin\{(' + '|'.join(environments) + r')\*?\}(.*?)\\end\{\1\*?\}'

def extract_latex_envs(text: str, regex: str) -> List[Tuple[str, str]]:
    """
    Extracts all specified LaTeX environments from the given text.

    Args:
    text (str): The text from which to extract LaTeX environments.
    regex (str): The regular expression defining the LaTeX environments.

    Returns:
    List[Tuple[str, str]]: A list of tuples, each containing the environment type and its content.
    """
    return re.findall(regex, text, re.DOTALL)

def remove_labels(text: str) -> str:
    """
    Removes \label{} tags from the LaTeX text.

    Args:
    text (str): The LaTeX text from which to remove \label{} tags.

    Returns:
    str: The LaTeX text with \label{} tags removed.
    """
    return re.sub(r'\\label\{.*?\}', '', text)

def read_tex_file(file_path: str) -> str:
    """
    Reads the content of a .tex file.

    Args:
    file_path (str): The path to the .tex file.

    Returns:
    str: The content of the .tex file.
    """
    with open(file_path, 'r', encoding='utf-8') as file:
        return file.read()

def get_next_file_number(output_dir: str) -> int:
    """
    Determines the next file number to use based on existing files in the output directory.

    Args:
    output_dir (str): The directory where the output files are saved.

    Returns:
    int: The next file number to use.
    """
    existing_files = [f for f in os.listdir(output_dir) if f.startswith('equation_') and f.endswith('.txt')]
    highest_num = 0
    for file in existing_files:
        try:
            num = int(file.split('_')[1].split('.')[0])
            highest_num = max(highest_num, num)
        except ValueError:
            continue
    return highest_num + 1

def save_extracted_content(content: List[Tuple[str, str]], output_dir: str) -> None:
    """
    Saves each extracted LaTeX environment content to individual text files.

    Args:
    content (List[Tuple[str, str]]): The extracted LaTeX content.
    output_dir (str): The directory where the output files will be saved.
    """
    file_number = get_next_file_number(output_dir)
    for env, math in content:
        cleaned_math = remove_labels(math)
        file_name = f'equation_{file_number}.txt'
        file_path = os.path.join(output_dir, file_name)
        with open(file_path, 'w', encoding='utf-8') as file:
            file.write(f'\\begin{{{env}}}\n{cleaned_math}\\end{{{env}}}\n')
        file_number += 1

def process_tex_files(directory: str, output_directory: str) -> None:
    """
    Processes all .tex files in the specified directory, extracting and saving LaTeX environment content.

    Args:
    directory (str): The directory containing the .tex files.
    output_directory (str): The directory where the output files will be saved.
    """
    for filename in os.listdir(directory):
        if filename.endswith('.tex'):
            file_path = os.path.join(directory, filename)
            tex_content = read_tex_file(file_path)
            extracted_content = extract_latex_envs(tex_content, env_regex)
            if extracted_content:
                save_extracted_content(extracted_content, output_directory)




In [18]:
paper_titles = os.listdir('papers')
equations_path = 'equations/'

for title in tqdm(paper_titles):
    paper_path = os.path.join('papers', title)
    if os.path.isdir(paper_path): # in case there is a .DS_Store (or other hidden file)
        process_tex_files(directory=paper_path, output_directory=equations_path)

100%|██████████| 10/10 [00:00<00:00, 296.67it/s]

Extracted content from lower.tex saved in equations/
Extracted content from deep_learning_nonclassicality.tex saved in equations/
Extracted content from Manuscript.tex saved in equations/
Extracted content from main1.tex saved in equations/
Extracted content from main_arxiv.tex saved in equations/
Extracted content from main.tex saved in equations/
Extracted content from main_ArXiV.tex saved in equations/
Extracted content from main.tex saved in equations/
Extracted content from main.tex saved in equations/
Extracted content from appendix.tex saved in equations/



