In [7]:
import re
import os
from typing import List, Tuple

from tqdm import tqdm

In [8]:
# Define the LaTeX environments you want to extract
environments = ['equation', 'align', 'gather', 'multline', 'flalign']

# Regular expression for matching LaTeX environments (including unnumbered versions)
env_regex = r'\\begin\{(' + '|'.join(environments) + r')\*?\}(.*?)\\end\{\1\*?\}'

def extract_inline_math_expressions(latex_content: str) -> List[str]:
    """
    Extracts and formats math expressions from a LaTeX file content.

    Args:
    latex_content (str): A string containing the content of a LaTeX file.

    Returns:
    List[str]: A list of formatted math expressions.
    """
    # Regular expression to match math expressions within $...$ or $$...$$
    math_pattern = r'\$(.*?)\$|\$\$(.*?)\$\$'

    # Extracting all math expressions
    math_expressions = re.findall(math_pattern, latex_content)

    # Flattening, filtering out empty matches, and ensuring at least 2 characters
    extracted_math = [expr for single, double in math_expressions for expr in (single, double) if expr and len(expr) >= 2]

    # Formatting the extracted math expressions
    formatted_math_expressions = ["$" + expr.replace("\\\\", "\\") + "$" for expr in extracted_math]

    return formatted_math_expressions

def extract_latex_envs(text: str, regex: str) -> List[Tuple[str, str]]:
    """
    Extracts all specified LaTeX environments from the given text.

    Args:
    text (str): The text from which to extract LaTeX environments.
    regex (str): The regular expression defining the LaTeX environments.

    Returns:
    List[Tuple[str, str]]: A list of tuples, each containing the environment type and its content.
    """
    return re.findall(regex, text, re.DOTALL)

def remove_labels(text: str) -> str:
    """
    Removes \label{} tags from the LaTeX text.

    Args:
    text (str): The LaTeX text from which to remove \label{} tags.

    Returns:
    str: The LaTeX text with \label{} tags removed.
    """
    return re.sub(r'\\label\{.*?\}', '', text)


def remove_percent_signs(text: str) -> str:
    """
    Removes percentage signs (%) from the text.

    Args:
    text (str): The text from which to remove percentage signs (%).

    Returns:
    str: The text with percentage signs (%) removed.
    """
    return re.sub(r'%', '', text)


def read_tex_file(file_path: str) -> str:
    """
    Reads the content of a .tex file, trying different encodings if necessary.

    Args:
    file_path (str): The path to the .tex file.

    Returns:
    str: The content of the .tex file.
    """
    encodings = ['utf-8', 'ISO-8859-1', 'windows-1252', 'utf-8-sig']
    for encoding in encodings:
        try:
            with open(file_path, 'r', encoding=encoding) as file:
                return file.read()
        except UnicodeDecodeError:
            continue
    raise UnicodeDecodeError(f"Could not decode file {file_path} with any of the specified encodings.")

def get_next_file_number(output_dir: str) -> int:
    """
    Determines the next file number to use based on existing files in the output directory.

    Args:
    output_dir (str): The directory where the output files are saved.

    Returns:
    int: The next file number to use.
    """
    existing_files = [f for f in os.listdir(output_dir) if f.startswith('equation_') and f.endswith('.txt')]
    highest_num = 0
    for file in existing_files:
        try:
            num = int(file.split('_')[1].split('.')[0])
            highest_num = max(highest_num, num)
        except ValueError:
            continue
    return highest_num + 1

def preprocess_equation(equation: str) -> str:
    """
    Preprocesses the LaTeX equation by removing newlines and redundant spaces.

    This function takes a LaTeX equation string, removes any newline characters,
    and replaces multiple consecutive spaces with a single space. The result is
    a single-line, streamlined version of the equation, which is more suitable
    for certain machine learning applications.

    Args:
    equation (str): The LaTeX equation to preprocess.

    Returns:
    str: The preprocessed, single-line LaTeX equation.
    """
    # Replace newlines with spaces
    equation = equation.replace('\n', ' ')

    # Replace multiple spaces with a single space
    equation = re.sub(r'\s+', ' ', equation)

    # Trim leading and trailing spaces
    return equation.strip()

def save_extracted_content(content: List[Tuple[str, str]], output_dir: str) -> None:
    """
    Saves each extracted LaTeX environment content to individual text files in a single-line format,
    including the environment tags.

    This function processes a list of LaTeX equations, removes labels, and
    preprocesses them into a single-line format. Each equation, along with its
    LaTeX environment tags, is then saved to a separate text file. This format
    is particularly useful for machine learning applications where equations
    are treated as sequences of characters.

    Args:
    content (List[Tuple[str, str]]): The extracted LaTeX content, where each tuple contains
                                     the environment type and its content.
    output_dir (str): The directory where the output files will be saved.
    """
    file_number = get_next_file_number(output_dir)
    for env, math in content:
        # Remove LaTeX labels & percent signs and preprocess the equation
        cleaned_math = remove_percent_signs(remove_labels(math))
    
        single_line_math = preprocess_equation(cleaned_math)

        # Construct the full LaTeX environment string
        full_env_string = f'\\begin{{{env}}}{single_line_math}\\end{{{env}}}'

        # Construct the filename and save the preprocessed equation with environment tags
        file_name = f'equation_{file_number}.txt'
        file_path = os.path.join(output_dir, file_name)
        with open(file_path, 'w', encoding='utf-8') as file:
            file.write(full_env_string)
        file_number += 1
        
def save_extracted_inline_content(content: List[str], output_dir: str) -> None:
    """
    Saves each extracted inline math expression to individual text files.

    This function processes a list of inline math expressions (enclosed within $...$ or $$...$$),
    removes LaTeX labels and percent signs, and preprocesses them into a single-line format.
    Each processed math expression is then saved to a separate text file. This format is useful
    for applications where individual math expressions are needed in a clean, consistent format.

    Args:
    content (List[str]): The list of extracted inline math expressions.
    output_dir (str): The directory where the output files will be saved.

    Each file is named in a sequence (equation_1.txt, equation_2.txt, etc.), ensuring
    unique filenames for each math expression.
    """
    file_number = get_next_file_number(output_dir)
    for math in content:
        # Remove LaTeX labels & percent signs and preprocess the equation
        cleaned_math = remove_percent_signs(remove_labels(math))
        single_line_math = preprocess_equation(cleaned_math)

        # Construct the filename and save the preprocessed equation with environment tags ($...$ or $$...$$)
        file_name = f'equation_{file_number}.txt'
        file_path = os.path.join(output_dir, file_name)
        with open(file_path, 'w', encoding='utf-8') as file:
            file.write(single_line_math)
        file_number += 1



def process_tex_files(directory: str, output_directory: str) -> None:
    """
    Processes all .tex files in the specified directory, extracting and saving LaTeX environment content.
    Now also extracts and formats inline and displayed math expressions.
    """
    for filename in os.listdir(directory):
        if filename.endswith('.tex'):
            file_path = os.path.join(directory, filename)
            tex_content = read_tex_file(file_path)

            # Extracting specialized LaTeX environments equations, aligns and so on..
            extracted_content = extract_latex_envs(tex_content, env_regex)
            if extracted_content:
                save_extracted_content(extracted_content, output_directory)
            
            # Extracting and formatting $...$ and $$...$$ expressions
            extracted_inline_content = extract_inline_math_expressions(tex_content)
            if extracted_inline_content:
                save_extracted_inline_content(extracted_inline_content, output_directory)





In [None]:
paper_titles = os.listdir('papers')
equations_path = 'equations/'

for title in tqdm(paper_titles):
    paper_path = os.path.join('papers', title)
    if os.path.isdir(paper_path): # in case there is a .DS_Store (or other hidden file)
        process_tex_files(directory=paper_path, output_directory=equations_path)

 11%|█         | 416/3920 [02:02<29:41,  1.97it/s]  

In [6]:
equations_path = 'test/'
process_tex_files(directory='./', output_directory=equations_path)