In [162]:
import re
import os
from typing import List, Tuple
import subprocess

from tqdm import tqdm

In [165]:
# Define the LaTeX environments you want to extract
environments = ['equation', 'align', 'gather', 'multline', 'flalign']

# Regular expression for matching LaTeX environments (including unnumbered versions)
env_regex = r'\\begin\{(' + '|'.join(environments) + r')\*?\}(.*?)\\end\{\1\*?\}'

def extract_inline_math_expressions(latex_content: str) -> List[str]:
    """
    Extracts and formats math expressions from a LaTeX file content.

    Args:
    latex_content (str): A string containing the content of a LaTeX file.

    Returns:
    List[str]: A list of formatted math expressions.
    """
    # Regular expression to match math expressions within $...$ or $$...$$
    math_pattern = r'\$(.*?)\$|\$\$(.*?)\$\$'

    # Extracting all math expressions
    math_expressions = re.findall(math_pattern, latex_content)

    # Flattening, filtering out empty matches, and ensuring at least 3 characters and that first character is not a '^'
    extracted_math = [expr for single, double in math_expressions for expr in (single, double) if expr and len(expr) >= 3 and not expr.startswith('^')]

    # Formatting the extracted math expressions
    formatted_math_expressions = ["$" + expr.replace("\\\\", "\\") + "$" for expr in extracted_math]

    return formatted_math_expressions

def extract_latex_envs(text: str, regex: str) -> List[Tuple[str, str]]:
    """
    Extracts all specified LaTeX environments from the given text.

    Args:
    text (str): The text from which to extract LaTeX environments.
    regex (str): The regular expression defining the LaTeX environments.

    Returns:
    List[Tuple[str, str]]: A list of tuples, each containing the environment type and its content.
    """
    return re.findall(regex, text, re.DOTALL)

def remove_labels(text: str) -> str:
    """
    Removes \label{} tags from the LaTeX text.

    Args:
    text (str): The LaTeX text from which to remove \label{} tags.

    Returns:
    str: The LaTeX text with \label{} tags removed.
    """
    return re.sub(r'\\label\{.*?\}', '', text)


def remove_percent_signs(text: str) -> str:
    """
    Removes percentage signs (%) from the text.

    Args:
    text (str): The text from which to remove percentage signs (%).

    Returns:
    str: The text with percentage signs (%) removed.
    """
    return re.sub(r'%', '', text)


def read_tex_file(file_path: str) -> str:
    """
    Reads the content of a .tex file, trying different encodings if necessary.

    Args:
    file_path (str): The path to the .tex file.

    Returns:
    str: The content of the .tex file.
    """
    encodings = ['utf-8', 'ISO-8859-1', 'windows-1252', 'utf-8-sig']
    for encoding in encodings:
        try:
            with open(file_path, 'r', encoding=encoding) as file:
                return file.read()
        except UnicodeDecodeError:
            continue
    raise UnicodeDecodeError(f"Could not decode file {file_path} with any of the specified encodings.")

def get_next_file_number(output_dir: str) -> int:
    """
    Determines the next file number to use based on existing files in the output directory.

    Args:
    output_dir (str): The directory where the output files are saved.

    Returns:
    int: The next file number to use.
    """
    existing_files = [f for f in os.listdir(output_dir) if f.startswith('equation_') and f.endswith('.txt')]
    highest_num = 0
    for file in existing_files:
        try:
            num = int(file.split('_')[1].split('.')[0])
            highest_num = max(highest_num, num)
        except ValueError:
            continue
    return highest_num + 1

def preprocess_equation(equation: str) -> str:
    """
    Preprocesses the LaTeX equation by removing newlines and redundant spaces.

    This function takes a LaTeX equation string, removes any newline characters,
    and replaces multiple consecutive spaces with a single space. The result is
    a single-line, streamlined version of the equation, which is more suitable
    for certain machine learning applications.

    Args:
    equation (str): The LaTeX equation to preprocess.

    Returns:
    str: The preprocessed, single-line LaTeX equation.
    """
    # Replace newlines with spaces
    equation = equation.replace('\n', ' ')

    # Replace multiple spaces with a single space
    equation = re.sub(r'\s+', ' ', equation)

    # Trim leading and trailing spaces
    return equation.strip()

def save_extracted_content(content: List[Tuple[str, str]], output_dir: str) -> None:
    """
    Saves each extracted LaTeX environment content to individual text files in a single-line format,
    including the environment tags.

    This function processes a list of LaTeX equations, removes labels, and
    preprocesses them into a single-line format. Each equation, along with its
    LaTeX environment tags, is then saved to a separate text file. This format
    is particularly useful for machine learning applications where equations
    are treated as sequences of characters.

    Args:
    content (List[Tuple[str, str]]): The extracted LaTeX content, where each tuple contains
                                     the environment type and its content.
    output_dir (str): The directory where the output files will be saved.
    """
    # Below is for saving to individual .txt files
    """file_number = get_next_file_number(output_dir)
    for env, math in content:
        # Remove LaTeX labels & percent signs and preprocess the equation
        cleaned_math = remove_percent_signs(remove_labels(math))
    
        single_line_math = preprocess_equation(cleaned_math)

        # Construct the full LaTeX environment string
        full_env_string = f'\\begin{{{env}}}{single_line_math}\\end{{{env}}}'

        # Construct the filename and save the preprocessed equation with environment tags
        file_name = f'equation_{file_number}.txt'
        file_path = os.path.join(output_dir, file_name)
        with open(file_path, 'w', encoding='utf-8') as file:
            file.write(full_env_string)
        file_number += 1"""
    # Below is for saving to one file
    file_name = 'equations.txt'
    if file_name in os.listdir(output_dir):
        opening_flag = 'a'
    else:
        opening_flag = 'w'
    file_path = os.path.join(output_dir, file_name)
    with open(file_path, opening_flag, encoding='utf-8') as file:
        for env, math in content:
            # Remove LaTeX labels & percent signs and preprocess the equation
            cleaned_math = remove_percent_signs(remove_labels(math))
            single_line_math = preprocess_equation(cleaned_math)
    
            # Construct the full LaTeX environment string
            full_env_string = f'\\begin{{{env}}}{single_line_math}\\end{{{env}}}'
            
            # Write out to file
            file.write(full_env_string)
            file.write('\n')

        
        
def save_extracted_inline_content(content: List[str], output_dir: str) -> None:
    """
    Saves each extracted inline math expression to individual text files.

    This function processes a list of inline math expressions (enclosed within $...$ or $$...$$),
    removes LaTeX labels and percent signs, and preprocesses them into a single-line format.
    Each processed math expression is then saved to a separate text file. This format is useful
    for applications where individual math expressions are needed in a clean, consistent format.

    Args:
    content (List[str]): The list of extracted inline math expressions.
    output_dir (str): The directory where the output files will be saved.

    Each file is named in a sequence (equation_1.txt, equation_2.txt, etc.), ensuring
    unique filenames for each math expression.
    """
    # Below is for saving to individual .txt files
    """file_number = get_next_file_number(output_dir)
    for math in content:
        # Remove LaTeX labels & percent signs and preprocess the equation
        cleaned_math = remove_percent_signs(remove_labels(math))
        single_line_math = preprocess_equation(cleaned_math)

        # Construct the filename and save the preprocessed equation with environment tags ($...$ or $$...$$)
        file_name = f'equation_{file_number}.txt'
        file_path = os.path.join(output_dir, file_name)
        with open(file_path, 'w', encoding='utf-8') as file:
            file.write(single_line_math)
        file_number += 1"""
    
    # Below is for saving to single .txt file
    file_name = 'inline_equations.txt'
    if file_name in os.listdir(output_dir):
        opening_flag = 'a'
    else:
        opening_flag = 'w'
    file_path = os.path.join(output_dir, file_name)
    with open(file_path, opening_flag, encoding='utf-8') as file:
        for math in content:
            # Remove LaTeX labels & percent signs and preprocess the equation
            cleaned_math = remove_percent_signs(remove_labels(math))
            single_line_math = preprocess_equation(cleaned_math)
            
            # Asserting if the expression is valid by trying a render
            if is_valid_latex_math(expr=single_line_math):
                # Write to file
                file.write(single_line_math)
                file.write('\n')


def process_tex_files(directory: str, output_directory: str) -> None:
    """
    Processes all .tex files in the specified directory, extracting and saving LaTeX environment content.
    Now also extracts and formats inline and displayed math expressions.
    """
    for filename in os.listdir(directory):
        if filename.endswith('.tex'):
            file_path = os.path.join(directory, filename)
            tex_content = read_tex_file(file_path)

            # Extracting specialized LaTeX environments equations, aligns and so on..
            extracted_content = extract_latex_envs(tex_content, env_regex)
            if extracted_content:
                save_extracted_content(extracted_content, output_directory)
            
            # Extracting and formatting $...$ and $$...$$ expressions
            extracted_inline_content = extract_inline_math_expressions(tex_content)
            if extracted_inline_content:
                save_extracted_inline_content(extracted_inline_content, output_directory)
                
def save_description(equations_path: str) -> None:
    """
    Saves a description of the number of lines (equations) in two files 
    ("equations.txt" and "inline_equations.txt") to a README.md file.

    The function counts the number of lines in each of the two specified files 
    and writes this information to a README.md file in the same directory.

    Args:
    equations_path (str): The path to the directory containing the equations files.
                          It's assumed that this path ends with a slash (/) or is empty.
    """

    def count_lines(filename: str) -> int:
        """
        Counts the number of lines in a given file.

        Args:
        filename (str): The path to the file whose lines are to be counted.

        Returns:
        int: The number of lines in the file.
        """
        with open(filename, 'r') as file:
            return sum(1 for line in file)

    # Constructing the full path for the README file
    readme_path = equations_path + 'README.txt'

    # Writing the line counts to the README file
    with open(readme_path, 'w', encoding='utf-8') as file:
        file.write(f'## ---- Each line has a single equation ---- ## \n \n')
        file.write(f'The "equations.txt" file contains: {count_lines(filename=equations_path + "equations.txt")} equations.\n')
        file.write(f'The "inline_equations.txt" file contains: {count_lines(filename=equations_path + "inline_equations.txt")} equations.')

def clear_output_path(output_directory: str) -> None:
    for filename in os.listdir(output_directory):
        os.remove(output_directory+filename)


In [None]:
paper_titles = os.listdir('papers')
equations_path = 'equations/'

clear_output_path(equations_path)
for title in tqdm(paper_titles):
    paper_path = os.path.join('papers', title)
    if os.path.isdir(paper_path): # in case there is a .DS_Store (or other hidden file)
        process_tex_files(directory=paper_path, output_directory=equations_path)
save_description(equations_path)

  0%|          | 2/6587 [04:19<208:47:51, 114.15s/it]

In [155]:
def find_all_packages(directory: str):
    """
    Scans all .tex files in the specified directory and extracts a list of unique LaTeX packages used.

    Args:
    directory (str): The path to the directory containing .tex files.

    Returns:
    set: A set of unique package names used across all .tex files in the directory.
    """

    def extract_packages_from_tex(content: str):
        return re.findall(r'\\usepackage(?:\[[^\]]*\])?{([^}]*)}', content)
    
    def filter_packages(packages: List[str]) -> List[str]:
        """
        Filters out package names from a list of strings based on specific criteria.
    
        Args:
        packages (List[str]): A list of package names.
    
        Returns:
        List[str]: A filtered list of package names.
        """
        filter_packages = [p for p in packages if '/' not in p]
        filter_packages = [p for p in filter_packages if '\t' not in p]
        filter_packages = [p for p in filter_packages if '\n' not in p]
        filter_packages = [p for p in filter_packages if '\\' not in p]
        filter_packages = [p for p in filter_packages if '%' not in p]
        filter_packages = [p for p in filter_packages if '{' not in p]
        filter_packages = [p for p in filter_packages if '}' not in p]
        filter_packages = [p for p in filter_packages if not any(char.isnumeric() for char in p)]
        filter_packages = [p for p in filter_packages if p != '']

        return filter_packages
        
        
    # Set to store unique package names
    all_packages = set()

    # Iterate over all files in the given directory
    for filename in os.listdir(directory):
        # Process only .tex files
        if filename.endswith('.tex'):
            # Construct the full path to the file
            file_path = os.path.join(directory, filename)
            # Read the content of the .tex file
            tex_content = read_tex_file(file_path)
            # Extract package names from the content
            packages = extract_packages_from_tex(tex_content)
            # Splitting and updating set:
            for package in packages:
                individual_packages = package.replace(' ', '').split(',')
                filtered_individual_packages = filter_packages(packages=individual_packages)
                all_packages.update(filtered_individual_packages)

    return all_packages


In [159]:
paper_titles = os.listdir('papers')

all_packages = set()
for title in tqdm(paper_titles):
    paper_path = os.path.join('papers', title)
    if os.path.isdir(paper_path): # in case there is a .DS_Store (or other hidden file)
        all_packages.update(find_all_packages(directory=paper_path))
with open('equations/packages.txt', 'w') as file:
    for package in list(all_packages):
        file.write(package)
        file.write('\n')

100%|██████████| 6587/6587 [00:03<00:00, 2165.36it/s]


In [158]:
list(all_packages)

1334