In [None]:
import re
import os

In [None]:

# Define the LaTeX environments you want to extract
environments = ['equation', 'align', 'gather', 'multline', 'flalign']

# Regular expression for matching LaTeX environments
env_regex = r'\\begin\{(' + '|'.join(environments) + r')\}(.*?)\\end\{\1\}'

# Function to extract all LaTeX environments from a text
def extract_latex_envs(text, regex):
    return re.findall(regex, text, re.DOTALL)

# Function to remove \label{...} tags
def remove_labels(text):
    return re.sub(r'\\label\{.*?\}', '', text)

# Read the .tex file
def read_tex_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        return file.read()

# Save each extracted content to individual files
def save_extracted_content(content, output_dir, base_filename):
    for i, (env, math) in enumerate(content):
        cleaned_math = remove_labels(math)
        file_name = f'{base_filename}_{i+1}.txt'
        file_path = os.path.join(output_dir, file_name)
        with open(file_path, 'w', encoding='utf-8') as file:
            file.write(f'\\begin{{{env}}}\n{cleaned_math}\\end{{{env}}}\n')

# Main function to process all .tex files in a directory
def process_tex_files(directory, output_directory):
    for filename in os.listdir(directory):
        if filename.endswith('.tex'):
            file_path = os.path.join(directory, filename)
            tex_content = read_tex_file(file_path)
            extracted_content = extract_latex_envs(tex_content, env_regex)
            if extracted_content:
                base_filename = os.path.splitext(filename)[0]
                save_extracted_content(extracted_content, output_directory, base_filename)
                print(f'Extracted content from {filename} saved in {output_directory}')

# Replace 'your_directory_path' with the path to your .tex files
# Replace 'output_directory_path' with the path where you want to save the output files
process_tex_files('papers/', 'papers/')
