In [4]:
import os
import shutil
from tqdm import tqdm

import tarfile 
import arxiv

from src.Categories import ArxivCategories

In [5]:
# Read https://info.arxiv.org/help/api/user-manual.html#query_details on how to construct queries
category_tags = ArxivCategories().available_category_tags()
category_tags

['astro-ph',
 'cond-mat',
 'qr-qc',
 'hep-ex',
 'hep-lat',
 'hep-ph',
 'hep-th',
 'math-ph',
 'nlin',
 'nucl-ex',
 'nucl-th',
 'physics',
 'quant-ph',
 'math',
 'CoRR',
 'q-bio',
 'q-fin',
 'stat']

In [7]:
import logging
logging.basicConfig(level=logging.DEBUG)
N_papers = 100

client = arxiv.Client()
results = client.results(arxiv.Search(query="cat:quant-ph",
                                      max_results=N_papers,
                                      sort_by = arxiv.SortCriterion.SubmittedDate))
for paper in tqdm(results):
    folder_path = 'papers/'
    paper_title_path = os.path.join(folder_path, paper.title)
    tar_path = paper_title_path + '.tar.gz'

    # Download the source
    paper.download_source(dirpath=folder_path, filename=os.path.basename(tar_path))

    # Extract the tar.gz file
    with tarfile.open(tar_path) as file:
        file.extractall(paper_title_path)

    # Delete the tar.gz file
    os.remove(tar_path)

    # Delete everything except .tex files
    for filename in os.listdir(paper_title_path):
        file_path = os.path.join(paper_title_path, filename)
        if os.path.isfile(file_path) and not filename.endswith('.tex'):
            os.remove(file_path)
        elif os.path.isdir(file_path):  # If it's a directory
            shutil.rmtree(file_path)  # Use shutil.rmtree to remove directories

    # If the folder is empty, delete it
    if not os.listdir(paper_title_path):
        os.rmdir(paper_title_path)
      
    


0it [00:00, ?it/s]INFO:arxiv:Requesting page (first: True, try: 0): https://export.arxiv.org/api/query?search_query=cat:quant-ph&id_list=&sortBy=submittedDate&sortOrder=descending&start=0&max_results=100
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): export.arxiv.org:443
DEBUG:urllib3.connectionpool:https://export.arxiv.org:443 "GET /api/query?search_query=cat:quant-ph&id_list=&sortBy=submittedDate&sortOrder=descending&start=0&max_results=100 HTTP/1.1" 200 54584
INFO:arxiv:Got first page: 100 of 142475 total results
4it [01:32, 23.14s/it]


KeyboardInterrupt: 

In [1]:
import re
import os

# Define the LaTeX environments you want to extract
environments = ['equation', 'align', 'gather', 'multline', 'flalign']

# Regular expression for matching LaTeX environments
env_regex = r'\\begin\{(' + '|'.join(environments) + r')\}(.*?)\\end\{\1\}'

# Function to extract all LaTeX environments from a text
def extract_latex_envs(text, regex):
    return re.findall(regex, text, re.DOTALL)

# Read the .tex file
def read_tex_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        return file.read()

# Save the extracted content to a new file
def save_extracted_content(content, output_file):
    with open(output_file, 'w', encoding='utf-8') as file:
        for env, math in content:
            file.write(f'\\begin{{{env}}}\n{math}\\end{{{env}}}\n\n')

# Main function to process all .tex files in a directory
def process_tex_files(directory):
    for filename in os.listdir(directory):
        if filename.endswith('.tex'):
            file_path = os.path.join(directory, filename)
            tex_content = read_tex_file(file_path)
            extracted_content = extract_latex_envs(tex_content, env_regex)
            if extracted_content:
                output_file = f'extracted_{filename}'
                save_extracted_content(extracted_content, output_file)
                print(f'Extracted content saved to {output_file}')

# Replace 'your_directory_path' with the path to your .tex files
process_tex_files('papers/')


Extracted content saved to extracted_main.tex
