In [15]:
import os
import shutil
import tarfile 
import re
import logging

from tqdm import tqdm
import arxiv

from src.Categories import ArxivCategories

In [16]:
# Read https://info.arxiv.org/help/api/user-manual.html#query_details on how to construct queries
category_tags = ArxivCategories().available_category_tags()
category_tags

['astro-ph',
 'cond-mat',
 'qr-qc',
 'hep-ex',
 'hep-lat',
 'hep-ph',
 'hep-th',
 'math-ph',
 'nlin',
 'nucl-ex',
 'nucl-th',
 'physics',
 'quant-ph',
 'math',
 'CoRR',
 'q-bio',
 'q-fin',
 'stat']

In [17]:
# Set up logging
logging.basicConfig(level=logging.DEBUG)

# Function to sanitize file paths
def sanitize_filename(name: str) -> str:
    return re.sub(r'[^\w\s-]', '', name.replace(' ', '_'))

def clean_directory(folder: str) -> None:
    """
    Recursively clean a directory, keeping only .tex files.

    This function traverses through all files and subdirectories within the given folder.
    It retains files with a .tex extension and removes all other files. If a subdirectory
    is found, the function is called recursively on that subdirectory. Empty directories
    are removed.

    Parameters:
    folder (str): The path to the directory to be cleaned.
    """
    for filename in os.listdir(folder):
        file_path = os.path.join(folder, filename)

        # Check if the current path is a file or a directory
        if os.path.isfile(file_path):
            # Remove the file if it does not have a .tex extension
            if not filename.endswith('.tex'):
                os.remove(file_path)
        elif os.path.isdir(file_path):
            # Recursively clean the subdirectory
            clean_directory(file_path)

            # Remove the subdirectory if it is empty after cleaning
            if not os.listdir(file_path):
                os.rmdir(file_path)

# Number of papers to download
N_papers = 100

# Initialize arXiv client
client = arxiv.Client()

# Query arXiv
results = client.results(arxiv.Search(query="cat:physics",
                                      max_results=N_papers,
                                      sort_by=arxiv.SortCriterion.SubmittedDate))

# Download and process papers
for paper in tqdm(results):
    folder_path = 'papers/'
    sanitized_title = sanitize_filename(paper.title)
    paper_title_path = os.path.join(folder_path, sanitized_title)
    tar_path = paper_title_path + '.tar.gz'

    try:
        # Download the source
        paper.download_source(dirpath=folder_path, filename=os.path.basename(tar_path))

        # Extract the tar.gz file
        with tarfile.open(tar_path) as file:
            file.extractall(paper_title_path)

        # Clean the directory
        clean_directory(paper_title_path)

        # Remove the tar.gz file
        os.remove(tar_path)

        # Remove the folder if it's empty
        if not os.listdir(paper_title_path):
            os.rmdir(paper_title_path)

    except Exception as e:
        logging.error(f"An error occurred with paper '{paper.title}': {e}")

        # Clean up any partially downloaded or extracted files
        if os.path.exists(tar_path):
            os.remove(tar_path)
        if os.path.exists(paper_title_path):
            clean_directory(paper_title_path)
            os.rmdir(paper_title_path)


0it [00:00, ?it/s]INFO:arxiv:Requesting page (first: True, try: 0): https://export.arxiv.org/api/query?search_query=cat:physics&id_list=&sortBy=submittedDate&sortOrder=descending&start=0&max_results=100
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): export.arxiv.org:443
DEBUG:urllib3.connectionpool:https://export.arxiv.org:443 "GET /api/query?search_query=cat:physics&id_list=&sortBy=submittedDate&sortOrder=descending&start=0&max_results=100 HTTP/1.1" 200 409
INFO:arxiv:Got empty first page; stopping generation
0it [00:00, ?it/s]
