In [4]:
import os
from typing import List
import tarfile 
import re
import logging

from tqdm import tqdm
import arxiv

from src.Categories import ArxivCategories

In [5]:
# Read https://info.arxiv.org/help/api/user-manual.html#query_details on how to construct queries
category_tags = ArxivCategories().available_category_tags()
category_tags

['astro-ph',
 'cond-mat',
 'qr-qc',
 'hep-ex',
 'hep-lat',
 'hep-ph',
 'hep-th',
 'math-ph',
 'nlin',
 'nucl-ex',
 'nucl-th',
 'physics',
 'quant-ph',
 'math',
 'CoRR',
 'q-bio',
 'q-fin',
 'stat']

In [6]:
def get_arxiv_paper_ids(start_number: int, end_number: int, representation_length: int, year: int, month: int) -> List[str]:
    """
    Generate a list of formatted arXiv paper IDs within a specified range.

    This function creates arXiv paper IDs using a combination of the specified year, month, and a range of numbers. 
    The IDs are formatted to match common arXiv ID conventions.

    Parameters:
    start_number (int): The starting number in the range of paper IDs.
    end_number (int): The ending number in the range of paper IDs (inclusive).
    representation_length (int): The total length of the numerical part of the paper ID.
    year (int): The year of publication (last two digits, e.g., 23 for 2023).
    month (int): The month of publication (e.g., 12 for December, 2 for February).

    Returns:
    List[str]: A list of formatted arXiv paper IDs.
    """

    # Generate a list of numbers in the specified range, converting each to a string
    numbers = [str(num) for num in range(start_number, end_number + 1)]

    # Format each number into the arXiv paper ID format
    return [
        f"{year:02d}{month:02d}." +  # Format year and month with leading zeros
        '0' * (representation_length - len(num)) +  # Add leading zeros to the number part
        num  # Append the number
        for num in numbers
    ]

# Set up logging
#logging.basicConfig(level=logging.DEBUG)

# Function to sanitize file paths
def sanitize_filename(name: str) -> str:
    return re.sub(r'[^\w\s-]', '', name.replace(' ', '_'))

def clean_directory(folder: str) -> None:
    """
    Recursively clean a directory, keeping only .tex files.

    This function traverses through all files and subdirectories within the given folder.
    It retains files with a .tex extension and removes all other files. If a subdirectory
    is found, the function is called recursively on that subdirectory. Empty directories
    are removed.

    Parameters:
    folder (str): The path to the directory to be cleaned.
    """
    for filename in os.listdir(folder):
        file_path = os.path.join(folder, filename)

        # Check if the current path is a file or a directory
        if os.path.isfile(file_path):
            # Remove the file if it does not have a .tex extension
            if not filename.endswith('.tex'):
                os.remove(file_path)
        elif os.path.isdir(file_path):
            # Recursively clean the subdirectory
            clean_directory(file_path)

            # Remove the subdirectory if it is empty after cleaning
            if not os.listdir(file_path):
                os.rmdir(file_path)


In [7]:
# Number of papers to download
N_papers = 100
start_number = 210
# Initialize arXiv client
client = arxiv.Client()

for month in [3,4,5,6,7,8,9,10,11,12]:
    print("#"*10+" MONTH: ", month, "#"*10)
    # Query arXiv
    id_list = get_arxiv_paper_ids(start_number=start_number,
                                  end_number=start_number+N_papers,
                                  representation_length=5, # Check arxiv to find this for specific year,month
                                  year=23,
                                  month=month)
    results = client.results(arxiv.Search(id_list=id_list))
    
    # Download and process papers
    for paper in tqdm(results):
        folder_path = 'papers/'
        sanitized_title = sanitize_filename(paper.title)
        paper_title_path = os.path.join(folder_path, sanitized_title)
        tar_path = paper_title_path + '.tar.gz'
    
        try:
            # Download the source
            paper.download_source(dirpath=folder_path, filename=os.path.basename(tar_path))
    
            # Extract the tar.gz file
            with tarfile.open(tar_path) as file:
                file.extractall(paper_title_path)
    
            # Clean the directory
            clean_directory(paper_title_path)
    
            # Remove the tar.gz file
            os.remove(tar_path)
    
            # Remove the folder if it's empty
            if not os.listdir(paper_title_path):
                os.rmdir(paper_title_path)
    
        except Exception as e:
            logging.error(f"An error occurred with paper '{paper.title}': {e}")
    
            # Clean up any partially downloaded or extracted files
            if os.path.exists(tar_path):
                os.remove(tar_path)
            if os.path.exists(paper_title_path):
                clean_directory(paper_title_path)
                os.rmdir(paper_title_path)


##########MONTH:  3 ##########


101it [04:48,  2.86s/it]


##########MONTH:  4 ##########


101it [03:53,  2.31s/it]


##########MONTH:  5 ##########


101it [05:02,  3.00s/it]


##########MONTH:  6 ##########


101it [04:45,  2.82s/it]


##########MONTH:  7 ##########


101it [03:53,  2.31s/it]


##########MONTH:  8 ##########


101it [03:56,  2.34s/it]


##########MONTH:  9 ##########


101it [03:53,  2.31s/it]


##########MONTH:  10 ##########


101it [05:01,  2.98s/it]


##########MONTH:  11 ##########


101it [05:28,  3.25s/it]


##########MONTH:  12 ##########


101it [05:10,  3.08s/it]
