In [3]:
import re
import os
def math_scraper(input_name: str, output_name: str) -> None:
    """
    Reads the input LaTeX file and extracts all equations, align, and $$ environments
    into the output file, with each math environment on a separate line.

    Args:
    - input_name (str): The name of the input LaTeX file.
    - output_name (str): The name of the output text file.

    Returns:
    - None
    """
    # Open the input file for reading and the output file for writing
    with open(input_name, 'r') as input_file, open(output_name, 'w') as output_file:

        # Define regular expressions for equation, align, and $$ environments
        equation_re = re.compile(r'\\begin{equation}(.*?)\\end{equation}', re.DOTALL)
        align_re = re.compile(r'\\begin{align}(.*?)\\end{align}', re.DOTALL)
        dollar_re = re.compile(r'\$\$(.*?)\$\$', re.DOTALL)

        # Find matches within the input file and write them to the output file
        input_contents = input_file.read()
        for match in equation_re.findall(input_contents):
            output_file.write(match.strip() + '\n\n')

        for match in align_re.findall(input_contents):
            output_file.write(match.strip() + '\n\n')

        for match in dollar_re.findall(input_contents):
            output_file.write(match.strip() + '\n\n')

import os
import time
import requests
from bs4 import BeautifulSoup

def download_paper(url, output_dir):
    """
    Given a URL for a paper on arXiv, downloads the paper and the .tex source file
    and saves them to the specified output directory.
    """
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    download_links = soup.find_all('a', title='Download PDF')

    if not download_links:
        print(f"No PDF download link found for {url}")
        return

    pdf_url = download_links[0]['href']
    arxiv_id = pdf_url.split('/')[-1].split('.pdf')[0]
    source_url = f"https://arxiv.org/e-print/{arxiv_id}"

    response = requests.get(source_url)
    with open(os.path.join(output_dir, f"{arxiv_id}.pdf"), 'wb') as f:
        f.write(response.content)
    time.sleep(1)

    response = requests.get(source_url, headers={'User-Agent': 'Mozilla/5.0'})
    with open(os.path.join(output_dir, f"{arxiv_id}.tex"), 'w') as f:
        f.write(response.content.decode('utf-8'))
    time.sleep(1)

def scrape_arxiv_stem_categories(categories, n, output_dir):
    """
    Given a list of arXiv STEM categories and a number 'n', scrapes the arXiv website
    for the latest papers in those categories and downloads their PDFs and .tex files
    to the specified output directory.
    """
    for category in categories:
        print(f"Scraping papers for category '{category}'...")
        url = f"https://arxiv.org/list/{category}/recent"
        response = requests.get(url)
        soup = BeautifulSoup(response.content, 'html.parser')
        paper_links = soup.find_all('a', title='Abstract')

        for i, link in enumerate(paper_links):
            if i >= n:
                break
            paper_url = "https://arxiv.org" + link['href']
            print(f"Downloading paper {i+1} of {n} from {paper_url}...")
            download_paper(paper_url, output_dir)


In [4]:
categories = ["math", "physics", "cs", "q-bio"]
n = 10
output_dir = "arxiv_papers/"
os.makedirs(output_dir, exist_ok=True)
scrape_arxiv_stem_categories(categories, n, output_dir)

Scraping papers for category 'math'...
Downloading paper 1 of 10 from https://arxiv.org/abs/2303.13492...
No PDF download link found for https://arxiv.org/abs/2303.13492
Downloading paper 2 of 10 from https://arxiv.org/abs/2303.13490...
No PDF download link found for https://arxiv.org/abs/2303.13490
Downloading paper 3 of 10 from https://arxiv.org/abs/2303.13487...
No PDF download link found for https://arxiv.org/abs/2303.13487
Downloading paper 4 of 10 from https://arxiv.org/abs/2303.13486...
No PDF download link found for https://arxiv.org/abs/2303.13486
Downloading paper 5 of 10 from https://arxiv.org/abs/2303.13485...
No PDF download link found for https://arxiv.org/abs/2303.13485
Downloading paper 6 of 10 from https://arxiv.org/abs/2303.13481...
No PDF download link found for https://arxiv.org/abs/2303.13481
Downloading paper 7 of 10 from https://arxiv.org/abs/2303.13480...
No PDF download link found for https://arxiv.org/abs/2303.13480
Downloading paper 8 of 10 from https://arxiv