In [185]:
import os
import re
import subprocess
import shutil
from datetime import datetime
from typing import List

import requests
import tarfile
from bs4 import BeautifulSoup

def math_scraper(input_names: List[str], output_name: str) -> None:
    """
    Reads the input LaTeX file and extracts all equations, align, and $$ environments
    into the output file, with each math environment on a separate line.

    Args:
        input_names (List[str]): A list of the names of the input LaTeX files.
        output_name (str): The name of the output text file.

    Returns:
        None

    Example Usage:
        >>> math_scraper(['input1.tex', 'input2.tex'], 'output.txt')
    """
    # Open the output file for writing
    with open(output_name, 'w') as output_file:

        # Define regular expressions for equation, align, and $$ environments
        equation_regex = re.compile(r'\\begin\{equation\}(.*?)\\end\{equation\}', re.DOTALL)
        equation_star_regex = re.compile(r'\\begin\{equation\*\}(.*?)\\end\{equation\*\}', re.DOTALL)

        align_regex = re.compile(r'\\begin\{align\}(.*?)\\end\{align\}', re.DOTALL)
        align_star_regex = re.compile(r'\\begin\{align\*\}(.*?)\\end\{align\*\}', re.DOTALL)

        multiline_regex = re.compile(r'\\begin\{multiline\}(.*?)\\end\{multiline\}', re.DOTALL)
        multiline_star_regex = re.compile(r'\\begin\{multiline\*\}(.*?)\\end\{multiline\*\}', re.DOTALL)

        gather_regex = re.compile(r'\\begin\{gather\}(.*?)\\end\{gather\}', re.DOTALL)
        gather_star_regex = re.compile(r'\\begin\{gather\*\}(.*?)\\end\{gather\*\}', re.DOTALL)

        aligned_regex = re.compile(r'\\begin\{aligned\}(.*?)\\end\{aligned\}', re.DOTALL)

        # Find matches within each input file and write them to the output file
        for idx, input_name in enumerate(input_names):
            print(f"Extracting latex equations from file nr: {idx+1}, named: {input_name}...")
            with open(input_name, 'r', encoding='utf-8') as input_file:
                try:
                    input_contents = input_file.read()
                    for match in equation_regex.findall(input_contents):
                        output_file.write(match.strip() + '\n')
                    for match in equation_star_regex.findall(input_contents):
                        output_file.write(match.strip() + '\n')

                    for match in align_regex.findall(input_contents):
                        output_file.write(match.strip() + '\n')
                    for match in align_star_regex.findall(input_contents):
                        output_file.write(match.strip() + '\n')

                    for match in multiline_regex.findall(input_contents):
                        output_file.write(match.strip() + '\n')
                    for match in multiline_star_regex.findall(input_contents):
                        output_file.write(match.strip() + '\n')

                    for match in gather_regex.findall(input_contents):
                        output_file.write(match.strip() + '\n')
                    for match in gather_star_regex.findall(input_contents):
                        output_file.write(match.strip() + '\n')

                    for match in aligned_regex.findall(input_contents):
                        output_file.write(match.strip() + '\n')
                except UnicodeDecodeError:
                    print(f"Error: File {input_name} cannot be decoded with UTF-8. Trying latin-1 encoding instead...")
                    try:
                        input_file.seek(0)  # Reset the file pointer
                        input_contents = input_file.read().encode('latin-1').decode('utf-8', 'ignore')
                        for match in equation_regex.findall(input_contents):
                            output_file.write(match.strip() + '\n')
                        for match in equation_star_regex.findall(input_contents):
                            output_file.write(match.strip() + '\n')

                        for match in align_regex.findall(input_contents):
                            output_file.write(match.strip() + '\n')
                        for match in align_star_regex.findall(input_contents):
                            output_file.write(match.strip() + '\n')

                        for match in multiline_regex.findall(input_contents):
                            output_file.write(match.strip() + '\n')
                        for match in multiline_star_regex.findall(input_contents):
                            output_file.write(match.strip() + '\n')

                        for match in gather_regex.findall(input_contents):
                            output_file.write(match.strip() + '\n')
                        for match in gather_star_regex.findall(input_contents):
                            output_file.write(match.strip() + '\n')

                        for match in aligned_regex.findall(input_contents):
                            output_file.write(match.strip() + '\n')
                    except UnicodeDecodeError:
                        print(f"Error: File {input_name} cannot be decoded with Latin-1 - skipping file..")
                        continue

            input_file.close()
    output_file.close()



In [176]:
def move_files_to_parent(download_directory: str) -> None:
    """
    Move all files in subdirectories of `download_directory` to the parent directory.

    Args:
        download_directory: The path to the directory containing the files.

    Raises:
        FileNotFoundError: If the `download_directory` does not exist.
    """
    if not os.path.exists(download_directory):
        raise FileNotFoundError(f"{download_directory} does not exist.")

    for item in os.listdir(download_directory):
        item_path = os.path.join(download_directory, item)
        if os.path.isdir(item_path):
            for file_name in os.listdir(item_path):
                file_path = os.path.join(item_path, file_name)
                new_file_path = os.path.join(download_directory, file_name)
                i = 1
                while os.path.exists(new_file_path):
                    file_name_split = os.path.splitext(file_name)
                    new_file_name = f"{file_name_split[0]}_{i}{file_name_split[1]}"
                    new_file_path = os.path.join(download_directory, new_file_name)
                    i += 1
                try:
                    shutil.move(file_path, new_file_path)
                except shutil.Error as e:
                    os.remove(file_path)
                    print(f"{e} - file deleted and skipped..")
            os.rmdir(item_path)


In [177]:
def unpack_tex_files(download_directory: str) -> None:
    """
    Deletes all files in the specified directory that are not '.tex' files or compressed '.pdf' files containing '.tex' files.
    Extracts all '.tex' files from compressed '.tar.gz' or '.tar' files to the specified directory.

    Args:
        download_directory: The path of the directory containing the downloaded files.

    Raises:
        subprocess.CalledProcessError: If the 'file' command fails to execute properly.

    """
    file_names = os.listdir(path=download_directory)

    for idx, filename in enumerate(file_names):
        if filename[-4:] != ".tex":
            print(f" ===== Handling file nr: {idx+1} with id: {filename} =====")
            full_filename = os.path.join(download_directory, filename)
            file_type_info_list = subprocess.check_output(["file", full_filename]).decode().strip().split()
            file_removed = False

            # If file_type_info_list contains 'PDF', file is either a '.pdf' or a compressed '.pdf' file.
            # We want only '.tex' files, so we delete it.
            if "PDF" in file_type_info_list:
                print(f" Deleting file nr: {idx+1} with id: {filename} - no '.tex' file found...")
                file_removed = True
                os.remove(full_filename)

            # If file_type_info_list contains 'gzip' and 'compressed', file is a compressed '.tar.gz' file.
            # We extract all '.tex' files to the specified directory and delete the original file.
            elif "gzip" in file_type_info_list and "compressed" in file_type_info_list:
                try:
                    tar = tarfile.open(full_filename, 'r:gz')
                    file_types = [os.path.splitext(os.path.basename(member.name))[1] for member in tar.getmembers()]
                    tar.close()
                    if ".tex" not in file_types:
                        print(f" Deleting file nr: {idx+1} with id: {filename} - no '.tex' file found...")
                        file_removed = True
                        os.remove(full_filename)
                    else:
                        tar = tarfile.open(full_filename, 'r:gz')
                        tar.extractall(path=download_directory, members=[m for m in tar.getmembers() if os.path.splitext(m.name)[1] == ".tex"])
                        tar.close()
                except tarfile.TarError as e:
                    print(f"{e} - file nr: {idx+1} with id: {filename} skipped an deleted...")
                    os.remove(full_filename)

            # If file_type_info_list contains 'tar', 'POSIX' and 'archive', file is a '.tar' file.
            # We extract all '.tex' files to the specified directory and delete the original file.
            elif "tar" in file_type_info_list and "POSIX" in file_type_info_list and "archive" in file_type_info_list:
                try:
                    tar = tarfile.open(full_filename, 'r:')
                    file_types = [os.path.splitext(os.path.basename(member.name))[1] for member in tar.getmembers()]
                    tar.close()
                    if ".tex" not in file_types:
                        print(f" Deleting file nr: {idx+1} with id: {filename} - no '.tex' file found...")
                        file_removed = True
                        os.remove(full_filename)
                    else:
                        tar = tarfile.open(full_filename, 'r:')
                        tar.extractall(path=download_directory, members=[m for m in tar.getmembers() if os.path.splitext(m.name)[1] == ".tex"])
                        tar.close()
                except tarfile.TarError as e:
                    print(f"{e} - file nr: {idx+1} with id: {filename} skipped an deleted...")
                    os.remove(full_filename)
            # Always remove original after.
            if not file_removed:
                os.remove(full_filename)

    # Move files from created sub-folders to parent folder
    move_files_to_parent(download_directory=download_directory)

In [178]:
def download_arxiv_papers(category: str, num_papers: int, download_dir: str) -> None:
    """
    Downloads STEM (Science, Technology, Engineering and Math) category arXiv papers in the specified category and saves them to the specified directory.

    Parameters:
        category (str): The arXiv category to download papers from.
        num_papers (int): The number of papers to download.
        download_dir (str): The directory to save the downloaded papers to.

    Returns:
        None.
    """
    # Physics, Mathematics, Computer Science, Quantitative biology, Electrical Engineering and Systems Science
    # Quantitative finance, Economics, Statistics
    STEM = ['physics','math', 'cs', 'eess', 'q-bio', 'q-fin', 'econ', 'stat']
    if category not in STEM:
        raise Exception(f'Category: {category} is not in the defined STEM categories: {STEM}')

    # Create the download directory if it doesn't exist
    if not os.path.exists(download_dir):
        # Notify the user that the download directory was created
        print(f"N.B.: Path '{download_dir}' didn't already exist, so it was created... ")
        os.makedirs(download_dir)

    # Request the arXiv category page and parse it with BeautifulSoup
    # Get the current date and time
    now = datetime.now()
    # Format the date as YY/MM
    current_formatted_date = now.strftime("%y%m")

    # Construct the URL of the arXiv category page for the current date and specified category
    url = f"https://arxiv.org/list/{category}/{current_formatted_date}"
    # Send an HTTP request to the URL and get the response
    response_1 = requests.get(url)
    # Parse the HTML content of the response with BeautifulSoup
    soup = BeautifulSoup(response_1.content, 'html.parser')
    # Construct a regular expression that matches links to the arXiv category page for the current date and specified category
    regex = re.compile(re.escape(f'/list/{category}/{current_formatted_date}'))
    # Find all links on the page that match the regular expression
    links = soup.find_all('a', href=regex)
    link = None
    # Loop over the links and find the link with the text ">all</a>"
    for l in links:
        if str(l)[-8:] == ">all</a>":
            link = l
    # Send an HTTP request to the URL of the page that contains links to individual papers in the category
    response_2 = requests.get(url+link['href'])
    # Parse the HTML content of the response with BeautifulSoup
    soup = BeautifulSoup(response_2.content, 'html.parser')

    # Extract the links to the individual papers from the page
    paper_links = soup.find_all('a', title='Abstract')

    # Raise an exception if the number of papers requested is greater than the number of papers available
    if num_papers > len(paper_links):
        raise Exception(f"{num_papers} papers requested, but only: {len(paper_links)} available in category: '{category}' in YY/MM: {current_formatted_date} ...")
    else:
        # Notify the user of the number of papers being downloaded
        print(f"======= Downloading: {num_papers} (out of {len(paper_links)}) papers in category: '{category}' in YY/MM: {current_formatted_date} =======")

    # Loop over the paper links and download the source files for each paper
    for i, link in enumerate(paper_links):
        if i >= num_papers:
            break


        # Get the URL of the paper page
        paper_url = "https://arxiv.org" + link['href']

        # Extract the paper ID from the URL
        paper_id = paper_url.split('/')[-1]

        # Construct the URL of the source files
        source_url = f"https://arxiv.org/e-print/{paper_id}"

        print("Downloading paper: ", i+1, " with id: ", paper_id, " ...")
        # Download the source files and save them to the download directory
        response_3 = requests.get(source_url)

        with open(download_dir+str(paper_id), "wb") as f:
            f.write(response_3.content)
        f.close()


In [166]:
nr_papers = 50
STEM_category = 'stat'
download_directory = "arxiv_papers/"
download_arxiv_papers(STEM_category, nr_papers, download_dir=download_directory)

[<a href="/list/stat/2303?skip=25&amp;show=25">26-50</a>, <a href="/list/stat/2303?skip=50&amp;show=25">51-75</a>, <a href="/list/stat/2303?skip=75&amp;show=25">76-100</a>, <a href="/list/stat/2303?skip=675&amp;show=25">676-698</a>, <a href="/list/stat/2303?skip=0&amp;show=10">fewer</a>, <a href="/list/stat/2303?skip=0&amp;show=50">more</a>, <a href="/list/stat/2303?show=698">all</a>, <a href="/list/stat/2303?skip=25&amp;show=25">26-50</a>, <a href="/list/stat/2303?skip=50&amp;show=25">51-75</a>, <a href="/list/stat/2303?skip=75&amp;show=25">76-100</a>, <a href="/list/stat/2303?skip=675&amp;show=25">676-698</a>, <a href="/list/stat/2303?skip=0&amp;show=10">fewer</a>, <a href="/list/stat/2303?skip=0&amp;show=50">more</a>, <a href="/list/stat/2303?show=698">all</a>, <a href="/list/stat/2303">2303</a>]
Downloading paper:  1  with id:  2303.00074  ...
Downloading paper:  2  with id:  2303.00102  ...
Downloading paper:  3  with id:  2303.00160  ...
Downloading paper:  4  with id:  2303.0017

In [179]:
move_files_to_parent(download_directory=download_directory)
unpack_tex_files(download_directory=download_directory)

 ===== Handling file nr: 112 with id: 2303.01186 =====
 ===== Handling file nr: 113 with id: 2303.01144 =====
 ===== Handling file nr: 118 with id: 2303.00203 =====
 ===== Handling file nr: 119 with id: 2303.01117 =====
 ===== Handling file nr: 121 with id: 2303.00835 =====
 ===== Handling file nr: 133 with id: 2303.01572 =====
 ===== Handling file nr: 134 with id: 2303.01129 =====
 ===== Handling file nr: 137 with id: 2303.01256 =====
 ===== Handling file nr: 141 with id: 2303.00187 =====
 Deleting file nr: 141 with id: 2303.00187 - no '.tex' file found...
 ===== Handling file nr: 145 with id: 2303.00586 =====
 ===== Handling file nr: 148 with id: 2303.01602 =====
 ===== Handling file nr: 149 with id: 2303.01406 =====
 ===== Handling file nr: 151 with id: 2303.00715 =====
 ===== Handling file nr: 152 with id: 2303.01031 =====
 ===== Handling file nr: 154 with id: 2303.00573 =====


In [186]:
file_names = [download_directory+file_name for file_name in os.listdir(download_directory) if file_name != ".DS_Store"]
output_file_name = "equations.txt"
math_scraper(input_names=file_names,output_name=output_file_name)

Extracting latex equations from file nr: 1, named: arxiv_papers/base_4.tex...
Extracting latex equations from file nr: 2, named: arxiv_papers/content_4.tex...
Extracting latex equations from file nr: 3, named: arxiv_papers/section33.tex...
Extracting latex equations from file nr: 4, named: arxiv_papers/content.tex...
Extracting latex equations from file nr: 5, named: arxiv_papers/monitoring.tex...
Extracting latex equations from file nr: 6, named: arxiv_papers/proof_factor_ti.tex...
Extracting latex equations from file nr: 7, named: arxiv_papers/proof_gmm_bn.tex...
Extracting latex equations from file nr: 8, named: arxiv_papers/supplementary.tex...
Extracting latex equations from file nr: 9, named: arxiv_papers/position_comp_3-5-2.tex...
Extracting latex equations from file nr: 10, named: arxiv_papers/new_samplers.tex...
Extracting latex equations from file nr: 11, named: arxiv_papers/proof_proposition.tex...
Extracting latex equations from file nr: 12, named: arxiv_papers/feng_tang_wa