In [12]:
import arxiv
import re
import os
import tarfile
import shutil
from tqdm import tqdm

# Set the data path
data_path = "data"
os.makedirs(data_path, exist_ok=True)

# Define categories
categories = ["astro-ph", "cond-mat", "gr-qc", "hep-th", "math-ph", "nlin", "nucl-th", "quant-ph"]


# Function to count the number of equations in a LaTeX source
def count_equations(latex_source):
    equation_patterns = [
        r"\\begin\{equation\}",
        r"\\end\{equation\}",
        r"\$\$.*?\$\$",
        r"\\\[.*?\\\]",
        r"\$.*?\$",
        r"\\begin\{align\}",
        r"\\end\{align\}",
    ]
    equation_regex = "|".join(equation_patterns)
    return len(re.findall(equation_regex, latex_source, re.DOTALL))


# Function to clean the LaTeX source by removing headers and other details
def clean_latex_source(latex_source):
    cleaned_source = re.sub(r"\\documentclass.*?\\begin\{document\}", "", latex_source, flags=re.DOTALL)
    cleaned_source = re.sub(r"\\end\{document\}", "", cleaned_source, flags=re.DOTALL)
    return cleaned_source.strip()


# Extract the main LaTeX file from the tar.gz archive
def extract_main_latex_file(tar_path, extract_dir):
    with tarfile.open(tar_path, "r:gz") as tar:
        tar.extractall(path=extract_dir)
        for member in tar.getmembers():
            if member.isfile() and member.name.endswith(".tex"):
                return os.path.join(extract_dir, member.name)
    return None


# Fetch and filter papers
def fetch_and_filter_papers():
    for category in categories:
        search = arxiv.Search(
            query=category, max_results=100, sort_by=arxiv.SortCriterion.SubmittedDate  # Adjust this number as needed
        )
        for result in tqdm(search.results()):
            try:
                print(f"Downloading {result.title}")
                paper_id = result.entry_id.split("/")[-1]
                download_path = os.path.join(data_path, f"{paper_id}.tar.gz")
                result.download_source(dirpath=data_path, filename=f"{paper_id}.tar.gz")
                extract_dir = os.path.join(data_path, paper_id)
                os.makedirs(extract_dir, exist_ok=True)

                filtered_path_folder = os.path.join(data_path, "latex_files")
                
                # Extract the LaTeX source file from the tar.gz
                latex_file_path = extract_main_latex_file(download_path, extract_dir)
                if latex_file_path:
                    with open(latex_file_path, "r", encoding="utf-8") as file:
                        latex_content = file.read()
                        equation_count = count_equations(latex_content)
                        if equation_count > 2:  # Adjust threshold as needed
                            cleaned_content = clean_latex_source(latex_content)
                            filtered_file_path = os.path.join(filtered_path_folder, f"{paper_id}.tex")
                            with open(filtered_file_path, "w", encoding="utf-8") as clean_file:
                                clean_file.write(cleaned_content)
                            print(f"Saved {filtered_file_path} with {equation_count} equations")
                    # Clean up the extracted files
                    shutil.rmtree(extract_dir)
                os.remove(download_path)  # Remove the tar.gz file
            except Exception as e:
                print(f"Error processing {result.title}: {e}")
                if os.path.exists(download_path):
                    os.remove(download_path)  # Clean up if an error occurs
                if os.path.exists(extract_dir):
                    shutil.rmtree(extract_dir)  # Clean up if an error occurs


fetch_and_filter_papers()

  for result in tqdm(search.results()):
0it [00:00, ?it/s]

Downloading Jellyfish galaxies with the IllustrisTNG simulations -- Citizen-science results towards large distances, low-mass hosts, and high redshifts


1it [00:02,  2.01s/it]

Error processing Jellyfish galaxies with the IllustrisTNG simulations -- Citizen-science results towards large distances, low-mass hosts, and high redshifts: cannot access local variable 'filtered_file_path' where it is not associated with a value
Downloading Jellyfish galaxies with the IllustrisTNG simulations -- No enhanced population-wide star formation according to TNG50


2it [00:02,  1.18s/it]

Error processing Jellyfish galaxies with the IllustrisTNG simulations -- No enhanced population-wide star formation according to TNG50: cannot access local variable 'filtered_file_path' where it is not associated with a value
Downloading Jellyfish galaxies with the IllustrisTNG simulations -- When, where, and for how long does ram pressure stripping of cold gas occur?


5it [00:03,  2.60it/s]

Error processing Jellyfish galaxies with the IllustrisTNG simulations -- When, where, and for how long does ram pressure stripping of cold gas occur?: cannot access local variable 'filtered_file_path' where it is not associated with a value
Downloading Bremsstrahlung emission from nuclear reactions in compact stars
Error processing Bremsstrahlung emission from nuclear reactions in compact stars: cannot access local variable 'filtered_file_path' where it is not associated with a value
Downloading Spontaneous current-layer fragmentation and cascading reconnection in solar flares: II. Relation to observations
Error processing Spontaneous current-layer fragmentation and cascading reconnection in solar flares: II. Relation to observations: cannot access local variable 'filtered_file_path' where it is not associated with a value
Downloading Proper motion and Photometric Variability of the Candidate Propotoplanet TMR-1C
Error processing Proper motion and Photometric Variability of the Candida

10it [00:03,  6.42it/s]

Error processing Ultraviolet Spectra of Local Galaxies and their Link with the High-z Population: cannot access local variable 'filtered_file_path' where it is not associated with a value
Downloading Correlated variability in the blazar 3C 454.3
Error processing Correlated variability in the blazar 3C 454.3: cannot access local variable 'filtered_file_path' where it is not associated with a value
Downloading Biases and Uncertainties in Physical Parameter Estimates of Lyman Break Galaxies from Broad-band Photometry
Error processing Biases and Uncertainties in Physical Parameter Estimates of Lyman Break Galaxies from Broad-band Photometry: cannot access local variable 'filtered_file_path' where it is not associated with a value
Downloading Dynamics of a Spherical Accretion Shock with Neutrino Heating and Alpha-Particle Recombination


14it [00:03,  9.76it/s]

Error processing Dynamics of a Spherical Accretion Shock with Neutrino Heating and Alpha-Particle Recombination: cannot access local variable 'filtered_file_path' where it is not associated with a value
Downloading Asymptotically FRW black holes
Error processing Asymptotically FRW black holes: cannot access local variable 'filtered_file_path' where it is not associated with a value
Downloading Quantum Black Holes As Elementary Particles
Error processing Quantum Black Holes As Elementary Particles: cannot access local variable 'filtered_file_path' where it is not associated with a value
Downloading Reaction of Accretion Disks to Abrupt Mass Loss During Binary Black Hole Merger
Error processing Reaction of Accretion Disks to Abrupt Mass Loss During Binary Black Hole Merger: cannot access local variable 'filtered_file_path' where it is not associated with a value
Downloading A Gamma-Ray Burst/Pulsar for Cosmic-Ray Positrons with a Dark Matter-like Spectrum
Error processing A Gamma-Ray Bur

19it [00:03, 13.20it/s]

Error processing Catastrophic Photo-z Errors and the Dark Energy Parameter Estimates with Cosmic Shear: cannot access local variable 'filtered_file_path' where it is not associated with a value
Downloading Stellar Ages from Stellar Rotation
Error processing Stellar Ages from Stellar Rotation: cannot access local variable 'filtered_file_path' where it is not associated with a value
Downloading An Evolutionary Considerations for V228 from 47 Tuc
Error processing An Evolutionary Considerations for V228 from 47 Tuc: cannot access local variable 'filtered_file_path' where it is not associated with a value
Downloading Source region of the 2003 November 18 CME that led to the strongest magnetic storm of cycle 23
Error processing Source region of the 2003 November 18 CME that led to the strongest magnetic storm of cycle 23: cannot access local variable 'filtered_file_path' where it is not associated with a value
Downloading Further progress on solar age calibration
Error processing Further pro

26it [00:04, 16.89it/s]

Error processing Origin of Europa and the Galilean Satellites: cannot access local variable 'filtered_file_path' where it is not associated with a value
Downloading High Accuracy Near-infrared Imaging Polarimetry with NICMOS
Error processing High Accuracy Near-infrared Imaging Polarimetry with NICMOS: cannot access local variable 'filtered_file_path' where it is not associated with a value
Downloading Active Galactic Nuclei, Radio Jets and Acceleration of UHECRs
Error processing Active Galactic Nuclei, Radio Jets and Acceleration of UHECRs: cannot access local variable 'filtered_file_path' where it is not associated with a value
Downloading Compressed sensing imaging techniques for radio interferometry
Error processing Compressed sensing imaging techniques for radio interferometry: cannot access local variable 'filtered_file_path' where it is not associated with a value
Downloading The CHilean Automatic Supernova sEarch (CHASE)
Error processing The CHilean Automatic Supernova sEarch (C

31it [00:04, 17.94it/s]

Error processing Measuring interstellar magnetic fields by radio synchrotron emission: cannot access local variable 'filtered_file_path' where it is not associated with a value
Downloading Intermediate inflation on the brane
Error processing Intermediate inflation on the brane: cannot access local variable 'filtered_file_path' where it is not associated with a value
Downloading Friedmann cosmology with bulk viscosity: a concrete model for dark energy
Error processing Friedmann cosmology with bulk viscosity: a concrete model for dark energy: cannot access local variable 'filtered_file_path' where it is not associated with a value
Downloading Detecting Solar Neutrino Flare in Megaton and km^3 detectors
Error processing Detecting Solar Neutrino Flare in Megaton and km^3 detectors: cannot access local variable 'filtered_file_path' where it is not associated with a value
Downloading Anisotropic distribution functions for spherical galaxies
Error processing Anisotropic distribution functions

34it [00:04, 20.50it/s]

Error processing Consistency of Equations in the Second-order Gauge-invariant Cosmological Perturbation Theory: cannot access local variable 'filtered_file_path' where it is not associated with a value
Downloading Quantum vacuum and accelerated expansion
Error processing Quantum vacuum and accelerated expansion: cannot access local variable 'filtered_file_path' where it is not associated with a value
Downloading Near-IR spectroscopic ages of massive star clusters in M82
Error processing Near-IR spectroscopic ages of massive star clusters in M82: cannot access local variable 'filtered_file_path' where it is not associated with a value
Downloading Infall and rotation motions in the HH 111 protostellar system: A flattened envelope in transition to a disk?
Error processing Infall and rotation motions in the HH 111 protostellar system: A flattened envelope in transition to a disk?: cannot access local variable 'filtered_file_path' where it is not associated with a value
Downloading Untwisti

41it [00:04, 21.54it/s]

Error processing Untwisting magnetospheres of neutron stars: cannot access local variable 'filtered_file_path' where it is not associated with a value
Downloading Dynamic masses for the close PG1159 binary SDSSJ212531.92-010745.9
Error processing Dynamic masses for the close PG1159 binary SDSSJ212531.92-010745.9: cannot access local variable 'filtered_file_path' where it is not associated with a value
Downloading Long-term photometric monitoring of the hybrid subdwarf B pulsator HS0702+6043
Error processing Long-term photometric monitoring of the hybrid subdwarf B pulsator HS0702+6043: cannot access local variable 'filtered_file_path' where it is not associated with a value
Downloading Spectroscopy of the sdB pulsator HS2201+2610
Error processing Spectroscopy of the sdB pulsator HS2201+2610: cannot access local variable 'filtered_file_path' where it is not associated with a value
Downloading Multi-wavelength photometric variation of PG1605+072
Error processing Multi-wavelength photomet

47it [00:05, 15.09it/s]

Error processing Orbital resonances in discs around braneworld Kerr black holes: cannot access local variable 'filtered_file_path' where it is not associated with a value
Downloading The variation of the electromagnetic coupling and quintessence
Error processing The variation of the electromagnetic coupling and quintessence: cannot access local variable 'filtered_file_path' where it is not associated with a value
Downloading 3D Spectroscopic Study of the Line Emitting Regions of Mrk 493
Error processing 3D Spectroscopic Study of the Line Emitting Regions of Mrk 493: cannot access local variable 'filtered_file_path' where it is not associated with a value
Downloading Angular Energy Distribution of Collapsar-Jets
Error processing Angular Energy Distribution of Collapsar-Jets: cannot access local variable 'filtered_file_path' where it is not associated with a value
Downloading Explosions inside Ejecta and Most Luminous Supernovae
Error processing Explosions inside Ejecta and Most Luminous

50it [00:05, 17.14it/s]

Error processing Quasi-viscous accretion flow -- I: Equilibrium conditions and asymptotic behaviour: cannot access local variable 'filtered_file_path' where it is not associated with a value
Downloading Observations of the pulsation of the Cepheid l Car with the Sydney University Stellar Interferometer
Error processing Observations of the pulsation of the Cepheid l Car with the Sydney University Stellar Interferometer: cannot access local variable 'filtered_file_path' where it is not associated with a value
Downloading Jet breaks and Energetics of Swift GRB X-ray Afterglows
Error processing Jet breaks and Energetics of Swift GRB X-ray Afterglows: cannot access local variable 'filtered_file_path' where it is not associated with a value
Downloading Constraints on Dark Energy from the Observed Expansion of our Cosmic Horizon
Error processing Constraints on Dark Energy from the Observed Expansion of our Cosmic Horizon: cannot access local variable 'filtered_file_path' where it is not assoc

55it [00:05, 15.34it/s]

Error processing Reversal of the amplitude difference of kHz QPOs in six atoll sources: cannot access local variable 'filtered_file_path' where it is not associated with a value
Downloading The origin of 'Great Walls'
Error processing The origin of 'Great Walls': cannot access local variable 'filtered_file_path' where it is not associated with a value
Downloading Thermal axion constraints in non-standard thermal histories
Error processing Thermal axion constraints in non-standard thermal histories: cannot access local variable 'filtered_file_path' where it is not associated with a value


57it [00:06, 15.35it/s]

Downloading Realistic analytic model for the prompt and high latitude emission in GRBs
Error processing Realistic analytic model for the prompt and high latitude emission in GRBs: cannot access local variable 'filtered_file_path' where it is not associated with a value
Downloading Rapid pulsations in sub-THz solar bursts
Error processing Rapid pulsations in sub-THz solar bursts: cannot access local variable 'filtered_file_path' where it is not associated with a value
Downloading Photometric Properties of the Near-contact Binary GW Geminorum
Error processing Photometric Properties of the Near-contact Binary GW Geminorum: cannot access local variable 'filtered_file_path' where it is not associated with a value
Downloading Preheating in the Standard Model with the Higgs-Inflaton coupled to gravity


62it [00:06, 17.38it/s]

Error processing Preheating in the Standard Model with the Higgs-Inflaton coupled to gravity: cannot access local variable 'filtered_file_path' where it is not associated with a value
Downloading Adiabatic expansion and magnetic fields in AGN jets
Error processing Adiabatic expansion and magnetic fields in AGN jets: cannot access local variable 'filtered_file_path' where it is not associated with a value
Downloading Probing parsec scale jets in AGN with geodetic VLBI
Error processing Probing parsec scale jets in AGN with geodetic VLBI: cannot access local variable 'filtered_file_path' where it is not associated with a value
Downloading Search for the magnetic field of the O7.5 III star xi Persei
Error processing Search for the magnetic field of the O7.5 III star xi Persei: cannot access local variable 'filtered_file_path' where it is not associated with a value
Downloading A New Model For Vela Jr. Supernova Remnant


67it [00:06, 20.18it/s]

Error processing A New Model For Vela Jr. Supernova Remnant: cannot access local variable 'filtered_file_path' where it is not associated with a value
Downloading The magnetic field of the B3V star 16 Pegasi
Error processing The magnetic field of the B3V star 16 Pegasi: cannot access local variable 'filtered_file_path' where it is not associated with a value
Downloading Scenarios for GCRT J1745-3009
Error processing Scenarios for GCRT J1745-3009: cannot access local variable 'filtered_file_path' where it is not associated with a value
Downloading Band-power reconstruction of the primordial fluctuation spectrum by the maximum likelihood reconstruction method
Error processing Band-power reconstruction of the primordial fluctuation spectrum by the maximum likelihood reconstruction method: cannot access local variable 'filtered_file_path' where it is not associated with a value
Downloading Model of Reconnection of Weakly Stochastic Magnetic Field and its Testing
Error processing Model of R

73it [00:06, 21.28it/s]

Error processing Grain alignment induced by radiative torques: effects of internal relaxation of energy and complex radiation fields: cannot access local variable 'filtered_file_path' where it is not associated with a value
Downloading Null geodesics and observational cosmology
Error processing Null geodesics and observational cosmology: cannot access local variable 'filtered_file_path' where it is not associated with a value
Downloading On Dark Energy and Dark Matter (Part I)
Error processing On Dark Energy and Dark Matter (Part I): cannot access local variable 'filtered_file_path' where it is not associated with a value
Downloading Near-infrared bulge-disc correlations of lenticular galaxies
Error processing Near-infrared bulge-disc correlations of lenticular galaxies: cannot access local variable 'filtered_file_path' where it is not associated with a value
Downloading Is the PAMELA Positron Excess Winos?
Error processing Is the PAMELA Positron Excess Winos?: cannot access local vari

75it [00:07, 10.68it/s]


Downloading Explaining the Orbits of the Galactic Center S-Stars
Error processing Explaining the Orbits of the Galactic Center S-Stars: cannot access local variable 'filtered_file_path' where it is not associated with a value
Downloading The ACS Survey of Galactic Globular Clusters. VII. Relative Ages
Error processing The ACS Survey of Galactic Globular Clusters. VII. Relative Ages: cannot access local variable 'filtered_file_path' where it is not associated with a value
Downloading Nonlinear Density Fluctuation Field Theory for Large Scale Structure


KeyboardInterrupt: 

In [None]:
os.path.join("", "aa")

'aa'