In [1]:
from feedparser import parse
from time import sleep
from pandas import DataFrame, read_csv
from pathlib import Path
from requests import get
from requests.exceptions import HTTPError
from re import escape, compile, search
from zipfile import ZipFile
from huggingface_hub import hf_hub_download
from json import dump, load

In [2]:
output_dir = Path("../data/pdf_extraction_data")
output_dir.mkdir(parents=True, exist_ok=True)

max_results = 50
sort_by = "relevance"
sort_order = "descending"

base_url = "http://export.arxiv.org/api/query"

years = ("2014", "2015", "2016", "2017", "2018")

categories = ("cs.LG","cs.CV","cs.CL",                  # Computer Science
              "astro‑ph","cond‑mat.mtrl‑sci", "hep‑ph", # Physics
              "math.AG", "math.PR", "math.NT",          # Mathematics
              "stat.ML", "stat.AP", "stat.ME",          # Statistics
              "eess.SP", "eess.SY", "eess.IV",          # EE & Systems Science
              "q‑bio.NC", "q‑bio.GN", "q‑bio.PE",       # Quantitative Biology
              "q‑fin.PR", "q‑fin.TR", "q‑fin.RM",       # Quantitative Finance
              "econ.EM", "econ.GN", "econ.TH")          # Economics

In [3]:
pdf_url = "https://arxiv.org/pdf/{arxiv_id}.pdf"
src_url = "https://arxiv.org/e-print/{arxiv_id}"

pdf_dir = Path(output_dir) / Path("pdf")
pdf_dir.mkdir(parents=True, exist_ok=True)

annotation_dir = Path(output_dir) / Path("annotation")
annotation_dir.mkdir(parents=True, exist_ok=True)

arxiv_api_response_path = Path(output_dir) / Path("arxiv_api_response.json")
metadata_path = Path(output_dir) / Path("metadata.csv")

In [4]:
def query_arxiv(category, year, max_results, sort_by, sort_order):
    """Query arXiv API for given category/year"""
    q = f"cat:{category}+AND+submittedDate:[{year}0101+TO+{year}1231]"
    url = (f"{base_url}?search_query={q}"
           f"&max_results={max_results}"
           f"&sortBy={sort_by}&sortOrder={sort_order}")
    sleep(3)
    return parse(url)

def extract_arxiv_ids(feed):
    """Extract arXiv IDs from API response"""
    entries = feed.entries
    arxiv_ids = []
    for entry in entries:
        arxiv_id = entry.id.split('/')[-1].split('v')[0]
        arxiv_ids.append(arxiv_id)
    return arxiv_ids

def download_file(url, output_path, chunk=8192):
    """Download file with progress tracking"""
    if not output_path.is_file():
        print(f"Downloading: {url}")
        try:
            r = get(url, stream=True)
            r.raise_for_status()
            with output_path.open("wb") as fh:
                for chunk_ in r.iter_content(chunk):
                    fh.write(chunk_)
            return True
        except HTTPError as e:
            print(e)
            return False
    else:
        return True

In [5]:
if not arxiv_api_response_path.is_file():
    doc_metadata = []

    for year in years:
        for category in categories:
            print(f"Querying {category} ({year})...")
            feed = query_arxiv(category, year, max_results, sort_by, sort_order)
            arxiv_ids = extract_arxiv_ids(feed)
            for arxiv_id in arxiv_ids:
                doc_metadata.append({
                    "arxiv_id": arxiv_id,
                    "year": year,
                    "category": category,
                    "annotation": None
                })
    
    with open(arxiv_api_response_path, 'w') as f:
        dump(doc_metadata, f)
else:
    with open(arxiv_api_response_path, 'r') as f:
        doc_metadata = load(f)

Querying cs.LG (2014)...
Querying cs.CV (2014)...
Querying cs.CL (2014)...
Querying astro‑ph (2014)...
Querying cond‑mat.mtrl‑sci (2014)...
Querying hep‑ph (2014)...
Querying math.AG (2014)...
Querying math.PR (2014)...
Querying math.NT (2014)...
Querying stat.ML (2014)...
Querying stat.AP (2014)...
Querying stat.ME (2014)...
Querying eess.SP (2014)...
Querying eess.SY (2014)...
Querying eess.IV (2014)...
Querying q‑bio.NC (2014)...
Querying q‑bio.GN (2014)...
Querying q‑bio.PE (2014)...
Querying q‑fin.PR (2014)...
Querying q‑fin.TR (2014)...
Querying q‑fin.RM (2014)...
Querying econ.EM (2014)...
Querying econ.GN (2014)...
Querying econ.TH (2014)...
Querying cs.LG (2015)...
Querying cs.CV (2015)...
Querying cs.CL (2015)...
Querying astro‑ph (2015)...
Querying cond‑mat.mtrl‑sci (2015)...
Querying hep‑ph (2015)...
Querying math.AG (2015)...
Querying math.PR (2015)...
Querying math.NT (2015)...
Querying stat.ML (2015)...
Querying stat.AP (2015)...
Querying stat.ME (2015)...
Querying eess.

In [6]:
if not metadata_path.is_file():
    metadata_df = DataFrame(doc_metadata)
    arxiv_ids = metadata_df["arxiv_id"].astype(str)
    
    txt_zip = hf_hub_download(
        repo_id="liminghao1630/DocBank",       
        filename="DocBank_500K_txt.zip",       
        cache_dir="../.hf_cache",
        repo_type="dataset")                 

    pattern = compile("|".join(escape(x) for x in arxiv_ids))

    with ZipFile(txt_zip) as z:
        for name in z.namelist():
            if name.endswith("_0.txt") and pattern.search(name):
                search_arxiv_id = search(r'tar_(.*?)\.gz', name)
                if search_arxiv_id:
                    arxiv_id = search_arxiv_id.group(1)
                    metadata_df.loc[metadata_df["arxiv_id"]==arxiv_id, "annotation"] = name
                    z.extract(name, path=annotation_dir)

    metadata_df = metadata_df.loc[metadata_df["annotation"].notnull()].reset_index(drop=True)

    for i, row in metadata_df.iterrows():
        arxiv_id = row["arxiv_id"]
        pdf_path = pdf_dir / Path(f"{arxiv_id}.pdf")
        download_successful = download_file(url = pdf_url.format(arxiv_id=arxiv_id), 
                                                output_path = pdf_path)
        
        if download_successful:
            metadata_df.loc[metadata_df["arxiv_id"] == arxiv_id, "pdf_path"] = str(pdf_path)
        else:
            metadata_df.loc[metadata_df["arxiv_id"] == arxiv_id, "pdf_path"] = None
            metadata_df.loc[metadata_df["arxiv_id"] == arxiv_id, "annotation"] = None

    metadata_df.dropna(ignore_index=True, inplace=True)
    metadata_df.to_csv(metadata_path, index=False)

else:
    metadata_df = read_csv(metadata_path)

Downloading: https://arxiv.org/pdf/1401.1880.pdf
Downloading: https://arxiv.org/pdf/1403.7100.pdf
Downloading: https://arxiv.org/pdf/1401.0733.pdf
Downloading: https://arxiv.org/pdf/1401.2416.pdf
Downloading: https://arxiv.org/pdf/1401.2804.pdf
Downloading: https://arxiv.org/pdf/1401.2871.pdf
Downloading: https://arxiv.org/pdf/1401.5311.pdf
Downloading: https://arxiv.org/pdf/1401.7713.pdf
Downloading: https://arxiv.org/pdf/1403.1687.pdf
Downloading: https://arxiv.org/pdf/1401.0660.pdf
Downloading: https://arxiv.org/pdf/1403.0801.pdf
Downloading: https://arxiv.org/pdf/1403.6636.pdf
Downloading: https://arxiv.org/pdf/1401.2611.pdf
Downloading: https://arxiv.org/pdf/1401.3057.pdf
Downloading: https://arxiv.org/pdf/1401.3713.pdf
Downloading: https://arxiv.org/pdf/1401.0364.pdf
Downloading: https://arxiv.org/pdf/1401.0429.pdf
Downloading: https://arxiv.org/pdf/1401.1167.pdf
Downloading: https://arxiv.org/pdf/1401.1555.pdf
Downloading: https://arxiv.org/pdf/1401.1692.pdf
Downloading: https:/

In [7]:
print(f"Total {len(metadata_df)} files")

Total 145 files


In [8]:
metadata_df["category_"] = metadata_df["category"].apply(lambda cat:cat.split(".")[0])
metadata_df["category_"].value_counts()

category_
cs      50
stat    33
math    29
eess    19
econ    14
Name: count, dtype: int64

In [9]:
metadata_df["year"].value_counts()

year
2017    34
2014    32
2016    28
2018    26
2015    25
Name: count, dtype: int64