# Testing ground for paper scrapers

## Pubmed

In [46]:
import pandas as pd

from paperscraper.arxiv import get_arxiv_papers_api
from paperscraper.arxiv.utils import get_query_from_keywords
from paperscraper.pubmed import get_pubmed_papers
from paperscraper.pubmed.utils import get_query_from_keywords_and_date

from src.utils import split_camel_case

In [5]:
# Prepare query
query = get_query_from_keywords_and_date(
    keywords=["active inference"], 
    start_date="2025/01/01", 
    end_date="2025/02/01")

# Scrape PubMed
papers = get_pubmed_papers(
    query=query, 
    fields=["title", "authors", "date", "journal", "doi"])

# Change dates to year
papers["date"] = pd.to_datetime(papers["date"])
papers["date"] = papers["date"].dt.year

# Format author column
papers['authors'] = papers['authors'].apply(lambda name_list: [split_camel_case(name) for name in name_list])

papers["authors"] = papers["authors"].apply(lambda name_list: ", ".join(name_list))

# Reorder and rename columns
papers = papers[["title", "authors", "journal", "date", "doi"]]
papers.columns = ["title", "authors", "where_published", "year", "doi"]

## ArXiv

In [65]:
# Prepare query
query = get_query_from_keywords(
    keywords=["active inference"], 
    start_date="2025-01-01", 
    end_date="2025-02-01")

# Scrape ArXiv
papers = get_arxiv_papers_api(
    query=query, 
    fields=["title", "authors", "date", "journal", "doi"])

# Change dates to year
papers["date"] = pd.to_datetime(papers["date"])
papers["date"] = papers["date"].dt.year

# Fill journal
papers["journal"] = ["arXiv"] * len(papers)

# Reorder and rename columns
papers = papers[["title", "authors", "journal", "date", "doi"]]
papers.columns = ["title", "authors", "where_published", "year", "doi"]

Processing all:active inference AND submittedDate:[202501010000 TO 202502010000]: 1117it [00:47, 23.33it/s]


In [70]:
papers

Unnamed: 0,title,authors,where_published,year,doi
0,Efficient LLM Inference with Activation Checkp...,"Sanghyeon Lee, Hongbeen Kim, Soojin Hwang, Gus...",arXiv,2025,10.48550/arXiv.2501.01792
1,Aging-aware CPU Core Management for Embodied C...,"Tharindu B. Hewage, Shashikant Ilager, Maria R...",arXiv,2025,10.48550/arXiv.2501.15829
2,Active and transfer learning with partially Ba...,"Sarah I. Allec, Maxim Ziatdinov",arXiv,2025,10.48550/arXiv.2501.00952
3,Dedicated Inference Engine and Binary-Weight N...,"Tse-Wei Chen, Wei Tao, Dongyue Zhao, Kazuhiro ...",arXiv,2025,10.48550/arXiv.2501.01841
4,Coded Deep Learning: Framework and Algorithm,"En-hui Yang, Shayan Mohajer Hamidi",arXiv,2025,10.48550/arXiv.2501.09849
...,...,...,...,...,...
1112,The ENUBET monitored neutrino beam and its imp...,"ENUBET collaboration, L. Halić, F. Acerbi, I. ...",arXiv,2025,10.48550/arXiv.2501.04531
1113,Measurements of the Temperature and E-mode Pol...,"T. -L. Chou, P. A. R. Ade, A. J. Anderson, J. ...",arXiv,2025,10.48550/arXiv.2501.06890
1114,Discovery of Ancient Globular Cluster Candidat...,"Katherine E. Whitaker, Sam E. Cutler, Rupali C...",arXiv,2025,10.48550/arXiv.2501.07627
1115,FAUST XX. The chemical structure and temperatu...,"J. Frediani, M. De Simone, L. Testi, L. Podio,...",arXiv,2025,10.48550/arXiv.2501.19188


In [None]:
# get_and_dump_pubmed_papers(
#     keywords=["active inference"],
#     output="data/tables/2025_04_04",
#     fields=["title", "authors", "date", "journal", "doi"],
#     start_date="2025/01/01", end_date="2025/02/01")  
    

In [104]:
from enum import Enum

class Scrapers(Enum):
    pubmed   = True
    arxiv    = True
    psyarxiv = False
    bioarxiv = False
    osf      = False
    zenodo   = False
    
archives = ["pubmed", "arxiv", "dfdf", "bioarxiv", "osf", "zenodo"]
archive_options = list(Scrapers.__members__)

supported = []
unsupported = []

for archive in archives:
    
    if archive not in archive_options:
        raise Exception(f"'{archive}' unrecgonized. Archive must be one of {archive_options}.")

    if Scrapers[archive].value is False:
        unsupported.append(archive)
    else:
        supported.append(archive)
        
assert len(unsupported) == 0, f"Archives {unsupported} are not currently supported. Currently supported archives: {supported}."


Exception: 'dfdf' unrecgonized. Archive must be one of ['pubmed', 'arxiv', 'psyarxiv', 'bioarxiv', 'osf', 'zenodo'].

In [97]:
list(Scrapers.__members__)

['pubmed', 'arxiv', 'psyarxiv', 'bioarxiv', 'osf', 'zenodo']