In [None]:
import os
from dotenv import load_dotenv
load_dotenv()
import requests
import xml.etree.ElementTree as ET
from typing import *
import time

In [41]:
import requests
import xml.etree.ElementTree as ET
import pandas as pd
import time

def get_all_pmids(query: str, api_key: None | str, email: str = "your_email@example.com") -> list[str]:
    base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
    retmax = 5000
    retstart = 0
    pmids = []

    # Get total count
    params = {
        "db": "pubmed",
        "term": query,
        "retmode": "xml",
        "retmax": 0,
        "email": email
    }

    if api_key:
        params["api_key"] = api_key

    try:
        r = requests.get(base_url, params=params, timeout=10)
        r.raise_for_status()

        root = ET.fromstring(r.text)
        total = int(root.findtext(".//Count"))
        print(f"Found {total} results.")

    except Exception as e:
        print(f"Failed to get count: {e}")
        return []
    
    # Fetch in pages with retry logic
    max_fails=0
    while retstart < total and max_fails<2:
        print(f"Fetching {retstart} to {retstart + retmax}...")
        retries = 2
        fetched = False

        while retries > 0 and not fetched:
            page_params = params.copy()
            page_params.update({
                "retstart": retstart,
                "retmax": retmax
            })

            try:
                r = requests.get(base_url, params=page_params, timeout=(5,10))
                r.raise_for_status()
                root = ET.fromstring(r.text)
                ids = [elem.text for elem in root.findall(".//Id")]

                if ids:
                    pmids.extend(ids)
                    retstart += retmax
                    fetched = True
                else:
                    retries -= 1
                    if retries == 0:
                        max_fails+=1
                        retstart+=retmax
                        fetched = True
                    #time.sleep(0.5)
                    print(f"No IDs returned at {retstart}, retrying...")

            except Exception as e:
                print(f"Error at retstart={retstart}: {e}")
                retries-=1
                if retries == 0:
                    max_fails+=1
                    retstart+=retmax
                    fetched = True
                
                #time.sleep(0.5)

        if max_fails==2:
            print(f"Giving up at retstart={retstart}. Stopping early.")
            break
    
    print(f"Fetched {len(pmids)}/{total} ids.")
    #time.sleep(0.5)

    return pmids

In [None]:
NCBI_EMAIL = os.getenv("NCBI_API_KEY")
NCBI_API_KEY = os.getenv("email")

query = "covid-19 AND 2024[dp] AND humans[MeSH Terms]"

pmids = get_all_pmids(query, NCBI_EMAIL,NCBI_API_KEY)

Found 40505 results.
Fetching 0 to 5000...
Fetching 5000 to 10000...
Fetching 10000 to 15000...
No IDs returned at 10000, retrying...
No IDs returned at 15000, retrying...
Fetching 15000 to 20000...
No IDs returned at 15000, retrying...
No IDs returned at 20000, retrying...
Giving up at retstart=20000. Stopping early.
Fetched 9999/40505 ids.


In [32]:
len(pmids)

9999