PubMed Journal Article Fetcher for Google Colab
This notebook fetches articles from specified journals for a given month using PubMed API

In [1]:
# Install required packages

!pip install biopython pandas requests

import os, pandas as pd, requests, time, warnings, json
from Bio import Entrez
from datetime import datetime, timedelta
import xml.etree.ElementTree as ET
from typing import List, Dict, Optional
# warnings.filterwarnings('ignore')


Collecting biopython


  Downloading biopython-1.86-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl.metadata (13 kB)


Downloading biopython-1.86-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl (3.2 MB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/3.2 MB[0m [31m?[0m eta [36m-:--:--[0m

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.2/3.2 MB[0m [31m70.0 MB/s[0m  [33m0:00:00[0m
[?25h

Installing collected packages: biopython


Successfully installed biopython-1.86


# Instructions for use

📋 INSTRUCTIONS:

1. Update the EMAIL variable with your email address (required by NCBI)
2. Adjust JOURNALS and LOOKBACK_DAYS to shape the search window
3. Run the cells to execute main() and harvest only new PMIDs within the rolling window or the requested month

⚠️  IMPORTANT NOTES:

- Use your real email address - it’s required by NCBI’s usage policy
- Journal names should match PubMed’s format exactly
- Large queries may take several minutes to complete
- Be respectful of API rate limits
- Previously harvested PMIDs are stored in data/ent_search/seen_pmids.json to avoid duplicates
- When running via GitHub Actions, you can supply TARGET_MONTH (YYYY-MM) or YEAR + MONTH inputs to rerun a specific calendar month instead of the rolling LOOKBACK_DAYS window.

🚀 To start, run: main()


In [2]:
# Parameters
EMAIL = os.getenv("UNPAYWALL_EMAIL", "")

JOURNALS = [
    "International Forum of Allergy & Rhinology",
    "Rhinology",
    "JAMA Otolaryngology–Head & Neck Surgery",
    "Otolaryngology–Head and Neck Surgery",
    "European Annals of Otorhinolaryngology–Head and Neck Diseases",
    "Journal of Voice",
    "American Journal of Rhinology & Allergy",
    "JARO – Journal of the Association for Research in Otolaryngology",
    "Journal of Otolaryngology–Head & Neck Surgery",
    "Laryngoscope",
    "Auris Nasus Larynx",
    "new england journal of medicine",
    "JAMA"
]

LOOKBACK_DAYS = int(os.getenv("LOOKBACK_DAYS", "30"))
TARGET_MONTH = os.getenv("TARGET_MONTH", "").strip()
TARGET_YEAR = os.getenv("TARGET_YEAR", "").strip()
TARGET_MONTH_NUMBER = os.getenv("TARGET_MONTH_NUMBER", "").strip()
OUTPUT_DIR = os.path.join("data", "ent_search")

ALLOW_EMPTY_HARVEST = os.getenv("ALLOW_EMPTY_HARVEST", "false").lower() == "true"


In [3]:
from datetime import datetime

RUN_STARTED_AT = datetime.now()


In [4]:
#!/usr/bin/env python3
def load_seen_pmids(path: str) -> set:
    """Load a set of previously seen PMIDs from disk."""
    if os.path.exists(path):
        try:
            with open(path) as f:
                data = json.load(f)
                if isinstance(data, list):
                    return set(map(str, data))
        except Exception as exc:
            print(f"⚠️  Could not load seen PMIDs: {exc}")
    return set()

def save_seen_pmids(path: str, pmids: set) -> None:
    """Persist a set of PMIDs to disk."""
    os.makedirs(os.path.dirname(path), exist_ok=True)
    with open(path, "w") as f:
        json.dump(sorted(pmids), f, indent=2)

class PubMedFetcher:

    def __init__(self, email: str):
        """Initialize with email for API requests"""
        self.email = email
        Entrez.email = email
        # Be respectful to NCBI servers
        self.request_delay = 0.34  # ~3 requests per second max

    def search_articles(
        self,
        journals: List[str],
        lookback_days: int,
        start_date_override: Optional[datetime] = None,
        end_date_override: Optional[datetime] = None,
    ):
        """
        Search for articles in specified journals within a rolling or fixed window.

        Args:
            journals: List of journal names
            lookback_days: Number of days to include in the rolling window
            start_date_override: Explicit start date to use instead of the rolling window
            end_date_override: Explicit end date to use instead of the rolling window

        Returns:
            Tuple of (List of PubMed IDs, start_date_str, end_date_str)
        """
        end_date = end_date_override or datetime.now()
        start_date = start_date_override or end_date - timedelta(days=lookback_days)

        start_date_str = start_date.strftime("%Y/%m/%d")
        end_date_str = end_date.strftime("%Y/%m/%d")

        journal_query = " OR ".join([f'"{journal}"[Journal]' for journal in journals]) if journals else ""

        query_parts = []
        if journal_query:
            query_parts.append(f"({journal_query})")
        query_parts.append(f"({start_date_str}[PDAT] : {end_date_str}[PDAT])")

        final_query = " AND ".join(query_parts)

        try:
            print(f"Querying PubMed with: {final_query}")
            handle = Entrez.esearch(db="pubmed", term=final_query, datetype="pdat", retmax=100000)
            record = Entrez.read(handle)
            handle.close()
            time.sleep(self.request_delay)

            search_results = record
            id_list = search_results["IdList"]
            print(f"Found {len(id_list)} articles")
            return id_list, start_date_str, end_date_str

        except Exception as e:
            print(f"Error searching PubMed: {e}")
            return [], start_date_str, end_date_str

    def fetch_article_details(self, pmid_list: List[str]) -> List[Dict]:
        """
        Fetch detailed information for articles

        Args:
            pmid_list: List of PubMed IDs

        Returns:
            List of article dictionaries
        """
        articles = []
        batch_size = 200  # Process in batches to avoid overwhelming API

        for i in range(0, len(pmid_list), batch_size):
            batch = pmid_list[i:i + batch_size]
            print(f"Processing batch {i//batch_size + 1}/{(len(pmid_list)-1)//batch_size + 1}")

            try:
                # Fetch article details
                handle = Entrez.efetch(
                    db="pubmed",
                    id=",".join(batch),
                    rettype="xml",
                    retmode="xml"
                )
                records = handle.read()
                handle.close()

                # Parse XML
                root = ET.fromstring(records)

                for article_elem in root.findall(".//PubmedArticle"):
                    article_info = self._parse_article_xml(article_elem)
                    if article_info:
                        articles.append(article_info)

                # Be respectful to servers
                time.sleep(self.request_delay)

            except Exception as e:
                print(f"Error fetching batch: {e}")
                continue

        return articles

    def _parse_article_xml(self, article_elem) -> Optional[Dict]:
        """Parse article XML element to extract information"""
        try:
            # Extract basic article info
            medline_citation = article_elem.find(".//MedlineCitation")
            article = medline_citation.find(".//Article")

            # PMID
            pmid = medline_citation.find(".//PMID").text

            # Title
            title_elem = article.find(".//ArticleTitle")
            title = title_elem.text if title_elem is not None else ""

            # Authors
            authors = []
            for author in article.findall(".//Author"):
                last_name = author.findtext("LastName", default="")
                fore_name = author.findtext("ForeName", default="")
                if last_name or fore_name:
                    authors.append(f"{fore_name} {last_name}".strip())
            authors_str = ", ".join(authors) if authors else ""

            # Journal
            journal = article.findtext(".//Journal/Title", default="")

            # Publication Date
            pub_date = article.find(".//JournalIssue/PubDate")
            year = pub_date.findtext("Year") if pub_date is not None else None
            month = pub_date.findtext("Month") if pub_date is not None else None
            day = pub_date.findtext("Day") if pub_date is not None else None

            if year and month and day:
                pub_date_str = f"{year}-{month}-{day}"
            elif year and month:
                pub_date_str = f"{year}-{month}"
            elif year:
                pub_date_str = year
            else:
                pub_date_str = ""

            # Volume, Issue, Pages
            volume = article.findtext(".//JournalIssue/Volume", default="")
            issue = article.findtext(".//JournalIssue/Issue", default="")
            pages = article.findtext(".//Pagination/MedlinePgn", default="")

            # DOI
            doi_elem = article.find(".//ArticleId[@IdType='doi']")
            doi = doi_elem.text if doi_elem is not None else ""

            # Abstract (handle structured abstracts)
            abstract_elem = article.find(".//Abstract/AbstractText")
            abstract = ""
            if abstract_elem is not None:
                if abstract_elem.get("Label"):
                    abstract_parts = []
                    for abs_part in article.findall(".//Abstract/AbstractText"):
                        label = abs_part.get("Label", "")
                        text = abs_part.text or ""
                        if label:
                            abstract_parts.append(f"{label}: {text}")
                        else:
                            abstract_parts.append(text)
                    abstract = " ".join(abstract_parts)
                else:
                    abstract = abstract_elem.text or ""

            return {
                "PMID": pmid,
                "Title": title,
                "Authors": authors_str,
                "Journal": journal,
                "Publication_Date": pub_date_str,
                "Volume": volume,
                "Issue": issue,
                "Pages": pages,
                "DOI": doi,
                "Abstract": abstract[:500] + "..." if len(abstract) > 500 else abstract  # Truncate long abstracts
            }

        except Exception as e:
            print(f"Error parsing article: {e}")
            return None

def main():

    print("=== PubMed Journal Article Fetcher ===")

    print(f"Email: {EMAIL}")
    if JOURNALS:
        print(f"Journals: {', '.join(JOURNALS)}")
    else:
        print("No journal filter configured; searching across all journals.")
    print("-" * 50)

    requested_window = compute_requested_window(TARGET_MONTH, TARGET_YEAR, TARGET_MONTH_NUMBER)
    if requested_window:
        start_dt, end_dt = requested_window
        print(
            f"Requested month window: {start_dt.strftime('%Y-%m-%d')} to {end_dt.strftime('%Y-%m-%d')} (from TARGET_MONTH/TARGET_YEAR+TARGET_MONTH_NUMBER)"
        )
    else:
        end_dt = datetime.now()
        start_dt = end_dt - timedelta(days=LOOKBACK_DAYS)
        print(f"Rolling window: last {LOOKBACK_DAYS} days ({start_dt.strftime('%Y-%m-%d')} to {end_dt.strftime('%Y-%m-%d')})")
    print("-" * 50)

    # Validate email
    if EMAIL == "your.email@example.com":
        print("⚠️  Please update the EMAIL variable with your actual email address!")
        print("This is required by NCBI's API usage policy.")
        return

    # Initialize fetcher
    fetcher = PubMedFetcher(EMAIL)

    # Search for articles within the requested window
    print("🔍 Searching for articles...")
    pmid_list, start_date, end_date = fetcher.search_articles(
        JOURNALS,
        LOOKBACK_DAYS,
        start_dt,
        end_dt,
    )

    if not pmid_list:
        print("❌ No articles found matching the criteria.")
        print(f"Summary: discovered {len(pmid_list)} PMIDs between {start_date} and {end_date}.")
        if ALLOW_EMPTY_HARVEST:
            print("⚠️  Empty harvest allowed via ALLOW_EMPTY_HARVEST flag; exiting without failure.")
            return
        raise RuntimeError("No articles found for the configured search window.")

    seen_pmids_path = os.path.join(OUTPUT_DIR, "seen_pmids.json")
    seen_pmids = load_seen_pmids(seen_pmids_path)
    if seen_pmids:
        print(f"Loaded {len(seen_pmids)} previously harvested PMIDs.")

    new_pmids = [pmid for pmid in pmid_list if pmid not in seen_pmids]
    print(f"PMIDs to fetch after filtering seen set: {len(new_pmids)}")

    if not new_pmids:
        print("⚠️  No new PMIDs to process within this window.")
        print(f"Summary: discovered {len(pmid_list)} PMIDs, seen {len(seen_pmids)}, new {len(new_pmids)} between {start_date} and {end_date}.")
        if ALLOW_EMPTY_HARVEST:
            print("⚠️  Empty harvest allowed via ALLOW_EMPTY_HARVEST flag; exiting without failure.")
            return
        raise RuntimeError("No new PMIDs to process after filtering seen set.")

    # Fetch article details
    print(f"📖 Fetching details for {len(new_pmids)} new articles...")
    articles = fetcher.fetch_article_details(new_pmids)

    if not articles:
        print("❌ Failed to fetch article details.")
        return

    # Create DataFrame
    df = pd.DataFrame(articles)

    # Display results
    print(f"✅ Successfully retrieved {len(articles)} articles!")
    print(f"Columns: {', '.join(df.columns.tolist())}")

    # Show first few rows
    print(f"First 5 articles:")
    print(df.head().to_string(max_colwidth=50))

    # Save outputs
    os.makedirs(OUTPUT_DIR, exist_ok=True)
    window_label = f"{start_date.replace('/', '-')}_to_{end_date.replace('/', '-')}"
    csv_path = os.path.join(OUTPUT_DIR, f"ent_raw_results_{window_label}.csv")
    df.to_csv(csv_path, index=False)

    updated_seen_pmids = seen_pmids | set(new_pmids)
    save_seen_pmids(seen_pmids_path, updated_seen_pmids)
    print(f"Updated seen PMIDs saved to: {seen_pmids_path}")

    json_path = os.path.join(OUTPUT_DIR, "ent_all_results.json")
    df.to_json(json_path, orient="records", force_ascii=False, indent=2)

    # Display summary statistics
    print(f"📊 Summary:")
    print(f"Total new articles: {len(articles)}")
    print(f"Unique journals: {df['Journal'].nunique()}")
    print(f"Articles per journal:")
    journal_counts = df['Journal'].value_counts()
    for journal, count in journal_counts.head(10).items():
        print(f"  • {journal}: {count}")

    print(f"💾 Raw CSV saved to: {csv_path}")
    print(f"💾 ENT results JSON saved to: {json_path}")

    return df


In [5]:
# Optional dry-run preview for the last 30 days
# Set DRY_RUN_PREVIEW=true in the environment to exercise this without editing code.
DRY_RUN_PREVIEW = os.getenv("DRY_RUN_PREVIEW", "false").lower() == "true"

if DRY_RUN_PREVIEW:
    if not EMAIL:
        raise ValueError("EMAIL must be configured to run the dry-run search.")
    dry_run_end = datetime.now()
    dry_run_start = dry_run_end - timedelta(days=30)
    print(f"Dry-run window: {dry_run_start.strftime('%Y-%m-%d')} to {dry_run_end.strftime('%Y-%m-%d')}")
    print("Running dry-run search...")
    fetcher = PubMedFetcher(EMAIL)
    pmid_list, start_date, end_date = fetcher.search_articles(
        JOURNALS,
        LOOKBACK_DAYS,
        dry_run_start,
        dry_run_end,
    )
    print(f"Dry-run returned {len(pmid_list)} PMIDs.")
    print("Sample PMIDs:", pmid_list[:10])
else:
    print("Dry run disabled. Set DRY_RUN_PREVIEW=true to log a quick query.")


=== PubMed Journal Article Fetcher ===
Email: shvecht@gmail.com
Journals: International Forum of Allergy & Rhinology, Rhinology, JAMA Otolaryngology–Head & Neck Surgery, Otolaryngology–Head and Neck Surgery, European Annals of Otorhinolaryngology–Head and Neck Diseases, Journal of Voice, American Journal of Rhinology & Allergy, JARO – Journal of the Association for Research in Otolaryngology, Journal of Otolaryngology–Head & Neck Surgery, Laryngoscope, Auris Nasus Larynx, new england journal of medicine, JAMA
Keyword filters: otolaryngology, ENT, sinus, nasal, larynx, otology, rhinology, head and neck surgery
Excluding terms: veterinary, rodent, mouse, rat, bovine, porcine, canine, feline
Rolling window: last 30 days
--------------------------------------------------
🔍 Searching for articles...
Search query: ("International Forum of Allergy & Rhinology"[Journal] OR "Rhinology"[Journal] OR "JAMA Otolaryngology–Head & Neck Surgery"[Journal] OR "Otolaryngology–Head and Neck Surgery"[Journ

Found 0 articles
❌ No articles found matching the criteria.
