In [1]:
import requests
import json
from xml.etree import ElementTree

In [2]:
# Define search parameters
QUERY = """
(pdac OR pancreatic cancer OR pancreatic adenocarcinoma OR pancreatic ductal adenocarcinoma) AND (alpha-smooth muscle actin OR alpha smooth muscle actin OR alpha-SMA OR alpha SMA) AND "neural invasion" AND "clinical trial"
"""
DB = "pmc"
BASE_URL = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils"
NUM_RESULTS = 10000

In [5]:
# Step 1: Search for articles
search_url = f"{BASE_URL}/esearch.fcgi?db={DB}&term={QUERY}&retmax={NUM_RESULTS}&retmode=xml"
response = requests.get(search_url)
root = ElementTree.fromstring(response.content)
article_ids = [id_elem.text for id_elem in root.findall(".//Id")]

# Step 2: Fetch article details
if article_ids:
    id_str = ",".join(article_ids)
    fetch_url = f"{BASE_URL}/efetch.fcgi?db={DB}&id={id_str}&retmode=xml"
    fetch_response = requests.get(fetch_url)
    fetch_root = ElementTree.fromstring(fetch_response.content)

    jsonl_filename = "pubmed_articles.jsonl"

    with open(jsonl_filename, "w", encoding="utf-8") as jsonl_file:
        for article in fetch_root.findall(".//article"):
            # Extract title
            title_elem = article.find(".//article-title")
            title = title_elem.text if title_elem is not None else "N/A"

            # Extract abstract (handling multiple paragraphs if present)
            abstract_text = "N/A"
            abstract_elem = article.find(".//abstract")
            if abstract_elem is not None:
                abstract_text = " ".join(p.text for p in abstract_elem.findall(".//p") if p.text)

            # Extract publication year
            pub_year_elem = article.find(".//pub-date/year")
            pub_year = pub_year_elem.text if pub_year_elem is not None else "N/A"

            # Extract journal name
            journal_elem = article.find(".//journal-title")
            journal = journal_elem.text if journal_elem is not None else "N/A"

            # Extract authors (handling missing given-names or surname)
            authors = []
            for contrib in article.findall(".//contrib[@contrib-type='author']"):
                surname_elem = contrib.find(".//surname")
                given_elem = contrib.find(".//given-names")
                surname = surname_elem.text if surname_elem is not None else ""
                given_name = given_elem.text if given_elem is not None else ""
                full_name = f"{surname}, {given_name}".strip(", ")
                if full_name:
                    authors.append(full_name)

            # Store extracted data
            article_data = {
                "title": title,
                "abstract": abstract_text,
                "publication_year": pub_year,
                "publication_name": journal,
                "authors": authors,
                "database": "PMC"
            }

            # Write to JSONL file
            jsonl_file.write(json.dumps(article_data, ensure_ascii=False) + "\n")
            # also write to a text file for human readability
            with open("pubmed_articles.txt", "a", encoding="utf-8") as text_file:
                text_file.write(f"Title: {title}\n")
                text_file.write(f"Abstract: {abstract_text}\n")
                text_file.write(f"Publication Year: {pub_year}\n")
                text_file.write(f"Publication Name: {journal}\n")
                text_file.write(f"Authors: {', '.join(authors)}\n")
                text_file.write("=" * 80 + "\n")

    print(f"Results saved to {jsonl_filename}")
else:
    print("No articles found.")

Results saved to pubmed_articles.jsonl
