Scrapping Pubmed Data Using Beautiful Soup


This Python script fetches information from PubMed related to a user-defined topic. It retrieves citation details, authors' information, abstracts, and PMCID (PubMed Central ID) for each publication on the topic. The script then saves this information as a JSON file. To ensure compliance with rate limits on PubMed's API, it employs rate limiting and retry mechanisms for HTTP errors, particularly error code 429 (rate limit exceeded). Finally, it prompts the user to enter a topic, scrapes PubMed for relevant publications, and saves the fetched data into a JSON file named after the topic


Importing the dependencies

In [15]:
import requests
from bs4 import BeautifulSoup
import xml.etree.ElementTree as ET
import urllib.parse
import urllib.request
from urllib.error import HTTPError
from ratelimit import limits, sleep_and_retry
import time
import json

In [16]:
# Define rate limits (requests per second)
SEARCH_RATE_LIMIT = 2
RETRIEVAL_RATE_LIMIT = 5

@sleep_and_retry
@limits(calls=SEARCH_RATE_LIMIT, period=1)
def search_pubmed(topic, page):
    search_query = topic.replace(' ', '+')

    # Construct the search URL
    base_url = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&retmode=xml'
    query_params = {'term': search_query, 'retstart': page*20, 'retmax': 20}
    search_url = base_url + '&' + urllib.parse.urlencode(query_params)

    #search results
    with urllib.request.urlopen(search_url) as response:
        xml_response = response.read().decode('utf-8')

    root = ET.fromstring(xml_response)

    # PubMed IDs (PMIDs) extraction
    pmids = [id.text for id in root.findall('.//Id')]

    return pmids


In [17]:
# Function to retrieve citation information for corresponding PMID
@sleep_and_retry
@limits(calls=RETRIEVAL_RATE_LIMIT, period=1)
def retrieve_citation(pmid):

    summary_url = f'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?db=pubmed&id={pmid}&retmode=xml'

    #summary information
    with urllib.request.urlopen(summary_url) as response:
        xml_response = response.read().decode('utf-8')

    root = ET.fromstring(xml_response)

    # Extraction of the citation information
    title = root.find('.//Item[@Name="Title"]').text
    journal = root.find('.//Item[@Name="FullJournalName"]').text
    pub_date = root.find('.//Item[@Name="PubDate"]').text

    citation_info = {'title': title, 'journal': journal, 'pub_date': pub_date, 'pmid': pmid}
    return citation_info


In [18]:
# retrieving authors' information for given PMID
@sleep_and_retry
@limits(calls=RETRIEVAL_RATE_LIMIT, period=1)
def retrieve_authors_info(pmid):

    summary_url = f'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?db=pubmed&id={pmid}&retmode=xml'

    with urllib.request.urlopen(summary_url) as response:
        xml_response = response.read().decode('utf-8')

    root = ET.fromstring(xml_response)

    # Extracting  authors' list
    authors_list = root.find('.//Item[@Name="AuthorList"]')
    authors = []
    if authors_list is not None:
        authors = [author.text for author in authors_list.findall('.//Item[@Name="Author"]')]

    return authors

# retrieving PMCID for a given publication URL
@sleep_and_retry
@limits(calls=RETRIEVAL_RATE_LIMIT, period=1)
def retrieve_pmcid(publication_url):
    # Fetching publication page
    response = requests.get(publication_url)
    if response.status_code != 200:
        # print(f"Failed to retrieve publication page: {publication_url}")
        return None

    # Parse the HTML response
    soup = BeautifulSoup(response.content, 'html.parser')

    # element containing the PMCID
    pmcid_elem = soup.find('a', class_='id-link')
    if pmcid_elem is None:
        # print(f"PMCID not found for publication: {publication_url}")
        return None

    # extract the PMCID
    pmcid = pmcid_elem.text.strip()

    # check if the PMCID starts with "PMC" prefix
    if not pmcid.startswith("PMC"):
        # print(f"Invalid PMCID format for publication: {publication_url}")
        return None

    return pmcid


In [19]:
# retrieving abstract information for given PMID
def retrieve_abstract(pmid):

    pubmed_url = f"https://pubmed.ncbi.nlm.nih.gov/{pmid}/"

    response = requests.get(pubmed_url)
    if response.status_code != 200:
        # print(f"Failed to retrieve PubMed page for PMID {pmid}")
        return None

    soup = BeautifulSoup(response.content, 'html.parser')

    # element containing the abstract
    abstract_elem = soup.find('div', class_='abstract-content selected')
    if abstract_elem is None:
        # print(f"Abstract not found for PMID {pmid}")
        return None

    # Extracting the abstract text
    abstract_text = abstract_elem.get_text(separator='\n').strip()

    return abstract_text

# save citation information to a JSON file
def save_citations_to_json(citations, filename):
    with open(filename, 'w', encoding='utf-8') as file:
        json.dump(citations, file, indent=4)

# user query
topic_input = input("Enter the topic to search PubMed for: ")



In [20]:
# Scraping 5 pages each with 20 publications
citations = []
for page in range(5): 
    pmids = search_pubmed(topic_input, page)
    for pmid in pmids:
        try:
            citation_info = retrieve_citation(pmid)
            authors = retrieve_authors_info(pmid)
            citation_info['authors'] = authors
            abstract = retrieve_abstract(pmid)
            citation_info['abstract'] = abstract

            # Retrieve PMCID
            publication_url = f"https://pubmed.ncbi.nlm.nih.gov/{pmid}/"
            pmcid = retrieve_pmcid(publication_url)
            if pmcid is not None:
                citation_info['pmcid'] = pmcid

            citations.append(citation_info)
        except HTTPError as e:
            if e.code == 429:
                print("Rate limit exceeded. Waiting and retrying...")
                time.sleep(10)  # Wait for 10 seconds and retry
                citation_info = retrieve_citation(pmid)  # Retry the request
                authors = retrieve_authors_info(pmid)
                citation_info['authors'] = authors
                abstract = retrieve_abstract(pmid)
                citation_info['abstract'] = abstract

                # Retry fetching PMCID
                publication_url = f"https://pubmed.ncbi.nlm.nih.gov/{pmid}/"
                pmcid = retrieve_pmcid(publication_url)
                if pmcid is not None:
                    citation_info['pmcid'] = pmcid

                citations.append(citation_info)
            else:
                raise


In [21]:
# Saving the citations to a JSON file
filename = f"{topic_input}_citations.json"
save_citations_to_json(citations, filename)
print("Citations saved to", filename)

Citations saved to cancer_citations.json


Scrapping Pubmed Data Using BioPython 

Compared to other libraries like Beautiful Soup, BioPython stands out for its tailored approach to biological data. While Beautiful Soup is adept at parsing HTML and XML content, BioPython’s specialized functionalities cater specifically to the nuances of biomedical data analysis. By choosing BioPython, researchers can harness these specialized tools to extract PubMed data with precision and reliability, making it the preferred choice for quick and efficient data extraction in scientific research.

In [22]:
# Installing the library
!pip install Bio




[notice] A new release of pip is available: 23.3.2 -> 24.0
[notice] To update, run: python.exe -m pip install --upgrade pip


In [23]:
# Importing the dependencies
from Bio import Entrez
from Bio import Medline
import json


In [24]:
from Bio import Entrez, Medline
import json

def search_pubmed(query, max_results=10):
    # Set the email address to be used with Entrez
    Entrez.email = "varsagupta07@gmail.com"
    
    # Search PubMed for the given query and retrieve a maximum of max_results IDs
    handle = Entrez.esearch(db="pubmed", term=query, retmax=max_results)
    record = Entrez.read(handle)
    handle.close()
    
    # Get the list of IDs from the search results
    id_list = record["IdList"]
    
    # Fetch detailed records for each ID in the list
    handle = Entrez.efetch(db="pubmed", id=id_list, rettype="medline", retmode="text")
    records = Medline.parse(handle)
    
    # Initialize lists to hold the processed papers and all records
    papers = []
    all_records = []
    
    # Process each record and extract relevant information
    for record in records:
        all_records.append(record)
        paper = {
            "title": record.get("TI", ""),  # Get the title of the paper
            "authors": record.get("AU", []),  # Get the list of authors
            "abstract": record.get("AB", ""),  # Get the abstract
            "PMID": record.get("PMID", []),  # Get the PubMed ID
            "DOP": record.get("DP", [])  # Get the date of publication
        }
        papers.append(paper)
    
    handle.close()
    
    # Return the list of processed papers and all records
    return papers, all_records

# Example usage
query = "breast cancer treatment"
max_results = 200

# Search PubMed and retrieve papers and all records for the given query
papers, all_records = search_pubmed(query, max_results)

# Save all records to a JSON file
with open("pubmed_papers_by_biopython.json", "w") as file:
    json.dump(all_records, file, indent=4)

# Print details of each paper
for paper in papers:
    print("Title:", paper["title"])
    print("Authors:", ", ".join(paper["authors"]))
    print("Abstract:", paper["abstract"])
    print("PMID:", paper["PMID"])
    print("DOP:", paper["DOP"])
    print("---")


Title: Diffuse Dermal Angiomatosis of the Breast Clinically Mimicking Cellulitis and Inflammatory Breast Cancer.
Authors: Moore LR, Skrine RA
Abstract: A 40-year-old woman admitted for hyponatremia and anasarca due to decompensated cirrhosis after a recent steroid taper developed extremely painful cutaneous breast lesions clinically mimicking cellulitis and inflammatory breast cancer and was biopsy-diagnosed instead with diffuse dermal angiomatosis (DDA) of the breasts, a rare and painful disease that can be a diagnostic chameleon. This case highlights the importance of early surgical consultation and tissue biopsy to correctly diagnose the etiology of severely painful mastitis and prevent prolonged symptomology and repeated administrations of ineffective treatments. Diffuse dermal angiomatosis should be considered when suspected breast cellulitis is refractory to treatment or there is concern for inflammatory breast cancer, especially in pendulous-breasted women with comorbidities tha

Thus, we see the above method of scrapping Pubmed data using Beautiful Soup takes approx 10 mins to extract data and the Python script to scrape Pubmed using BioPython just takes 10 secs to extract data. Hence, BioPython is more efficient than Beautiful Soup in extarcting large volume of Pubmed Data.