In [9]:
# 📦 Install Biopython (if not already installed)
!pip install biopython



In [10]:
# 📚 Import required library
from Bio import Entrez

# 📧 Set email for NCBI Entrez access
Entrez.email = "sheetal.reddy@g.austincc.edu"

In [11]:
# 🔎 Step 1: List all available Entrez databases
handle = Entrez.einfo()
record = Entrez.read(handle)
record["DbList"]

['pubmed', 'protein', 'nuccore', 'ipg', 'nucleotide', 'structure', 'genome', 'annotinfo', 'assembly', 'bioproject', 'biosample', 'blastdbinfo', 'books', 'cdd', 'clinvar', 'gap', 'gapplus', 'grasp', 'dbvar', 'gene', 'gds', 'geoprofiles', 'medgen', 'mesh', 'nlmcatalog', 'omim', 'orgtrack', 'pmc', 'proteinclusters', 'pcassay', 'protfam', 'pccompound', 'pcsubstance', 'seqannot', 'snp', 'sra', 'taxonomy', 'biocollections', 'gtr']

In [12]:
# 🔍 Step 2: Explore metadata about the PubMed database
handle = Entrez.einfo(db="pubmed")
record = Entrez.read(handle)

# View top-level keys
print("Top-level keys:", list(record.keys()))
print("\n")
# View database information keys
print("DbInfo keys:", list(record["DbInfo"].keys()))
print("\n")
# Print database description and number of records
print("Description:", record["DbInfo"]["Description"])
print("Total Records:", record["DbInfo"]["Count"])


Top-level keys: ['DbInfo']


DbInfo keys: ['DbName', 'MenuName', 'Description', 'DbBuild', 'Count', 'LastUpdate', 'FieldList', 'LinkList']


Description: PubMed bibliographic record
Total Records: 38766138


In [13]:
# 🧬 Step 3: Search PubMed for the TP53 gene
handle = Entrez.esearch(db="pubmed", term="TP53")
record = Entrez.read(handle)

# View structure and result IDs
print("Search record keys:", list(record.keys()))
print("\n")
print("Found IDs:", record["IdList"])

Search record keys: ['Count', 'RetMax', 'RetStart', 'IdList', 'TranslationSet', 'QueryTranslation']


Found IDs: ['40320401', '40320171', '40319323', '40319042', '40319028', '40319021', '40318009', '40316697', '40315632', '40315418', '40314902', '40314231', '40313151', '40313121', '40312921', '40312368', '40311763', '40311718', '40311467', '40311321']


In [17]:
# 📄 Step 4: Get article summaries for two sample PMIDs
handle = Entrez.esummary(db="pubmed", id="40320401,40320171")
records = Entrez.parse(handle)

# 🖋️ Optional: Format with bold/underline styling (works in terminal and notebooks that support ANSI)
bold = "\033[1m"
underline = "\033[4m"
reset = "\033[0m"

for i, record in enumerate(records, 1):
    print(f"{bold}Record {i}:{reset}\n")
    print(f"{bold}{underline}Authors:{reset}", record["AuthorList"])
    print(f"{bold}{underline}Title:{reset}", record["Title"])
    print(f"{bold}{underline}Publication Date:{reset}", record["PubDate"])
    print(f"{bold}{underline}Journal:{reset}", record["FullJournalName"])
    print("\n")


[1mRecord 1:[0m

[1m[4mAuthors:[0m ['Li Y', 'Song C', 'Wang H', 'Di W', 'Chen Y', 'Hu Y', 'Li P', 'Chen J', 'Ren Y', 'Gong J', 'Wang Q']
[1m[4mTitle:[0m Novel prognostic biomarkers in small cell lung cancer reveal mutational signatures, genomic mutations, and immune implications.
[1m[4mPublication Date:[0m 2025 May 4
[1m[4mJournal:[0m Scientific reports


[1mRecord 2:[0m

[1m[4mAuthors:[0m ['Sabari JK', 'Yu HA', 'Mahadevia PJ', 'Liu Y', 'Demirdjian L', 'Chen YH', 'Wang X', 'Passaro A']
[1m[4mTitle:[0m Overall Survival in EGFR-mutant Advanced Non-Small Cell Lung Cancer Treated with First-line Osimertinib: A Cohort Study Integrating Clinical and Biomarker Data in the United States.
[1m[4mPublication Date:[0m 2025 May 2
[1m[4mJournal:[0m Journal of thoracic oncology : official publication of the International Association for the Study of Lung Cancer




In [18]:
# 🧾 Step 5: Fetch raw XML for a specific article and pretty-print it
import xml.dom.minidom

handle = Entrez.efetch(db="pubmed", id="40319323", retmode="xml")
raw_xml = handle.read()

parsed_xml = xml.dom.minidom.parseString(raw_xml)
pretty_xml = parsed_xml.toprettyxml(indent="  ")

print(pretty_xml)

<?xml version="1.0" ?>
<!DOCTYPE PubmedArticleSet
  PUBLIC '-//NLM//DTD PubMedArticle, 1st January 2025//EN'
  'https://dtd.nlm.nih.gov/ncbi/pubmed/out/pubmed_250101.dtd'>
<PubmedArticleSet>
  

  <PubmedArticle>
    <MedlineCitation Status="PubMed-not-MEDLINE" Owner="NLM">
      <PMID Version="1">40319323</PMID>
      <DateRevised>
        <Year>2025</Year>
        <Month>05</Month>
        <Day>03</Day>
      </DateRevised>
      <Article PubModel="Electronic">
        <Journal>
          <ISSN IssnType="Print">1475-2867</ISSN>
          <JournalIssue CitedMedium="Print">
            <Volume>25</Volume>
            <Issue>1</Issue>
            <PubDate>
              <Year>2025</Year>
              <Month>May</Month>
              <Day>03</Day>
            </PubDate>
          </JournalIssue>
          <Title>Cancer cell international</Title>
          <ISOAbbreviation>Cancer Cell Int</ISOAbbreviation>
        </Journal>
        <ArticleTitle>Genomic signatures in plasma circulating 