In [1]:
# 📦 Install Biopython (if not already installed)
!pip install biopython

Collecting biopython
  Downloading biopython-1.85-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (13 kB)
Downloading biopython-1.85-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.3/3.3 MB[0m [31m52.8 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25hInstalling collected packages: biopython
Successfully installed biopython-1.85


In [2]:
# 📚 Import required library
from Bio import Entrez

# 📧 Set email for NCBI Entrez access
Entrez.email = "sheetal.reddy@g.austincc.edu"

In [3]:
# 🔎 Step 1: List all available Entrez databases
handle = Entrez.einfo()
record = Entrez.read(handle)
record["DbList"]

['pubmed', 'protein', 'nuccore', 'ipg', 'nucleotide', 'structure', 'genome', 'annotinfo', 'assembly', 'bioproject', 'biosample', 'blastdbinfo', 'books', 'cdd', 'clinvar', 'gap', 'gapplus', 'grasp', 'dbvar', 'gene', 'gds', 'geoprofiles', 'medgen', 'mesh', 'nlmcatalog', 'omim', 'orgtrack', 'pmc', 'proteinclusters', 'pcassay', 'protfam', 'pccompound', 'pcsubstance', 'seqannot', 'snp', 'sra', 'taxonomy', 'biocollections', 'gtr']

In [5]:
# 🔍 Step 2: Explore metadata about the PubMed database
handle = Entrez.einfo(db="pubmed")
record = Entrez.read(handle)

# View top-level keys
print("Top-level keys:", list(record.keys()))
print("\n")
# View database information keys
print("DbInfo keys:", list(record["DbInfo"].keys()))
print("\n")
# Print database description and number of records
print("Description:", record["DbInfo"]["Description"])
print("Total Records:", record["DbInfo"]["Count"])


Top-level keys: ['DbInfo']


DbInfo keys: ['DbName', 'MenuName', 'Description', 'DbBuild', 'Count', 'LastUpdate', 'FieldList', 'LinkList']


Description: PubMed bibliographic record
Total Records: 38681452


In [7]:
# 🧬 Step 3: Search PubMed for the TP53 gene
handle = Entrez.esearch(db="pubmed", term="TP53")
record = Entrez.read(handle)

# View structure and result IDs
print("Search record keys:", list(record.keys()))
print("\n")
print("Found IDs:", record["IdList"])

Search record keys: ['Count', 'RetMax', 'RetStart', 'IdList', 'TranslationSet', 'QueryTranslation']


Found IDs: ['40234641', '40234451', '40234394', '40233742', '40233741', '40233044', '40232888', '40232565', '40231352', '40230093', '40229848', '40229358', '40229109', '40228962', '40228946', '40228216', '40227826', '40227792', '40227763', '40227619']


In [10]:
record["IdList"]

['40229848', '40229358', '40229109', '40228962', '40228946', '40228216', '40227826', '40227792', '40227763', '40227619', '40227587', '40226969', '40226311', '40225855', '40224680', '40224173', '40223932', '40223808', '40223170', '40223084']

In [9]:
# 📄 Step 4: Get article summaries for two sample PMIDs
handle = Entrez.esummary(db="pubmed", id="40229848,40229358")
records = Entrez.parse(handle)

# 🖋️ Optional: Format with bold/underline styling (works in terminal and notebooks that support ANSI)
bold = "\033[1m"
underline = "\033[4m"
reset = "\033[0m"

for i, record in enumerate(records, 1):
    print(f"{bold}Record {i}:{reset}\n")
    print(f"{bold}{underline}Authors:{reset}", record["AuthorList"])
    print(f"{bold}{underline}Title:{reset}", record["Title"])
    print(f"{bold}{underline}Publication Date:{reset}", record["PubDate"])
    print(f"{bold}{underline}Journal:{reset}", record["FullJournalName"])
    print("\n")


[1mRecord 1:[0m

[1m[4mAuthors:[0m ['Nørgaard M', 'Rusan M', 'Kondrup K', 'Sørensen EMG', 'Weiss S', 'Bjerre MT', 'Fredsøe J', 'Vang S', 'Jensen JB', 'De Laere B', 'Grönberg H', 'Borre M', 'Lindberg J', 'Sørensen KD']
[1m[4mTitle:[0m Deep targeted sequencing of circulating tumor DNA to inform treatment in patients with metastatic castration-resistant prostate cancer.
[1m[4mPublication Date:[0m 2025 Apr 14
[1m[4mJournal:[0m Journal of experimental & clinical cancer research : CR


[1mRecord 2:[0m

[1m[4mAuthors:[0m ['Lee JM', 'Lee CY', 'Seol B', 'Jung CK', 'Kim Y', 'Kang D', 'Yu H', 'Hong Y', 'Song CL', 'Cho YS', 'Kim M']
[1m[4mTitle:[0m Tracing genomic instability in induced mesenchymal stromal cell manufacture: an integration-free transfection approach.
[1m[4mPublication Date:[0m 2025 Apr 14
[1m[4mJournal:[0m Experimental & molecular medicine




In [10]:
# 🧾 Step 5: Fetch raw XML for a specific article and pretty-print it
import xml.dom.minidom

handle = Entrez.efetch(db="pubmed", id="40229109", retmode="xml")
raw_xml = handle.read()

parsed_xml = xml.dom.minidom.parseString(raw_xml)
pretty_xml = parsed_xml.toprettyxml(indent="  ")

print(pretty_xml)

<?xml version="1.0" ?>
<!DOCTYPE PubmedArticleSet
  PUBLIC '-//NLM//DTD PubMedArticle, 1st January 2025//EN'
  'https://dtd.nlm.nih.gov/ncbi/pubmed/out/pubmed_250101.dtd'>
<PubmedArticleSet>
  

  <PubmedArticle>
    <MedlineCitation Status="MEDLINE" Owner="NLM" IndexingMethod="Automated">
      <PMID Version="1">40229109</PMID>
      <DateCompleted>
        <Year>2025</Year>
        <Month>04</Month>
        <Day>14</Day>
      </DateCompleted>
      <DateRevised>
        <Year>2025</Year>
        <Month>04</Month>
        <Day>14</Day>
      </DateRevised>
      <Article PubModel="Print-Electronic">
        <Journal>
          <ISSN IssnType="Electronic">1525-1438</ISSN>
          <JournalIssue CitedMedium="Internet">
            <Volume>34</Volume>
            <Issue>10</Issue>
            <PubDate>
              <Year>2024</Year>
              <Month>Oct</Month>
            </PubDate>
          </JournalIssue>
          <Title>International journal of gynecological cancer : officia