In [2]:
import requests
import xml.etree.ElementTree as ET
import re
import requests
import json
import pandas as pd

In [3]:
# Searching the Database
url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
params = {
    "db": "pubmed",
    "term": "science[journal] AND biomarker OR protein marker OR gene expression",  # Query terms
    "retmax": 100,  # Number of results to fetch
    "usehistory": "y",  # Save search results for later retrieval
    "retmode": "json",  # Get results in JSON format
}
response = requests.get(url, params=params)
search_results = response.json()

# Extract WebEnv, QueryKey, and count of results
webenv = search_results['esearchresult']['webenv']
query_key = search_results['esearchresult']['querykey']
result_count = int(search_results['esearchresult']['count'])

print(f"Search Results Found: {result_count}")
print(f"WebEnv: {webenv}, QueryKey: {query_key}")

Search Results Found: 2290577
WebEnv: MCID_686593bde41e383b9f0848ea, QueryKey: 1


In [4]:
# Fetching detailed records 
fetch_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
records = []
batch_size = 100
xml_extracted = []

# for retstart in range(0, result_count, batch_size):  # Fetch in batches of 100
for retstart in range(0, 80, batch_size):  # Fetch in batches of 100
    params_efetch = {
        "db": "pubmed",
        "WebEnv": webenv,
        "query_key": query_key,
        # "retstart": retstart,
        "retmax": batch_size,  # Maximum records to fetch at a time
        "retmode": "xml",  # XML format
        "rettype": "abstract",
    }
    response_efetch = requests.get(fetch_url, params=params_efetch)
    
    metadata = response_efetch.text
    root = ET.fromstring(metadata)

    # Extract details from each article in the XML response
    for article in root.findall(".//PubmedArticle"): # finds only elements with a tag which are direct children of the current element
        # Extract the title
        title = article.find(".//ArticleTitle") # finds the first child with a particular tag
        title = title.text if title is not None else "No Title Found"
        
        # Extract the abstract
        abstracts = article.findall(".//Article/Abstract/AbstractText")
        for abstract in abstracts:
            if abstract.text:
                xml_extracted.append(abstract.text)

        # Extract the authors (if any)
        authors = article.findall(".//Article/AuthorList/Author")
        author_names = []
        for author in authors:
            last_name = author.find("LastName")
            first_name = author.find("ForeName")
            # if last_name is not None and first_name is not None:
            if last_name is not None and last_name.text and first_name is not None and first_name.text:
                author_names.append(f"{first_name.text} {last_name.text}")
            else:
                print(f"Missing or empty names in author: {author}")
        authors = ", ".join(author_names) if author_names else "No Authors Found"

        # Print or process the extracted data
        print(f"")
        print(f"Title: {title}")
        print(f"Abstract: {abstract.text}")
        print(f"Authors: {authors}")
        print("-" * 80)


Title: Causal Associations Between Neuroinflammation-Related Genes and Intracerebral Hemorrhage: An Integrated Study of Mendelian Randomization and Gene Functional Analysis.
Abstract: The findings reveal a significant genetic influence of CHUK and CTLA4 on ICH risk, provide potential targets for future therapeutic interventions, which could lead to the development of more effective treatment strategies for ICH.
Authors: Quanming Zhou, Shejuan Wu, Yuanbao Kang
--------------------------------------------------------------------------------

Title: Progress in research on DDOST dysregulation in related diseases.
Abstract: DDOST is an important subunit of N-glycosylated oligosaccharyltransferase and is closely related to protein N-glycosylation. Some studies have reported that abnormal expression of DDOST is associated with congenital disorders of glycosylation, solid tumours and other diseases. To better understand the progress of research on DDOST in diseases, we herein provide a compr

In [7]:
url = "http://localhost:11434/api/generate"
     
abstract_specific_prompt = f"""For following XML: {xml_extracted}, please parse the content and extract specific information of all 215 articles.
    1. Title
    2. Core Metadata:
        - Disease name,
        - Disease category,
    3. Marker information:
        - Marker name (e.g. BRCA1, C-reative protein),
        - Marker type (e.g. gene, protein, metabolite),
        - Marker ID (e.g. HGNC ID for genes),
        - Marker location (e.g. chromosomal location for genetic markers),
    4. Association type:
        - Association type: type of association (e.g. expression, genetic variation),
        - Strength of association (e.g. score or qualitative measure),
        - Evidence level: Evidence qualtiy (e.g. clinical trial or pre-clinical trial),
        - Statistical metrics: metrics like p-value, odds ratio, confidence intervals,
        - Directionality: positive or negative association,
        - Functional impact: description of how the marker affects the disease (e.g. promotes progression or acts as a suppressor)

# Input Data: {xml_extracted}

Please answer in JSON only using the following template. Please keep the output within the ''. 

# Output -> List[JSON]:
    Abstract title: 'Title',
    Disease name: 'Name',
    Disease category: 'Category',
    Marker name: 'Name',
    Marker type: 'Type',
    Marker ID: 'ID',
    Marker location: 'Location',
    Association type: 'Type',
    Strength of association: 'Strength',
    Evidence level: 'Level',
    Statistical metrics: 'Metrics',
    Directionality: 'Direction',
    Functional impact: 'Impact description',


if none is found please put 'no information' instead of leaving it blank
"""

payload = json.dumps({
    "model": "phi4",
    "prompt": abstract_specific_prompt,
    "stream": False
})

headers = {
  'Content-Type': 'application/json'
}

response = requests.request("POST", url, headers=headers, data=payload)

print(response.text)

{"model":"phi4","created_at":"2025-07-02T20:26:51.5482317Z","response":"```json\n[\n    {\n        \"Abstract title\": \"Role of AD-MSCs-CM in Breast Cancer Treatment\",\n        \"Disease name\": \"Breast cancer\",\n        \"Disease category\": \"Cancer\",\n        \"Marker name\": \"no information\",\n        \"Marker type\": \"no information\",\n        \"Marker ID\": \"no information\",\n        \"Marker location\": \"no information\",\n        \"Association type\": \"no information\",\n        \"Strength of association\": \"no information\",\n        \"Evidence level\": \"no information\",\n        \"Statistical metrics\": \"no information\",\n        \"Directionality\": \"no information\",\n        \"Functional impact\": \"Induction of apoptosis and inhibition of migration in MDA-MB-231 breast cancer cells\"\n    },\n    {\n        \"Abstract title\": \"Investigation of Inflammatory Markers in AAA\",\n        \"Disease name\": \"Abdominal aortic aneurysm\",\n        \"Disease ca

In [9]:
response_output = response.json()['response'].replace('```', '').replace('json\n', '').replace('\n', '')
cleaned_response_output = json.loads(re.sub(' +', ' ', response_output))
cleaned_response_output

[{'Abstract title': 'Role of AD-MSCs-CM in Breast Cancer Treatment',
  'Disease name': 'Breast cancer',
  'Disease category': 'Cancer',
  'Marker name': 'no information',
  'Marker type': 'no information',
  'Marker ID': 'no information',
  'Marker location': 'no information',
  'Association type': 'no information',
  'Strength of association': 'no information',
  'Evidence level': 'no information',
  'Statistical metrics': 'no information',
  'Directionality': 'no information',
  'Functional impact': 'Induction of apoptosis and inhibition of migration in MDA-MB-231 breast cancer cells'},
 {'Abstract title': 'Investigation of Inflammatory Markers in AAA',
  'Disease name': 'Abdominal aortic aneurysm',
  'Disease category': 'Cardiovascular disease',
  'Marker name': 'no information',
  'Marker type': 'Proteins',
  'Marker ID': 'no information',
  'Marker location': 'Serum',
  'Association type': 'Inflammatory markers associated with AAA',
  'Strength of association': 'no information',
 

In [10]:
df = pd.DataFrame(cleaned_response_output)
df.to_csv('pubmed_results.csv')
df.head(50)

Unnamed: 0,Abstract title,Disease name,Disease category,Marker name,Marker type,Marker ID,Marker location,Association type,Strength of association,Evidence level,Statistical metrics,Directionality,Functional impact
0,Role of AD-MSCs-CM in Breast Cancer Treatment,Breast cancer,Cancer,no information,no information,no information,no information,no information,no information,no information,no information,no information,Induction of apoptosis and inhibition of migra...
1,Investigation of Inflammatory Markers in AAA,Abdominal aortic aneurysm,Cardiovascular disease,no information,Proteins,no information,Serum,Inflammatory markers associated with AAA,no information,no information,Expression levels of 92 inflammation-related p...,no information,Chronic inflammation associated with AAA
2,Detection of Gene Mutations in Endometrial Cancer,Endometrial cancer (EC),Cancer,no information,Genetic mutations,no information,no information,Mutation status and expression levels of genes...,no information,Data from 97 EC patients,Mutation rate data,no information,Understanding genetic basis for EC
3,Investigation of Immune Responses in RSV-Infec...,Respiratory syncytial virus (RSV) infection,Infectious disease,no information,Immunological markers,no information,Human airway organoids,Immune responses at different stages of differ...,no information,Experimental models using FLO and iAO,Expression levels of infection receptor protei...,Stronger immune response observed in mature ai...,Inhibition of RSV replication by mature airway...
