In [6]:
import requests
import xml.etree.ElementTree as ET
import re
import requests
import json
import pandas as pd

In [7]:
# Searching the Database
url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
params = {
    "db": "pubmed",
    "term": "science[journal] AND biomarker OR protein marker OR gene expression",  # Query terms
    "retmax": 100,  # Number of results to fetch
    "usehistory": "y",  # Save search results for later retrieval
    "retmode": "json",  # Get results in JSON format
}
response = requests.get(url, params=params)
search_results = response.json()

# Extract WebEnv, QueryKey, and count of results
webenv = search_results['esearchresult']['webenv']
query_key = search_results['esearchresult']['querykey']
result_count = int(search_results['esearchresult']['count'])

print(f"Search Results Found: {result_count}")
print(f"WebEnv: {webenv}, QueryKey: {query_key}")

Search Results Found: 2244343
WebEnv: MCID_67994be06362eae263097cc8, QueryKey: 1


In [8]:
# Fetching detailed records 
fetch_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
records = []
batch_size = 100
xml_extracted = []

# for retstart in range(0, result_count, batch_size):  # Fetch in batches of 100
for retstart in range(0, 80, batch_size):  # Fetch in batches of 100
    params_efetch = {
        "db": "pubmed",
        "WebEnv": webenv,
        "query_key": query_key,
        # "retstart": retstart,
        "retmax": batch_size,  # Maximum records to fetch at a time
        "retmode": "xml",  # XML format
        "rettype": "abstract",
    }
    response_efetch = requests.get(fetch_url, params=params_efetch)
    
    metadata = response_efetch.text
    root = ET.fromstring(metadata)

    # Extract details from each article in the XML response
    for article in root.findall(".//PubmedArticle"): # finds only elements with a tag which are direct children of the current element
        # Extract the title
        title = article.find(".//ArticleTitle") # finds the first child with a particular tag
        title = title.text if title is not None else "No Title Found"
        
        # Extract the abstract
        abstracts = article.findall(".//Article/Abstract/AbstractText")
        for abstract in abstracts:
            if abstract.text:
                xml_extracted.append(abstract.text)

        # Extract the authors (if any)
        authors = article.findall(".//Article/AuthorList/Author")
        author_names = []
        for author in authors:
            last_name = author.find("LastName")
            first_name = author.find("ForeName")
            # if last_name is not None and first_name is not None:
            if last_name is not None and last_name.text and first_name is not None and first_name.text:
                author_names.append(f"{first_name.text} {last_name.text}")
            else:
                print(f"Missing or empty names in author: {author}")
        authors = ", ".join(author_names) if author_names else "No Authors Found"

        # Print or process the extracted data
        print(f"")
        print(f"Title: {title}")
        print(f"Abstract: {abstract.text}")
        print(f"Authors: {authors}")
        print("-" * 80)


Title: Association of Suicidal Status, Inflammation Markers, and Resting-State Functional Activity and Connectivity in Patients With Major Depressive Disorder.
Abstract: None
Authors: Emilie Olié, Guillaume Clain, Manon Malestroit, Dimitri Fiedos, Fabrice Cognasse, Jeremy Deverdun, Emmanuelle Le Bars, Philippe Courtet
--------------------------------------------------------------------------------

Title: Lidocaine Inhibits the Proliferation of Non-Small Cell Lung Cancer and Exerts Anti-Inflammatory Effects Through the TLR-9/MyD88/NF-κB Pathway.
Abstract: Lung cancer represents a significant global health burden, with non-small cell lung cancer (NSCLC) being the most common subtype. The current standard of care for NSCLC has limited efficacy, highlighting the necessity for innovative treatment options. Lidocaine, traditionally recognized as a local anesthetic, has emerged as a compound with potential antitumor and anti-inflammatory capabilities. This study was designed to explore the 

In [9]:
url = "http://localhost:11434/api/generate"
     
abstract_specific_prompt = f"""For following XML: {xml_extracted}, please parse the content and extract specific information of articles.
    1. Title
    2. Core Metadata:
        - Disease name,
        - Disease category,
    3. Marker information:
        - Marker name (e.g. BRCA1, C-reative protein),
        - Marker type (e.g. gene, protein, metabolite),
        - Marker ID (e.g. HGNC ID for genes),
        - Marker location (e.g. chromosomal location for genetic markers),
    4. Association type:
        - Association type: type of association (e.g. expression, genetic variation),
        - Strength of association (e.g. score or qualitative measure),
        - Evidence level: Evidence qualtiy (e.g. clinical trial or pre-clinical trial),
        - Statistical metrics: metrics like p-value, odds ratio, confidence intervals,
        - Directionality: positive or negative association,
        - Functional impact: description of how the marker affects the disease (e.g. promotes progression or acts as a suppressor)

# Input Data: {xml_extracted}

Please answer in JSON only using the following template. Please keep the output within the []:
# Output:
Abstract title: [Title],
Disease name: [Name],
Disease category: [Category],
Marker name: [Name],
Marker type: [Type],
Marker ID: [ID],
Marker location: [Location],
Association type: [Type],
Strength of association: [Strength],
Evidence level: [Level],
Statistical metrics: [Metrics],
Directionality: [Direction],
Functional impact: [Impact description]

"""

payload = json.dumps({
    "model": "phi4",
    "prompt": abstract_specific_prompt,
    "stream": False
})

headers = {
  'Content-Type': 'application/json'
}

response = requests.request("POST", url, headers=headers, data=payload)

print(response.text)


{"model":"phi4","created_at":"2025-01-28T21:32:59.6302763Z","response":"```json\n[\n  {\n    \"Abstract title\": \"Effects of High Temperature Stress on Metabolism and Physiology in Pigs\",\n    \"Disease name\": \"High temperature stress\",\n    \"Disease category\": \"Environmental stress condition\",\n    \"Marker name\": \"\",\n    \"Marker type\": \"\",\n    \"Marker ID\": \"\",\n    \"Marker location\": \"\",\n    \"Association type\": \"\",\n    \"Strength of association\": \"\",\n    \"Evidence level\": \"\",\n    \"Statistical metrics\": \"\",\n    \"Directionality\": \"\",\n    \"Functional impact\": \"\"\n  },\n  {\n    \"Abstract title\": \"Extracellular Vesicles from Human Umbilical Cord Blood-Derived Mesenchymal Stromal Cells in Diabetic Kidney Disease\",\n    \"Disease name\": \"Diabetic kidney disease\",\n    \"Disease category\": \"Chronic kidney disease\",\n    \"Marker name\": \"\",\n    \"Marker type\": \"\",\n    \"Marker ID\": \"\",\n    \"Marker location\": \"\",

In [10]:
response_output = response.json()['response'].replace('```', '').replace('json\n', '').replace('\n', '')
cleaned_response_output = json.loads(re.sub(' +', ' ', response_output))
cleaned_response_output

[{'Abstract title': 'Effects of High Temperature Stress on Metabolism and Physiology in Pigs',
  'Disease name': 'High temperature stress',
  'Disease category': 'Environmental stress condition',
  'Marker name': '',
  'Marker type': '',
  'Marker ID': '',
  'Marker location': '',
  'Association type': '',
  'Strength of association': '',
  'Evidence level': '',
  'Statistical metrics': '',
  'Directionality': '',
  'Functional impact': ''},
 {'Abstract title': 'Extracellular Vesicles from Human Umbilical Cord Blood-Derived Mesenchymal Stromal Cells in Diabetic Kidney Disease',
  'Disease name': 'Diabetic kidney disease',
  'Disease category': 'Chronic kidney disease',
  'Marker name': '',
  'Marker type': '',
  'Marker ID': '',
  'Marker location': '',
  'Association type': '',
  'Strength of association': '',
  'Evidence level': '',
  'Statistical metrics': '',
  'Directionality': '',
  'Functional impact': ''},
 {'Abstract title': 'Role of microRNA-126 in Osteoarthritis',
  'Disease

In [11]:
df = pd.DataFrame(cleaned_response_output)
df.head()

Unnamed: 0,Abstract title,Disease name,Disease category,Marker name,Marker type,Marker ID,Marker location,Association type,Strength of association,Evidence level,Statistical metrics,Directionality,Functional impact
0,Effects of High Temperature Stress on Metaboli...,High temperature stress,Environmental stress condition,,,,,,,,,,
1,Extracellular Vesicles from Human Umbilical Co...,Diabetic kidney disease,Chronic kidney disease,,,,,,,,,,
2,Role of microRNA-126 in Osteoarthritis,Osteoarthritis,Degenerative joint disease,microRNA-126,Non-coding RNA/miRNA,,,Regulatory,,,,Negative regulation,Involvement in inflammation and cartilage degr...
3,Chitosan Nanoparticles for DNA Delivery,,,,,,,,,,,,
4,Myocardial Ischemia/Reperfusion Injury and miR...,Ischemic heart disease,Cardiovascular disease,miR-181c-5p,Non-coding RNA/miRNA,,,Regulatory,,,,Positive regulation of damage pathways,Involvement in inflammatory and apoptotic proc...
