In [2]:
import random
import requests
from Bio import Entrez

import warnings
warnings.filterwarnings("ignore")

In [None]:
def is_valid_pmid(pmid):
    try:
        # Query PubMed API to check if PMID exists
        response = requests.get(f'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?db=pubmed&id={pmid}&retmode=json')
        data = response.json()
        return 'error' not in data
    except Exception as e:
        print(f"An error occurred: {e}")
        return False

def generate_random_pmid():
    while True:
        # Generate a random number within the valid PMID range
        random_pmid = random.randint(1, 30000000)
        
        # Check if the generated PMID is valid
        if is_valid_pmid(random_pmid):
            return random_pmid

# Generate 10 random, valid PMIDs and store them in a list
generated_pmids = [generate_random_pmid() for _ in range(100)]

# Print the generated PMIDs
print("Generated PMIDs:", generated_pmids)

Entrez.email = "richard.finney@torontomu.ca"

results = []

for PMID in generated_pmids:
    handle = Entrez.efetch(db="pubmed", id=PMID, retmode="xml")
    record = Entrez.read(handle)
    articles = record['PubmedArticle']
    
    if articles:
        medline_citation = articles[0].get('MedlineCitation', {})
        article = medline_citation.get('Article', {})
        title = article.get('ArticleTitle', 'Title not available')

        # Retrieving abstract, handling KeyError if abstract is unavailable
        abstract = ''
        try:
            abstract_element = record['PubmedArticle'][0]['MedlineCitation']['Article']['Abstract']['AbstractText']
            if isinstance(abstract_element, list):
                abstract = ' '.join(abstract_element)
            elif isinstance(abstract_element, str):
                abstract = abstract_element
        except (KeyError, IndexError):
            pass

        mesh_headings = medline_citation.get('MeshHeadingList', [])
        mesh_terms = []

        # Retrieving mesh terms and handling KeyError if Mesh terms are unavailable
        for mesh_heading in mesh_headings:
            try:
                mesh_terms.append(mesh_heading['DescriptorName'])
            except KeyError:
                pass

        mesh_terms = [term.split(',')[0].strip("'").strip() for term in mesh_terms]

        # Retrieving keywords and handling KeyError if keywords are unavailable
        keywords = []
        try:
            keyword_list = record['PubmedArticle'][0]['MedlineCitation'].get('KeywordList', [])
            if keyword_list:
                keywords = [keyword.strip() for keyword in keyword_list[0]]
        except KeyError:
            pass

        results.append({
            'PMID': PMID,
            'Title': title,
            'Abstract': abstract,
            'MeshTerms': mesh_terms,
            'Keywords': keywords
        })

    else:
        results.append({
            'PMID': PMID,
            'Title': '',
            'Abstract': '',
            'MeshTerms': '',
            'Keywords': ''
        })
results

### **Dr. Ensan, Leandra, could I ask you to review the prompts given to ChatGPT and the responses retrieved, to see if we can improve these in any way?**

In [7]:
import re

from openai import OpenAI

client = OpenAI(
    api_key="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",
)

def get_completion(prompt, model="gpt-3.5-turbo"):
    messages = [{"role": "user", "content": prompt}]
    response = client.chat.completions.create(
        model=model,
        messages=messages,
    temperature=0,
    )
    return response.choices[0].message.content

for result in results:

    prompt = "Based on the following SLR Title (" + str(result['Title']) + ") please provide a complex pubmed Entrez formatted query without descriptions, in plain text, such that they may be used directly on Pubmed’s website. The follow Mesh Terms (" + str(result['MeshTerms']) +") and Keywords (" + str(result['Keywords']) +") extracted from this article may be relevant as well. Please only include these in the query if they are revelent and improve the formulation an effective query."
    print(prompt)
    response = get_completion(prompt)

    query = response
    print('')

#     print("PMID:", result['PMID'])
#     print('')
    print("Parsed Query:")
    print('')    
    print(query)
    result['Query'] = query
    print('')
    print('-----------------------------------------------------------------------')

Based on the following SLR Title (Deformation of dorsal root ganglion due to pressure transients of venous blood and cerebrospinal fluid in the cervical vertebral canal.) please provide a complex pubmed Entrez formatted query without descriptions, in plain text, such that they may be used directly on Pubmed’s website. The follow Mesh Terms (['Blood Pressure', 'Cerebrospinal Fluid Pressure', 'Cervical Vertebrae', 'Ganglia', 'Humans', 'Neck', 'Spinal Canal', 'Whiplash Injuries']) and Keywords (['Cerebrospinal fluid', 'Dorsal root ganglion', 'Dura mater', 'Fluid-structure interaction', 'Pressure transients', 'Venous blood', 'Whiplash']) extracted from this article may be relevant as well. Please only include these in the query if they are revelent and improve the formulation an effective query.

Parsed Query:

("Blood Pressure"[Mesh] OR "Cerebrospinal Fluid Pressure"[Mesh] OR "Cervical Vertebrae"[Mesh] OR "Ganglia"[Mesh] OR "Humans"[Mesh] OR "Neck"[Mesh] OR "Spinal Canal"[Mesh] OR "Whipla

In [8]:
def search_pubmed(query, num_results=5):
    Entrez.email = "richard.finney@torontomu.ca"

    # Search query in Pubmed database
    handle = Entrez.esearch(db="pubmed", term=query, retmax=num_results)
    record = Entrez.read(handle)
    handle.close()

    # Retrieve the list of PubMed IDs (PMID)
    pmids = record["IdList"]

    return pmids

In [9]:
list_of_Golden_PMIDs = []

for result in results:
    result['Golden_PMIDs'] = search_pubmed(result['Query'], num_results=5)
    print('--------------------------------------------------------------------------------------------------------------')
    print("Original Generated PMID:")
    print(result['PMID'])
    print('')
    print("Title Used to create Gold-Standard Query:")
    print(result['Title'])
    print('')
    print("ChatGPT-created Golden Standard Query based on Title of original PMID:")
    print(result['Query'])
    print('')
    print("Top 5 PMIDs retrieved from the Golden Standard Query:")
    print(result['Golden_PMIDs'])
    print('')

--------------------------------------------------------------------------------------------------------------
Original Generated PMID:
29801662

Title Used to create Gold-Standard Query:
Deformation of dorsal root ganglion due to pressure transients of venous blood and cerebrospinal fluid in the cervical vertebral canal.

ChatGPT-created Golden Standard Query based on Title of original PMID:
("Blood Pressure"[Mesh] OR "Cerebrospinal Fluid Pressure"[Mesh] OR "Cervical Vertebrae"[Mesh] OR "Ganglia"[Mesh] OR "Humans"[Mesh] OR "Neck"[Mesh] OR "Spinal Canal"[Mesh] OR "Whiplash Injuries"[Mesh]) AND ("Cerebrospinal fluid"[Keyword] OR "Dorsal root ganglion"[Keyword] OR "Dura mater"[Keyword] OR "Fluid-structure interaction"[Keyword] OR "Pressure transients"[Keyword] OR "Venous blood"[Keyword] OR "Whiplash"[Keyword])

Top 5 PMIDs retrieved from the Golden Standard Query:
['38368403', '38368354', '38365357', '38363175', '38361318']

---------------------------------------------------------------

In [10]:
cleansed_results=[]

for result in results:

    if len(result['Golden_PMIDs'])==5:
        cleansed_results.append({
            'PMID': result['PMID'],
            'Title': result['Title'],
            'Query': result['Query'],
            'Golden_PMIDs': result['Golden_PMIDs']

        })

In [11]:
for cleansed_result in cleansed_results:

    Golden_Titles=[]

    for Golden_PMID in cleansed_result['Golden_PMIDs']:

        handle = Entrez.efetch(db="pubmed", id=Golden_PMID, retmode="xml")
        record = Entrez.read(handle)
        articles = record['PubmedArticle']

        if articles:
            medline_citation = articles[0].get('MedlineCitation', {})
            article = medline_citation.get('Article', {})
            title = article.get('ArticleTitle', 'Title not available')

            Golden_Titles.append(title)

        else:
            Golden_Titles.append('')

    cleansed_result['Golden Titles'] = Golden_Titles

In [12]:
import json

# Specify the file path where you want to save the JSON file
json_file_path = 'cleansed_results.json'

# Write the list of dictionaries to a JSON file
with open(json_file_path, 'w') as json_file:
    json.dump(cleansed_results, json_file, indent=4)

print("JSON file created successfully.")

JSON file created successfully.


In [13]:
import json

def preprocess_intents_json(intents_file):
    with open(intents_file, "r") as f:
        data = json.load(f)

    preprocessed_data = []

    for entry in data:
            preprocessed_data.append(f"User: {entry['Golden Titles']}\n")
            preprocessed_data.append(f"Assistant: {entry['Query']}\n")

    return "".join(preprocessed_data)


def save_preprocessed_data(preprocessed_data, output_file):
    with open(output_file, "w", encoding="utf-8") as f:
        f.write(preprocessed_data)


intents_file = "cleansed_results.json"
output_file = "Golden_Query_Titles.txt"


preprocessed_data = preprocess_intents_json(intents_file)
save_preprocessed_data(preprocessed_data, output_file)