In [1]:
import random
import requests
from Bio import Entrez

### **The code cells below accomplishes the following:**

1) Find a way to get random pubmed ID’s (PMID) 

2) For each PMID, get the title, abstract, MeSH terms, and keywords, and save this data

In [None]:
def is_valid_pmid(pmid):
    try:
        # Query PubMed API to check if PMID exists
        response = requests.get(f'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?db=pubmed&id={pmid}&retmode=json')
        data = response.json()
        return 'error' not in data
    except Exception as e:
        print(f"An error occurred: {e}")
        return False

def generate_random_pmid():
    while True:
        # Generate a random number within the valid PMID range
        random_pmid = random.randint(1, 30000000)
        
        # Check if the generated PMID is valid
        if is_valid_pmid(random_pmid):
            return random_pmid

# Generate 10 random, valid PMIDs and store them in a list
generated_pmids = [generate_random_pmid() for _ in range(10)]

# Print the generated PMIDs
print("Generated PMIDs:", generated_pmids)

Entrez.email = "richard.finney@torontomu.ca"

results = []

for PMID in generated_pmids:
    handle = Entrez.efetch(db="pubmed", id=PMID, retmode="xml")
    record = Entrez.read(handle)
    articles = record['PubmedArticle']
    
    if articles:
        medline_citation = articles[0].get('MedlineCitation', {})
        article = medline_citation.get('Article', {})
        title = article.get('ArticleTitle', 'Title not available')

        # Retrieving abstract, handling KeyError if abstract is unavailable
        abstract = ''
        try:
            abstract_element = record['PubmedArticle'][0]['MedlineCitation']['Article']['Abstract']['AbstractText']
            if isinstance(abstract_element, list):
                abstract = ' '.join(abstract_element)
            elif isinstance(abstract_element, str):
                abstract = abstract_element
        except (KeyError, IndexError):
            pass

        mesh_headings = medline_citation.get('MeshHeadingList', [])
        mesh_terms = []

        # Retrieving mesh terms and handling KeyError if Mesh terms are unavailable
        for mesh_heading in mesh_headings:
            try:
                mesh_terms.append(mesh_heading['DescriptorName'])
            except KeyError:
                pass

        mesh_terms = [term.split(',')[0].strip("'").strip() for term in mesh_terms]

        # Retrieving keywords and handling KeyError if keywords are unavailable
        keywords = []
        try:
            keyword_list = record['PubmedArticle'][0]['MedlineCitation'].get('KeywordList', [])
            if keyword_list:
                keywords = [keyword.strip() for keyword in keyword_list[0]]
        except KeyError:
            pass

        results.append({
            'PMID': PMID,
            'Title': title,
            'Abstract': abstract,
            'MeshTerms': mesh_terms,
            'Keywords': keywords
        })

    else:
        results.append({
            'PMID': PMID,
            'Title': '',
            'Abstract': '',
            'MeshTerms': '',
            'Keywords': ''
        })
results


In [93]:
print("Generated PMIDs:", generated_pmids)

Generated PMIDs: [25264004, 14963222, 1300492, 27032909, 10026408, 29483605, 170368, 28446589, 27622469, 24304278]


The Output below is a List (results) containing the Title, Abstract, MesHTerms and Keywords extracted from PubMed based on each of the 10 randomly generated PMIDs

In [94]:
results

[{'PMID': 25264004,
  'Title': '[The clinical significance of typical reflux symptoms in diagnosing gastroesophageal reflux disease].',
  'Abstract': "To explore the clinical significance of typical reflux symptoms in the diagnosis of gastroesophageal reflux disease (GERD). Consecutive patients older than 16 years, who initially visited department of gastroenterology at clinic of Peking University Third Hospital from May 9, 2012 to Dec 31, 2012, were required to complete a self-reported GERD questionnaire. Upper endoscopy was performed in some selected patients. A total of 18 987 patients were enrolled with a response rate of 91.5%. The prevalence of symptom-defined GERD was 13.6% (2 579/18 987). A total of 4 357 (22.9%) patients underwent the upper endoscopy, and the diagnostic rates of reflux esophagitis, Barrett's esophagus, peptic ulcer disease, and upper gastrointestinal malignancy were 13.1% (572/4 357), 1.8% (78/4 357), 10.5% (456/4 357), and 1.7% (75/4 357), respectively. The i

### **In this part of the code, we:**

3) Create a ChatGPT prompt where you provide it with the title, abstract, MeSH terms, and keywords and it returns a PubMed runnable Boolean Query 

4) Post-process so that the query is runnable on PubMed

So as noted above, we are going to take the Title, Abstract, MeSH terms and keywords from each of the 10 randomly Generated PMIDs, and use ChatGPT to create a PubMed runnable Boolean Query. There is also post-fetching processing done here, as sometimes the reponse contains explanation to accompany the Query - we want just the Query for PubMed.

This Query forms the Golden-Standard, and will be appended to our original results list for organization.

In [95]:
import re

from openai import OpenAI

client = OpenAI(
    api_key="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",
)

def get_completion(prompt, model="gpt-4-0125-preview"):
    messages = [{"role": "user", "content": prompt}]
    response = client.chat.completions.create(
        model=model,
        messages=messages,
    temperature=0,
    )
    return response.choices[0].message.content

for result in results:

    prompt = "For the following Title, Abstract, MeSH Terms and Keywords, Can you generate a PubMed runnable Boolean Query. Title = " + str(result['Title']) + ", Abstract: " + str(result['Abstract']) + ", MeSH Terms: " + str(result['MeshTerms']) +", keywords: " + str(result['Keywords'])
    response = get_completion(prompt)
    response
    # Performing Post Processing to just grab the Query produced (ChatGPT writes out this whole explanation)
    matches = re.search(r'```(.*?)```', response, re.DOTALL)
    if matches:
        query = matches.group(1).strip()

    print('-----------------------------------------------------------------------')
    print('')
    
    print("PMID:", result['PMID'])
    print('')
    print("Parsed Query:")
    print(query)
    result['Query'] = query
    print('')
    print('-----------------------------------------------------------------------')

-----------------------------------------------------------------------

PMID: 25264004

Parsed Query:
("Gastroesophageal Reflux"[Mesh] AND "Barrett Esophagus"[Mesh]) OR ("Gastroesophageal Reflux/diagnosis"[Mesh] AND "Gastroscopy"[Mesh]) OR ("Gastroesophageal Reflux/epidemiology"[Mesh] AND "Incidence"[Mesh]) OR ("Gastroesophageal Reflux/epidemiology"[Mesh] AND "Prevalence"[Mesh]) OR ("Gastroesophageal Reflux"[Mesh] AND "Risk Factors"[Mesh]) AND ("Humans"[Mesh]) AND ("reflux symptoms" OR "diagnostic rates of reflux esophagitis" OR "Barrett's esophagus" OR "peptic ulcer disease" OR "upper gastrointestinal malignancy" OR "upper endoscopy" OR "GERD questionnaire")

-----------------------------------------------------------------------
-----------------------------------------------------------------------

PMID: 14963222

Parsed Query:
("Gastroesophageal Reflux"[Mesh] AND "Barrett Esophagus"[Mesh]) OR ("Gastroesophageal Reflux/diagnosis"[Mesh] AND "Gastroscopy"[Mesh]) OR ("Gastroesophagea

The cell below displays the list that contains each of the 10 randomly generated PMIDs along with their Titles, Abstracts, MesH Terms, Keywords and the Golden-Standard ChatGPT Generated PubMed runnable Boolean Query generated with all this information.

In [96]:
results

[{'PMID': 25264004,
  'Title': '[The clinical significance of typical reflux symptoms in diagnosing gastroesophageal reflux disease].',
  'Abstract': "To explore the clinical significance of typical reflux symptoms in the diagnosis of gastroesophageal reflux disease (GERD). Consecutive patients older than 16 years, who initially visited department of gastroenterology at clinic of Peking University Third Hospital from May 9, 2012 to Dec 31, 2012, were required to complete a self-reported GERD questionnaire. Upper endoscopy was performed in some selected patients. A total of 18 987 patients were enrolled with a response rate of 91.5%. The prevalence of symptom-defined GERD was 13.6% (2 579/18 987). A total of 4 357 (22.9%) patients underwent the upper endoscopy, and the diagnostic rates of reflux esophagitis, Barrett's esophagus, peptic ulcer disease, and upper gastrointestinal malignancy were 13.1% (572/4 357), 1.8% (78/4 357), 10.5% (456/4 357), and 1.7% (75/4 357), respectively. The i

### **Finally, we:**

5) Run each query through PubMed, get the top 5-10 results from the query (That boolean query will be considered the gold-standard for those 5-10 documents). You also need to get the title, abstract, MeSH terms, and Keywords for each of those documents and save all that data

In [97]:
def search_pubmed(query, num_results=5):
    Entrez.email = "richard.finney@torontomu.ca"

    # Search query in Pubmed database
    handle = Entrez.esearch(db="pubmed", term=query, retmax=num_results)
    record = Entrez.read(handle)
    handle.close()

    # Retrieve the list of PubMed IDs (PMID)
    pmids = record["IdList"]

    return pmids

In [98]:
list_of_Golden_PMIDs = []

for result in results:
    result['Golden_PMIDs'] = search_pubmed(result['Query'], num_results=5)
    print('--------------------------------------------------------------------------------------------------------------')
    print("Original Generated PMID:")
    print(result['PMID'])
    print('')
    print("ChatGPT-created Golden Standard Query based on Title, Abstract, MeSH Terms and Key words of original PMID:")
    print(result['Query'])
    print('')
    print("Top 5 PMIDs retrieved from the Golden Standard Query:")
    print(result['Golden_PMIDs'])
    print('')
    
    #I am also storting these 'Golden PMIDs' in a separate list so we can capture Title, Abstract, MeshTerms and Keywords from these
    
    #these are stored separately because the original list is getting too monstorous lol
    list_of_Golden_PMIDs.append(search_pubmed(result['Query'], num_results=5))

--------------------------------------------------------------------------------------------------------------
Original Generated PMID:
25264004

ChatGPT-created Golden Standard Query based on Title, Abstract, MeSH Terms and Key words of original PMID:
("Gastroesophageal Reflux"[Mesh] AND "Barrett Esophagus"[Mesh]) OR ("Gastroesophageal Reflux/diagnosis"[Mesh] AND "Gastroscopy"[Mesh]) OR ("Gastroesophageal Reflux/epidemiology"[Mesh] AND "Incidence"[Mesh]) OR ("Gastroesophageal Reflux/epidemiology"[Mesh] AND "Prevalence"[Mesh]) OR ("Gastroesophageal Reflux"[Mesh] AND "Risk Factors"[Mesh]) AND ("Humans"[Mesh]) AND ("reflux symptoms" OR "diagnostic rates of reflux esophagitis" OR "Barrett's esophagus" OR "peptic ulcer disease" OR "upper gastrointestinal malignancy" OR "upper endoscopy" OR "GERD questionnaire")

Top 5 PMIDs retrieved from the Golden Standard Query:
['38311635', '38183594', '38019753', '37979936', '37932595']

----------------------------------------------------------------

Now we must also retrieve the Titles, Abstracts, MeSH Terms and Keywords for each of these Golden-Standard Query retrieved PMIDs and save the data. I stored these in a separate list (results2), as the original list (results) is getting too big.

In [103]:
results2 = []

for sublist in list_of_Golden_PMIDs:
    for item in sublist:
        
        handle = Entrez.efetch(db="pubmed", id=item, retmode="xml")
        record = Entrez.read(handle)
        articles = record['PubmedArticle']
    
        if articles:
            medline_citation = articles[0].get('MedlineCitation', {})
            article = medline_citation.get('Article', {})
            title = article.get('ArticleTitle', 'Title not available')

            # Retrieving abstract, handling KeyError if abstract is unavailable
            abstract = ''
            try:
                abstract_element = record['PubmedArticle'][0]['MedlineCitation']['Article']['Abstract']['AbstractText']
                if isinstance(abstract_element, list):
                    abstract = ' '.join(abstract_element)
                elif isinstance(abstract_element, str):
                    abstract = abstract_element
            except (KeyError, IndexError):
                pass

            mesh_headings = medline_citation.get('MeshHeadingList', [])
            mesh_terms = []

            # Retrieving mesh terms and handling KeyError if Mesh terms are unavailable
            for mesh_heading in mesh_headings:
                try:
                    mesh_terms.append(mesh_heading['DescriptorName'])
                except KeyError:
                    pass

            mesh_terms = [term.split(',')[0].strip("'").strip() for term in mesh_terms]

            # Retrieving keywords and handling KeyError if keywords are unavailable
            keywords = []
            try:
                keyword_list = record['PubmedArticle'][0]['MedlineCitation'].get('KeywordList', [])
                if keyword_list:
                    keywords = [keyword.strip() for keyword in keyword_list[0]]
            except KeyError:
                pass

            results2.append({
                'PMID': item,
                'Title': title,
                'Abstract': abstract,
                'MeshTerms': mesh_terms,
                'Keywords': keywords
            })

        else:
            results2.append({
                'PMID': item,
                'Title': '',
                'Abstract': '',
                'MeshTerms': '',
                'Keywords': ''
            })
        

        

Here is our final list, that contains Title, Abstract, MeSH Terms, and Keywords for each of out top-5 PMIDs retrieved from each of our Golden-Standard Queries generated from the original 10 randomly generated PMIDs

In [104]:
results2

[{'PMID': '38311635',
  'Title': "Prevalence and associated factors of worry for cancer in patients with a Barrett's esophagus.",
  'Abstract': "Although the risk of cancer progression in a Barrett's esophagus (BE) is very low, worrying about cancer is known as an important factor affecting HRQoL. The aim of this study was to determine the proportion of BE patients with high levels of worry for cancer, to compare outcomes of patients endoscopically treated for BE neoplasia (DBE), non-dysplastic BE patients (NDBE) and patients with reflux symptoms, and to examine associated factors. We performed a cross sectional, exploratory, self-administered questionnaire study using the cancer worry scale, and the reflux disease questionnaire. A total of 192 DBE patients, 213 NDBE patients and 111 refractory reflux symptom patients were included from October 2019 until July 2021, 76.8% of BE participants were male and aged 66.9\xa0years. High cancer worry was reported in 40.6% of the DBE patients an