In [None]:
import random
import requests
from Bio import Entrez
import time

import warnings
warnings.filterwarnings("ignore")

In [None]:
def is_valid_pmid(pmid):
    try:
        # Query PubMed API to check if PMID exists
        response = requests.get(f'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?db=pubmed&id={pmid}&retmode=json')
        data = response.json()
        return 'error' not in data
    except Exception as e:
        print(f"An error occurred: {e}")
        return False

def generate_random_pmid():
    while True:
        # Generate a random number within the valid PMID range
        random_pmid = random.randint(1, 30000000)
        
        # Check if the generated PMID is valid
        if is_valid_pmid(random_pmid):
            return random_pmid

# Generate 10 random, valid PMIDs and store them in a list
generated_pmids = [generate_random_pmid() for _ in range(100000)]

Entrez.email = "richard.finney@torontomu.ca"

results = []

for PMID in generated_pmids:
    try:
        handle = Entrez.efetch(db="pubmed", id=PMID, retmode="xml")
        record = Entrez.read(handle)
        articles = record.get('PubmedArticle', [])  # Ensure articles is not empty
    except Exception as e:
        print(f"API access error for PMID {PMID}: {e}")
        time.sleep(5)  # Sleep for 5 seconds before retrying
        continue
    
    if articles:
        medline_citation = articles[0].get('MedlineCitation', {})
        article = medline_citation.get('Article', {})
        title = article.get('ArticleTitle', 'Title not available')

        # Retrieving abstract, handling KeyError if abstract is unavailable
        abstract = ''
        try:
            abstract_element = record['PubmedArticle'][0]['MedlineCitation']['Article']['Abstract']['AbstractText']
            if isinstance(abstract_element, list):
                abstract = ' '.join(abstract_element)
            elif isinstance(abstract_element, str):
                abstract = abstract_element
        except (KeyError, IndexError):
            pass

        mesh_headings = medline_citation.get('MeshHeadingList', [])
        mesh_terms = []

        # Retrieving mesh terms and handling KeyError if Mesh terms are unavailable
        for mesh_heading in mesh_headings:
            try:
                mesh_terms.append(mesh_heading['DescriptorName'])
            except KeyError:
                pass

        mesh_terms = [term.split(',')[0].strip("'").strip() for term in mesh_terms]

        # Retrieving keywords and handling KeyError if keywords are unavailable
        keywords = []
        try:
            keyword_list = record['PubmedArticle'][0]['MedlineCitation'].get('KeywordList', [])
            if keyword_list:
                keywords = [keyword.strip() for keyword in keyword_list[0]]
        except KeyError:
            pass

        results.append({
            'PMID': PMID,
            'Title': title,
            'Abstract': abstract,
            'MeshTerms': mesh_terms,
            'Keywords': keywords
        })

    else:
        results.append({
            'PMID': PMID,
            'Title': '',
            'Abstract': '',
            'MeshTerms': '',
            'Keywords': ''
        })


In [None]:
len(results)

In [None]:
import re

from openai import OpenAI

client = OpenAI(
    api_key="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",
)

def get_completion(prompt, model="gpt-3.5-turbo"):
    messages = [{"role": "user", "content": prompt}]
    response = client.chat.completions.create(
        model=model,
        messages=messages,
        temperature=0,
    )
    return response.choices[0].message.content

for result in results:

    prompt = "Based on the following SLR Title (" + str(result['Title']) + ") please provide a complex pubmed Entrez formatted query without descriptions, in plain text, such that they may be used directly on Pubmed’s website. The following Mesh Terms (" + str(result['MeshTerms']) +") and Keywords (" + str(result['Keywords']) +") extracted from this article may be relevant as well. Please only include these in the query if they are relevant and improve the formulation an effective query."
    #print(prompt)
    
    retry = True
    while retry:
        try:
            response = get_completion(prompt)
            query = response
            retry = False
        except Exception as e:
            print(f"Error: {e}. Retrying in 5 seconds...")
            time.sleep(5)

    result['Query'] = query

In [None]:
def search_pubmed(query, num_results=5):
    Entrez.email = "richard.finney@torontomu.ca"

    # Search query in Pubmed database
    handle = Entrez.esearch(db="pubmed", term=query, retmax=num_results)
    record = Entrez.read(handle)
    handle.close()

    # Retrieve the list of PubMed IDs (PMID)
    pmids = record["IdList"]

    return pmids

In [None]:
list_of_Golden_PMIDs = []

for result in results:
    retry = True
    while retry:
        try:
            result['Golden_PMIDs'] = search_pubmed(result['Query'], num_results=5)
            retry = False
        except Exception as e:
            print(f"Error: {e}. Retrying in 5 seconds...")
            time.sleep(5)

    list_of_Golden_PMIDs.append(result['Golden_PMIDs'])




#     print('--------------------------------------------------------------------------------------------------------------')
#     print("Original Generated PMID:")
#     print(result['PMID'])
#     print('')
#     print("Title Used to create Gold-Standard Query:")
#     print(result['Title'])
#     print('')
#     print("ChatGPT-created Golden Standard Query based on Title of original PMID:")
#     print(result['Query'])
#     print('')
#     print("Top 5 PMIDs retrieved from the Golden Standard Query:")
#     print(result['Golden_PMIDs'])
#     print('')

In [None]:
for result in results:

    Golden_Titles=[]
    Golden_Abstracts=[]
    Golden_MeshTerms=[]
    Golden_Keywords=[]

    for Golden_PMID in result['Golden_PMIDs']:
        retry = True
        while retry:
            try:
                handle = Entrez.efetch(db="pubmed", id=Golden_PMID, retmode="xml")
                record = Entrez.read(handle)
                retry = False
            except Exception as e:
                print(f"API access error for PMID {Golden_PMID}: {e}. Retrying in 5 seconds...")
                time.sleep(5)

        articles = record['PubmedArticle']

        if articles:
            medline_citation = articles[0].get('MedlineCitation', {})
            article = medline_citation.get('Article', {})
            title = article.get('ArticleTitle', 'Title not available')
            
            # Retrieving abstract, handling KeyError if abstract is unavailable
            abstract = ''
            try:
                abstract_element = record['PubmedArticle'][0]['MedlineCitation']['Article']['Abstract']['AbstractText']
                if isinstance(abstract_element, list):
                    abstract = ' '.join(abstract_element)
                elif isinstance(abstract_element, str):
                    abstract = abstract_element
            except (KeyError, IndexError):
                pass

            mesh_headings = medline_citation.get('MeshHeadingList', [])
            mesh_terms = []

            # Retrieving mesh terms and handling KeyError if Mesh terms are unavailable
            for mesh_heading in mesh_headings:
                try:
                    mesh_terms.append(mesh_heading['DescriptorName'])
                except KeyError:
                    pass

            mesh_terms = [term.split(',')[0].strip("'").strip() for term in mesh_terms]

            # Retrieving keywords and handling KeyError if keywords are unavailable
            keywords = []
            try:
                keyword_list = record['PubmedArticle'][0]['MedlineCitation'].get('KeywordList', [])
                if keyword_list:
                    keywords = [keyword.strip() for keyword in keyword_list[0]]
            except KeyError:
                pass

            Golden_Titles.append(title)
            Golden_Abstracts.append(abstract)
            Golden_MeshTerms.append(mesh_terms)
            Golden_Keywords.append(keywords)

        else:
            Golden_Titles.append('')
            Golden_Abstracts.append('')
            Golden_MeshTerms.append('')
            Golden_Keywords.append('')

    result['Golden Titles'] = Golden_Titles
    result['Golden Abstracts'] = Golden_Abstracts
    result['Golden MeshTerms'] = Golden_MeshTerms
    result['Golden Keywords'] = Golden_Keywords


In [None]:
cleansed_results=[]

for result in results:

    if len(result['Golden_PMIDs'])>=1:
        cleansed_results.append({
            'PMID': result['PMID'],
            'Title': result['Title'],
            'Abstract': result['Abstract'],
            'MeshTerms': result['MeshTerms'],
            'Keywords': result['Keywords'],
            'Query': result['Query'],
            'Golden_PMIDs': result['Golden_PMIDs'],
            'Golden_Abstracts': result['Golden Abstracts'],
            'Golden MeshTerms': result['Golden MeshTerms'],
            'Golden Keywords': result['Golden Keywords']

        })

In [None]:
len(results)

In [None]:
len(cleansed_results)

In [None]:
import json

# Specify the file path where you want to save the JSON file
json_file_path = 'Method1_Training_Data.json'

# Write the list of dictionaries to a JSON file
with open(json_file_path, 'w') as json_file:
    json.dump(results, json_file, indent=4)

print("JSON file created successfully.")