In [None]:
import requests
import time
import firebase_admin
import openai
import os
from firebase_admin import credentials, firestore
from tqdm import tqdm
from dotenv import load_dotenv

### Fetching API version info

In [147]:
def get_api_version_info():
    url = "https://clinicaltrials.gov/api/v2/version"
    try:
        response = requests.get(url)
        if response.status_code == 200:
            return response.json()
        else:
            return {"error": f"Failed to get version info, status code: {response.status_code}"}
    except requests.RequestException as e:
        return {"error": str(e)}

# Example usage
version_info = get_api_version_info()
print(version_info.get('apiVersion'))


2.0.3


### Fetching Clinical Trials Data

In [151]:
def fetch_clinical_trials(max_studies=None, batch_size=1000):
    url = "https://clinicaltrials.gov/api/v2/studies"
    page_token = None
    studies = []

    headers = {
        "Accept": "application/json"
    }

    # Fetch the total count of studies if max_studies is not provided
    if max_studies is None:
        response = requests.get(url, headers=headers, params={
            "countTotal": "true",
            "filter.overallStatus": "RECRUITING",
            "format": "json"
        })
        if response.status_code == 200:
            data = response.json()
            max_studies = data.get("totalCount", 5)
            print(f"Total count of recruiting studies: {max_studies}")
        else:
            print(f"Error: Unable to fetch total count, status code {response.status_code}")
            max_studies = 5  # Set a default value to avoid NoneType error

    total_batches = (max_studies // batch_size) + (1 if max_studies % batch_size != 0 else 0)

    # Fetch the studies with batch processing and a progress bar
    with tqdm(total=max_studies, desc="Fetching Clinical Trials") as pbar:
        for batch in range(total_batches):
            while len(studies) < (batch + 1) * batch_size and len(studies) < max_studies:
                params = {
                    "filter.overallStatus": "RECRUITING",
                    "countTotal": "true",
                    "pageToken": page_token,
                    "format": "json",
                    "markupFormat": "markdown"
                }

                response = requests.get(url, headers=headers, params=params)

                if response.status_code != 200:
                    print(f"Error: Received status code {response.status_code}")
                    print("Response content:", response.text)
                    break

                try:
                    data = response.json()
                except requests.JSONDecodeError:
                    print("Failed to decode JSON. Response was:", response.text)
                    break

                fetched_studies = data.get("studies", [])
                pbar.update(len(fetched_studies))

                for study in fetched_studies:
                    if len(studies) >= max_studies:
                        break

                    document_section = study.get("documentSection", {}).get("largeDocumentModule", {})
                    design_info = study["protocolSection"]["designModule"].get("designInfo", {})
                    responsible_party = study["protocolSection"]["sponsorCollaboratorsModule"].get("responsibleParty", {})
                    enrollment_info = study["protocolSection"]["designModule"].get("enrollmentInfo", {})
                    contacts_locations = study["protocolSection"].get("contactsLocationsModule", {})
                    central_contacts = contacts_locations.get("centralContacts", [{}])[0]
                    locations = contacts_locations.get("locations", [{}])[0]

                    study_info = {
                        "nctId": study["protocolSection"]["identificationModule"].get("nctId"),
                        "title": study["protocolSection"]["identificationModule"].get("officialTitle"),
                        "studyType": study["protocolSection"]["designModule"].get("studyType"),
                        "investigator": responsible_party.get("investigatorFullName", "N/A"),
                        "sponsorName": study["protocolSection"]["sponsorCollaboratorsModule"]["leadSponsor"].get("name"),
                        "organization": study["protocolSection"]["identificationModule"]["organization"].get("fullName"),
                        "overallStatus": study["protocolSection"]["statusModule"]["overallStatus"],
                        "briefSummary": study["protocolSection"]["descriptionModule"].get("briefSummary"),
                        "description": study["protocolSection"]["descriptionModule"].get("detailedDescription"),
                        "conditions": study["protocolSection"]["conditionsModule"].get("conditions"),
                        "keywords": study["protocolSection"]["conditionsModule"].get("keywords"),
                        "purpose": design_info.get("primaryPurpose"),
                        "phase": study["protocolSection"]["designModule"].get("phases"),
                        "interventionalModel": design_info.get("interventionModel"),
                        "observationalModel": design_info.get("observationalModel"),
                        "timePerspective": design_info.get("timePerspective"),
                        "enrollmentCount": enrollment_info.get("count", "N/A"),
                        "enrollmentType": enrollment_info.get("type", "N/A"),
                        "targetDuration": study["protocolSection"]["designModule"].get("targetDuration"),
                        "eligibilityCriteria": study["protocolSection"]["eligibilityModule"].get("eligibilityCriteria"),
                        "sex": study["protocolSection"]["eligibilityModule"].get("sex"),
                        "minAge": study["protocolSection"]["eligibilityModule"].get("minimumAge"),
                        "maxAge": study["protocolSection"]["eligibilityModule"].get("maximumAge"),
                        "healthyVolunteers": study["protocolSection"]["eligibilityModule"].get("healthyVolunteers"),
                        "centralContactName": central_contacts.get("name", "N/A"),
                        "centralContactPhone": central_contacts.get("phone", "N/A"),
                        "centralContactEmail": central_contacts.get("email", "N/A"),
                        "locationFacility": locations.get("facility", "N/A"),
                        "locationCity": locations.get("city", "N/A"),
                        "locationState": locations.get("state", "N/A"),
                        "locationZip": locations.get("zip", "N/A"),
                        "locationCountry": locations.get("country", "N/A"),
                        "hasProtocol": any(doc.get("hasProtocol") for doc in document_section.get("largeDocs", [])),
                        "hasSAP": any(doc.get("hasSap") for doc in document_section.get("largeDocs", [])),
                        "hasICF": any(doc.get("hasIcf") for doc in document_section.get("largeDocs", [])),
                        "fileName": [doc.get("filename") for doc in document_section.get("largeDocs", [])]
                    }
                    studies.append(study_info)

                page_token = data.get("nextPageToken")
                if not page_token:
                    break

    return studies

results = fetch_clinical_trials(batch_size=1000)


Total count of recruiting studies: 67237


Fetching Clinical Trials: 100%|██████████| 67237/67237 [16:14<00:00, 69.00it/s]


### Setting up Firestore

In [None]:
if not firebase_admin._apps:
    cred = credentials.Certificate('dukeai-103f8-369df2b50aa4.json')
    firebase_admin.initialize_app(cred)

db = firestore.client()

### Storing data into Firestore

In [2]:
def store_to_firestore(studies):
    for study in tqdm(studies, desc="Storing data to Firestore"):
        db.collection("clinical_trials").add(study)

store_to_firestore(results)
print("Data stored in Firestore successfully.")

In [None]:
def store_to_firestore_batch(studies, batch_size=500):
    total_batches = (len(studies) + batch_size - 1) // batch_size
    for i in range(total_batches):
        batch = db.batch()
        batch_start = i * batch_size
        batch_end = batch_start + batch_size
        batch_studies = studies[batch_start:batch_end]

        for study in tqdm(batch_studies, desc=f"Storing batch {i + 1}/{total_batches}"):
            # Create a new document reference for each study
            doc_ref = db.collection("clinical_trials").document()  # Auto-generate document ID
            batch.set(doc_ref, study)  # Queue the document for writing

        # Commit the batch
        batch.commit()
        print(f"Batch {i + 1}/{total_batches} stored in Firestore successfully.")

# Call the function with the results and batch size
store_to_firestore_batch(results, batch_size=500)


### Fetching data from Firestore

In [3]:
def get_existing_document_ids(collection_name):
    collection_ref = db.collection(collection_name)
    docs = collection_ref.stream()
    
    document_ids = [doc.id for doc in docs]
    return document_ids

def fetch_from_firestore(collection_name):
    collection_ref = db.collection(collection_name)
    
    # Retrieve all documents in the collection
    docs = collection_ref.stream()
    
    # Store fetched data in a list
    studies = []
    for doc in docs:
        study = doc.to_dict()
        study['doc_id'] = doc.id  # Ensure the document ID is included
        studies.append(study)
    
    return studies

### Setting up OpenAI API

In [None]:
# Load environment variables from a .env file
load_dotenv()

openai.api_key = os.getenv('OPENAI_API_KEY')

### Generating keywords and summaries with OpenAI


In [33]:
def openai_prompting(firestore_data, model_choice="gpt-3.5-turbo", limit=None):
    url = "https://api.openai.com/v1/chat/completions"
    headers = {
        "Content-Type": "application/json",
        "Authorization": f"Bearer {openai.api_key}"
    }

    # Limit the number of documents processed if a limit is provided
    if limit:
        firestore_data = firestore_data[:limit]

    results = []
    for doc in firestore_data:
        # Ensure keywords is a list
        keywords = doc.get('keywords', [])
        if not isinstance(keywords, list):
            keywords = [keywords]

        # Replace None values with 'N/A'
        keywords = [kw if kw is not None else 'N/A' for kw in keywords]

        study_info = {
            "keywords": keywords,
            "briefSummary": doc.get('briefSummary', 'N/A'),
            "description": doc.get('description', 'N/A')
        }

        # Construct the system content with the specified prompt

        ## Version 1
        # system_content = (
        #     "You are a 9th-grade teacher tasked with explaining the background, purpose, methodologies, "
        #     "and potential risks of specific clinical trials to an 8th grader. "
        #     "Your job is to create a concise summary and generate hashtags based on provided clinical trial information.\n\n"
        #     "Ensure the response follows this strict format:\n"
        #     "1. A single line of hashtags derived from 'keywords', using no more than six keywords.\n"
        #     "The line must start with 'Hashtag:' and separate each keyword by a comma without a # prefix.\n"
        #     "2. A one to two-paragraph summary that:\n"
        #     "   - Must start with 'Summary:' and continue with a one to two-paragraph explanation.\n"
        #     "   - Provides relevant background using the keywords in one sentence.\n"
        #     "   - Clearly describes what participants will experience and the purpose of the trial.\n"
        #     "   - Uses simple language suitable for an 8th grader.\n"
        #     "   - Starts with the label called'Summary'.\n\n"
        #     "Do not add any titles or extra explanations outside of the requested format.\n"
        #     "Ensure there are no double newline characters in the response. Each section should be separated by a single newline character.\n\n"
        #     "Ensure uniformity by maintaining the structure exactly as requested for each response:\n"
        #     "   'Hashtag: keyword1, keyword2, keyword3, ...'\n"
        #     "   'Summary: [Concise summary text]'\n"
        #     "Maintain clarity and conciseness in your response."
        # )

        system_content = (
            "You are a 9th-grade teacher tasked with explaining the background, purpose, methodologies, "
            "and potential risks of specific clinical trials to an 9th grader. "
            "Your job is to create a summary and generate hashtags based on provided clinical trial information.\n\n"
            "Ensure the response follows this strict format:\n"
            "1. A single line of hashtags derived from 'keywords', using no more than six keywords.\n"
            "The line must start with 'Hashtag:' and separate each keyword by a comma without a # prefix.\n"
            "2. A one- to two-paragraph summary that:\n"
            "   - Must start with 'Summary:' and continue with a one to two-paragraph explanation.\n"
            "   - Must be more than 7 sentences.\n"
            "   - Provides relevant background using the keywords in one sentence.\n"
            "   - Clearly describes what participants will experience and the purpose of the trial.\n"
            "   - Uses simple language suitable for an 9th grader.\n"
            "   - State specific potential risks that participants may take during the trial.\n"
            "   - Starts with the label called'Summary'.\n\n"
            "Do not add any titles or extra explanations outside of the requested format.\n"
            "Ensure there are no double newline characters in the response. Each section should be separated by a single newline character.\n\n"
            "Ensure uniformity by maintaining the structure exactly as requested for each response:\n"
            "   'Hashtag: keyword1, keyword2, keyword3, …'\n"
            "   'Summary: [Summary text]'\n"
            "Maintain clarity in your response."
        )

        # Prepare the assistant content using extracted trial information
        assistant_content = (
            "Here is the clinical trial information:\n"
            "- **Keywords**: " + ", ".join(study_info['keywords']) + "\n"
            "- **Brief Summary**: " + (study_info['briefSummary'] or 'N/A') + "\n"
            "- **Description**: " + (study_info['description'] or 'N/A') + "\n"
            "\nPlease provide a summary and hashtags based on the above information."
        )

        data = {
            "model": model_choice,
            "messages": [
                {
                    "role": "system",
                    "content": system_content
                },
                {
                    "role": "assistant",
                    "content": assistant_content  # Provide the trial data as if the assistant already knows it
                },
                {
                    "role": "user",
                    "content": "Please provide the requested explanation and hashtags."  # Prompt the AI to respond
                }
            ],
            "max_tokens": 500
        }

        response = requests.post(url, json=data, headers=headers)
        if response.status_code == 200:
            output = response.json()['choices'][0]['message']['content']
            results.append(output.strip())
        else:
            print(f"Error: {response.status_code} - {response.text}")
            results.append(None)

    return results

# Example usage with Firestore data
fetched_data = fetch_from_firestore('clinical_trials') 
limit = 3
results = openai_prompting(fetched_data, limit=limit)

# Print results for each trial
for idx, result in enumerate(results):
    if result:
        print(f"Trial {idx + 1}:\n{result}\n")
    else:
        print(f"Trial {idx + 1}: Failed to generate a response.\n")

Trial 1:
Hashtag: buprenorphine, neuroinflammation, complex regional pain syndrome, PET

Summary: This clinical trial will study how buprenorphine affects inflammation in the brains of people with complex regional pain syndrome using a special PET scan. Participants in this trial will take buprenorphine medication and undergo PET scans to see if the medication helps reduce brain inflammation linked to their pain condition. The purpose is to see if buprenorphine can be a helpful treatment for this type of pain. However, participants may experience side effects from the medication such as nausea, constipation, or dizziness, and there could be risks associated with the PET scan, like allergic reactions to the imaging agent or discomfort during the procedure.

Trial 2:
Hashtag: Efficacy, Safety, Tolerability, Ertugliflozin, Sitagliptin

Summary: In this clinical trial, researchers want to see how well and how safe two diabetes medications, Ertugliflozin and Sitagliptin, work together in Pa

### Storing hashtags and summaries into the existing Firestore Database

In [None]:
def store_to_existing_firestore_batch(fetched_data, results, batch_size=500):
    existing_ids = get_existing_document_ids("clinical_trials")
    total_batches = (len(fetched_data) + batch_size - 1) // batch_size

    for i in range(total_batches):
        batch = db.batch()
        batch_start = i * batch_size
        batch_end = batch_start + batch_size
        batch_fetched_data = fetched_data[batch_start:batch_end]
        batch_results = results[batch_start:batch_end]

        for doc, response in tqdm(zip(batch_fetched_data, batch_results), desc=f"Processing batch {i + 1}/{total_batches}", total=len(batch_fetched_data)):
            doc_id = doc['doc_id']
            if response:
                hashtags_list = []
                summary_line = ""
                
                lines = response.split('\n')
                for line in lines:
                    if line.startswith('Hashtag:'):
                        hashtags_list = line.replace('Hashtag:', '').strip().split(', ')
                    elif line.startswith('Summary:'):
                        summary_line = line.replace('Summary:', '').strip()

                if doc_id in existing_ids:
                    doc_ref = db.collection("clinical_trials").document(doc_id)
                    # Add update to the batch
                    batch.update(doc_ref, {
                        "openai_hashtags": hashtags_list,
                        "openai_summary": summary_line
                    })
                    print(f"Prepared batch update for document ID: {doc_id}")
                else:
                    print(f"Document ID: {doc_id} does not exist in Firestore")
            else:
                print(f"Failed to parse response for document ID: {doc_id}")

        # Commit the batch
        batch.commit()
        print(f"Batch {i + 1}/{total_batches} update completed.")

store_to_existing_firestore_batch(fetched_data, results, batch_size=500)