### Fetching API version info

In [None]:
import requests

def get_api_version_info():
    url = "https://clinicaltrials.gov/api/v2/version"
    try:
        response = requests.get(url)
        if response.status_code == 200:
            return response.json()
        else:
            return {"error": f"Failed to get version info, status code: {response.status_code}"}
    except requests.RequestException as e:
        return {"error": str(e)}

# Example usage
version_info = get_api_version_info()
print(version_info.get('apiVersion'))


: 

### Fetching Clinical Trials Data

In [133]:
import requests

def fetch_clinical_trials(max_studies=None):
    url = "https://clinicaltrials.gov/api/v2/studies"
    page_token = None
    studies = []

    headers = {
        "Accept": "application/json"
    }

    # Fetch the total count of studies if max_studies is not provided
    if max_studies is None:
        response = requests.get(url, headers=headers, params={
            "countTotal": "true",
            "filter.overallStatus": "RECRUITING",
            "format": "json"
        })
        if response.status_code == 200:
            data = response.json()
            max_studies = data.get("totalCount", 5)
            print(f"Total count: {max_studies}")
        else:
            print(f"Error: Unable to fetch total count, status code {response.status_code}")
            max_studies = 5  # Set a default value to avoid NoneType error
    
    # Fetch the studies
    while len(studies) < max_studies:
        params = {
            # "query.titles": "cancer",
            "filter.overallStatus": "RECRUITING",
            "countTotal": "true",
            # "pageSize": int(2), # number of studies returned per page
            "pageToken": page_token,
            "format": "json",
            "markupFormat": "markdown"
        }

        response = requests.get(url, headers=headers, params=params)

        if response.status_code != 200:
            print(f"Error: Received status code {response.status_code}")
            print("Response content:", response.text)
            break

        try:
            data = response.json()
            print(f"Fetched {len(data.get('studies', []))} studies.")
        except requests.JSONDecodeError:
            print("Failed to decode JSON. Response was:", response.text)
            break

        # Extract totalCount from the first page
        if not studies:
            total_count = data.get("totalCount")
            print(f"Total count: {total_count}")

        for study in data.get("studies", []):
            if len(studies) >= max_studies:
                break

            document_section = study.get("documentSection", {}).get("largeDocumentModule", {})

            study_info = {
                "nctId": study["protocolSection"]["identificationModule"].get("nctId"),
                "title": study["protocolSection"]["identificationModule"].get("officialTitle"),
                "studyType": study["protocolSection"]["designModule"].get("studyType"),
                "investigator": study["protocolSection"]["sponsorCollaboratorsModule"]["responsibleParty"].get("investigatorFullName"),
                "sponsorName": study["protocolSection"]["sponsorCollaboratorsModule"]["leadSponsor"].get("name"),
                "organization": study["protocolSection"]["identificationModule"]["organization"].get("fullName"),
                "overallStatus": study["protocolSection"]["statusModule"]["overallStatus"],

                # Description of the study
                "briefSummary": study["protocolSection"]["descriptionModule"].get("briefSummary"),
                "description": study["protocolSection"]["descriptionModule"].get("detailedDescription"),
                "conditions": study["protocolSection"]["conditionsModule"].get("conditions"),
                "keywords": study["protocolSection"]["conditionsModule"].get("keywords"),
                "purpose": study["protocolSection"]["designModule"]["designInfo"].get("primaryPurpose"),
                "phase": study["protocolSection"]["designModule"].get("phases"),

                # For interventional study designs only
                "interventionalModel": study["protocolSection"]["designModule"]["designInfo"].get("interventionModel"),

                # For observational study designs only
                "observationalModel": study["protocolSection"]["designModule"]["designInfo"].get("observationalModel"),
                "timePerspective": study["protocolSection"]["designModule"]["designInfo"].get("timePerspective"),
                "enrollmentCount": study["protocolSection"]["designModule"]["enrollmentInfo"].get("count"),
                "enrollmentType": study["protocolSection"]["designModule"]["enrollmentInfo"].get("type"),
                "targetDuration": study["protocolSection"]["designModule"].get("targetDuration"),

                # Eligibility 
                "eligibilityCriteria" : study["protocolSection"]["eligibilityModule"].get("eligibilityCriteria"),
                "sex": study["protocolSection"]["eligibilityModule"].get("sex"),
                "minAge": study["protocolSection"]["eligibilityModule"].get("minimumAge"),
                "maxAge": study["protocolSection"]["eligibilityModule"].get("maximumAge"),
                "healthyVolunteers": study["protocolSection"]["eligibilityModule"].get("healthyVolunteers"),

                # Contact information, location, and country
                "centralContactName": study["protocolSection"]["contactsLocationsModule"].get("centralContacts", [{}])[0].get("name"),
                "centralContactPhone": study["protocolSection"]["contactsLocationsModule"].get("centralContacts", [{}])[0].get("phone"),
                "centralContactEmail": study["protocolSection"]["contactsLocationsModule"].get("centralContacts", [{}])[0].get("email"),
                "locationFacility": study["protocolSection"]["contactsLocationsModule"].get("locations", [{}])[0].get("facility"),
                "locationCity": study["protocolSection"]["contactsLocationsModule"].get("locations", [{}])[0].get("city"),
                "locationState": study["protocolSection"]["contactsLocationsModule"].get("locations", [{}])[0].get("state"),
                "locationZip": study["protocolSection"]["contactsLocationsModule"].get("locations", [{}])[0].get("zip"),
                "locationCountry": study["protocolSection"]["contactsLocationsModule"].get("locations", [{}])[0].get("country"),
                
                # Additional documents (protocols, informed consent)
                "hasProtocol": any(doc.get("hasProtocol") for doc in document_section.get("largeDocs", [])),
                "hasSAP": any(doc.get("hasSap") for doc in document_section.get("largeDocs", [])),
                "hasICF": any(doc.get("hasIcf") for doc in document_section.get("largeDocs", [])),
                "fileName": [doc.get("filename") for doc in document_section.get("largeDocs", [])]
            }
            studies.append(study_info)

        page_token = data.get("nextPageToken")
        if not page_token:
            break

    return studies

# Example call to test functionality
results = fetch_clinical_trials()
for study in results:
    print(study.get("eligibilityCriteria"))


Total count: 67237
Fetched 10 studies.
Total count: 67237
Fetched 10 studies.
Fetched 10 studies.
Fetched 10 studies.
Fetched 10 studies.
Fetched 10 studies.
Fetched 10 studies.
Fetched 10 studies.
Fetched 10 studies.
Fetched 10 studies.
Fetched 10 studies.
Fetched 10 studies.
Fetched 10 studies.
Fetched 10 studies.
Fetched 10 studies.
Fetched 10 studies.
Fetched 10 studies.
Fetched 10 studies.
Fetched 10 studies.
Fetched 10 studies.
Fetched 10 studies.
Fetched 10 studies.
Fetched 10 studies.
Fetched 10 studies.
Fetched 10 studies.
Fetched 10 studies.
Fetched 10 studies.
Fetched 10 studies.
Fetched 10 studies.
Fetched 10 studies.
Fetched 10 studies.
Fetched 10 studies.
Fetched 10 studies.
Fetched 10 studies.
Fetched 10 studies.
Fetched 10 studies.
Fetched 10 studies.
Fetched 10 studies.
Fetched 10 studies.
Fetched 10 studies.
Fetched 10 studies.


KeyboardInterrupt: 

In [145]:
import requests
from tqdm import tqdm

def fetch_clinical_trials(max_studies=None):
    url = "https://clinicaltrials.gov/api/v2/studies"
    page_token = None
    studies = []

    headers = {
        "Accept": "application/json"
    }

    # Fetch the total count of studies if max_studies is not provided
    if max_studies is None:
        response = requests.get(url, headers=headers, params={
            "countTotal": "true",
            "filter.overallStatus": "RECRUITING",
            "format": "json"
        })
        if response.status_code == 200:
            data = response.json()
            max_studies = data.get("totalCount", 5)
            print(f"Total count of recruiting studies: {max_studies}")
        else:
            print(f"Error: Unable to fetch total count, status code {response.status_code}")
            max_studies = 5  # Set a default value to avoid NoneType error

    # Fetch the studies with a progress bar
    with tqdm(total=max_studies, desc="Fetching Clinical Trials") as pbar:
        while len(studies) < max_studies:
            params = {
                "filter.overallStatus": "RECRUITING",
                "countTotal": "true",
                "pageToken": page_token,
                "format": "json",
                "markupFormat": "markdown"
            }

            response = requests.get(url, headers=headers, params=params)

            if response.status_code != 200:
                print(f"Error: Received status code {response.status_code}")
                print("Response content:", response.text)
                break

            try:
                data = response.json()
            except requests.JSONDecodeError:
                print("Failed to decode JSON. Response was:", response.text)
                break

            fetched_studies = data.get("studies", [])
            pbar.update(len(fetched_studies))

            for study in fetched_studies:
                if len(studies) >= max_studies:
                    break

                document_section = study.get("documentSection", {}).get("largeDocumentModule", {})
                design_info = study["protocolSection"]["designModule"].get("designInfo", {})
                
                study_info = {
                    "nctId": study["protocolSection"]["identificationModule"].get("nctId"),
                    "title": study["protocolSection"]["identificationModule"].get("officialTitle"),
                    "studyType": study["protocolSection"]["designModule"].get("studyType"),
                    "investigator": study["protocolSection"]["sponsorCollaboratorsModule"]["responsibleParty"].get("investigatorFullName"),
                    "sponsorName": study["protocolSection"]["sponsorCollaboratorsModule"]["leadSponsor"].get("name"),
                    "organization": study["protocolSection"]["identificationModule"]["organization"].get("fullName"),
                    "overallStatus": study["protocolSection"]["statusModule"]["overallStatus"],
                    
                    # Description of the study
                    "briefSummary": study["protocolSection"]["descriptionModule"].get("briefSummary"),
                    "description": study["protocolSection"]["descriptionModule"].get("detailedDescription"),
                    "conditions": study["protocolSection"]["conditionsModule"].get("conditions"),
                    "keywords": study["protocolSection"]["conditionsModule"].get("keywords"),
                    "purpose": design_info.get("primaryPurpose"),
                    "phase": study["protocolSection"]["designModule"].get("phases"),

                    # For interventional study designs only
                    "interventionalModel": design_info.get("interventionModel"),

                    # For observational study designs only
                    "observationalModel": design_info.get("observationalModel"),
                    "timePerspective": design_info.get("timePerspective"),
                    "enrollmentCount": study["protocolSection"]["designModule"]["enrollmentInfo"].get("count"),
                    "enrollmentType": study["protocolSection"]["designModule"]["enrollmentInfo"].get("type"),
                    "targetDuration": study["protocolSection"]["designModule"].get("targetDuration"),

                    # Eligibility 
                    "eligibilityCriteria" : study["protocolSection"]["eligibilityModule"].get("eligibilityCriteria"),
                    "sex": study["protocolSection"]["eligibilityModule"].get("sex"),
                    "minAge": study["protocolSection"]["eligibilityModule"].get("minimumAge"),
                    "maxAge": study["protocolSection"]["eligibilityModule"].get("maximumAge"),
                    "healthyVolunteers": study["protocolSection"]["eligibilityModule"].get("healthyVolunteers"),

                    # Contact information, location, and country
                    "centralContactName": study["protocolSection"]["contactsLocationsModule"].get("centralContacts", [{}])[0].get("name"),
                    "centralContactPhone": study["protocolSection"]["contactsLocationsModule"].get("centralContacts", [{}])[0].get("phone"),
                    "centralContactEmail": study["protocolSection"]["contactsLocationsModule"].get("centralContacts", [{}])[0].get("email"),
                    "locationFacility": study["protocolSection"]["contactsLocationsModule"].get("locations", [{}])[0].get("facility"),
                    "locationCity": study["protocolSection"]["contactsLocationsModule"].get("locations", [{}])[0].get("city"),
                    "locationState": study["protocolSection"]["contactsLocationsModule"].get("locations", [{}])[0].get("state"),
                    "locationZip": study["protocolSection"]["contactsLocationsModule"].get("locations", [{}])[0].get("zip"),
                    "locationCountry": study["protocolSection"]["contactsLocationsModule"].get("locations", [{}])[0].get("country"),
                    
                    # Additional documents (protocols, informed consent)
                    "hasProtocol": any(doc.get("hasProtocol") for doc in document_section.get("largeDocs", [])),
                    "hasSAP": any(doc.get("hasSap") for doc in document_section.get("largeDocs", [])),
                    "hasICF": any(doc.get("hasIcf") for doc in document_section.get("largeDocs", [])),
                    "fileName": [doc.get("filename") for doc in document_section.get("largeDocs", [])]
                }
                studies.append(study_info)

            page_token = data.get("nextPageToken")
            if not page_token:
                break

    return studies

results = fetch_clinical_trials(max_studies=100)
# for study in results:
#     print(study.get("centralContactEmail"))

Fetching Clinical Trials: 100%|██████████| 100/100 [00:01<00:00, 76.55it/s]


### Setting up & storing data in Firestore

In [103]:
import firebase_admin
from firebase_admin import credentials, firestore

if not firebase_admin._apps:
    cred = credentials.Certificate('dukeai-103f8-369df2b50aa4.json')
    firebase_admin.initialize_app(cred)

db = firestore.client()

def store_to_firestore(studies):
    for study in studies:
        db.collection("clinical_trials").add(study)  # Adds each study as a new document

store_to_firestore(results)
print("Data stored in Firestore successfully.")

Fetched 10 studies.
Total count: 67237
Data stored in Firestore successfully.


### Functions for fetching data from Firestore

In [104]:
def get_existing_document_ids(collection_name):
    collection_ref = db.collection(collection_name)
    docs = collection_ref.stream()
    
    document_ids = [doc.id for doc in docs]
    return document_ids

def fetch_from_firestore(collection_name):
    collection_ref = db.collection(collection_name)
    
    # Retrieve all documents in the collection
    docs = collection_ref.stream()
    
    # Store fetched data in a list
    studies = []
    for doc in docs:
        study = doc.to_dict()
        study['doc_id'] = doc.id  # Ensure the document ID is included
        studies.append(study)
    
    return studies

# fetched_data = fetch_from_firestore("clinical_trials")
# for study in fetched_data:
#     print(study)

### Function for generating overviews with OpenAI


In [94]:
import openai
import requests
import os
from dotenv import load_dotenv

# Load environment variables from a .env file
load_dotenv()

openai.api_key = os.getenv('OPENAI_API_KEY')

# Function to generate an overview using OpenAI API
def openai_prompting(prompt, model_choice="gpt-3.5-turbo"):
    url = "https://api.openai.com/v1/chat/completions"
    headers = {
        "Content-Type": "application/json",
        "Authorization": f"Bearer {openai.api_key}"
    }
    
    data = {
        "model": model_choice,
        "messages": [
            {
                "role": "system",
                "content": "You are an assistant that generates clear and simple clinical trial overviews.\
                    Focus on summarizing the trial's purpose and background in an accessible way."
            },
            {
                "role": "assistant",
                "content": "Here is the clinical trial data:"
            },
            {
                "role": "user",
                "content": prompt
            }
        ],
        "max_tokens": 500
    }

    response = requests.post(url, json=data, headers=headers)
    if response.status_code == 200:
        output = response.json()['choices'][0]['message']['content']
        return output.strip()
    else:
        print(f"Error: {response.status_code} - {response.text}")
        return None

### Storing overviews in Firestore

In [105]:
import time
# Function to store overview in Firestore
def store_to_existing_firestore(trial_id, overview):
    doc_ref = db.collection("clinical_trials").document(trial_id)
    doc_ref.update({"overview": overview})
    print(f"Stored overview for trial ID: {trial_id}")


# Function to create prompt, generate overview, and store in Firestore with rate limiting
def create_overviews_for_trials():
    trials = fetch_from_firestore("clinical_trials")
    request_count = 0

    for trial in trials:
        prompt = (
            f"Generate an overview for a clinical trial with the following details:\n\n"
            f"Title: {trial.get('briefTitle', 'N/A')}\n"
            f"Overview:"
        )
        
        # Generate overview
        overview = openai_prompting(prompt)
        print(overview)
        if overview:
            # Store overview in Firestore
            store_to_existing_firestore(trial['doc_id'], overview)
            request_count += 1

            # Rate limit: Pause every 3 requests to stay within the rate limit
            if request_count % 3 == 0:
                print("Rate limit reached. Waiting for 20 seconds...")
                time.sleep(20)

# Run the function
create_overviews_for_trials()

Study Overview:

This clinical trial aims to evaluate the effectiveness of a new medication for treating patients with advanced stage breast cancer. The medication works by targeting specific proteins found in cancer cells, which may help slow down the growth of the tumors and potentially improve patient outcomes. The trial will compare the new medication with standard treatment options to assess its safety and efficacy. This research is crucial in developing better therapeutic options for patients facing this challenging diagnosis.
Stored overview for trial ID: 09iul2opn4u1QV7Otfd1
Study Title: Effect of Vitamin D Supplementation on Bone Health in Postmenopausal Women

Background: This clinical trial aims to investigate the impact of vitamin D supplementation on bone health in postmenopausal women. Postmenopausal women are at a higher risk of developing osteoporosis due to decreased estrogen levels, which results in accelerated bone loss. Vitamin D is essential for bone health as it h

KeyboardInterrupt: 