### Using map-reduce

In [1]:
import os
import streamlit as st
import pandas as pd
from langchain.agents import create_pandas_dataframe_agent
from langchain.llms import OpenAI
from langchain.document_loaders import UnstructuredXMLLoader
from langchain.document_loaders.csv_loader import CSVLoader
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.chat_models import ChatOpenAI
from langchain.chains import ConversationalRetrievalChain
from langchain.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains.summarize import load_summarize_chain
from langchain import PromptTemplate

In [2]:
import xml.etree.ElementTree as ET
import pandas as pd
import os
import sys
import streamlit as st
from streamlit_chat import message
import pandas as pd
from langchain.agents import create_pandas_dataframe_agent
from langchain.llms import OpenAI
from langchain.document_loaders import UnstructuredXMLLoader
from langchain.document_loaders import TextLoader
from langchain.document_loaders.csv_loader import CSVLoader
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.chat_models import ChatOpenAI
from langchain.chains import ConversationalRetrievalChain
from langchain.vectorstores import FAISS
from langchain.vectorstores import Chroma
from langchain.agents import Tool
from langchain.chains import RetrievalQA
from langchain.agents import initialize_agent
from langchain.agents import AgentType
from langchain.memory import ConversationBufferMemory
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain.callbacks.base import BaseCallbackHandler
from langchain.memory.chat_message_histories import StreamlitChatMessageHistory
from langchain.prompts import MessagesPlaceholder

In [3]:
loader = CSVLoader(
    "./data/train.csv",
)
docs = loader.load()

In [4]:
llm = OpenAI()

In [12]:
  char_text_splitter =  RecursiveCharacterTextSplitter(chunk_size=500,chunk_overlap=0)
  docs_split = char_text_splitter.split_documents(docs)

In [13]:
len(docs_split)

187

In [30]:
map_prompt = """
suggest some trials for breast cancer
"{text}"
ANSWER:
"""
map_prompt_template = PromptTemplate(template=map_prompt, input_variables=["text"])

combine_prompt = """
suggest some trials for breast cancer
"{text}"
ANSWER:
"""
combine_prompt_template = PromptTemplate(template=combine_prompt, input_variables=["text"])

In [31]:
model = load_summarize_chain(llm=llm, chain_type="map_reduce", map_prompt=map_prompt_template,combine_prompt=combine_prompt_template)
model.run(docs_split)

'\n1. A Randomized Phase III Trial of Adjuvant Chemotherapy with or without Atezolizumab in Patients with Triple Negative Breast Cancer\n2. A Randomized Phase III Trial of Neoadjuvant Chemotherapy with or without Atezolizumab in Patients with Early Stage Breast Cancer\n3. A Phase III Trial Comparing Neoadjuvant Chemotherapy Regimens in Women with Large Operable Breast Cancer\n4. A Phase III Trial of Adjuvant Chemotherapy with or without Anthracycline-Based Regimens in Women with Early Breast Cancer\n5. A Randomized Phase III Trial Comparing Hormonal Therapy to Chemotherapy in Women with Estrogen Receptor Positive, Node Positive Breast Cancer\n6. A Randomized Phase III Trial Comparing Adjuvant Hormonal Therapy to Chemotherapy in Postmenopausal Women with Estrogen Receptor Positive, Node Positive Breast Cancer\n7. A Clinical Trial to Test the Effectiveness of Combining Atezolizumab with Radiation Therapy and Chemotherapy in Treating Breast Cancer\n8. A Clinical Trial to Compare the Effic

### Chunking + summarizing individual chunks

In [85]:
import platform
import os
import openai
import tiktoken

In [86]:
def break_up_file_to_chunks(filename, chunk_size=2000, overlap=100):
    encoding = tiktoken.get_encoding("gpt2")
    with open(filename, 'r') as f:
        text = f.read()    
        tokens = encoding.encode(text)
    num_tokens = len(tokens)
    
    chunks = []
    for i in range(0, num_tokens, chunk_size - overlap):
        chunk = tokens[i:i + chunk_size]
        chunks.append(chunk)
    
    return chunks

In [89]:
filename = './data/clinical_trials_modified.csv'

In [90]:
chunks = break_up_file_to_chunks(filename)
for i, chunk in enumerate(chunks):
    print(f"Chunk {i}: {len(chunk)} tokens")

Chunk 0: 2000 tokens
Chunk 1: 2000 tokens
Chunk 2: 2000 tokens
Chunk 3: 2000 tokens
Chunk 4: 2000 tokens
Chunk 5: 2000 tokens
Chunk 6: 2000 tokens
Chunk 7: 2000 tokens
Chunk 8: 2000 tokens
Chunk 9: 2000 tokens
Chunk 10: 2000 tokens
Chunk 11: 2000 tokens
Chunk 12: 2000 tokens
Chunk 13: 2000 tokens
Chunk 14: 2000 tokens
Chunk 15: 2000 tokens
Chunk 16: 2000 tokens
Chunk 17: 2000 tokens
Chunk 18: 2000 tokens
Chunk 19: 2000 tokens
Chunk 20: 2000 tokens
Chunk 21: 2000 tokens
Chunk 22: 2000 tokens
Chunk 23: 2000 tokens
Chunk 24: 2000 tokens
Chunk 25: 2000 tokens
Chunk 26: 2000 tokens
Chunk 27: 2000 tokens
Chunk 28: 2000 tokens
Chunk 29: 2000 tokens
Chunk 30: 2000 tokens
Chunk 31: 2000 tokens
Chunk 32: 2000 tokens
Chunk 33: 2000 tokens
Chunk 34: 2000 tokens
Chunk 35: 2000 tokens
Chunk 36: 2000 tokens
Chunk 37: 2000 tokens
Chunk 38: 2000 tokens
Chunk 39: 2000 tokens
Chunk 40: 2000 tokens
Chunk 41: 2000 tokens
Chunk 42: 2000 tokens
Chunk 43: 2000 tokens
Chunk 44: 2000 tokens
Chunk 45: 2000 token

In [120]:
def query_engine(query):
        prompt_response = []
        encoding = tiktoken.get_encoding("gpt2")
        chunks = break_up_file_to_chunks(filename)
        for i, chunk in enumerate(chunks):    
                prompt_request = query + encoding.decode(chunks[i])
                messages = [{"role": "system", "content": "You help query the clinical trial data provided and answer any questions about the same"}]    
                messages.append({"role": "user", "content": prompt_request})    
                response = openai.ChatCompletion.create(
                        model="gpt-3.5-turbo",
                        messages=messages,
                        temperature=0,
                        max_tokens=500,
                        top_p=1,
                        frequency_penalty=0,
                        presence_penalty=0
                )
                
                prompt_response.append(response["choices"][0]["message"]['content'].strip())
        
        prompt_request = "Consolidate the answer and give a cohesive response: " + str(prompt_response)
        messages = [{"role": "system", "content": "You help query the clinical trial data provided and answer any questions about the same"}]    
        messages.append({"role": "user", "content": prompt_request})   

        response = openai.ChatCompletion.create(
        model="gpt-3.5-turbo",
        messages=messages,
        temperature=0,
        max_tokens=1000,
        top_p=1,
        frequency_penalty=0,
        presence_penalty=0
    )
        return response


In [None]:
response = query_engine("suggest 5 trials related to the nose")

In [28]:
response["choices"][0]["message"]['content'].strip()

'Here are 5 clinical trials related to the nose:\n\n1. Trial: NRG-HN001\n   - Title: Randomized Phase II and Phase III Studies of Individualized Treatment for Nasopharyngeal Carcinoma Based on Biomarker Epstein Barr Virus (EBV) Deoxyribonucleic Acid (DNA)\n   - NCT ID: NCT02135042\n   - Investigator Name: Rupali Nabar\n   - Status: Open to Accrual\n   - Eligibility: 18 Years and older (Adult, Older Adult)\n   - Description: This trial aims to study individualized treatment for nasopharyngeal carcinoma based on the biomarker EBV DNA.\n   - Phase: Phase II/III\n   - Treatment Type: Not specified\n   - Age Description: 18 Years and older (Adult, Older Adult)\n   - Scope Description: Not specified\n   - Location Name: Not specified\n   - Summary: This trial investigates the use of individualized treatment for nasopharyngeal carcinoma based on the biomarker EBV DNA. Patients will undergo standard concurrent chemotherapy and radiation therapy, followed by additional treatment based on their 

### Processing data (mainly just some pandas work)

In [2]:
import pandas as pd
import numpy as np
from langchain.agents import create_pandas_dataframe_agent
from langchain.llms import OpenAI
from langchain.document_loaders import UnstructuredXMLLoader
from langchain.document_loaders.csv_loader import CSVLoader
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.chat_models import ChatOpenAI
from langchain.chains import ConversationalRetrievalChain
from langchain.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains.summarize import load_summarize_chain
from langchain import PromptTemplate


In [76]:
df = pd.read_csv('./data/clinical_trials.csv')

In [77]:
columns_to_combine = [f'DISEASE_SITES/DISEASE_SITE/{i}' for i in range(42)]

In [78]:
df['disease_sites'] = df[columns_to_combine].apply(lambda row: ', '.join(row.dropna()), axis=1)
df.drop(columns=columns_to_combine, inplace=True)

In [79]:
columns_to_combine = [f'SPONSOR_NAMES/SPONSOR_NAME/{i}' for i in range(4)]
df['sponsor_names'] = df[columns_to_combine].apply(lambda row: ', '.join(row.dropna()), axis=1)
df.drop(columns=columns_to_combine, inplace=True)

In [80]:
columns_to_combine = [f'THERAPY_NAMES/THERAPY_NAME/{i}' for i in range(3)]
df['therapy_names'] = df[columns_to_combine].apply(lambda row: ', '.join(row.dropna()), axis=1)
df.drop(columns=columns_to_combine, inplace=True)

In [81]:
columns_to_combine = [f'DRUG_NAMES/DRUG_NAME/{i}' for i in range(5)]
df['drug_names'] = df[columns_to_combine].apply(lambda row: ', '.join(row.dropna()), axis=1)
df.drop(columns=columns_to_combine, inplace=True)

In [85]:
df.to_csv('./data/clinical_trials_modified.csv',index=False)

In [12]:
df = pd.read_xml('./data/clinical_trials_original.xml',xpath="/TRIAL/PROTOCOL")

In [14]:
with open('./data/clinical_trials_original.xml', 'r') as file_in, open('./data/test.txt', 'w') as file_out:
    data = file_in.read()
    file_out.write(data)

In [26]:
not os.path.isfile('./data/clinical_trials_original.xml')

False

### Using FAISS/chroma (with modified/prep-processed CSV)

In [1]:
import os
import streamlit as st
from streamlit_chat import message
import pandas as pd
from langchain.agents import create_pandas_dataframe_agent
from langchain.llms import OpenAI
from langchain.document_loaders import UnstructuredXMLLoader
from langchain.document_loaders import TextLoader
from langchain.document_loaders.csv_loader import CSVLoader
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.chat_models import ChatOpenAI
from langchain.chains import ConversationalRetrievalChain
from langchain.vectorstores import FAISS
from langchain.vectorstores import Chroma
from langchain.document_loaders import DataFrameLoader

In [12]:
import xml.etree.ElementTree as ET
import pandas as pd

# Parse the XML file
tree = ET.parse('./data/clinical_trials_original.xml')
root = tree.getroot()

# Initialize empty lists to store data
protocol_data = []

# Iterate through each <PROTOCOL> element
for protocol in root.findall('.//PROTOCOL'):
    protocol_dict = {}
    # Extract data from XML tags and add it to the dictionary
    #protocol_dict['PROTOCOL_NO'] = protocol.find('PROTOCOL_NO').text
    protocol_dict['TITLE'] = protocol.find('TITLE').text
    #protocol_dict['NCT_ID'] = protocol.find('NCT_ID').text
    protocol_dict['SHORT_TITLE'] = protocol.find('SHORT_TITLE').text
    protocol_dict['INVESTIGATOR_NAME'] = protocol.find('INVESTIGATOR_NAME').text
    protocol_dict['STATUS'] = protocol.find('STATUS').text
    protocol_dict['ELIGIBILITY'] = protocol.find('ELIGIBILITY').text
    protocol_dict['DETAILED_ELIGIBILITY'] = protocol.find('DETAILED_ELIGIBILITY').text if protocol.find('DETAILED_ELIGIBILITY') is not None else ''
    protocol_dict['DESCRIPTION'] = protocol.find('DESCRIPTION').text
    protocol_dict['PHASE_DESC'] = protocol.find('PHASE_DESC').text
    protocol_dict['TREATMENT_TYPE_DESC'] = protocol.find('TREATMENT_TYPE_DESC').text
    protocol_dict['AGE_DESCRIPTION'] = protocol.find('AGE_DESCRIPTION').text
    protocol_dict['SCOPE_DESC'] = protocol.find('SCOPE_DESC').text
    protocol_dict['MODIFIED_DATE'] = protocol.find('MODIFIED_DATE').text
    protocol_dict['DEPARTMENT_NAME'] = protocol.find('DEPARTMENT_NAME').text
    # Extract SPONSOR_NAMES
    sponsor_names = [sponsor.text for sponsor in protocol.findall('.//SPONSOR_NAME')]
    protocol_dict['SPONSOR_NAMES'] = ', '.join(sponsor_names)
    # Extract DISEASE_SITES
    disease_sites = [site.text for site in protocol.findall('.//DISEASE_SITE')]
    protocol_dict['DISEASE_SITES'] = ', '.join(disease_sites)
    # Extract DRUG_NAMES (if available)
    drugs = protocol.findall('.//DRUG_NAMES')
    if drugs:
        protocol_dict['DRUG_NAMES'] = ', '.join([drug.text if drug.text is not None else '' for drug in drugs])
    else:
        protocol_dict['DRUG_NAMES'] = ''

    # Extract THERAPY_NAMES (if available)
    therapies = protocol.findall('.//THERAPY_NAMES')
    if therapies:
        protocol_dict['THERAPY_NAMES'] = ', '.join([therapy.text if therapy.text is not None else '' for therapy in therapies])
    else:
        protocol_dict['THERAPY_NAMES']= ''

    

    # Append the protocol data to the list
    protocol_data.append(protocol_dict)

# Create a DataFrame from the list of dictionaries
df = pd.DataFrame(protocol_data)
df.to_csv('./data/clinical_trials_from_xml.csv',  index=False)



In [4]:
loader = CSVLoader('./data/train.csv')
docs = loader.load()

In [79]:
docs[21]

Document(page_content='TITLE: Genetic Testing in Guiding Treatment for Patients with Brain Metastases\nSHORT_TITLE: Genetic Testing in Guiding Treatment for Patients with Brain Metastases\nINVESTIGATOR_NAME: Yoon Jae Choi\nSTATUS: OPEN TO ACCRUAL\nELIGIBILITY: Adults\nDETAILED_ELIGIBILITY: Pre-registration Eligibility:\n\n- Tissue available for biomarker testing (any brain metastasis tissue and extracranial site from any prior resection or biopsy)\n\nRegistration Eligibility:\n\n- Participants must have histologically confirmed parenchymal metastatic disease to the brain from any solid tumor\n- Female participants must not be pregnant or breastfeeding\n- Ability to obtain MRIs with contrast\nDESCRIPTION: This phase II trial studies how well genetic testing works in guiding treatment for patients with solid tumors that have spread to the brain. Several genes have been found to be altered or mutated in brain metastases such as NTRK, ROS1, CDK or PI3K. Medications that target these genes 

In [7]:
embeddings = OpenAIEmbeddings()

In [80]:
# Using FAISS 
vectorstore = FAISS.from_documents(docs, embeddings)
#retriever = vectorstore.as_retriever()

# Create vector store + save using chroma
vectordb = Chroma.from_documents(documents = docs, embedding = embeddings, persist_directory="./embeddings")
vectordb.persist()


In [119]:
vectordb = Chroma(persist_directory = "./embeddings", embedding_function=embeddings)

In [7]:
from langchain.memory import ConversationBufferMemory
memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)

In [None]:
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
 
chain = ConversationalRetrievalChain.from_llm(llm=ChatOpenAI(temperature=0.0, model_name='gpt-3.5-turbo', streaming=True, callbacks=[StreamingStdOutCallbackHandler()]),retriever=vectordb.as_retriever(search_type="similarity_score_threshold", search_kwargs={'score_threshold': 0.5}), memory=memory)

In [34]:
chain( {"question":"trials for brain cancer"},  return_only_outputs=True) 

Are there any ongoing clinical trials for brain cancer?Yes, there are ongoing clinical trials for brain cancer. Here are a few examples:

1. Title: Genetic Testing in Guiding Treatment for Patients with Brain Metastases
   Investigator: Yoon Jae Choi
   Status: Open to accrual
   Description: This phase II trial is studying how well genetic testing works in guiding treatment for patients with solid tumors that have spread to the brain. The goal is to tailor treatment based on genetic mutations.
   Disease Sites: Melanoma, Skin, Brain and Nervous System, Breast, Lung

2. Title: Phase II Trial of the Immune Checkpoint Inhibitor Nivolumab in Patients with Recurrent Select Rare CNS Cancers
   Investigator: Daniela Bota
   Status: Open to accrual
   Description: This phase II trial is testing the effectiveness of the immunotherapy drug nivolumab in treating patients with rare central nervous system (CNS) tumors.
   Disease Sites: Brain and Nervous System

3. Title: Phase II Trial of BRAF/ME

{'answer': 'Yes, there are ongoing clinical trials for brain cancer. Here are a few examples:\n\n1. Title: Genetic Testing in Guiding Treatment for Patients with Brain Metastases\n   Investigator: Yoon Jae Choi\n   Status: Open to accrual\n   Description: This phase II trial is studying how well genetic testing works in guiding treatment for patients with solid tumors that have spread to the brain. The goal is to tailor treatment based on genetic mutations.\n   Disease Sites: Melanoma, Skin, Brain and Nervous System, Breast, Lung\n\n2. Title: Phase II Trial of the Immune Checkpoint Inhibitor Nivolumab in Patients with Recurrent Select Rare CNS Cancers\n   Investigator: Daniela Bota\n   Status: Open to accrual\n   Description: This phase II trial is testing the effectiveness of the immunotherapy drug nivolumab in treating patients with rare central nervous system (CNS) tumors.\n   Disease Sites: Brain and Nervous System\n\n3. Title: Phase II Trial of BRAF/MEK Inhibitors in Papillary Cra

In [35]:
chain.run( "name of the first clinical trial in the list reabove?" )

What is the name of the first clinical trial mentioned in the list above?The name of the first clinical trial mentioned in the list above is "PLS Natural History Study (PNHS)".

'The name of the first clinical trial mentioned in the list above is "PLS Natural History Study (PNHS)".'

#### Using compression retriever

In [48]:
from langchain.llms import OpenAI
from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import LLMChainExtractor

llm = OpenAI(temperature=0)
compressor = LLMChainExtractor.from_llm(llm)
compression_retriever = ContextualCompressionRetriever(base_compressor=compressor, base_retriever=vectordb_persist.as_retriever())

# compressed_docs = compression_retriever.get_relevant_documents("What did the president say about Ketanji Jackson Brown")
# pretty_print_docs(compressed_docs)

In [49]:
chain_new =  RetrievalQA.from_chain_type(llm=ChatOpenAI(temperature=0.0, model_name='gpt-3.5-turbo'),retriever=compression_retriever)

In [90]:
chain_new.run({"query": "suggest some trials for brain cancer","chat_history":[]})



'Based on the provided context, here are a couple of clinical trials that may be relevant for brain cancer:\n\n1. Trial Name: "Study of Abemaciclib in Patients With Brain Metastases"\n   - Description: This trial is investigating the effectiveness of abemaciclib, a medication that targets CDK enzymes, in patients with brain metastases from any solid tumor.\n   - Eligibility Criteria: Participants must have histologically confirmed parenchymal metastatic disease to the brain from any solid tumor.\n   - Sponsor: Multiple sponsors, including academic institutions and pharmaceutical companies.\n\n2. Trial Name: "Vemurafenib and Cobimetinib in Treating Patients With BRAF V600E Mutation-Positive Craniopharyngioma"\n   - Description: This trial is evaluating the efficacy of vemurafenib and cobimetinib, which block enzymes needed for cell growth, in patients with papillary craniopharyngioma that has the BRAF V600E mutation.\n   - Eligibility Criteria: Participants must have histologically prov

### Using pandas multi-dataframe agent with chunking(with modified/pre-processed CSV)

In [2]:
df = pd.read_csv('./data/clinical_trials_modified.csv')

In [3]:
chunk_size = 20  # adjust this value to suit your needs
chunks = [df[i:i + chunk_size] for i in range(0, df.shape[0], chunk_size)]

In [4]:
from langchain.chat_models import ChatOpenAI
agent = create_pandas_dataframe_agent( ChatOpenAI(temperature=0, model_name='gpt-3.5-turbo'), chunks, verbose=True)

In [6]:
# agent.run("suggest some trials for brain cancer") -- THROWS RATE LIMIT ERROR

### Using vectorstore + agents (via tools)

In [3]:
df = pd.read_csv('./data/train.csv')

In [4]:
with open('./data/train.txt', 'w') as file:
    for index, row in df.iterrows():
        file.write(f"Trial {index+1}\n")
        for col_name, value in row.items():
            file.write(f"{col_name} - {value}\n")
          

In [13]:
loader = CSVLoader('./data/train.csv')
docs = loader.load()


In [14]:
# Create vector store + save using chroma
embeddings = OpenAIEmbeddings()
vectordb = Chroma.from_documents(documents = docs, embedding = embeddings, persist_directory="./embeddings")
vectordb.persist()

In [4]:
from langchain.prompts import SystemMessagePromptTemplate
from langchain.prompts import HumanMessagePromptTemplate
from langchain.prompts import ChatPromptTemplate

general_system_template = """ 
Given a specific context, please give the most relevant answer to the question using the context given, covering the required advices in general. If there is no direct answer, try to give the closest match before saying you don't know the answer. If the request is for a clinical trial then look for the keyword and any relevant keywords in 'Disease sites' before saying nothing exists. For ex: If the user asks about trials related to the Nose, but it doesn't exist, look for trials in the closest surrounding areas like nose/throat and suggest the same. Only answer based on the context, nothing outside of it.
 ----
{context}
----
"""
general_user_template = "Question:```{question}```"
messages = [
            SystemMessagePromptTemplate.from_template(general_system_template),
            HumanMessagePromptTemplate.from_template(general_user_template)
]
qa_prompt = ChatPromptTemplate.from_messages( messages )


In [101]:
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain.memory import ConversationBufferMemory
from langchain.chains import RetrievalQA
# embeddings = OpenAIEmbeddings()
# # vectordb_text = Chroma(persist_directory = "./text_embeddings", embedding_function=embeddings)
memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)

llm = ChatOpenAI(temperature=0.0, model_name='gpt-3.5-turbo', streaming=True, callbacks=[StreamingStdOutCallbackHandler()])

chain =ConversationalRetrievalChain.from_llm(llm=llm,retriever=vectordb.as_retriever(), memory=memory )

In [102]:
chat_history = []
res = chain({"question":"some trials for the nose", "chat_history": chat_history})

Here are two clinical trials related to nose conditions:

1. TITLE: A Phase III, Randomized, Blinded, Controlled, Parallel-Group Trial to Evaluate the Efficacy and Safety of LYR-210 for the Treatment of Chronic Rhinosinusitis (CRS) in Adults.
   SHORT_TITLE: Enlighten 2
   DESCRIPTION: This trial aims to evaluate the efficacy and safety of LYR-210 compared to a sham control for the treatment of chronic rhinosinusitis (CRS) in adults. It is a 24-week study with a 24-week treatment period.
   PHASE_DESC: III
   TREATMENT_TYPE_DESC: Drug
   DISEASE_SITES: Ear/ Nose/ Throat (ENT) - Otolaryngologic
   SPONSOR_NAMES: Medpace, Inc., Lyra Therapeutics, Inc.

2. TITLE: Phase II Trial of BRAF/MEK Inhibitors in Papillary Craniopharyngiomas
   SHORT_TITLE: Ph II Trial of BRAF/MEK Inhibitors in Papillary Craniopharyngiomas
   DESCRIPTION: This phase II trial investigates the effectiveness of vemurafenib and cobimetinib, which are BRAF/MEK inhibitors, in treating patients with papillary craniopharyn

In [113]:
chat_history = [("some trials for the nose", res["answer"])]
 

In [115]:
res2 = chain({"question":"eligibility criteria for trial 1 from above list", "chat_history": chat_history})

What are the eligibility criteria for trial 1 from the list provided above?The eligibility criteria for trial 1 are as follows:

Inclusion Requirements:
- Must be 18 years of age or older

Exclusion Requirements:
- Pregnant or breastfeeding

Please note that this may not be a complete list of eligibility criteria. It is recommended to consult with the study team for a thorough assessment of eligibility.

In [10]:
from langchain.chat_models import ChatOpenAI
from langchain.agents import Tool
from langchain.document_loaders import PyPDFLoader
from langchain.chains import RetrievalQA
from langchain.agents import initialize_agent
from langchain.agents import AgentType
from langchain.memory import ConversationBufferMemory


In [61]:
from langchain.retrievers.multi_query import MultiQueryRetriever
tools = []
tools.append(
                    Tool(
                         
                        name="search_clinical_trials_database"  ,
                        description="useful when you want to answer questions about the clinical trial database",
                        func=RetrievalQA.from_chain_type(llm=ChatOpenAI(temperature=0.0, model_name='gpt-3.5-turbo', streaming=True, callbacks=[StreamingStdOutCallbackHandler()]),retriever=vectordb.as_retriever()),
                    )
                )

In [49]:
from langchain.agents import ZeroShotAgent, Tool, AgentExecutor
from langchain.memory import ConversationBufferMemory
# agent_kwargs = {
#     "extra_prompt_messages": [MessagesPlaceholder(variable_name="memory")],
# }
# memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)
prefix = """Have a conversation with a human, answering the following questions as best you can. Display results in a bulleted list where possible. You have access to the following tools:"""
suffix = """Begin!"

{chat_history}
Question: {input}
{agent_scratchpad}"""

prompt = ZeroShotAgent.create_prompt(
    tools,
    prefix=prefix,
    suffix=suffix,
    input_variables=["input", "chat_history", "agent_scratchpad"],
)
memory = ConversationBufferMemory(memory_key="chat_history")

In [58]:
# llm = ChatOpenAI(
#     temperature=0,
#     model="gpt-3.5-turbo",
#     # model_kwargs={"prompt":prompt}
# )
# llm_chain = LLMChain(llm=OpenAI(temperature=0), prompt=prompt)

# agent = initialize_agent(tools, llm_chain, agent=AgentType.CHAT_CONVERSATIONAL_REACT_DESCRIPTION, memory=memory,handle_parsing_errors="Check your output and make sure it conforms!")
from langchain import LLMChain

llm_chain = LLMChain(llm=OpenAI(temperature=0), prompt=prompt)
agent = ZeroShotAgent(llm_chain=llm_chain, tools=tools)
agent_chain = AgentExecutor.from_agent_and_tools(
    agent=agent, tools=tools, memory=memory,
)

In [60]:
# agent("trials for blood cancer")['output']
agent_chain.run(input="trials for nose" )

I'm sorry, but I don't have any information related to the nose in the provided context.I'm sorry, but I don't have any information about a study specifically related to nasal conditions.The first study mentioned, titled "A Phase III, Randomized, Blinded, Controlled, Parallel-Group Trial to Evaluate the Efficacy and Safety of LYR-210 for the Treatment of Chronic Rhinosinusitis (CRS) in Adults," is specifically focused on the treatment of chronic rhinosinusitis (CRS) in adults. It is a randomized, controlled trial evaluating the efficacy and safety of LYR-210 compared to a sham control.

The second study mentioned, titled "A Randomized, Double-blind, Placebo-controlled, Phase 3 Study of the Efficacy and Safety of Inhaled Treprostinil in Subjects with Idiopathic Pulmonary Fibrosis," is not related to sinus conditions. It is a study evaluating the efficacy and safety of inhaled treprostinil in subjects with idiopathic pulmonary fibrosis.

The third study mentioned, titled "Gronigen Intern

'The first study mentioned, titled "A Phase III, Randomized, Blinded, Controlled, Parallel-Group Trial to Evaluate the Efficacy and Safety of LYR-210 for the Treatment of Chronic Rhinosinusitis (CRS) in Adults," is specifically focused on the treatment of chronic rhinosinusitis (CRS) in adults. It is a randomized, controlled trial evaluating the efficacy and safety of LYR-210 compared to a sham control.'

In [35]:
agent_chain.run(input="okay give me those trials then")

Based on the given context, there is no specific clinical trial mentioned for nose-related conditions. However, since the context mentions chronic rhinosinusitis (CRS), which is a condition affecting the nose and sinuses, it is recommended to consider participating in the clinical trial titled "A Phase III, Randomized, Blinded, Controlled, Parallel-Group Trial to Evaluate the Efficacy and Safety of LYR-210 for the Treatment of Chronic Rhinosinusitis (CRS) in Adults" (Enlighten 2). This trial is specifically for adults diagnosed with CRS and aims to evaluate the efficacy and safety of LYR-210 for the treatment of CRS. It is always advisable to consult with a healthcare professional for personalized advice and to determine if participating in a clinical trial is suitable for your specific condition.

'Based on the given context, it is recommended to consider participating in the clinical trial titled "A Phase III, Randomized, Blinded, Controlled, Parallel-Group Trial to Evaluate the Efficacy and Safety of LYR-210 for the Treatment of Chronic Rhinosinusitis (CRS) in Adults" (Enlighten 2). This trial is specifically for adults diagnosed with CRS and aims to evaluate the efficacy and safety of LYR-210 for the treatment of CRS. It is always advisable to consult with a healthcare professional for personalized advice and to determine if participating in a clinical trial is suitable for your specific condition.'

In [36]:
agent_chain.run(input="eligibility for trial")

Enlighten 2 is a phase III clinical trial for the treatment of Chronic Rhinosinusitis (CRS) in adults. The trial is evaluating the efficacy and safety of LYR-210 compared to a sham control. To be eligible for the trial, participants must be 18 years or older, diagnosed with CRS, have undergone at least 2 trials of medical treatments in the past, have a mean 3 cardinal symptom (3CS) score, and have bilateral ethmoid disease confirmed on CT. It is important to note that this trial is currently open to accrual.

'To be eligible for the trial, participants must be 18 years or older, diagnosed with CRS, have undergone at least 2 trials of medical treatments in the past, have a mean 3 cardinal symptom (3CS) score, and have bilateral ethmoid disease confirmed on CT. It is important to note that this trial is currently open to accrual.'

In [37]:
res = agent_chain.run(input="what about exclusion criteria")

The Phase III trial is evaluating the efficacy and safety of LYR-210 for the treatment of Chronic Rhinosinusitis (CRS) in adults. The trial is randomized, blinded, controlled, and conducted in parallel groups. The eligibility criteria include being 18 years or older, diagnosed with CRS, having undergone at least 2 trials of medical treatments in the past, and having a mean 3 cardinal symptom (3CS) score. Bilateral ethmoid disease confirmed on CT is also required. It is important to note that the trial is currently open to accrual.

##### Using conversational retrieval agent

In [17]:
vectordb = FAISS.from_documents(docs, embeddings)
vectordb.save_local("faiss_embeddings")

In [18]:
docsearch = FAISS.load_local("faiss_embeddings", embeddings)

In [None]:
# Reach out about Oncore + Qualy
# Get scripts from Daniella
# Prompt people to ask the right questions

In [19]:
from langchain.agents.agent_toolkits import create_retriever_tool
tool = create_retriever_tool(
    docsearch.as_retriever(), 
    "search_clinical_trials_database",
    "Searches and returns documents regarding clinical trials"
)
tools = [tool]

In [20]:
from langchain.agents.agent_toolkits import create_conversational_retrieval_agent
llm = ChatOpenAI(temperature = 0, 
                 model_name="gpt-3.5-turbo",
                 streaming=True, callbacks=[StreamingStdOutCallbackHandler()])
agent_executor = create_conversational_retrieval_agent(llm, tools )

In [21]:
res = agent_executor ({"input": "trials for blood cancer"})['output']

Here are some clinical trials related to blood cancer:

1. Title: A Study to Evaluate Long-Term Safety of CAR-T Cell Therapy in Patients with Hematologic Malignancies
   - Investigator: Susan O'Brien
   - Status: Open to accrual
   - Description: This is a non-interventional, long-term safety study of allogeneic CAR-T cell therapy in patients with hematologic malignancies. The purpose is to collect long-term observational data to identify and understand potential late side effects in patients who have received CAR-T cell therapies.
   - Disease Sites: Multiple Myeloma, Non-Hodgkin's Lymphoma, Hodgkin's Lymphoma, Lymphoid Leukemia, Myeloid and Monocytic Leukemia, Other Hematopoietic Leukemia
   - Sponsor: Caribou Biosciences, Inc.

2. Title: Blood Collection Protocol for the Analysis of Exosomes in Patients with Breast Cancer
   - Investigator: Ritesh Parajuli
   - Status: Open to accrual
   - Description: The purpose of this research study is to determine a group of particles in the bl

In [22]:
agent_executor ({"input": "trials for the nose"})['output']

Here are some clinical trials related to the nose:

1. Title: A Phase III, Randomized, Blinded, Controlled, Parallel-Group Trial to Evaluate the Efficacy and Safety of LYR-210 for the Treatment of Chronic Rhinosinusitis (CRS) in Adults.
   - Investigator: Naveen Bhandarkar
   - Status: Open to accrual
   - Description: This is a 24-week, multicenter, phase III trial to evaluate the efficacy and safety of LYR-210 compared with sham control for the treatment of chronic rhinosinusitis in adults. The trial aims to assess the effectiveness of LYR-210 in improving symptoms and quality of life in patients with CRS.
   - Disease Sites: Ear/ Nose/ Throat (ENT) - Otolaryngologic
   - Sponsors: Medpace, Inc., Lyra Therapeutics, Inc.

2. Title: Randomized Phase II and Phase III Studies of Individualized Treatment for Nasopharyngeal Carcinoma Based on Biomarker Epstein Barr Virus (EBV) Deoxyribonucleic Acid (DNA)
   - Investigator: Rupali Nabar
   - Status: Open to accrual
   - Description: This is

'Here are some clinical trials related to the nose:\n\n1. Title: A Phase III, Randomized, Blinded, Controlled, Parallel-Group Trial to Evaluate the Efficacy and Safety of LYR-210 for the Treatment of Chronic Rhinosinusitis (CRS) in Adults.\n   - Investigator: Naveen Bhandarkar\n   - Status: Open to accrual\n   - Description: This is a 24-week, multicenter, phase III trial to evaluate the efficacy and safety of LYR-210 compared with sham control for the treatment of chronic rhinosinusitis in adults. The trial aims to assess the effectiveness of LYR-210 in improving symptoms and quality of life in patients with CRS.\n   - Disease Sites: Ear/ Nose/ Throat (ENT) - Otolaryngologic\n   - Sponsors: Medpace, Inc., Lyra Therapeutics, Inc.\n\n2. Title: Randomized Phase II and Phase III Studies of Individualized Treatment for Nasopharyngeal Carcinoma Based on Biomarker Epstein Barr Virus (EBV) Deoxyribonucleic Acid (DNA)\n   - Investigator: Rupali Nabar\n   - Status: Open to accrual\n   - Descrip

In [23]:
agent_executor({"input": "eligibility criteria for trial 2"})['output']

The eligibility criteria for Trial 2, "Randomized Phase II and Phase III Studies of Individualized Treatment for Nasopharyngeal Carcinoma Based on Biomarker Epstein Barr Virus (EBV) Deoxyribonucleic Acid (DNA)", are as follows:

Inclusion Criteria:
- Biopsy-proven (from primary lesion and/or lymph nodes) diagnosis of cancer of the nasopharynx.
- Patients must have detectable pretreatment plasma EBV DNA, determined by the central lab prior to Step 2 registration.
- Stage II-IVB disease (AJCC, 7th ed.) with no evidence of distant metastasis.
- History/physical examination by a Medical Oncologist or Clinical Oncologist or Radiation Oncologist or ENT, which must include an endoscopic evaluation, a complete list of current medications, and assessment of weight and weight loss in the past 6 months within 21 days prior to registration.
- Evaluation of tumor extent required within 28 days prior to registration: MRI of the nasopharynx and neck; or CT of the nasopharynx and neck with ≤ 3 mm cont

'The eligibility criteria for Trial 2, "Randomized Phase II and Phase III Studies of Individualized Treatment for Nasopharyngeal Carcinoma Based on Biomarker Epstein Barr Virus (EBV) Deoxyribonucleic Acid (DNA)", are as follows:\n\nInclusion Criteria:\n- Biopsy-proven (from primary lesion and/or lymph nodes) diagnosis of cancer of the nasopharynx.\n- Patients must have detectable pretreatment plasma EBV DNA, determined by the central lab prior to Step 2 registration.\n- Stage II-IVB disease (AJCC, 7th ed.) with no evidence of distant metastasis.\n- History/physical examination by a Medical Oncologist or Clinical Oncologist or Radiation Oncologist or ENT, which must include an endoscopic evaluation, a complete list of current medications, and assessment of weight and weight loss in the past 6 months within 21 days prior to registration.\n- Evaluation of tumor extent required within 28 days prior to registration: MRI of the nasopharynx and neck; or CT of the nasopharynx and neck with ≤ 3

## Different approaches

### Convert CSV to PDFs/text files, embed and see results with both RetrievalQA and conversational agent

There's a lot of talk about how it is not very meaningful or efficient to generate embeddings directly from CSVs since gen AI models don't function very well with tabular dat and vector embeddings are better suited to more natural language datasets. Hence, I'm trying to convert the data into a more human readable text format and then generating embeddings

In [1]:
import xml.etree.ElementTree as ET
import pandas as pd
import os
import streamlit as st
import pandas as pd
from langchain.llms import OpenAI
from langchain.document_loaders import TextLoader
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.chat_models import ChatOpenAI
from langchain.vectorstores import FAISS
from langchain.chains import RetrievalQA
from langchain.agents import initialize_agent
from langchain.agents import AgentType
from langchain.memory import ConversationBufferMemory
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain.callbacks.base import BaseCallbackHandler
from langchain.memory.chat_message_histories import StreamlitChatMessageHistory
from langchain.prompts import MessagesPlaceholder
from langchain.prompts import SystemMessagePromptTemplate
from langchain.prompts import HumanMessagePromptTemplate
from langchain.prompts import ChatPromptTemplate
from langchain.agents import ZeroShotAgent, AgentExecutor, Tool
from langchain import LLMChain
from langchain.callbacks.streaming_stdout_final_only import (
    FinalStreamingStdOutCallbackHandler,
)
from langchain.agents.agent_toolkits import create_conversational_retrieval_agent, create_retriever_tool
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains import ConversationalRetrievalChain


In [14]:
# CSV to human readable text file
import pandas as pd

# Function to convert a row to a human-readable sentence format
def row_to_sentence(row):
    sentence_parts = []
    for col in df.columns:
        # Skip columns with NaN values
        if pd.isna(row[col]):
            continue
        # Format each column value into a readable sentence part
        sentence_part = f"{col.replace('_', ' ').upper()} is {row[col]}"
        sentence_parts.append(sentence_part)
    # Combine all parts into a single sentence
    return '. '.join(sentence_parts) + '.'

# Load the CSV file
csv_file_path = './data/train.csv'
df = pd.read_csv(csv_file_path)

# Apply the function to each row in the dataframe
sentences = [f"{index + 1}. {row_to_sentence(row)}" for index, row in df.iterrows()]

# Saving the sentences to a text file
text_file_path = './data/train.txt'
with open(text_file_path, 'w') as file:
    for sentence in sentences:
        file.write(sentence + '\n')


In [2]:
loader = TextLoader("./data/train.txt")
docs = loader.load()

text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=400)
texts = text_splitter.split_documents(docs)

In [3]:
embeddings = OpenAIEmbeddings()
vectordb = FAISS.from_documents(documents=docs, embedding=embeddings)
vectordb.save_local("<path to folder>")

### Using GPT4ALL

In [4]:
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain.chains import LLMChain
from langchain.llms import GPT4All
from langchain.prompts import PromptTemplate

In [5]:
local_path = (
    "./models/mistral-7b-openorca.Q4_0.gguf"  # replace with your desired local file path
)
llm = GPT4All(model=local_path, callbacks=[StreamingStdOutCallbackHandler()], verbose=True)
 
chain =ConversationalRetrievalChain.from_llm(llm=llm,retriever=vectordb.as_retriever())

In [6]:
chat_history = []
res = chain({"question":"some trials for the nose", "chat_history": chat_history})

ERROR: The prompt size exceeds the context window size and cannot be processed.

LLaMA ERROR: The prompt is 209401 tokens and the context window is 2048!


In [7]:
# Create tool
tool = create_retriever_tool(
    vectordb.as_retriever(),
    "search_clinical_trials_database",
    "Searches and returns documents regarding clinical trials")

tools = [tool]

# Define llm
llm = ChatOpenAI(temperature=0,
                    model="gpt-3.5-turbo",
                    streaming=True, callbacks=[StreamingStdOutCallbackHandler()])

# Create agent
agent_executor = create_conversational_retrieval_agent(llm, tools)

In [8]:
agent_executor ({"input": "trials for the nose"})['output']

Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.<locals>._completion_with_retry in 4.0 seconds as it raised RateLimitError: Request too large for gpt-3.5-turbo in organization org-E6HBJxFs9ulEaKRWe6XnwCJS on tokens per min (TPM): Limit 160000, Requested 179788. The input or output tokens must be reduced in order to run successfully. Visit https://platform.openai.com/account/rate-limits to learn more..
Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.<locals>._completion_with_retry in 4.0 seconds as it raised RateLimitError: Request too large for gpt-3.5-turbo in organization org-E6HBJxFs9ulEaKRWe6XnwCJS on tokens per min (TPM): Limit 160000, Requested 179788. The input or output tokens must be reduced in order to run successfully. Visit https://platform.openai.com/account/rate-limits to learn more..
Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.<locals>._completion_with_retry in 4.0 seconds as it raised RateLimitErro

RateLimitError: Request too large for gpt-3.5-turbo in organization org-E6HBJxFs9ulEaKRWe6XnwCJS on tokens per min (TPM): Limit 160000, Requested 179788. The input or output tokens must be reduced in order to run successfully. Visit https://platform.openai.com/account/rate-limits to learn more.