In [5]:
#%%writefile shakti3_index.py
# NOTE: This is ONLY necessary in jupyter notebook.
# Details: Jupyter runs an event-loop behind the scenes. 
#          This results in nested event-loops when we start an event-loop to make async queries.
#          This is normally not allowed, we use nest_asyncio to allow it for convenience.  
# import nest_asyncio
# nest_asyncio.apply()

# PydanticMultipleSelector
# Use the OpenAI Function API to generate/parse pydantic objects under the hood for the router selector.
from llama_index.query_engine.router_query_engine import RouterQueryEngine
from llama_index.selectors.llm_selectors import LLMSingleSelector, LLMMultiSelector
from llama_index.selectors.pydantic_selectors import PydanticMultiSelector, PydanticSingleSelector
from llama_index.tools.query_engine import QueryEngineTool
from llama_index.objects import ObjectIndex, SimpleToolNodeMapping
from llama_index.query_engine import ToolRetrieverRouterQueryEngine
from langchain.chat_models import ChatOpenAI

import openai
import os
import json
os.environ["OPENAI_API_KEY"] = "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"
openai.api_key = "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"

import logging
import sys

logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logging.getLogger().handlers = []
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

from llama_index import (
    VectorStoreIndex,
    ListIndex,
    SimpleDirectoryReader,
    ServiceContext,
    StorageContext,
    SimpleKeywordTableIndex,
    LLMPredictor,
)

def llama_listvectorkeyword_index(uploaded_file, query):
    # load documents
    documents = SimpleDirectoryReader(input_files = [uploaded_file]).load_data()
    
    # initialize service context (set chunk size)
    #service_context = ServiceContext.from_defaults(chunk_size=1024)
    llm_predictor_gpt4 = LLMPredictor(llm=ChatOpenAI(temperature=0, model_name="gpt-4-32k"))
    service_context = ServiceContext.from_defaults(llm_predictor=llm_predictor_gpt4, chunk_size=1024, chunk_overlap=100)
    nodes = service_context.node_parser.get_nodes_from_documents(documents)

    # initialize storage context (by default it's in-memory)
    storage_context = StorageContext.from_defaults()
    storage_context.docstore.add_documents(nodes)

    #Define List Index and Vector Index over Same Data
    list_index = ListIndex(nodes, storage_context=storage_context)
    vector_index = VectorStoreIndex(nodes, storage_context=storage_context)

    #Define Query Engines and Set Metadata
    list_query_engine = list_index.as_query_engine(
        response_mode='tree_summarize',
        use_async=True,
    )
    vector_query_engine = vector_index.as_query_engine()

    list_tool = QueryEngineTool.from_defaults(
        query_engine=list_query_engine,
        description='Useful for summarization questions',
    )

    vector_tool = QueryEngineTool.from_defaults(
        query_engine=vector_query_engine,
        description='Useful for retrieving specific context from a particular section or paragraph.',

    )

    #Define Keyword Query Engine
    keyword_index = SimpleKeywordTableIndex(nodes, storage_context=storage_context)
    keyword_query_engine = keyword_index.as_query_engine()
    keyword_tool = QueryEngineTool.from_defaults(
        query_engine=vector_query_engine,
        description='Useful for retrieving specific context using keywords',
    )

    query_engine = RouterQueryEngine(
        selector=PydanticMultiSelector.from_defaults(),
        query_engine_tools=[
            list_tool,
            vector_tool,
            keyword_tool,
        ]
    )

    #Define Retrieval-Augmented Router Query Engine
    tool_mapping = SimpleToolNodeMapping.from_objects([list_tool, vector_tool, keyword_tool])
    obj_index = ObjectIndex.from_objects(
        [list_tool, vector_tool, keyword_tool], 
        tool_mapping,
        VectorStoreIndex,
    )

    query_engine = ToolRetrieverRouterQueryEngine(obj_index.as_retriever())

    return query_engine.query(query)

def prompt(file):
    with open(file, encoding="utf8") as f:
        return f.read()
    

In [5]:
intro_query = "Strictly following the above instructions and the clinical trial document provided, write the content of 'Introduction' \
section of the APLS in past tense, comprehendable by a scientific researcher. Do not violate the section-wise instructions provided \
in any case. The content should be strictly inferred from the clinical trial document provided only and not any other sources.\n"


intro_response = llama_listvectorkeyword_index('RCT.pdf', (prompt('apls_persona_2606.txt') + "\n\n\n" + intro_query))

Combining responses from multiple query engines.


In [10]:
intro_summary_query = "Strictly following the above instructions and the clinical trial document provided, write the content of 'Intro Summary' \
section of the APLS in past tense, comprehendable by a scientific researcher. Do not violate the section-wise instructions provided \
in any case. The content should be strictly inferred from the clinical trial document provided only and not any other sources.\n"


print(llama_listvectorkeyword_index('RCT.pdf', (prompt('apls_persona_2606.txt') + "\n\n\n" + "Here is the Introduction section generated: "+ 
                                    str(intro_response) + intro_summary_query)))

Combining responses from multiple query engines.

This study investigated the efficacy of pembrolizumab and encorafenib plus cetuximab compared with pembrolizumab alone as a first-line treatment for BRAF V600E-mutant microsatellite instability-high (MSI-H)/mismatch repair-deficient (dMMR) metastatic colorectal cancer (CRC). The SEAMARK study was a randomized phase 2 clinical trial sponsored by Novartis Pharmaceuticals Corporation and conducted at multiple sites in the United States, Canada, and Europe. A total of 120 participants with BRAF V600E-mutant MSI-H/dMMR metastatic CRC were randomized in a 1:1 ratio to receive either pembrolizumab alone or pembrolizumab plus encorafenib plus cetuximab. The primary endpoint was progression-free survival (PFS) and the secondary endpoints were overall survival (OS), objective response rate (ORR), and safety.


In [12]:
inclusion_criteria_query = "Strictly following the above instructions and the clinical trial document provided, write the content of \
'Inclusion criteria' section of the APLS in past tense, comprehendable by a scientific researcher. Do not violate the section-wise instructions provided \
in any case. The content should be strictly inferred from the clinical trial document provided only and not any other sources.\n"


print(llama_listvectorkeyword_index('RCT.pdf', (prompt('apls_persona_2606.txt') + "\n\n\n" + inclusion_criteria_query)))

Combining responses from multiple query engines.

Inclusion criteria:
- Participants were aged 18 years or older
- Participants had histologically or cytologically confirmed metastatic colorectal cancer
- Participants had BRAF V600E mutation
- Participants had microsatellite instability-high (MSI-H) or mismatch repair-deficient (dMMR) tumors
- Participants had an Eastern Cooperative Oncology Group (ECOG) performance status of 0 or 1
- Participants had adequate organ function


In [13]:
exclusion_criteria_query = "Strictly following the above instructions and the clinical trial document provided, write the content of \
'Exclusion criteria' section of the APLS in past tense, comprehendable by a scientific researcher. Do not violate the section-wise instructions provided \
in any case. The content should be strictly inferred from the clinical trial document provided only and not any other sources.\n"


print(llama_listvectorkeyword_index('RCT.pdf', (prompt('apls_persona_2606.txt') + "\n\n\n" + exclusion_criteria_query)))

Combining responses from multiple query engines.

The Exclusion criteria for this study were:
- Participants with a history of any other malignancy within the past 5 years, except for adequately treated non-melanoma skin cancer or carcinoma in situ of the cervix.
- Participants with known brain metastases or leptomeningeal disease.
- Participants with known active hepatitis B or C virus infection.
- Participants with known human immunodeficiency virus (HIV) infection.
- Participants with known active tuberculosis.
- Participants with known hypersensitivity to any of the study drugs or their excipients.


In [15]:
participants_query = "Strictly following the above instructions and the clinical trial document provided, generate in one word the \
'Participants' section of the APLS in past tense, comprehendable by a scientific researcher. Do not violate the section-wise instructions provided \
in any case. The content should be strictly inferred from the clinical trial document provided only and not any other sources.\n"


print(llama_listvectorkeyword_index('RCT.pdf', (prompt('apls_persona_2606.txt') + "\n\n\n" + participants_query)))

Combining responses from multiple query engines.

545


In [18]:
disease_condition_query = "Strictly following the above instructions and the clinical trial document provided, generate in 2-3 words the specific \
'Disease condition' mentioned in the APLS in past tense, comprehendable by a scientific researcher. Do not violate the section-wise instructions provided \
in any case. The content should be strictly inferred from the clinical trial document provided only and not any other sources.\n"


print(llama_listvectorkeyword_index('RCT.pdf', (prompt('apls_persona_2606.txt') + "\n\n\n" + disease_condition_query)))

Combining responses from multiple query engines.
Colorectal cancer


In [19]:
demographics_query = "Strictly following the above instructions and the clinical trial document provided, write the content of \
'Demographics' section of the APLS in past tense, comprehendable by a scientific researcher. Do not violate the section-wise instructions provided \
in any case. The content should be strictly inferred from the clinical trial document provided only and not any other sources.\n"


print(llama_listvectorkeyword_index('RCT.pdf', (prompt('apls_persona_2606.txt') + "\n\n\n" + demographics_query)))

Combining responses from multiple query engines.

The study included 441 participants aged 18 to 75 years, with a median age of 57 years. The majority of the participants were male (68.2%) and white (83.2%). The participants were from the United States (68.2%), Europe (25.2%), and other countries (6.6%).


In [26]:
treatment_arm_query = "Write in one number the count of participants only in the treatment arm\n"


print(llama_listvectorkeyword_index('RCT.pdf', (prompt('apls_persona_2606.txt') + "\n\n\n" + treatment_arm_query)))

Combining responses from multiple query engines.

39 participants


In [23]:
control_arm_query = "Write in one number the count of participnats in the control arm"


print(llama_listvectorkeyword_index('RCT.pdf', (prompt('apls_persona_2606.txt') + "\n\n\n" + control_arm_query)))

Combining responses from multiple query engines.

50


In [2]:
#from shakti_stream_index import llama_listvectorkeyword_index, prompt
summary_query = "With the above instructions and the clinical trial document provided, write a detailed APLS with mentioned sections in past tense, comprehendable by a grade 1 student. Provide the response in JSON format with section names as keys. Enclose each key in JSON in angular brackets for example '<Title>'. Enclose each value in JSON in double quotes. Make sure to complete the JSON with opening and closing flower brackets.\n"
for text in llama_listvectorkeyword_index('RCT.pdf',prompt('apls_persona_2606.txt') + "\n\n\n" + summary_query):
    print(text)


Combining responses from multiple query engines.


TypeError: 'Response' object is not iterable

In [27]:
print(response['<Results>'])

The main thing the researchers looked at was the percentage of people who had a hemoglobin response. The results showed that the combination of pembrolizumab and encorafenib plus cetuximab was more effective than pembrolizumab alone. The researchers also looked at other things, like the absolute change in hemoglobin level, the relative change in indirect bilirubin level, the relative change in the percentage of reticulocytes, and the relative change in lactate dehydrogenase level. The results showed that the combination of pembrolizumab and encorafenib plus cetuximab was more effective than pembrolizumab alone for all of these things.


In [25]:
summary_query = "Strictly following the above instructions and the clinical trial document provided, write the content of 'Key takeaway' section of the APLS in past tense, comprehendable by a scientific researcher. Do not violate the section-wise instructions provided in any case. The content should be strictly inferred from the clinical trial document provided only and not any other sources.\n"
llama_listvectorkeyword_index(prompt('apls_persona_2606.txt') + "\n\n\n" + summary_query)

> [build_index_from_nodes] Total LLM token usage: 0 tokens
> [build_index_from_nodes] Total embedding token usage: 0 tokens
> [build_index_from_nodes] Total LLM token usage: 0 tokens
> [build_index_from_nodes] Total embedding token usage: 13499 tokens
> [build_index_from_nodes] Total LLM token usage: 0 tokens
> [build_index_from_nodes] Total embedding token usage: 0 tokens
> [build_index_from_nodes] Total LLM token usage: 0 tokens
> [build_index_from_nodes] Total embedding token usage: 62 tokens
> [retrieve] Total LLM token usage: 0 tokens
> [retrieve] Total embedding token usage: 1118 tokens
> [retrieve] Total LLM token usage: 0 tokens
> [retrieve] Total embedding token usage: 0 tokens
> [get_response] Total LLM token usage: 2451 tokens
> [get_response] Total embedding token usage: 0 tokens
> [retrieve] Total LLM token usage: 0 tokens
> [retrieve] Total embedding token usage: 0 tokens
> [get_response] Total LLM token usage: 2451 tokens
> [get_response] Total embedding token usage: 0 t

In [28]:
summary_query = "Strictly following the above instructions and the clinical trial document provided, write the content of 'Study details' section of the APLS in past tense, comprehendable by a scientific researcher. Do not violate the section-wise instructions provided in any case. The content should be strictly inferred from the clinical trial document provided only and not any other sources.\n"
llama_listvectorkeyword_index(prompt('apls_persona_2606.txt') + "\n\n\n" + summary_query)

> [build_index_from_nodes] Total LLM token usage: 0 tokens
> [build_index_from_nodes] Total embedding token usage: 0 tokens
> [build_index_from_nodes] Total LLM token usage: 0 tokens
> [build_index_from_nodes] Total embedding token usage: 13499 tokens
> [build_index_from_nodes] Total LLM token usage: 0 tokens
> [build_index_from_nodes] Total embedding token usage: 0 tokens
> [build_index_from_nodes] Total LLM token usage: 0 tokens
> [build_index_from_nodes] Total embedding token usage: 62 tokens
> [retrieve] Total LLM token usage: 0 tokens
> [retrieve] Total embedding token usage: 1118 tokens
> [retrieve] Total LLM token usage: 0 tokens
> [retrieve] Total embedding token usage: 0 tokens
> [get_response] Total LLM token usage: 2364 tokens
> [get_response] Total embedding token usage: 0 tokens
> [retrieve] Total LLM token usage: 0 tokens
> [retrieve] Total embedding token usage: 0 tokens
> [get_response] Total LLM token usage: 2364 tokens
> [get_response] Total embedding token usage: 0 t

In [32]:
summary_query = "Strictly following the above instructions and the clinical trial document provided, write the content of 'Aims' section of the APLS in past tense, comprehendable by a scientific researcher. Do not violate the section-wise instructions provided in any case. The content should be strictly inferred from the clinical trial document provided only and not any other sources.\n"
aims = llama_listvectorkeyword_index(prompt('apls_persona_2606.txt') + "\n\n\n" + summary_query)


summary_query = "Strictly following the above instructions and the clinical trial document provided, write the content of 'Results' section of the APLS in past tense, comprehendable by a scientific researcher. Do not violate the section-wise instructions provided in any case. The content should be strictly inferred from the clinical trial document provided only and not any other sources.\n"
results = llama_listvectorkeyword_index(prompt('apls_persona_2606.txt') + "\n\n\n" + summary_query)

> [build_index_from_nodes] Total LLM token usage: 0 tokens
> [build_index_from_nodes] Total embedding token usage: 0 tokens
> [build_index_from_nodes] Total LLM token usage: 0 tokens
> [build_index_from_nodes] Total embedding token usage: 13499 tokens
> [build_index_from_nodes] Total LLM token usage: 0 tokens
> [build_index_from_nodes] Total embedding token usage: 0 tokens
> [build_index_from_nodes] Total LLM token usage: 0 tokens
> [build_index_from_nodes] Total embedding token usage: 62 tokens
> [retrieve] Total LLM token usage: 0 tokens
> [retrieve] Total embedding token usage: 1118 tokens
> [retrieve] Total LLM token usage: 0 tokens
> [retrieve] Total embedding token usage: 0 tokens
> [get_response] Total LLM token usage: 2196 tokens
> [get_response] Total embedding token usage: 0 tokens
> [retrieve] Total LLM token usage: 0 tokens
> [retrieve] Total embedding token usage: 0 tokens
> [get_response] Total LLM token usage: 2196 tokens
> [get_response] Total embedding token usage: 0 t

In [23]:
from shakti3_index import llama_listvectorkeyword_index, prompt
summary_query = "Strictly following the above instructions and the clinical trial document provided, write the content of 'Title' section of the APLS in past tense, comprehendable by a scientific researcher. Do not violate the section-wise instructions provided in any case. The content should be strictly inferred from the clinical trial document provided only and not any other sources.\n"
llama_listvectorkeyword_index("/home/cdsw/experimentation_project1/PLS_project/data/RCT.pdf", prompt('apls_persona_2606.txt') + "\n\n\n" + summary_query)

> [build_index_from_nodes] Total LLM token usage: 0 tokens
> [build_index_from_nodes] Total embedding token usage: 0 tokens
> [build_index_from_nodes] Total LLM token usage: 0 tokens
> [build_index_from_nodes] Total embedding token usage: 13499 tokens
> [build_index_from_nodes] Total LLM token usage: 0 tokens
> [build_index_from_nodes] Total embedding token usage: 0 tokens
> [build_index_from_nodes] Total LLM token usage: 0 tokens
> [build_index_from_nodes] Total embedding token usage: 54 tokens
> [retrieve] Total LLM token usage: 0 tokens
> [retrieve] Total embedding token usage: 1117 tokens
> [retrieve] Total LLM token usage: 0 tokens
> [retrieve] Total embedding token usage: 0 tokens
> [get_response] Total LLM token usage: 2193 tokens
> [get_response] Total embedding token usage: 0 tokens
> [retrieve] Total LLM token usage: 0 tokens
> [retrieve] Total embedding token usage: 0 tokens
> [get_response] Total LLM token usage: 2193 tokens
> [get_response] Total embedding token usage: 0 t

'\nTitle: Effect of Pembrolizumab and Encorafenib Plus Cetuximab Compared with Pembrolizumab Alone in People with BRAF V600E-Mutant Microsatellite Instability-High (MSI-H)/Mismatch Repair-Deficient (dMMR) Metastatic Colorectal Cancer (CRC): Results of the SEAMARK Study'

In [8]:
%%writefile shakti_stream_index.py
#current working version for streaming
# NOTE: This is ONLY necessary in jupyter notebook.
# Details: Jupyter runs an event-loop behind the scenes. 
#          This results in nested event-loops when we start an event-loop to make async queries.
#          This is normally not allowed, we use nest_asyncio to allow it for convenience.  
# import nest_asyncio
# nest_asyncio.apply()

# PydanticMultipleSelector
# Use the OpenAI Function API to generate/parse pydantic objects under the hood for the router selector.

from langchain.chat_models import ChatOpenAI
from llama_index.llms import OpenAI
import streamlit as st
import time
import openai
import os
import json
os.environ["OPENAI_API_KEY"] = "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"
openai.api_key = "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"

import logging
import sys

logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logging.getLogger().handlers = []
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

from llama_index import (
    VectorStoreIndex,
    ListIndex,
    SimpleDirectoryReader,
    ServiceContext,
    StorageContext,
    SimpleKeywordTableIndex,
    LLMPredictor,
)

def llama_vector_index(uploaded_file, query):
    # load documents
    documents = SimpleDirectoryReader(input_files = [uploaded_file]).load_data()
    
    service_context = ServiceContext.from_defaults(
            llm=OpenAI(model="gpt-4-32k", temperature=0, streaming=True, ), chunk_size=4096, chunk_overlap = 512, 
    )
    
    index = VectorStoreIndex.from_documents(
        documents, service_context=service_context
    )
    index_query_engine = index.as_query_engine(streaming=True, similarity_top_k=2)
    streaming_response = index_query_engine.query(query)


    placeholder = st.empty()
    full_text = ""

    for text in streaming_response.response_gen:
        full_text += text
        time.sleep(0.1)
        placeholder.write(full_text)
    

    return full_text
    
    

def prompt(file):
    with open(file,encoding="utf8") as f:
        return f.read()
    

Overwriting shakti_stream_index.py


In [10]:
streaming_response = llama_listvectorkeyword_index('RCT.pdf', 'summarize and conclude the voxelotor efficacy for parents of child patients in plain language"')
streaming_response.print_response_stream()


Voxelotor was found to be an effective treatment for sickle cell disease in children. The 1500 mg dose of voxelotor was associated with a significant decrease in hemoglobin levels and a decrease in indirect bilirubin levels from baseline to week 24. Additionally, the relative change in the percentage of reticulocytes was significantly greater in the 1500-mg voxelotor group than in the placebo group. All four measures of hemolysis showed a response that improved with increasing whole-blood concentrations of voxelotor. These results suggest that voxelotor is an effective treatment for sickle cell disease in children.