<a href="https://colab.research.google.com/github/polyexplorer/open-llm/blob/main/RAG%2BEval(Llama_Index).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Dependencies

# Huggingface LLM (Integrated with LlamaIndex)

In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from transformers import TextStreamer, pipeline
import torch

from llama_index.llms.huggingface import HuggingFaceLLM

torch.cuda.empty_cache()
model_name_or_path = "TheBloke/neural-chat-7B-v3-2-GPTQ"
# To use a different branch, change revision
# For example: revision="gptq-4bit-32g-actorder_True"
model = AutoModelForCausalLM.from_pretrained(model_name_or_path,
                                             device_map="auto",
                                            #  trust_remote_code=False,
                                             revision="main")

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=True)

def generate_response(prompt):
    input_ids = tokenizer(prompt, return_tensors="pt").input_ids
    outputs = model.generate(
        input_ids,
        max_length=256,
        pad_token_id=tokenizer.pad_token_id,
        eos_token_id=tokenizer.eos_token_id,
    )
    response_ids = outputs[0]
    response_text = tokenizer.decode(response_ids, skip_special_tokens=True)
    return response_text

streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
pipe = pipeline(
            "text-generation",
            model=model,
            tokenizer=tokenizer,
            max_new_tokens=200,
            do_sample=True,
            temperature=0.1,
            top_k=40,
            top_p=0.95,
            repetition_penalty=1.15,
            # streamer=streamer,
        )

# langchain_llm = HuggingFacePipeline(pipeline=pipe)

system_prompt = "### System: You are a good Q/A chatbot who always answers the question based on the context only."

prompt_template = """
### User:
{query_str}

### Assistant:
"""

# prompt_template = "GPT4 Correct User: {query_str}<|end_of_turn|>GPT4 Correct Assistant:"

llm = HuggingFaceLLM(
    model = model,
    tokenizer = tokenizer,
    context_window = 4096,
    max_new_tokens = 256,
    query_wrapper_prompt = prompt_template,
    system_prompt = system_prompt,

)

# Basic RAG Pipeline

## Ingestion

### File Upload

### DocumentStore

In [19]:
from llama_index import Document,SimpleDirectoryReader
import os
filename = "/home/ubuntu/open-llm/pdfs/X06-201-00001_Protocol_Amendment_4"

documents = SimpleDirectoryReader(
    input_files=[filename]
).load_data()
document = Document(text="\n\n".join([doc.text for doc in documents]))

### Embeddings

In [4]:
# VectorStore Embeddings
from llama_index.embeddings import InstructorEmbedding, HuggingFaceEmbedding
from llama_index import ServiceContext

# model_name = "AnnaWegmann/Style-Embedding"
# model_name = "hkunlp/instructor-large"
# model_name = "BAAI/bge-large-en-v1.5"

# text_instruction = "Represent the Medical document for retrieving important points where answer can be found:"
# query_instruction = "Represent the Medical question for retrieving supporting documents:"

# embed_model = InstructorEmbedding(
#     model_name= model_name,
#     text_instruction=text_instruction,
#     query_instruction=query_instruction
#     )

# embed_model = HuggingFaceEmbedding(
#     model_name= model_name
#     )



# service_context = ServiceContext.from_defaults(
#     llm=llm, embed_model=embed_model
# )

### Index

In [5]:
! rm -r sentence_index
! rm -r merging_index

In [22]:
# Auto-Merging Index

from llama_index.node_parser import HierarchicalNodeParser

from llama_index.node_parser import get_leaf_nodes
from llama_index import StorageContext
from llama_index.retrievers import AutoMergingRetriever
from llama_index.indices.postprocessor import SentenceTransformerRerank
from llama_index.query_engine import RetrieverQueryEngine


def build_automerging_index(
    documents,
    llm,
    embed_model="local:BAAI/bge-large-en-v1.5",
    save_dir="merging_index",
    chunk_sizes=None,
):
    chunk_sizes = chunk_sizes or [2048, 512, 128]
    node_parser = HierarchicalNodeParser.from_defaults(chunk_sizes=chunk_sizes)
    nodes = node_parser.get_nodes_from_documents(documents)
    leaf_nodes = get_leaf_nodes(nodes)
    merging_context = ServiceContext.from_defaults(
        llm=llm,
        embed_model=embed_model,
    )
    storage_context = StorageContext.from_defaults()
    storage_context.docstore.add_documents(nodes)

    if not os.path.exists(save_dir):
        automerging_index = VectorStoreIndex(
            leaf_nodes, storage_context=storage_context, service_context=merging_context
        )
        automerging_index.storage_context.persist(persist_dir=save_dir)
    else:
        automerging_index = load_index_from_storage(
            StorageContext.from_defaults(persist_dir=save_dir),
            service_context=merging_context,
        )
    return automerging_index

def get_automerging_query_engine(
    automerging_index,
    similarity_top_k=12,
    rerank_top_n=2,
):
    # base_retriever = automerging_index.as_retriever(similarity_top_k=similarity_top_k)
    # retriever = AutoMergingRetriever(
    #     base_retriever, automerging_index.storage_context, verbose=True
    # )
    rerank = SentenceTransformerRerank(
        top_n=rerank_top_n, model="BAAI/bge-reranker-base"
    )
    auto_merging_engine = automerging_index.as_query_engine(
        similarity_top_k=similarity_top_k,
        node_postprocessors=[rerank]
        )
    return auto_merging_engine


# Sentence Window Index

from llama_index import ServiceContext, VectorStoreIndex, StorageContext
from llama_index.node_parser import SentenceWindowNodeParser
from llama_index.indices.postprocessor import MetadataReplacementPostProcessor
from llama_index.indices.postprocessor import SentenceTransformerRerank
from llama_index import load_index_from_storage
import os


# index = VectorStoreIndex.from_documents(documents,
#                                         service_context=service_context)

def build_sentence_window_index(
    documents, llm, embed_model="local:BAAI/bge-large-en-v1.5", save_dir="sentence_index"
):
    # create the sentence window node parser w/ default settings
    node_parser = SentenceWindowNodeParser.from_defaults(
        window_size=2,
        window_metadata_key="window",
        original_text_metadata_key="original_text",
    )
    sentence_context = ServiceContext.from_defaults(
        llm=llm,
        embed_model=embed_model,
        node_parser=node_parser,
    )
    if not os.path.exists(save_dir):
        sentence_index = VectorStoreIndex.from_documents(
            documents, service_context=sentence_context
        )
        sentence_index.storage_context.persist(persist_dir=save_dir)
    else:
        sentence_index = load_index_from_storage(
            StorageContext.from_defaults(persist_dir=save_dir),
            service_context=sentence_context,
        )

    return sentence_index



def get_sentence_window_query_engine(
    sentence_index,
    similarity_top_k=6,
    rerank_top_n=2,
):
    # define postprocessors
    postproc = MetadataReplacementPostProcessor(target_metadata_key="window")
    rerank = SentenceTransformerRerank(
        top_n=rerank_top_n, model="BAAI/bge-reranker-base"
    )

    sentence_window_engine = sentence_index.as_query_engine(
        similarity_top_k=similarity_top_k, node_postprocessors=[postproc, rerank]
    )
    return sentence_window_engine


sentence_index = build_sentence_window_index(documents=documents,llm=llm, save_dir="sentence_index")
automerging_index = build_automerging_index(
    documents=documents,
    llm=llm,
    save_dir="merging_index"
)

In [21]:
! rm -r sentence_index
! rm -r merging_index

In [26]:
from llama_index import Document,SimpleDirectoryReader
import os

def create_sentence_query_engine(pdf_path):
    documents = SimpleDirectoryReader(input_files=[pdf_path]).load_data()
    sentence_index = build_sentence_window_index(documents=documents,llm=llm, save_dir='sentence_index')
    return get_sentence_window_query_engine(sentence_index)


def create_automerging_query_engine(pdf_path):
    documents = SimpleDirectoryReader(input_files=[pdf_path]).load_data()
    automerging_index = build_automerging_index(documents=documents,llm=llm, save_dir='merging_index')
    return get_automerging_query_engine(automerging_index,)


## Retreival

### Intialize RAG Pipeline

In [27]:
import re
import math
from trulens_eval.feedback import Groundedness
from trulens_eval import Feedback, TruLlama
from trulens_eval.feedback.provider.hugs import Huggingface
import numpy as np



huggingface_provider = Huggingface()



def convert_score(score):
    if score > 1:
        # Find the next highest power of 10
        power = math.ceil(math.log10(score))
        # Divide the score by 10 to the power
        score /= 10 ** power
    return float(score)

pat_0_10 = re.compile(r"\s*([0-9]+)[\s\\]*$")

def extract_first_number(s):
    """
    Extracts the first number mentioned in a string.

    :param s: A string that may contain numbers.
    :return: The first number found in the string, or None if no number is found.
    """
    match = re.search(r'\d+', s)
    return int(match.group()) if match else None

def re_0_10_rating(str_val):
    matches = pat_0_10.fullmatch(str_val)
    if not matches:
        # Try soft match
        matches = re.search('([0-9]+)(?=\D*$)', str_val)
        if not matches:
            print(f"0-10 rating regex failed to match on: '{str_val}'")
            return -10  # so this will be reported as -1 after division by 10

    return float(matches.group())

def _extract_score_and_reasons_from_response(
    response,
    normalize = 10.0
):
  if "Supporting Evidence" in response:
    score = 0.0
    supporting_evidence = ""
    for line in response.split('\n'):
      if "Score" in line:
        score = re_0_10_rating(line) / normalize
      if "Criteria" in line:
        parts = line.split(":")
        if len(parts) > 1:
          criteria = ":".join(parts[1:]).strip()
      if "Supporting Evidence" in line:
        parts = line.split(":")
        if len(parts) > 1:
          supporting_evidence = ":".join(parts[1:]).strip()
    reasons = {
      'reason':
          (
            f"{'Criteria: ' + str(criteria) + ' ' if criteria else ''}\n"
            f"{'Supporting Evidence: ' + str(supporting_evidence) if supporting_evidence else ''}"
          )
    }
    return score, reasons
  else:
    return re_0_10_rating(response) / normalize

def llm_output_parser(prompt,response):
    final_prompt = f'''### System: You are a good parser. If a user question and a user answer is given, convert it to a Python dictionary with a key named presence or number and value as True or False or a number depending on the user answer. Understand user question with user answer and convert it to a Python dictionary. Assistant has to figure out boolean or integer values of the dictionary based on the user answer. 
Below are some examples.

User Question: Does this text mention Translational Medicine Research, which is a research approach that aims to 'translate' findings from fundamental research into medical practice and meaningful health outcomes? One of the examples is utilization of Neurocart.
User Answer: The text does not mention Translational Medicine Research.
Assistant Response:
{{
"presence": False,
}}

User Question: How many patients are planned for the  study?
User Answer: As per the given text, there are 88 patients planned for the study in total.
Assistant Response:
{{
"number":88,
}} 

User Question: Does this text mention Translational Medicine Research, which is a research approach that aims to 'translate' findings from fundamental research into medical practice and meaningful health outcomes? One of the examples is utilization of Neurocart.
User Answer: The text mentions Translational Medicine Research.
Assistant Response:
{{
"presence": True,
}}
User Question: How many countries will this trial take place over?
User Answer: As per the given text, the trial will only be conducted in the UK.
Assistant Response:
{{
"number":1,
}} 


User Question: Is it an oncology study?
User Answer: The study text is a study for cancer. Therefore, it is an oncology study.
Assistant Response:
{{
"presence": True,
}}

User Question: What is the Age Range of the  Patients in the trial?
User Answer: As per the given text, the patient ages will be 24-39, inclusive.
Assistant Response:
{{
"min_number":24,
"max_number":39,
}} 

User Question: Is there any mention of Observational period in the text?
User Answer: I apologize but the information on the observational period is not mentioned in the text.
Assistant Response:
{{
"presence": False,
}}
### User:
User Question is {prompt} and User Answer is {response}.
Without making assumptions or inferring meanings beyond the dataset description provided, convert User Answer to a Python Dictionary and give an Assistant Response for the user question-user answer pair
If you dont know the answer, just say that you dont know, dont try to make up an answer. No explanation of the assistant response is required.
### Assistant:
    '''
    response = pipe(final_prompt)[0]['generated_text'].split('### Assistant:')[-1]
    return response


def select_ground_truth(inp, questions_list):
    for q_obj in questions_list:
        if q_obj['question'] == inp:
            return q_obj['ground_truth']    
    
    return "<NOT AVAILABLE>"

def gt_relevance_with_cot_reasons(prompt,response):
  ground_truth = select_ground_truth(prompt,questions_list)
  parsed_response = llm_output_parser(prompt,response)
  parsed_ground_truth = llm_output_parser(prompt,ground_truth)
  final_prompt = f"""### System:

You are an JSON DIFFERENCE CHECKER; providing whether the 'presence' or 'number' values in the given JSON strings are the same or not.

Given two JSON strings, Please answer with this template:

TEMPLATE FORMAT:
REASONS: <reasoning for your answer.>
SCORE: <The score between 0 and 10, based on similarity of values>

Do not give high scores until absolutely sure. Even if the keys are same or different , only compare the values. 

### User:
JSON STRINGS:
First:
{parsed_response}
Second:
{parsed_ground_truth }
### Assistant:
"""
  response = pipe(final_prompt)[0]['generated_text'].split('### Assistant:')[-1]
  print("Response: \n ",response)
  score = convert_score(extract_first_number(response.split('SCORE')[-1]))
  reasons = response.split('SCORE')[0].split('REASONS')[-1]
  reasons =   {'reason':reasons}
  # return score,reasons
  return score,reasons



def relevance_with_cot_reasons(prompt, response):

  final_prompt = f"""You are a RELEVANCE grader; providing the relevance of the given RESPONSE to the given PROMPT.
Respond only as a number from 0 to 10 where 0 is the least relevant and 10 is the most relevant.

Please answer with this template:

TEMPLATE FORMAT:
Criteria: <The criteria for your evaluation>
Supporting Evidence: <Your reasons for your scoring.>
Score: <The score 0-10 based on the given criteria>

A few additional scoring guidelines:

- Long RESPONSES should score equally well as short RESPONSES.

- Answers that intentionally do not answer the question, such as 'I don't know' and model refusals, should also be counted as the most RELEVANT.

- RESPONSE must be relevant to the entire PROMPT to get a score of 10.

- RELEVANCE score should increase as the RESPONSE provides RELEVANT context to more parts of the PROMPT.

- RESPONSE that is RELEVANT to none of the PROMPT should get a score of 0.

- RESPONSE that is RELEVANT to some of the PROMPT should get as score of 2, 3, or 4. Higher score indicates more RELEVANCE.

- RESPONSE that is RELEVANT to most of the PROMPT should get a score between a 5, 6, 7 or 8. Higher score indicates more RELEVANCE.

- RESPONSE that is RELEVANT to the entire PROMPT should get a score of 9 or 10.

- RESPONSE that is RELEVANT and answers the entire PROMPT completely should get a score of 10.

- RESPONSE that confidently FALSE should get a score of 0.

- RESPONSE that is only seemingly RELEVANT should get a score of 0.

PROMPT: {prompt}

RESPONSE: {response}
  """
  response = pipe(prompt_template.format(query_str=final_prompt))[0]['generated_text']
  score,reasons = _extract_score_and_reasons_from_response(response)
  score = convert_score(score)  
  return score,reasons


huggingface_provider = Huggingface()

grounded = Groundedness(groundedness_provider=huggingface_provider)

def groundedness_measure_with_cot_reasons(source, statement):
  prompt_template = f"""### System:
  You are a INFORMATION OVERLAP classifier providing the overlap of information between a SOURCE and STATEMENT.For every sentence in the statement, please answer with this template:
TEMPLATE:
Statement Sentence: <Sentence>,
Supporting Evidence: <Choose the exact unchanged sentences in the source that can answer the statement, if nothing matches, say NOTHING FOUND>
Score: <Output a number between 0-10 where 0 is no information overlap and 10 is all information is overlapping>

### User:
Give me the INFORMATION OVERLAP of this SOURCE and STATEMENT.
SOURCE: {source}

STATEMENT: {statement}</s>
### Assistant
"""
  groundedness_scores = {}
  plausible_junk_char_min = 4
  if len(statement) > plausible_junk_char_min:
    reason = pipe(prompt_template)[0]['generated_text'].split('### Assistant')[-1]
  i = 0
  for line in reason.split('\n'):
    if "Score" in line:
      groundedness_scores[f"statement_{i}"] = re_0_10_rating(line) / 10
      i += 1
  for k,v in groundedness_scores.items():
     groundedness_scores[k] = convert_score(v)
  return groundedness_scores, {"reason": reason}


groundedness = (
    Feedback(groundedness_measure_with_cot_reasons, name="Groundedness")
        .on(TruLlama.select_source_nodes().node.text)
        .on_output()
        .aggregate(grounded.grounded_statements_aggregator)
)



qa_relevance = (
    Feedback(relevance_with_cot_reasons, name="Answer Relevance")
    .on_input_output()
)

gt_relevance = (
    Feedback(gt_relevance_with_cot_reasons, name="Ground-Truth Relevance")
    .on_input_output()
)

qs_relevance = (
    Feedback(relevance_with_cot_reasons, name = "Context Relevance")
    .on_input()
    .on(TruLlama.select_source_nodes().node.text)
    .aggregate(np.mean)
)

feedbacks = [qa_relevance, qs_relevance,gt_relevance, groundedness]


def get_prebuilt_trulens_recorder(query_engine, feedbacks, app_id):
    tru_recorder = TruLlama(
        query_engine,
        app_id=app_id,
        feedbacks=feedbacks
        )
    return tru_recorder

Feedback implementation <function groundedness_measure_with_cot_reasons at 0x7f8fb1e5a670> cannot be serialized: Module __main__ is not importable.. This may be ok unless you are using the deferred feedback mode.
Feedback implementation <function groundedness_measure_with_cot_reasons at 0x7f8fb1e5a670> cannot be serialized: Module __main__ is not importable.. This may be ok unless you are using the deferred feedback mode.
Feedback implementation <function groundedness_measure_with_cot_reasons at 0x7f8fb1e5a670> cannot be serialized: Module __main__ is not importable.. This may be ok unless you are using the deferred feedback mode.
Feedback implementation <function groundedness_measure_with_cot_reasons at 0x7f8fb1e5a670> cannot be serialized: Module __main__ is not importable.. This may be ok unless you are using the deferred feedback mode.
Feedback implementation <function relevance_with_cot_reasons at 0x7f8f87a3fc10> cannot be serialized: Module __main__ is not importable.. This may b

✅ In Groundedness, input source will be set to __record__.app.query.rets.source_nodes[:].node.text .
✅ In Groundedness, input statement will be set to __record__.main_output or `Select.RecordOutput` .
✅ In Answer Relevance, input prompt will be set to __record__.main_input or `Select.RecordInput` .
✅ In Answer Relevance, input response will be set to __record__.main_output or `Select.RecordOutput` .
✅ In Ground-Truth Relevance, input prompt will be set to __record__.main_input or `Select.RecordInput` .
✅ In Ground-Truth Relevance, input response will be set to __record__.main_output or `Select.RecordOutput` .
✅ In Context Relevance, input prompt will be set to __record__.main_input or `Select.RecordInput` .
✅ In Context Relevance, input response will be set to __record__.app.query.rets.source_nodes[:].node.text .


In [28]:
from trulens_eval import Tru, TruLlama
tru = Tru()
# tru.reset_database()

def run_evals(filename, questions_list):
    tot = len(questions_list)

    print(f"Creating RAG Pipelines for document {filename.split('/')[-1]}...")
    sentence_pipeline = create_sentence_query_engine(filename)
    print("Created Sentence Window RAG.")
    automerging_pipeline = create_automerging_query_engine(filename)
    print("Created Auto-Merging RAG.")

    app_name = f"RAG - {filename.split('/'[-1])}"

    sentence_recorder = get_prebuilt_trulens_recorder(sentence_pipeline,feedbacks, app_id = f"Sentence Window {app_name}")
    automerging_recorder = get_prebuilt_trulens_recorder(automerging_pipeline,feedbacks, app_id = f"Auto-Merging {app_name}")
    
    print("Evaluating on Questions List:")
    for i,question in enumerate(questions_list):
        with sentence_recorder as recording:
            response = sentence_pipeline.query(question)
        print(f"Sentence Window RAG ({i+1}/{tot})")
        with automerging_recorder as recording:
            response = automerging_pipeline.query(question)
        print(f"Auto-Merging RAG ({i+1}/{tot})")
    
    del sentence_pipeline, automerging_pipeline, sentence_recorder, automerging_recorder

    torch.cuda.empty_cache()
    


    




In [8]:
filename = "/home/ubuntu/open-llm/pdfs/X06-201-00001_Protocol_Amendment_4"
questions_list  = [
    {
        'question':"Does this trial involve Psychedelic Research, which is the study of the effects of psychedelic substances, like LSD and psilocybin, on the human brain and mental health?",
        'ground_truth':"To answer your question, the text does not mention anything about psychedelic research or the use of substances like LSD and psilocybin. The trial is focused on the investigation of B-124a, an orally bioavailable NAM of the NMDA receptor with conferred selectivity to the NR2B subunit, and its effects on blood pressure, quantitative electroencephalogram (qEEG) measures, and neurological scale scores.",
    },
    {
        'question':"Patients, subjects, and participants are used interchangeably. Other synonyms are Enrollees, study volunteers, research recruits, cohort members, survey respondents. How many patients are planned for study according to the text?",
        'ground_truth':"Regarding your question, Arm 1 is a randomized, double-blind, placebo-controlled, single ascending oral dose administration of B-124a to healthy subjects in a fasted state where dosing is planned to be conducted in 7 cohorts of 8 subjects each.  Therefore, for Arm 1, 56  subjects are planned. Arm 2 is a randomized, double-blind, placebo-controlled, single ascending oral dose administration of B-124a to healthy subjects in a fasted state where dosing is planned to be conducted in 3 or 4 cohorts of 8 subjects each.  Therefore, for Arm 2, 32 subjects are planned. So, in total, 88 subjects are planned for the study ",
    },
    {
        'question':"What is the age of participants mentioned in the text?",
        'ground_truth':"The age of participants mentioned in the text is between 18 and 55 years of age, inclusive. This information can be found on Page 33 of the protocol.",
    },
    {
        'question':"Are Adverse Events of Special Interest mentioned and applicable for the study? They are sometimes abbreviated as AESI or AEs of Special Interest.",
        'ground_truth':"To answer your question, Adverse Events of Special Interest (AESIs) is mentioned in the study but not applicable.",
    },
    {
        'question':"What is the treatment period mentioned in the study?",
        'ground_truth':"The treatment period is defined as the time period during which subjects are evaluated for primary and/or secondary objectives of the trial irrespective of whether or not the subject actually consumes all doses of the IMP. Subjects who are evaluated at the last scheduled visit during the treatment period will be defined as trial completers. For purposes of this trial, subjects who complete all PK and PD assessments all the way through discharge from the clinic, assuming PK washout and scheduled in-clinic AE assessment completion (ie, complete the Day 8 visit, will be defined as trial completers.",
    },
    {
        'question':"In an inpatient study, participants are admitted to a study site or are admitted to a clinic. In an inpatient study, the text also might mention subjects checking in and getting discharged. Is it an inpatient study?",
        'ground_truth':"It appears that this is an inpatient study . The trial will consist of a screening period (Day ?45 through Day ?2), check-in (Day ?1), in-clinic stay (minimum of 8 days), and a safety follow-up telephone call 30 (+ 2) days after the last dose of B-124a (the investigational medicinal product [IMP]) to assess any new or ongoing adverse events (AEs) and to record concomitant medications. ",
    },
    {
        'question':"In an outpatient study, participants visit the study site or must visit a clinic or must visit a hospital. In an outpatient study, participants do not stay overnight. Is it an outpatient study?",
        'ground_truth':"It appears that this is not an outpatient study, but an inpatient study.",
    },
    {
        'question':"How many trial centers or sites are planned for the study?",
        'ground_truth':"To answer your question, there is a single site planned for the study",
    },
    {
        'question':"Is this trial Placebo-controlled or an open trial?",
        'ground_truth':"To answer your question, the trial is a randomized, double-blind, placebo-controlled trial.",
    },
    {
        'question':"Is Food effect mentioned in this text? Food effect describes how the presence or absence of food in the stomach can affect the rate and extent to which a drug is absorbed into the bloodstream. It is sometimes abbreviated as FE.",
        'ground_truth':"To answer your question, yes, the effect of food on B-124a safety, tolerability, and PK will be determined in this trial.",
    },
    {
        'question':"Are Healthy adult participants a part of this trial?",
        'ground_truth':"To answer your question, yes, healthy adult participants are a part of this trial. The trial population will consist of healthy males and females, 18 to 55 years of age, inclusive. Approximately 88 subjects (56 subjects in Arm 1 and 32 subjects in Arm 2) are expected to be enrolled in the trial.",
    },
    {
        'question':"Is it a double blind randomization clinical study?",
        'ground_truth':"To answer your question, yes, the study is a double-blind randomized clinical study. For each cohort in Arm 1 and Arm 2, subjects will be randomized on Day 1 to a single oral dose of B-124a or matching placebo in a 6:2 ratio (6 on B-124a subjects and 2 placebo subjects)",
    },
    {
        'question':"How many total number of drugs are being assessed as Investigational Medicinal Product in this study?",
        'ground_truth':"To answer your question, there is only one investigational medicinal product being assessed in this study, which is B-124a. This is mentioned in the protocol summary on page 1 and in various sections throughout the document.",
    },
    {
        'question':"Is dose administration mentioned in text of treatment section? Other synonyms of dose administration are IMP administration and study medication.",
        'ground_truth':"""To answer your question, yes, the text mentions dose administration in several places. Synonyms used include IMP administration and study medication. For example, it states that "all doses of IMP will be administered while the subjects are in the clinic." Additionally, it mentions that the IMP will be supplied as API and will be labeled with instructions for use and route of administration.""",
    },
    {
        'question':"Are the lab samples sent to one central lab or a different location? Lab samples are sometimes referred to as biomarkers, FBR, Fasting Blood referrals or bodily fluid samples.",
        'ground_truth':"""To answer your question, the lab samples collected for clinical laboratory assessments in this trial will be sent to a local laboratory. The protocol does not use the terms biomarkers, FBR, Fasting Blood referrals, or bodily fluid samples to refer to the lab samples collected""",
    },
    
    ]



In [None]:

sentence_pipeline = create_sentence_query_engine(filename)
automerging_pipeline = create_automerging_query_engine(filename)



    
{
"presence": False,
}


### Q/A

In [8]:
# response_sentence = sentence_pipeline.query("Does the study title mention that it is about monitoring Drug-Drug Interactions? Drug drug Interactions occur when two or more drugs interact with each other in a way that affects their effectiveness or safety It is sometimes abbreviated as DDI.  ")
# print(str(response_sentence), "\n-----------------------------------\n")

# response_automerging = automerging_pipeline.query("Does the study title mention that it is about monitoring Drug-Drug Interactions? Drug drug Interactions occur when two or more drugs interact with each other in a way that affects their effectiveness or safety It is sometimes abbreviated as DDI.  ")
# print(str(response_automerging))

In [9]:
# response_2 = sentence_pipeline.query("How many patients are planned to be taken for the trial?  ")
# print(str(response_2))

In [10]:
# response_2.source_nodes[0].node.metadata

In [11]:
# response_3 = sentence_pipeline.query("What is the age of participants mentioned in the text? ")
# print(str(response_3))

In [12]:
# response_full = sentence_pipeline.query("What are the demographics (age, gender, ethnicity etc) of the patients/subjects being taken for the trial?")
# print(str(response_full))

In [13]:
# response_4 = sentence_pipeline.query("In an inpatient study, participants are admitted to a study site or are admitted to a clinic. In an inpatient study, the text also might mention subjects checking in and getting discharged. Is it an inpatient study? ")
# print(str(response_4))

In [14]:
# response_5 = sentence_pipeline.query("In an outpatient study, participants visit the study site or must visit a clinic or must visit a hospital. In an outpatient study, participants do not stay overnight. Is it an outpatient study?   ")
# print(str(response_5))

## Evaluation

In [9]:
eval_questions = ["""What is the total number of study arms the study is dealing with? """
,"""What is the total number of cohorts the study is dealing with? """
,"""What is the total number of treatment groups the study is dealing with? """
,"""What is the total number of treatment sequences the study is dealing with? """
,"""Does this text mention Translational Medicine Research, which is a research approach that aims to 'translate' findings from fundamental research into medical practice and meaningful health outcomes? One of the examples is utilization of Neurocart. """
,"""Does this trial involve Psychedelic Research, which is the study of the effects of psychedelic substances, like LSD and psilocybin, on the human brain and mental health? """
,"""Does this trial mention Abuse Liability, which refers to the potential of a drug to be misused, leading to addiction or dependence? It is also sometimes referred to as Human Abuse Liability or HAL. """
,"""Is this a Basket trial, which is a clinical trial design where multiple subgroups (baskets) of patients, usually with different types of cancer, are tested with a single drug based on a common biomarker? """
,"""Is this an Umbrella trial, a type of clinical trial that tests the impact of different drugs on different mutations in a single type of disease, usually cancer, in one 'umbrella' study? """
,"""Is 'First in Human' mentioned in this text, which refers to the first time a new treatment or procedure is tested in humans? It is also sometimes abbreviated as FIH. """
,"""Is it an oncology study? """
,"""Is 'Single Ascending Dose' mentioned in this text, referring to a phase in clinical trials where the dosage is gradually increased to evaluate the body's reactions? It is sometimes abbreviated as SAD. """
,"""Is 'Multiple Ascending Dose' mentioned in this text, which refers to a method in clinical trials where small groups of subjects receive multiple low doses of the drug, which are gradually increased? It is sometimes abbreviated as MAD. """
,"""Is 'Thorough QTc' mentioned in this text, referring to a clinical trial design used to assess the impact of a drug on the heart's QT interval, which is a measure of the time between the start of the Q wave and the end of the T wave in the heart's electrical cycle? """
,"""Is there any mention of run in in the text? """
,"""During the run in period, participants undergo specific procedures to establish baseline measurements or assess eligibility criteria. Does the study involve run in period? It is also sometimes called washout period or lead-in period. """
,"""Is there any mention of Observational period in the text?  """
,"""Observational period refers where researchers observe participants and collect various types of data including medical history, symptoms, outcomes, biomarkers etc. Does the study involve an Observational period? """
,"""Is there any mention of titration period in full text? """
,"""Titration period is when the dosage of a medication is gradually adjusted until an optimal dose is reached. Does the study involve a titration period? """
,"""Is the trial investigating the drug’s bioavailability? Bioavailability represents the extent and rate at which a drug is absorbed. It is sometimes abbreviated as BA. """
,"""Is the trial investigating bioequivalence of drugs? Bioequivalence is the similarity in the rate and extent of drug absorption between two drug products, typically a generic and a brand-name drug. It is sometimes abbreviated as BE. """
,"""Does the study title mention that it is about monitoring Drug-Drug Interactions? Drug drug Interactions occur when two or more drugs interact with each other in a way that affects their effectiveness or safety It is sometimes abbreviated as DDI.  """
,"""Is mass balance being confirmed in the study? Mass Balance involves accounting for the total amount of a drug that enters and exits a biological system. It is sometimes abbreviated as MB. """
,"""Is Food effect mentioned in this text? Food effect describes how the presence or absence of food in the stomach can affect the rate and extent to which a drug is absorbed into the bloodstream. It is sometimes abbreviated as FE. """
,"""Patients, subjects, and participants are used interchangeably. Other synonyms are Enrollees, study volunteers, research recruits, cohort members, survey respondents. How many patients are planned for study according to the text?  """
,"""What is the age of participants mentioned in the text? """
,"""Are adult participants a part of this trial? """
,"""Is it a study checking for influence of the therapy on subjects of a specific ethnicity? """
,"""Is competing trial mentioned in the text? Competing trial investigates a similar or related intervention as the trial in question. """
,"""Is it mentioned in the text that the target population is uncommon? """
,"""Is the “target is common” mentioned in the text? """
,"""Is it a study related to orphan diseases mentioned in the text? Orphan diseases are rare diseases that affect a relatively small number of individuals. """
,"""Does the study involve any disease which has a limited number of medications available in the market to treat it? """
,"""Is paediatric population involved in the study? They are sometimes referred to as children or adolescents. """
,"""Is molecular screening a criterion to select patients in the study? Molecular screening criteria refers to the specific genetic, molecular, or biomarker-based characteristics used to identify and select patients for participation in a clinical trial. """
,"""Does the study involve inclusion criteria of highly selective eligibility?  """
,"""Is it an open label enrolment study? It is a study where participants are aware of the treatment they are receiving and can enroll themselves directly into a particular treatment group or study arm without randomization by researchers. """
,"""Is it a study with no randomization during enrolment? """
,"""Is it an open label randomization study? It is a study where both the researchers and the participants are aware of the treatment they are receiving. The randomization process is still used to assign participants to different treatment groups, but everyone involved knows which treatment or intervention the participant is receiving. """
,"""Is it a double blind randomization clinical study? """
,"""Oral dosing refers to the administration of drugs through the mouth usually swallowed. Does the study involve oral dosing? """
,"""Is multiple drug formulation mentioned in the text of dosing section? Multiple drug formulation means a medicinal product being administered in multiple forms. For example, a drug being presented as a powder in capsule as well as a liquid-filled capsule.  """
,"""Is rescue medication allowed to be used as mentioned in the text of dosing section? Rescue medications are medications that help in managing conditions that involve sudden symptom exacerbations by providing quick relief. Common rescue medications are Epipen, adrenaline, steroids, triptans, antihistamines etc. """
,"""What is the treatment period mentioned in the study? """
,"""In an inpatient study, participants are admitted to a study site or are admitted to a clinic. In an inpatient study, the text also might mention subjects checking in and getting discharged. Is it an inpatient study? """
,"""In an outpatient study, participants visit the study site or must visit a clinic or must visit a hospital. In an outpatient study, participants do not stay overnight. Is it an outpatient study?   """
,"""Are the lab samples sent to one central lab? Lab samples are sometimes referred to as biomarkers, FBR, Fasting Blood referrals or bodily fluid samples. """
,"""Are the lab samples sent to a different location other than a central lab? Lab samples are sometimes referred to as biomarkers, FBR, Fasting Blood referrals or bodily fluid samples. """
,"""Do we require complex frozen packaging for shipment of the lab samples? Complex frozen packaging can involve dry ice, liquid nitrogen, refrigeration, freezer boxes, vacuum insulated dry shipper containers, thermoformed packaging, insulated shipping kits or any kind of customized packagings. """
,"""Is Cohort Safety Review mentioned in the text? """
,"""Is Dose Escalation Review Team a part of the study? They are sometimes referred to as DERT. """
,"""Is Dose Review Committee a part of the study? They are sometimes referred to as DRC. """
,"""Is the data collection happening at the site for this study? """
,"""Are Adverse Events of Special Interest mentioned and applicable for the study? They are sometimes abbreviated as AESI or AEs of Special Interest. """
,"""Alanine aminotransferase is sometimes abbreviated as ALT. Aspartate aminotransferase is sometimes abbreviated as AST. Upper Limit of Normal is sometimes abbreviated as ULN. Is elevation of alanine aminotransferase or aspartate aminotransferase being compared with Upper limit of normal in the study? """
,"""Upper Limit of Normal is sometimes abbreviated as ULN. Is total bilirubin level being compared with upper limit of normal in the study? """
,"""Does any section name in the pdf contain Interim as a substring?  """
,"""Are there section names mentioning IA? """
,"""Are there any mentions of snapshots in the text? """
,"""Is there unblinding prior to full database lock? """
,"""How many sites are planned for the study? """
,"""How many countries are planned for the study? """
,"""Is a Non-USA country involved in the study? """
,"""How many protocol amendments were made according to the text? """
,"""Are there any country specific amendments made to the protocol? """
,"""Does the study involve Clinical Outcome Assessments? They are sometimes referred to as COA, ClinRO or Clinician Reported Outcomes. """
,"""Does the study involve Patient Reported Outcomes? They are sometimes referred to as PRO. """
,"""Case Report Form is sometimes abbreviated as CRF. Does the study mention that some data needs to be transcribed or entered in a CRF?  """
,"""Are any of these terms mentioned in the text: adhd-rs-5, adpkd outcome, conners-3, ids-sr, msfq, pgi-c, pgi-s, qol questionnaire, sds, smwq-p, smwq-po, Diary, EDE-Q, SF-36? """
,"""Does the study involve electronic Clinical Outcome Assessments? They are sometimes referred to as eCOA, eClinRO or electronic Clinician Reported Outcomes. """
,"""Does the study involve electronic Patient Reported Outcomes? They are sometimes referred to as ePRO. """
,"""Does the study involve self-reporting? """
,"""Does the study involve observer-reporting? """
,"""Does the study involve caregiver reporting?  """
,"""Does the study mention performance outcomes? """
,"""Does the study involve arater? """
,"""Does the study involve a certified clinician? """
,"""Is it mentioned in the text that any of these assessments would be performed electronically: adhd-rs-5, adpkd outcome, conners-3, ids-sr, msfq, pgi-c, pgi-s, qol questionnaire, sds, smwq-p, smwq-po, Diary, EDE-Q, SF-36? """
]


15

### Groundedness

In [17]:
# select_ground_truth("What is the treatment period mentioned in the study?", questions_list)

In [18]:
# response = sentence_pipeline.query("What is the treatment period mentioned in the study?")
# print(str(response))

### QS Relevance With CoT reasons (Open Source Implementation)

Feedback implementation <function groundedness_measure_with_cot_reasons at 0x7f8f87a755e0> cannot be serialized: Module __main__ is not importable.. This may be ok unless you are using the deferred feedback mode.
Feedback implementation <function groundedness_measure_with_cot_reasons at 0x7f8f87a755e0> cannot be serialized: Module __main__ is not importable.. This may be ok unless you are using the deferred feedback mode.
Feedback implementation <function groundedness_measure_with_cot_reasons at 0x7f8f87a755e0> cannot be serialized: Module __main__ is not importable.. This may be ok unless you are using the deferred feedback mode.
Feedback implementation <function groundedness_measure_with_cot_reasons at 0x7f8f87a755e0> cannot be serialized: Module __main__ is not importable.. This may be ok unless you are using the deferred feedback mode.


✅ In Groundedness, input source will be set to __record__.app.query.rets.source_nodes[:].node.text .
✅ In Groundedness, input statement will be set to __record__.main_output or `Select.RecordOutput` .


In [39]:
prompt = "How many total number of drugs are being assessed as Investigational Medicinal Product in this study?"
response = automerging_pipeline.query(prompt)
relevance = gt_relevance_with_cot_reasons(prompt, response)
print(relevance)



Response: 
  
 REASONS: Both JSON objects have a key "number" with the value 1. The structure is identical, so all values match.
SCORE: 10
(1.0, {'reason': ': Both JSON objects have a key "number" with the value 1. The structure is identical, so all values match.\n'})


### Initialize all Evaluation Functions

Feedback implementation <function relevance_with_cot_reasons at 0x7f8f87a75550> cannot be serialized: Module __main__ is not importable.. This may be ok unless you are using the deferred feedback mode.
Feedback implementation <function relevance_with_cot_reasons at 0x7f8f87a75550> cannot be serialized: Module __main__ is not importable.. This may be ok unless you are using the deferred feedback mode.
Feedback implementation <function relevance_with_cot_reasons at 0x7f8f87a75550> cannot be serialized: Module __main__ is not importable.. This may be ok unless you are using the deferred feedback mode.
Feedback implementation <function gt_relevance_with_cot_reasons at 0x7f8f87a754c0> cannot be serialized: Module __main__ is not importable.. This may be ok unless you are using the deferred feedback mode.
Feedback implementation <function gt_relevance_with_cot_reasons at 0x7f8f87a754c0> cannot be serialized: Module __main__ is not importable.. This may be ok unless you are using the deferred

✅ In Answer Relevance, input prompt will be set to __record__.main_input or `Select.RecordInput` .
✅ In Answer Relevance, input response will be set to __record__.main_output or `Select.RecordOutput` .
✅ In Ground-Truth Relevance, input prompt will be set to __record__.main_input or `Select.RecordInput` .
✅ In Ground-Truth Relevance, input response will be set to __record__.main_output or `Select.RecordOutput` .
✅ In Context Relevance, input prompt will be set to __record__.main_input or `Select.RecordInput` .
✅ In Context Relevance, input response will be set to __record__.app.query.rets.source_nodes[:].node.text .


## Setup Eval Pipeline

🦑 Tru initialized with db url sqlite:///default.sqlite .
🛑 Secret keys may be written to the database. See the `database_redact_keys` option of `Tru` to prevent this.


In [46]:
del sentence_pipeline, automerging_pipeline

In [47]:
torch.cuda.empty_cache()

In [42]:
questions = [x['question'] for x in questions_list]

tot=len(questions)
with sentence_recorder as recording:
    for i,question in enumerate(questions):
        response = sentence_pipeline.query(question)
        print(f"Ran Question {i+1}/{tot} for Sentence Window RAG")


with automerging_recorder as recording:
    for i,question in enumerate(questions):
        response = automerging_pipeline.query(question)
        print(f"Ran Question {i+1}/{tot} for Auto-Merging RAG")


records, feedback = tru.get_records_and_feedback(app_ids=[])
records.head()

A new object of type <class 'llama_index.query_engine.retriever_query_engine.RetrieverQueryEngine'> at 0x7f9c04705d00 is calling an instrumented method <function BaseQueryEngine.query at 0x7f9c1a5a1b80>. The path of this call may be incorrect.
Guessing path of new object is app based on other object (0x7f9c1979e5e0) using this function.


A new object of type <class 'llama_index.query_engine.retriever_query_engine.RetrieverQueryEngine'> at 0x7f9c04705d00 is calling an instrumented method <function RetrieverQueryEngine.retrieve at 0x7f9c2cb34e50>. The path of this call may be incorrect.
Guessing path of new object is app based on other object (0x7f9c1979e5e0) using this function.
A new object of type <class 'llama_index.indices.vector_store.retrievers.retriever.VectorIndexRetriever'> at 0x7f9c04705dc0 is calling an instrumented method <function BaseRetriever.retrieve at 0x7f9c1a59adc0>. The path of this call may be incorrect.
Guessing path of new object is app.retriever based on other object (0x7f9c1979e6d0) using this function.
A new object of type <class 'llama_index.response_synthesizers.compact_and_refine.CompactAndRefine'> at 0x7f9c04705fd0 is calling an instrumented method <function CompactAndRefine.get_response at 0x7f9c1918daf0>. The path of this call may be incorrect.
Guessing path of new object is app._response

Ran Question 1/15 for Sentence Window RAG


A new object of type <class 'llama_index.response_synthesizers.compact_and_refine.CompactAndRefine'> at 0x7f9c04705fd0 is calling an instrumented method <function Refine.get_response at 0x7f9c1918d700>. The path of this call may be incorrect.
Guessing path of new object is app._response_synthesizer based on other object (0x7f9c1979e070) using this function.


Ran Question 2/15 for Sentence Window RAG


A new object of type <class 'llama_index.query_engine.retriever_query_engine.RetrieverQueryEngine'> at 0x7f9c04705d00 is calling an instrumented method <function RetrieverQueryEngine.retrieve at 0x7f9c2cb34e50>. The path of this call may be incorrect.
Guessing path of new object is app based on other object (0x7f9c1979e5e0) using this function.
A new object of type <class 'llama_index.response_synthesizers.compact_and_refine.CompactAndRefine'> at 0x7f9c04705fd0 is calling an instrumented method <function Refine.get_response at 0x7f9c1918d700>. The path of this call may be incorrect.
Guessing path of new object is app._response_synthesizer based on other object (0x7f9c1979e070) using this function.


Response: 
  
 REASONS: Both JSON objects have the exact same key "presence" with the same value "False". Therefore, they represent identical data.
SCORE: 10


A new object of type <class 'llama_index.query_engine.retriever_query_engine.RetrieverQueryEngine'> at 0x7f9c04705d00 is calling an instrumented method <function RetrieverQueryEngine.retrieve at 0x7f9c2cb34e50>. The path of this call may be incorrect.
Guessing path of new object is app based on other object (0x7f9c1979e5e0) using this function.


Ran Question 3/15 for Sentence Window RAG


A new object of type <class 'llama_index.response_synthesizers.compact_and_refine.CompactAndRefine'> at 0x7f9c04705fd0 is calling an instrumented method <function Refine.get_response at 0x7f9c1918d700>. The path of this call may be incorrect.
Guessing path of new object is app._response_synthesizer based on other object (0x7f9c1979e070) using this function.
A new object of type <class 'llama_index.query_engine.retriever_query_engine.RetrieverQueryEngine'> at 0x7f9c04705d00 is calling an instrumented method <function RetrieverQueryEngine.retrieve at 0x7f9c2cb34e50>. The path of this call may be incorrect.
Guessing path of new object is app based on other object (0x7f9c1979e5e0) using this function.


Ran Question 4/15 for Sentence Window RAG


A new object of type <class 'llama_index.response_synthesizers.compact_and_refine.CompactAndRefine'> at 0x7f9c04705fd0 is calling an instrumented method <function Refine.get_response at 0x7f9c1918d700>. The path of this call may be incorrect.
Guessing path of new object is app._response_synthesizer based on other object (0x7f9c1979e070) using this function.
Feedback Function exception caught: Traceback (most recent call last):
  File "/home/ubuntu/venv/lib/python3.8/site-packages/trulens_eval/feedback/feedback.py", line 519, in run
    assert isinstance(
AssertionError: Feedback function output must be a float or dict but was <class 'int'>.



Response: 
  
 REASONS: The first JSON has both "number" and "healthy_subjects", while the second one only contains "number". There is no comparison possible since they have different structures.
SCORE: 0


A new object of type <class 'llama_index.query_engine.retriever_query_engine.RetrieverQueryEngine'> at 0x7f9c04705d00 is calling an instrumented method <function RetrieverQueryEngine.retrieve at 0x7f9c2cb34e50>. The path of this call may be incorrect.
Guessing path of new object is app based on other object (0x7f9c1979e5e0) using this function.


Ran Question 5/15 for Sentence Window RAG


A new object of type <class 'llama_index.response_synthesizers.compact_and_refine.CompactAndRefine'> at 0x7f9c04705fd0 is calling an instrumented method <function Refine.get_response at 0x7f9c1918d700>. The path of this call may be incorrect.
Guessing path of new object is app._response_synthesizer based on other object (0x7f9c1979e070) using this function.
Feedback Function exception caught: Traceback (most recent call last):
  File "/home/ubuntu/venv/lib/python3.8/site-packages/trulens_eval/feedback/feedback.py", line 519, in run
    assert isinstance(
AssertionError: Feedback function output must be a float or dict but was <class 'int'>.



Response: 
  
 REASONS: The first JSON has keys'min_number' and'max_number', while the second one has different keys like 'age_range', 'page_no', and 'protocol_mention'. There is no direct comparison possible between these sets of data.
SCORE: 0


A new object of type <class 'llama_index.query_engine.retriever_query_engine.RetrieverQueryEngine'> at 0x7f9c04705d00 is calling an instrumented method <function RetrieverQueryEngine.retrieve at 0x7f9c2cb34e50>. The path of this call may be incorrect.
Guessing path of new object is app based on other object (0x7f9c1979e5e0) using this function.


Ran Question 6/15 for Sentence Window RAG


A new object of type <class 'llama_index.response_synthesizers.compact_and_refine.CompactAndRefine'> at 0x7f9c04705fd0 is calling an instrumented method <function Refine.get_response at 0x7f9c1918d700>. The path of this call may be incorrect.
Guessing path of new object is app._response_synthesizer based on other object (0x7f9c1979e070) using this function.
Feedback Function exception caught: Traceback (most recent call last):
  File "/home/ubuntu/venv/lib/python3.8/site-packages/trulens_eval/feedback/feedback.py", line 519, in run
    assert isinstance(
AssertionError: Feedback function output must be a float or dict but was <class 'int'>.



Response: 
  
 REASONS: The first JSON has a key "presence" with value 'None', while the second one has both "presence" and another key "applicability". Since they have different keys and values, their presence is not the same.
SCORE: 0
Response: 
  
 REASONS: Both JSON objects have the key "presence" with the value set to False. Since all the relevant information is identical, we can conclude that they represent the same data.
SCORE: 10
Response: 
  
 REASONS: Both JSON objects have a key "presence" with the value set to True. The structure and values are identical.
SCORE: 10


A new object of type <class 'llama_index.query_engine.retriever_query_engine.RetrieverQueryEngine'> at 0x7f9c04705d00 is calling an instrumented method <function RetrieverQueryEngine.retrieve at 0x7f9c2cb34e50>. The path of this call may be incorrect.
Guessing path of new object is app based on other object (0x7f9c1979e5e0) using this function.


Ran Question 7/15 for Sentence Window RAG


A new object of type <class 'llama_index.response_synthesizers.compact_and_refine.CompactAndRefine'> at 0x7f9c04705fd0 is calling an instrumented method <function Refine.get_response at 0x7f9c1918d700>. The path of this call may be incorrect.
Guessing path of new object is app._response_synthesizer based on other object (0x7f9c1979e070) using this function.


Ran Question 8/15 for Sentence Window RAG


A new object of type <class 'llama_index.query_engine.retriever_query_engine.RetrieverQueryEngine'> at 0x7f9c04705d00 is calling an instrumented method <function RetrieverQueryEngine.retrieve at 0x7f9c2cb34e50>. The path of this call may be incorrect.
Guessing path of new object is app based on other object (0x7f9c1979e5e0) using this function.
A new object of type <class 'llama_index.response_synthesizers.compact_and_refine.CompactAndRefine'> at 0x7f9c04705fd0 is calling an instrumented method <function Refine.get_response at 0x7f9c1918d700>. The path of this call may be incorrect.
Guessing path of new object is app._response_synthesizer based on other object (0x7f9c1979e070) using this function.
A new object of type <class 'llama_index.query_engine.retriever_query_engine.RetrieverQueryEngine'> at 0x7f9c04705d00 is calling an instrumented method <function RetrieverQueryEngine.retrieve at 0x7f9c2cb34e50>. The path of this call may be incorrect.
Guessing path of new object is app based 

Ran Question 9/15 for Sentence Window RAG


A new object of type <class 'llama_index.response_synthesizers.compact_and_refine.CompactAndRefine'> at 0x7f9c04705fd0 is calling an instrumented method <function Refine.get_response at 0x7f9c1918d700>. The path of this call may be incorrect.
Guessing path of new object is app._response_synthesizer based on other object (0x7f9c1979e070) using this function.


Response: 
  
 REASONS: Both JSON objects have the exact same key "presence" with the same value "False". Therefore, they represent the same data.
SCORE: 10


Feedback Function exception caught: Traceback (most recent call last):
  File "/home/ubuntu/venv/lib/python3.8/site-packages/trulens_eval/feedback/feedback.py", line 519, in run
    assert isinstance(
AssertionError: Feedback function output must be a float or dict but was <class 'int'>.



Response: 
  
 REASONS: The first JSON has a key "presence" with value False while the second one has a different key "number" with value 1. There is no matching key-value pair between these two objects.
SCORE: 0


A new object of type <class 'llama_index.query_engine.retriever_query_engine.RetrieverQueryEngine'> at 0x7f9c04705d00 is calling an instrumented method <function RetrieverQueryEngine.retrieve at 0x7f9c2cb34e50>. The path of this call may be incorrect.
Guessing path of new object is app based on other object (0x7f9c1979e5e0) using this function.


Ran Question 10/15 for Sentence Window RAG


A new object of type <class 'llama_index.response_synthesizers.compact_and_refine.CompactAndRefine'> at 0x7f9c04705fd0 is calling an instrumented method <function Refine.get_response at 0x7f9c1918d700>. The path of this call may be incorrect.
Guessing path of new object is app._response_synthesizer based on other object (0x7f9c1979e070) using this function.
A new object of type <class 'llama_index.query_engine.retriever_query_engine.RetrieverQueryEngine'> at 0x7f9c04705d00 is calling an instrumented method <function RetrieverQueryEngine.retrieve at 0x7f9c2cb34e50>. The path of this call may be incorrect.
Guessing path of new object is app based on other object (0x7f9c1979e5e0) using this function.


Ran Question 11/15 for Sentence Window RAG


A new object of type <class 'llama_index.response_synthesizers.compact_and_refine.CompactAndRefine'> at 0x7f9c04705fd0 is calling an instrumented method <function Refine.get_response at 0x7f9c1918d700>. The path of this call may be incorrect.
Guessing path of new object is app._response_synthesizer based on other object (0x7f9c1979e070) using this function.


Response: 
  
 REASONS: Both JSON objects have a key "presence" with the value set to True. The structure and values are identical.
SCORE: 10


A new object of type <class 'llama_index.query_engine.retriever_query_engine.RetrieverQueryEngine'> at 0x7f9c04705d00 is calling an instrumented method <function RetrieverQueryEngine.retrieve at 0x7f9c2cb34e50>. The path of this call may be incorrect.
Guessing path of new object is app based on other object (0x7f9c1979e5e0) using this function.


Ran Question 12/15 for Sentence Window RAG


A new object of type <class 'llama_index.response_synthesizers.compact_and_refine.CompactAndRefine'> at 0x7f9c04705fd0 is calling an instrumented method <function Refine.get_response at 0x7f9c1918d700>. The path of this call may be incorrect.
Guessing path of new object is app._response_synthesizer based on other object (0x7f9c1979e070) using this function.


Ran Question 13/15 for Sentence Window RAG


A new object of type <class 'llama_index.query_engine.retriever_query_engine.RetrieverQueryEngine'> at 0x7f9c04705d00 is calling an instrumented method <function RetrieverQueryEngine.retrieve at 0x7f9c2cb34e50>. The path of this call may be incorrect.
Guessing path of new object is app based on other object (0x7f9c1979e5e0) using this function.
A new object of type <class 'llama_index.response_synthesizers.compact_and_refine.CompactAndRefine'> at 0x7f9c04705fd0 is calling an instrumented method <function Refine.get_response at 0x7f9c1918d700>. The path of this call may be incorrect.
Guessing path of new object is app._response_synthesizer based on other object (0x7f9c1979e070) using this function.


Response: 
  
 REASONS: The first JSON has "placebo_controlled" as a key while the second one has "placebo-controlled". Both have the same value (True). However, there is no matching key for "open", "randomized", "double-blind", and "unknown" in the first JSON. These additional keys make both JSONs different from each other.

SCORE: 5


A new object of type <class 'llama_index.query_engine.retriever_query_engine.RetrieverQueryEngine'> at 0x7f9c04705d00 is calling an instrumented method <function RetrieverQueryEngine.retrieve at 0x7f9c2cb34e50>. The path of this call may be incorrect.
Guessing path of new object is app based on other object (0x7f9c1979e5e0) using this function.


Ran Question 14/15 for Sentence Window RAG


A new object of type <class 'llama_index.response_synthesizers.compact_and_refine.CompactAndRefine'> at 0x7f9c04705fd0 is calling an instrumented method <function Refine.get_response at 0x7f9c1918d700>. The path of this call may be incorrect.
Guessing path of new object is app._response_synthesizer based on other object (0x7f9c1979e070) using this function.
Feedback Function exception caught: Traceback (most recent call last):
  File "/home/ubuntu/venv/lib/python3.8/site-packages/trulens_eval/feedback/feedback.py", line 519, in run
    assert isinstance(
AssertionError: Feedback function output must be a float or dict but was <class 'int'>.



Response: 
  
 REASONS: The first JSON has three key-value pairs while the second one has just one. There is no comparison possible as they have different structures and keys.
SCORE: 0


Feedback Function exception caught: Traceback (most recent call last):
  File "/home/ubuntu/venv/lib/python3.8/site-packages/trulens_eval/feedback/feedback.py", line 519, in run
    assert isinstance(
AssertionError: Feedback function output must be a float or dict but was <class 'int'>.



Response: 
  
 REASONS: The first JSON has a key "presence" with value False while the second one has a different key "number" with value 1. There is no matching key-value pair between these two JSONs.
SCORE: 0
Ran Question 15/15 for Sentence Window RAG


Feedback Function exception caught: Traceback (most recent call last):
  File "/home/ubuntu/venv/lib/python3.8/site-packages/trulens_eval/feedback/feedback.py", line 519, in run
    assert isinstance(
AssertionError: Feedback function output must be a float or dict but was <class 'int'>.



Response: 
  
 REASONS: The first JSON has additional key-value pairs like 'gender' which is missing from the second one. Also, there are new keys present in the second JSON such as 'total_subjects', 'arm_1_subjects', and 'arm_2_subjects'. These differences make them dissimilar.
SCORE: 0
Response: 
  
 REASONS: The first JSON has more information like dose_administration, IMP_administration, while the second one is missing those details. Only presence is common to both.
SCORE: 3
Ran Question 1/15 for Auto-Merging RAG


Feedback Function exception caught: Traceback (most recent call last):
  File "/home/ubuntu/venv/lib/python3.8/site-packages/trulens_eval/feedback/feedback.py", line 519, in run
    assert isinstance(
AssertionError: Feedback function output must be a float or dict but was <class 'int'>.



Response: 
  
 REASONS: The value "presence" is different in both objects as it's either True or False.
SCORE: 0
Ran Question 2/15 for Auto-Merging RAG
Response: 
  
 REASONS: Both JSON objects have the key "presence" with a value of False. The structure and content of both objects are identical.
SCORE: 10
Ran Question 3/15 for Auto-Merging RAG
Ran Question 4/15 for Auto-Merging RAG


Feedback Function exception caught: Traceback (most recent call last):
  File "/home/ubuntu/venv/lib/python3.8/site-packages/trulens_eval/feedback/feedback.py", line 519, in run
    assert isinstance(
AssertionError: Feedback function output must be a float or dict but was <class 'int'>.



Response: 
  
 REASONS: The first JSON has a key "presence" with value False while the second one has a different key "number" with value 88. There is no commonality between these two JSONs.
SCORE: 0
Ran Question 5/15 for Auto-Merging RAG


Feedback Function exception caught: Traceback (most recent call last):
  File "/home/ubuntu/venv/lib/python3.8/site-packages/trulens_eval/feedback/feedback.py", line 519, in run
    assert isinstance(
AssertionError: Feedback function output must be a float or dict but was <class 'int'>.



Response: 
  
 REASONS: The first JSON has a key "presence" with value False while the second one also has a key "presence" but its value is True. There's no other matching key-value pair. So, they don't have the same presence value.
SCORE: 0


Feedback Function exception caught: Traceback (most recent call last):
  File "/home/ubuntu/venv/lib/python3.8/site-packages/trulens_eval/feedback/feedback.py", line 519, in run
    assert isinstance(
AssertionError: Feedback function output must be a float or dict but was <class 'int'>.



Response: 
  
 REASONS: The first JSON has keys'min_number' and'max_number', while the second one has different keys like 'age_range', 'page_number', and 'protocol_reference'. Although both have numbers related to age range (18-55), they don't share the exact same structure.
SCORE: 0
Ran Question 6/15 for Auto-Merging RAG
Response: 
  
 REASONS: Both JSON objects have the exact same key "presence" with the same value "False". Therefore, they represent identical data.
SCORE: 10
Ran Question 7/15 for Auto-Merging RAG
Response: 
  
 REASONS: Both JSON objects have a key "presence" with the value set to True. The structure and content of both dictionaries are identical.
SCORE: 10
Ran Question 8/15 for Auto-Merging RAG
Ran Question 9/15 for Auto-Merging RAG
Response: 
  
 REASONS: Both JSON objects have the exact same key "presence" with the same value "False". Therefore, they represent identical data.
SCORE: 10
Ran Question 10/15 for Auto-Merging RAG
Response: 
  
 REASONS: Both JSON objec

Unnamed: 0,app_id,app_json,type,record_id,input,output,tags,record_json,cost_json,perf_json,...,Answer Relevance,Context Relevance,Groundedness,Ground-Truth Relevance_calls,Answer Relevance_calls,Context Relevance_calls,Groundedness_calls,latency,total_tokens,total_cost
0,Sentence Window RAG,"{""app_id"": ""Sentence Window RAG"", ""tags"": ""-"",...",RetrieverQueryEngine(llama_index.query_engine....,record_hash_e5f466c8fd8b34ba1b24ee83e833054a,"""Does this trial involve Psychedelic Research,...","""The context information does not specifically...",-,"{""record_id"": ""record_hash_e5f466c8fd8b34ba1b2...","{""n_requests"": 0, ""n_successful_requests"": 0, ...","{""start_time"": ""2023-12-13T08:14:35.409204"", ""...",...,0.4,0.2,0.65,[{'args': {'prompt': 'Does this trial involve ...,[{'args': {'prompt': 'Does this trial involve ...,[{'args': {'prompt': 'Does this trial involve ...,[{'args': {'source': 'Assessments for safety: ...,2,0,0.0
1,Sentence Window RAG,"{""app_id"": ""Sentence Window RAG"", ""tags"": ""-"",...",RetrieverQueryEngine(llama_index.query_engine....,record_hash_075dcee660ec0eeba2e36c0a7d87142d,"""Patients, subjects, and participants are used...","""According to the text, approximately 88 healt...",-,"{""record_id"": ""record_hash_075dcee660ec0eeba2e...","{""n_requests"": 0, ""n_successful_requests"": 0, ...","{""start_time"": ""2023-12-13T08:14:38.202246"", ""...",...,1.0,0.9,1.0,[],"[{'args': {'prompt': 'Patients, subjects, and ...","[{'args': {'prompt': 'Patients, subjects, and ...",[{'args': {'source': 'The principal investigat...,26,0,0.0
2,Sentence Window RAG,"{""app_id"": ""Sentence Window RAG"", ""tags"": ""-"",...",RetrieverQueryEngine(llama_index.query_engine....,record_hash_500c3a4781e62e6b15a78fd1998187d6,"""What is the age of participants mentioned in ...","""The age of participants mentioned in the text...",-,"{""record_id"": ""record_hash_500c3a4781e62e6b15a...","{""n_requests"": 0, ""n_successful_requests"": 0, ...","{""start_time"": ""2023-12-13T08:15:04.941123"", ""...",...,1.0,1.0,0.85,[],[{'args': {'prompt': 'What is the age of parti...,[{'args': {'prompt': 'What is the age of parti...,[{'args': {'source': 'Subjects will not be all...,25,0,0.0
3,Sentence Window RAG,"{""app_id"": ""Sentence Window RAG"", ""tags"": ""-"",...",RetrieverQueryEngine(llama_index.query_engine....,record_hash_6b639bb1e3d5450f7a7a2663dc912408,"""Are Adverse Events of Special Interest mentio...","""Adverse Events of Special Interest (AESIs) ar...",-,"{""record_id"": ""record_hash_6b639bb1e3d5450f7a7...","{""n_requests"": 0, ""n_successful_requests"": 0, ...","{""start_time"": ""2023-12-13T08:15:30.428140"", ""...",...,0.6,1.0,0.75,[],[{'args': {'prompt': 'Are Adverse Events of Sp...,[{'args': {'prompt': 'Are Adverse Events of Sp...,[{'args': {'source': '• Other medically signif...,84,0,0.0
4,Sentence Window RAG,"{""app_id"": ""Sentence Window RAG"", ""tags"": ""-"",...",RetrieverQueryEngine(llama_index.query_engine....,record_hash_fbd0bb4d286dbea375431e294eb09810,"""What is the treatment period mentioned in the...","""The treatment period mentioned in the study i...",-,"{""record_id"": ""record_hash_fbd0bb4d286dbea3754...","{""n_requests"": 0, ""n_successful_requests"": 0, ...","{""start_time"": ""2023-12-13T08:16:55.200876"", ""...",...,0.9,0.9,0.9,[{'args': {'prompt': 'What is the treatment pe...,[{'args': {'prompt': 'What is the treatment pe...,[{'args': {'prompt': 'What is the treatment pe...,[{'args': {'source': '4.4 End of Trial Definit...,65,0,0.0


In [34]:
# from trulens_eval import Tru, TruLlama
tru = Tru()
tru.get_leaderboard(app_ids=[])

Unnamed: 0_level_0,Ground-Truth Relevance,Context Relevance,Groundedness,Answer Relevance,latency,total_cost
app_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Auto-Merging Window RAG,0.82,0.458333,0.7,0.783333,38.666667,0.0
Sentence Window RAG,0.709091,0.481818,0.677273,0.545455,29.545455,0.0


In [17]:
tru.stop_dashboard(force=True)

Force stopping dashboard ...


In [25]:
# tru.run_dashboard()

Starting dashboard ...
Config file already exists. Skipping writing process.
Credentials file already exists. Skipping writing process.


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Accordion(children=(VBox(children=(VBox(children=(Label(value='STDOUT'), Output())), VBox(children=(Label(valu…

Dashboard started at http://172.31.28.48:8501 .


<subprocess.Popen at 0x7f270632e4f0>



Response: 
  
 REASONS: The provided statement gives clear details on the number of subjects per arm and their respective cohort sizes, allowing us to calculate the total number of subjects involved in the study.

SCORE: 9
Response: 
  
 REASONS: This answer provides a clear explanation of how many dose cohorts are planned for each arm (Arm 1 has 7, Arm 2 has either 3 or 4) and gives a summary of the overall number of cohorts (at least 11). It directly addresses the information asked about the study's arms and their respective cohort plans.

SCORE: 9
Response: 
  
 REASONS: Both answers provide similar information about the age range of participants being from 18 to 55 years old. However, the correct answer includes "of age" which makes it clear that the ages are included within the range (18-55), while the provided answer does not explicitly mention inclusion or exclusion. Additionally, the correct answer also mentions where the information was found (Page 33 of the protocol). These d