<a href="https://colab.research.google.com/github/polyexplorer/open-llm/blob/main/RAG%2BEval(Llama_Index).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Dependencies

In [1]:
! pip install transformers optimum accelerate langchain llama_index sentence_transformers peft trulens-eval
! pip install auto-gptq --extra-index-url https://huggingface.github.io/autogptq-index/whl/cu118/  # Use cu117 if on CUDA 11.7
! pip install pypdf pymupdf chromadb InstructorEmbedding
! mkdir pdfs

Looking in indexes: https://pypi.org/simple, https://huggingface.github.io/autogptq-index/whl/cu118/
mkdir: cannot create directory ‘pdfs’: File exists


# Huggingface LLM (Integrated with LlamaIndex)

In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from transformers import TextStreamer, pipeline
import torch

from llama_index.llms.huggingface import HuggingFaceLLM

torch.cuda.empty_cache()
model_name_or_path = "TheBloke/zephyr-7B-beta-GPTQ"
# To use a different branch, change revision
# For example: revision="gptq-4bit-32g-actorder_True"
model = AutoModelForCausalLM.from_pretrained(model_name_or_path,
                                             device_map="auto",
                                            #  trust_remote_code=False,
                                             revision="main")

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=True)

def generate_response(prompt):
    input_ids = tokenizer(prompt, return_tensors="pt").input_ids
    outputs = model.generate(
        input_ids,
        max_length=256,
        pad_token_id=tokenizer.pad_token_id,
        eos_token_id=tokenizer.eos_token_id,
    )
    response_ids = outputs[0]
    response_text = tokenizer.decode(response_ids, skip_special_tokens=True)
    return response_text

streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
pipe = pipeline(
            "text-generation",
            model=model,
            tokenizer=tokenizer,
            max_new_tokens=200,
            do_sample=True,
            temperature=0.1,
            top_k=40,
            top_p=0.95,
            repetition_penalty=1.15,
            # streamer=streamer,
        )

# langchain_llm = HuggingFacePipeline(pipeline=pipe)

system_prompt = "You are a good Q/A chatbot who always answers the question based on the context only.</s>"

prompt_template = """<|user|>
{query_str}</s>
<|assistant|>
"""

# prompt_template = "GPT4 Correct User: {query_str}<|end_of_turn|>GPT4 Correct Assistant:"

llm = HuggingFaceLLM(
    model = model,
    tokenizer = tokenizer,
    context_window = 4096,
    max_new_tokens = 256,
    query_wrapper_prompt = prompt_template,
    system_prompt = "",

)

Using `disable_exllama` is deprecated and will be removed in version 4.37. Use `use_exllama` instead and specify the version with `exllama_config`.The value of `use_exllama` will be overwritten by `disable_exllama` passed in `GPTQConfig` or stored in your config file.


# Basic RAG Pipeline

## Ingestion

### File Upload

In [2]:
import os
from google.colab import files


uploaded = files.upload()

for fn, content in uploaded.items():
  filename = os.path.join("pdfs",fn)
  with open(filename, 'wb') as f:
    f.write(content)


### DocumentStore

In [3]:
from llama_index import Document,SimpleDirectoryReader
import os
# filename = "/content/pdfs/263-102-00006_Protocol_Amendment_1_14Nov2019.pdf"

documents = SimpleDirectoryReader(
    input_files=[filename]
).load_data()
document = Document(text="\n\n".join([doc.text for doc in documents]))

### Embeddings

In [4]:
# VectorStore Embeddings
from llama_index.embeddings import InstructorEmbedding
from llama_index import ServiceContext

# model_name = "AnnaWegmann/Style-Embedding"
model_name = "hkunlp/instructor-large"
text_instruction = "Represent the Medical document for retrieving important points where answer can be found:"
query_instruction = "Represent the Medical question for retrieving supporting documents:"

embed_model = InstructorEmbedding(
    model_name= model_name,
    text_instruction=text_instruction,
    query_instruction=query_instruction
    )

service_context = ServiceContext.from_defaults(
    llm=llm, embed_model=embed_model
)

load INSTRUCTOR_Transformer
max_seq_length  512


### Index

In [5]:
! rm -r sentence_index
! rm -r merging_index

In [6]:
# Auto-Merging Index

from llama_index.node_parser import HierarchicalNodeParser

from llama_index.node_parser import get_leaf_nodes
from llama_index import StorageContext
from llama_index.retrievers import AutoMergingRetriever
from llama_index.indices.postprocessor import SentenceTransformerRerank
from llama_index.query_engine import RetrieverQueryEngine


def build_automerging_index(
    documents,
    llm,
    embed_model="local:BAAI/bge-small-en-v1.5",
    save_dir="merging_index",
    chunk_sizes=None,
):
    chunk_sizes = chunk_sizes or [2048, 512, 128]
    node_parser = HierarchicalNodeParser.from_defaults(chunk_sizes=chunk_sizes)
    nodes = node_parser.get_nodes_from_documents(documents)
    leaf_nodes = get_leaf_nodes(nodes)
    merging_context = ServiceContext.from_defaults(
        llm=llm,
        embed_model=embed_model,
    )
    storage_context = StorageContext.from_defaults()
    storage_context.docstore.add_documents(nodes)

    if not os.path.exists(save_dir):
        automerging_index = VectorStoreIndex(
            leaf_nodes, storage_context=storage_context, service_context=merging_context
        )
        automerging_index.storage_context.persist(persist_dir=save_dir)
    else:
        automerging_index = load_index_from_storage(
            StorageContext.from_defaults(persist_dir=save_dir),
            service_context=merging_context,
        )
    return automerging_index

def get_automerging_query_engine(
    automerging_index,
    similarity_top_k=12,
    rerank_top_n=2,
):
    base_retriever = automerging_index.as_retriever(similarity_top_k=similarity_top_k)
    retriever = AutoMergingRetriever(
        base_retriever, automerging_index.storage_context, verbose=True
    )
    rerank = SentenceTransformerRerank(
        top_n=rerank_top_n, model="BAAI/bge-reranker-base"
    )
    auto_merging_engine = automerging_index.as_query_engine(
        similarity_top_k=similarity_top_k,
        node_postprocessors=[rerank]
        )
    return auto_merging_engine


# Sentence Window Index

from llama_index import ServiceContext, VectorStoreIndex, StorageContext
from llama_index.node_parser import SentenceWindowNodeParser
from llama_index.indices.postprocessor import MetadataReplacementPostProcessor
from llama_index.indices.postprocessor import SentenceTransformerRerank
from llama_index import load_index_from_storage
import os


# index = VectorStoreIndex.from_documents(documents,
#                                         service_context=service_context)

def build_sentence_window_index(
    documents, llm, embed_model="local:BAAI/bge-small-en-v1.5", save_dir="sentence_index"
):
    # create the sentence window node parser w/ default settings
    node_parser = SentenceWindowNodeParser.from_defaults(
        window_size=2,
        window_metadata_key="window",
        original_text_metadata_key="original_text",
    )
    sentence_context = ServiceContext.from_defaults(
        llm=llm,
        embed_model=embed_model,
        node_parser=node_parser,
    )
    if not os.path.exists(save_dir):
        sentence_index = VectorStoreIndex.from_documents(
            documents, service_context=sentence_context
        )
        sentence_index.storage_context.persist(persist_dir=save_dir)
    else:
        sentence_index = load_index_from_storage(
            StorageContext.from_defaults(persist_dir=save_dir),
            service_context=sentence_context,
        )

    return sentence_index



def get_sentence_window_query_engine(
    sentence_index,
    similarity_top_k=6,
    rerank_top_n=2,
):
    # define postprocessors
    postproc = MetadataReplacementPostProcessor(target_metadata_key="window")
    rerank = SentenceTransformerRerank(
        top_n=rerank_top_n, model="BAAI/bge-reranker-base"
    )

    sentence_window_engine = sentence_index.as_query_engine(
        similarity_top_k=similarity_top_k, node_postprocessors=[postproc, rerank]
    )
    return sentence_window_engine


sentence_index = build_sentence_window_index(documents=documents,llm=llm, embed_model=embed_model, save_dir="sentence_index")
automerging_index = build_automerging_index(
    documents=documents,
    llm=llm,
    embed_model=embed_model,
    save_dir="merging_index"
)

## Retreival

### Intialize RAG Pipeline

In [7]:
sentence_pipeline = get_sentence_window_query_engine(sentence_index)

In [8]:
auto_merging_pipeline = get_automerging_query_engine(automerging_index,)


### Q/A

In [None]:
response_sentence = sentence_pipeline.query("Does the study title mention that it is about monitoring Drug-Drug Interactions? Drug drug Interactions occur when two or more drugs interact with each other in a way that affects their effectiveness or safety It is sometimes abbreviated as DDI.  ")
print(str(response_sentence), "\n-----------------------------------\n")

response_automerging = auto_merging_pipeline.query("Does the study title mention that it is about monitoring Drug-Drug Interactions? Drug drug Interactions occur when two or more drugs interact with each other in a way that affects their effectiveness or safety It is sometimes abbreviated as DDI.  ")
print(str(response_automerging))

No, the study title does not mention monitoring Drug-Drug Interactions. The study aims to determine the absorption, metabolism, and excretion of OPC-61815 and characterize its metabolites in healthy male Japanese subjects following a single intravenous dose. The study's objective is to evaluate the likelihood of effects of renal or hepatic impairment on the disposition of OPC-61815 and the likelihood for drug-drug interactions with OPC-61815. However, the study does not explicitly state that it is about monitoring 
-----------------------------------

No, the study title does not mention monitoring Drug-Drug Interactions. The study seems to be focused on evaluating the safety and pharmacokinetics of OPC-61815, a potential treatment for patients with advanced solid tumors, in a Phase I clinical trial. The context information provided does not indicate that the study is specifically designed to monitor Drug-Drug Interactions. However, the study does mention the need to evaluate the likel

In [None]:
response_2 = sentence_pipeline.query("How many patients are planned to be taken for the trial?  ")
print(str(response_2))

According to the context information provided, up to a maximum of 10 subjects will be dosed in total for the trial. Therefore, the number of patients planned to be taken for the trial is up to 10.


In [None]:
response_2.source_nodes[0].node.metadata

{'window': 'Protocol 263- 102-00006  \n28 \nConfidential - Proprietary Information   Amendment 1 Approval: 1 4 November 2019 5 Trial Population  \nIt is planned for at least 8  healthy male Japanese subjects to be dosed to ensure that \n6 subjects complete the IV infusion.  Up to a maximum of 10 subjects will be dosed in \ntotal.   \n',
 'original_text': 'Protocol 263- 102-00006  \n28 \nConfidential - Proprietary Information   Amendment 1 Approval: 1 4 November 2019 5 Trial Population  \nIt is planned for at least 8  healthy male Japanese subjects to be dosed to ensure that \n6 subjects complete the IV infusion. ',
 'page_label': '28',
 'file_name': '263-102-00006_Protocol_Amendment_1_14Nov2019.pdf',
 'file_path': '/content/pdfs/263-102-00006_Protocol_Amendment_1_14Nov2019.pdf',
 'file_type': 'application/pdf',
 'file_size': 895973,
 'creation_date': '2023-12-06',
 'last_modified_date': '2023-12-06',
 'last_accessed_date': '2023-12-06'}

In [None]:
response_3 = sentence_pipeline.query("What is the age of participants mentioned in the text? ")
print(str(response_3))

The participants mentioned in the text are healthy male Japanese subjects between the ages of 35 and 55 years old.


In [None]:
response_full = sentence_pipeline.query("What are the demographics (age, gender, ethnicity etc) of the patients/subjects being taken for the trial?")
print(str(response_full))

The demographic information (collection date, year of birth, age, sex, race, ethnicity, and country) of the patients/subjects will be recorded in the eCRF at the screening visit, as stated in section 5.2 of the protocol. Therefore, the specific demographics of the patients/subjects being taken for the trial will be available in the eCRF data. However, the provided context information does not include prior knowledge of the specific demographics of the patients/subjects.


In [None]:
response_4 = sentence_pipeline.query("In an inpatient study, participants are admitted to a study site or are admitted to a clinic. In an inpatient study, the text also might mention subjects checking in and getting discharged. Is it an inpatient study? ")
print(str(response_4))

Based on the context information provided, it is unclear whether this is an inpatient study or an outpatient study. The text mentions "Final Discharge from trial" and "residential treatment period," which could suggest an inpatient component, but it also mentions "2 additional 24-hour nonresidential collections" and "outpatient visits," which could suggest an outpatient component. Without further information, it is not possible to determine whether this is an inpatient or outpatient study.


In [None]:
response_5 = sentence_pipeline.query("In an outpatient study, participants visit the study site or must visit a clinic or must visit a hospital. In an outpatient study, participants do not stay overnight. Is it an outpatient study?   ")
print(str(response_5))

Yes, based on the provided context information, the study described in the query is an outpatient study. The text states that "Up to 2 additional 24-hour nonresidential collections (urine and feces) for total radioactivity may occur if discharge criteria have not been met by Day 10. Subjects will collect excreta samples at home for the 24-hour period prior to the clinic visit and deliver them to the trial site at the end of the collection interval, within 24 hours." This indicates that some participants may need to make additional visits to the study site for non


## Evaluation

### Groundedness

In [37]:
from trulens_eval.feedback import Groundedness
from trulens_eval import Feedback, TruLlama
from trulens_eval.feedback.provider.hugs import Huggingface

huggingface_provider = Huggingface()

grounded = Groundedness(groundedness_provider=huggingface_provider)

def groundedness_measure_with_cot_reasons(source, statement):
  prompt_template = """<|system|>
  You are a INFORMATION OVERLAP classifier providing the overlap of information between a SOURCE and STATEMENT.
For every sentence in the statement, please answer with this template:

TEMPLATE:
Statement Sentence: <Sentence>,
Supporting Evidence: <Choose the exact unchanged sentences in the source that can answer the statement, if nothing matches, say NOTHING FOUND>
Score: <Output a number between 0-10 where 0 is no information overlap and 10 is all information is overlapping>
</s>
<|user|>
Give me the INFORMATION OVERLAP of this SOURCE and STATEMENT.

SOURCE: {source}

STATEMENT: {statement}</s>
<|assistant|>
"""
  groundedness_scores = {}
  plausible_junk_char_min = 4
  if len(statement) > plausible_junk_char_min:
    reason = pipe(prompt_template)[0]['generated_text']
  i = 0
  for line in reason.split('\n'):
    if "Score" in line:
      groundedness_scores[f"statement_{i}"] = re_0_10_rating(line) / 10
      i += 1
  return groundedness_scores, {"reason": reason}


groundedness = (
    Feedback(groundedness_measure_with_cot_reasons, name="Groundedness")
        .on(TruLlama.select_source_nodes().node.text)
        .on_output()
        .aggregate(grounded.grounded_statements_aggregator)
)



✅ In Groundedness, input source will be set to __record__.app.query.rets.source_nodes[:].node.text .
✅ In Groundedness, input statement will be set to __record__.main_output or `Select.RecordOutput` .


### QS Relevance With CoT reasons (Open Source Implementation)

In [38]:
import re

pat_0_10 = re.compile(r"\s*([0-9]+)\s*$")


def re_0_10_rating(str_val):
    matches = pat_0_10.fullmatch(str_val)
    if not matches:
        # Try soft match
        matches = re.search('([0-9]+)(?=\D*$)', str_val)
        if not matches:
            print(f"0-10 rating regex failed to match on: '{str_val}'")
            return -10  # so this will be reported as -1 after division by 10

    return int(matches.group())

def _extract_score_and_reasons_from_response(
    response,
    normalize = 10.0
):
  if "Supporting Evidence" in response:
    score = 0.0
    supporting_evidence = ""
    for line in response.split('\n'):
      if "Score" in line:
        score = re_0_10_rating(line) / normalize
      if "Criteria" in line:
        parts = line.split(":")
        if len(parts) > 1:
          criteria = ":".join(parts[1:]).strip()
      if "Supporting Evidence" in line:
        parts = line.split(":")
        if len(parts) > 1:
          supporting_evidence = ":".join(parts[1:]).strip()
    reasons = {
      'reason':
          (
            f"{'Criteria: ' + str(criteria) + ' ' if criteria else ''}\n"
            f"{'Supporting Evidence: ' + str(supporting_evidence) if supporting_evidence else ''}"
          )
    }
    return score, reasons
  else:
    return re_0_10_rating(response) / normalize



def relevance_with_cot_reasons(prompt, response):

  final_prompt = f"""You are a RELEVANCE grader; providing the relevance of the given RESPONSE to the given PROMPT.
Respond only as a number from 0 to 10 where 0 is the least relevant and 10 is the most relevant.

Please answer with this template:

TEMPLATE FORMAT:
Criteria: <The criteria for your evaluation>
Supporting Evidence: <Your reasons for your scoring.>
Score: <The score 0-10 based on the given criteria>

A few additional scoring guidelines:

- Long RESPONSES should score equally well as short RESPONSES.

- Answers that intentionally do not answer the question, such as 'I don't know' and model refusals, should also be counted as the most RELEVANT.

- RESPONSE must be relevant to the entire PROMPT to get a score of 10.

- RELEVANCE score should increase as the RESPONSE provides RELEVANT context to more parts of the PROMPT.

- RESPONSE that is RELEVANT to none of the PROMPT should get a score of 0.

- RESPONSE that is RELEVANT to some of the PROMPT should get as score of 2, 3, or 4. Higher score indicates more RELEVANCE.

- RESPONSE that is RELEVANT to most of the PROMPT should get a score between a 5, 6, 7 or 8. Higher score indicates more RELEVANCE.

- RESPONSE that is RELEVANT to the entire PROMPT should get a score of 9 or 10.

- RESPONSE that is RELEVANT and answers the entire PROMPT completely should get a score of 10.

- RESPONSE that confidently FALSE should get a score of 0.

- RESPONSE that is only seemingly RELEVANT should get a score of 0.

PROMPT: {prompt}

RESPONSE: {response}
  """
  response = pipe(prompt_template.format(query_str=final_prompt))[0]['generated_text']
  return _extract_score_and_reasons_from_response(response)


### Initialize all Evaluation Functions

In [39]:
import numpy as np

from trulens_eval import (
    Feedback,
    TruLlama
)

qa_relevance = (
    Feedback(relevance_with_cot_reasons, name="Answer Relevance")
    .on_input_output()
)

qs_relevance = (
    Feedback(relevance_with_cot_reasons, name = "Context Relevance")
    .on_input()
    .on(TruLlama.select_source_nodes().node.text)
    .aggregate(np.mean)
)
feedbacks = [qa_relevance, qs_relevance, groundedness]



✅ In Answer Relevance, input prompt will be set to __record__.main_input or `Select.RecordInput` .
✅ In Answer Relevance, input response will be set to __record__.main_output or `Select.RecordOutput` .
✅ In Context Relevance, input prompt will be set to __record__.main_input or `Select.RecordInput` .
✅ In Context Relevance, input response will be set to __record__.app.query.rets.source_nodes[:].node.text .


## Setup Eval Pipeline

In [41]:
from trulens_eval import Tru, TruLlama
tru = Tru()
tru.reset_database()


eval_questions = [

"Study arms, treatment groups and cohorts are used interchangeably. How many number of study arms are mentioned in this text"
,
"""Does this text mention Translational Medicine Research, which is a research approach that aims to 'translate' findings from fundamental research into medical practice and meaningful health outcomes? One of the examples is utilization of Neurocart. """
,
"""Does this trial involve Psychedelic Research, which is the study of the effects of psychedelic substances, like LSD and psilocybin, on the human brain and mental health? """
,
"""Does this trial mention Abuse Liability, which refers to the potential of a drug to be misused, leading to addiction or dependence? """
,
"""Is this a Basket trial, which is a clinical trial design where multiple subgroups (baskets) of patients, usually with different types of cancer, are tested with a single drug based on a common biomarker? """
,
"""Is this an Umbrella trial, a type of clinical trial that tests the impact of different drugs on different mutations in a single type of disease, usually cancer, in one 'umbrella' study? """
,
"""Is this trial conducted over Multiple Phases? """
,
"""Is this trial Placebo-controlled or an open trial? """
,
"""Is 'First in Human' mentioned in this text, which refers to the first time a new treatment or procedure is tested in humans? It is also sometimes abbreviated as FIH. """
,
"""Is it an oncology study? """
,
"""Is 'Single Ascending Dose' mentioned in this text, referring to a phase in clinical trials where the dosage is gradually increased to evaluate the body's reactions? It is sometimes abbreviated as SAD. """
,
"""Is 'Multiple Ascending Dose' mentioned in this text, which refers to a method in clinical trials where small groups of subjects receive multiple low doses of the drug, which are gradually increased? It is sometimes abbreviated as MAD. """
,
"""Is 'Thorough QTc' mentioned in this text, referring to a clinical trial design used to assess the impact of a drug on the heart's QT interval, which is a measure of the time between the start of the Q wave and the end of the T wave in the heart's electrical cycle? """
,
"""Is Hepatic impairment mentioned in this text, indicating a reduced liver function? """
,
"""Is Renal impairment mentioned in the text, indicating a reduced kidney function? """
,
"""Is the trial investigating the drug’s bioavailability? Bioavailability represents the extent and rate at which a drug is absorbed. It is sometimes abbreviated as BA. """
,
"""Is the trial investigating bioequivalence of drugs? Bioequivalence is the similarity in the rate and extent of drug absorption between two drug products, typically a generic and a brand-name drug. It is sometimes abbreviated as BE. """
,
"""Does the study title mention that it is about monitoring Drug-Drug Interactions? Drug drug Interactions occur when two or more drugs interact with each other in a way that affects their effectiveness or safety It is sometimes abbreviated as DDI."""
,
"""Is the trial monitoring Drug-Drug Interactions? Drug drug Interactions occur when two or more drugs interact with each other in a way that affects their effectiveness or safety It is sometimes abbreviated as DDI. """
,
"""Is mass balance being confirmed in the study? Mass Balance involves accounting for the total amount of a drug that enters and exits a biological system. It is sometimes abbreviated as MB. """
,
"""Is Food effect mentioned in this text? Food effect describes how the presence or absence of food in the stomach can affect the rate and extent to which a drug is absorbed into the bloodstream. It is sometimes abbreviated as FE. """
,
"""Patients, subjects, and participants are used interchangeably. Other synonyms are Enrollees, study volunteers, research recruits, cohort members, survey respondents. How many patients are planned for study according to the text?  """
,
"""Patient Availability(Inclusion/Exclusion criteria and competitive landscape) """
,
"""What is the age of participants mentioned in the text? """
,
"""Are Healthy adult participants a part of this trial? """
,
"""Is the study checking for the influence of the therapy on subjects of diverse ethnicities? """
,
"""Are any ethnicities mentioned in the text? """
,
"""Is competing trial mentioned in the text? Competing trial investigates a similar or related intervention as the trial in question. """
,
"""Is it mentioned in the text that the target population is uncommon? """
,
"""Is the “target is common” mentioned in the text? """
,
"""Is it a study related to orphan diseases mentioned in the text? Orphan diseases are rare diseases that affect a relatively small number of individuals. """
,
"""Is it a study related to a disease with a limited number of medications? """
,
"""Is paediatric population involved in the study? They are sometimes referred to as children or adolescents. """
,
"""Is molecular screening a criterion to select patients in the study? Molecular screening criteria refers to the specific genetic, molecular, or biomarker-based characteristics used to identify and select patients for participation in a clinical trial. """
,
"""Does the study involve inclusion criteria of highly selective eligibility?  """
,
"""Is it an open label enrolment study? It is a study where participants are aware of the treatment they are receiving and can enroll themselves directly into a particular treatment group or study arm without randomization by researchers. """
,
"""Is it an open label randomization study? It is a study where both the researchers and the participants are aware of the treatment they are receiving. The randomization process is still used to assign participants to different treatment groups, but everyone involved knows which treatment or intervention the participant is receiving. """
,
"""Is it a double blind randomization clinical study? """
,
"""How many treatment modalities are mentioned in the text of treatment section? Treatment modalities are various treatment methods used in the study. They could be medication, surgery, physical therapy, Psychological therapy, radiation therapy, Alternative Medicine, Behavioral interventions. """
,
"""How many total number of drugs are being assessed as Investigational Medicinal Product in this study? """
,
"""Count the Trues for below questions """
,
"""Does the study involve behavioral therapy as per the text of treatment section? """
,
"""Is dose administration mentioned in text of treatment section? Other synonyms of dose administration are IMP administration and study medication. """
,
"""Is chemotherapy mentioned in text of treatment section? """
,
"""Is surgery mentioned in text of treatment section? """
,
"""Is injection mentioned in text of treatment section? """
,
"""Is infusion mentioned in text of in treatment section? """
,
"""Is it mentioned in the text of treatment section that no dose adjustments are allowed? """
,
"""Are there multiple arms SOA tables? """
,
"""Are there multiple paths SOA tables? """
,
"""Is chemotherapy a part of the study? """
,
"""Is biologics a part of the study? Biologics are drugs produced using biological systems such as bacteria, yeast, or mammalian cells. Biologics are also referred to as biological therapeutics or biopharmaceuticals> """
,
"""Is rescue medication allowed to be used as mentioned in the text of dosing section? Rescue medications are medications that help in managing conditions that involve sudden symptom exacerbations by providing quick relief. Common rescue medications are Epipen, adrenaline, steroids, triptans, antihistamines etc.  """
,
"""Is the study related to a high risk toxicity profile? """
,
"""Is it a life threatening study? """
,
"""Is pharmacist mentioned in the text? """
,
"""Is clinical pharmacy mentioned in the text? """
,
"""Is multiple drug formulation mentioned in the text of dosing section? Multiple drug formulation means a medicinal product contains two or more active ingredients, or drugs, in a single dosage form. It is also sometimes referred to as a combination drug or fixed-dose combination (FDC). """
,
"""What is the treatment period mentioned in the study? """
,
"""A study can be inpatient, outpatient or a mix of both. In an inpatient study, participants are admitted to a study site. In an outpatient study, participants visit the study site but do not stay overnight. What type of study is this?  """
,
"""Are the lab samples sent to one central lab or a different location? Lab samples are sometimes referred to as biomarkers, FBR, Fasting Blood referrals or bodily fluid samples. """
,
"""Do we require complex frozen packaging for shipment of the lab samples? Complex frozen packaging can involve dry ice, liquid nitrogen, refrigeration, freezer boxes, vacuum insulated dry shipper containers, thermoformed packaging, insulated shipping kits or any kind of customized packagings. """
,
"""Is Data Monitoring Committee mentioned in the text?It is sometimes abbreviated as DMC or IDMC. """
,
"""Is Cohort Safety Review mentioned in the text? """
,
"""Is Dose Escalation Review Team a part of the study? They are sometimes referred to as DERT. """
,
"""Is Data Safety Monitoring Board a part of the study? They are sometimes referred to as DSMB. """
,
"""Events requiring adjudication refer to specific events that occur during a clinical trial and need an independent and systematic review or evaluation by a panel of experts or an adjudication committee. Does the study mention about any events requiring adjudication? """
,
"""Is the data collection happening at a lab for this study? """
,
"""Are Adverse Events of Special Interest mentioned in the study? They are sometimes abbreviated as AESI or AEs of Special Interest. """
,
"""Alanine aminotransferase is sometimes abbreviated as ALT. Aspartate aminotransferase is sometimes abbreviated as AST. Upper Limit of Normal is sometimes abbreviated as ULN. Is elevation of alanine aminotransferase or aspartate aminotransferase being compared with Upper limit of normal in the study? """
,
"""Upper Limit of Normal is sometimes abbreviated as ULN. Is total bilirubin level being compared with upper limit of normal in the study? """
,
"""Are there sections mentioning Interim?  """
,
"""Are there sections mentioning IA? """
,
"""How many sites are planned for the study? """
,
"""How many countries are planned for the study? """
,
"""Is a Non-USA country involved in the study? """
,
"""How many protocol amendments were made according to the text? """
,
"""Are there any country specific amendments made to the protocol? """
]

def get_prebuilt_trulens_recorder(query_engine, feedbacks, app_id):
    tru_recorder = TruLlama(
        query_engine,
        app_id=app_id,
        feedbacks=feedbacks
        )
    return tru_recorder

tru_recorder = get_prebuilt_trulens_recorder(sentence_pipeline, feedbacks,
                                             app_id="Direct Query Engine")

In [42]:
from trulens_eval import Tru
tru = Tru()
records , feedback = tru.get_records_and_feedback(app_ids = [])

In [25]:
records.head()

Unnamed: 0,app_id,app_json,type,record_id,input,output,tags,record_json,cost_json,perf_json,ts,Answer Relevance,Context Relevance,Answer Relevance_calls,Groundedness_calls,Context Relevance_calls,latency,total_tokens,total_cost
0,Direct Query Engine,"{""app_id"": ""Direct Query Engine"", ""tags"": ""-"",...",RetrieverQueryEngine(llama_index.query_engine....,record_hash_8d67ecd687443344f406500c301a2c81,"""How many number of Site(s) will this trial ta...","""Based on the provided context information, th...",-,"{""record_id"": ""record_hash_8d67ecd687443344f40...","{""n_requests"": 0, ""n_successful_requests"": 0, ...","{""start_time"": ""2023-12-06T12:35:34.816441"", ""...",2023-12-06T12:35:41.606703,1.0,1.0,[{'args': {'prompt': 'How many number of Site(...,[],[{'args': {'prompt': 'How many number of Site(...,6,0,0.0
1,Direct Query Engine,"{""app_id"": ""Direct Query Engine"", ""tags"": ""-"",...",RetrieverQueryEngine(llama_index.query_engine....,record_hash_cfa2c16b55027c7c6f4bec3033070d29,"""How many patients are planned to be taken for...","""Up to a maximum of 10 subjects will be dosed ...",-,"{""record_id"": ""record_hash_cfa2c16b55027c7c6f4...","{""n_requests"": 0, ""n_successful_requests"": 0, ...","{""start_time"": ""2023-12-06T12:35:42.017344"", ""...",2023-12-06T12:36:03.155157,1.0,1.0,[{'args': {'prompt': 'How many patients are pl...,[],[{'args': {'prompt': 'How many patients are pl...,21,0,0.0
2,Direct Query Engine,"{""app_id"": ""Direct Query Engine"", ""tags"": ""-"",...",RetrieverQueryEngine(llama_index.query_engine....,record_hash_1f71b1c53f34286454585afca3100538,"""What is the age of participants mentioned in ...","""The participants mentioned in the text are he...",-,"{""record_id"": ""record_hash_1f71b1c53f342864545...","{""n_requests"": 0, ""n_successful_requests"": 0, ...","{""start_time"": ""2023-12-06T12:36:03.608728"", ""...",2023-12-06T12:36:18.033568,1.0,1.0,[{'args': {'prompt': 'What is the age of parti...,[],[{'args': {'prompt': 'What is the age of parti...,14,0,0.0


In [None]:
with tru_recorder as recording:
    for question in eval_questions:
        response = sentence_pipeline.query(question)

records, feedback = tru.get_records_and_feedback(app_ids=[])
records.head()






0-10 rating regex failed to match on: 'Score: N/A (not applicable)'


In [26]:
tru.get_leaderboard(app_ids=[])

Unnamed: 0_level_0,Context Relevance,Answer Relevance,latency,total_cost
app_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Direct Query Engine,1.0,1.0,13.666667,0.0


In [27]:
tru.run_dashboard()

Starting dashboard ...
Config file already exists. Skipping writing process.
Credentials file already exists. Skipping writing process.
Dashboard already running at path:   Submit this IP Address: 35.223.170.144



<Popen: returncode: None args: ['streamlit', 'run', '--server.headless=True'...>