<a href="https://colab.research.google.com/github/polyexplorer/open-llm/blob/main/Protocol_Scoring_FT_RAG.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Dependencies

In [1]:
! pip install transformers optimum langchain sentence_transformers peft
! pip install auto-gptq --extra-index-url https://huggingface.github.io/autogptq-index/whl/cu118/  # Use cu117 if on CUDA 11.7
! pip install pypdf pymupdf chromadb InstructorEmbedding
! mkdir pdfs

Collecting optimum
  Downloading optimum-1.14.1-py3-none-any.whl (399 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m399.9/399.9 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting langchain
  Downloading langchain-0.0.343-py3-none-any.whl (1.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m10.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting sentence_transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m11.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting peft
  Downloading peft-0.6.2-py3-none-any.whl (174 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m174.7/174.7 kB[0m [31m16.9 MB/s[0m eta [36m0:00:00[0m
Collecting coloredlogs (from optimum)
  Downloading coloredlogs-15.0.1-py2.py3-none-any.whl (46 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━

# LLM

In [2]:
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from transformers import TextStreamer, pipeline
from langchain.llms.huggingface_pipeline import HuggingFacePipeline
import torch

torch.cuda.empty_cache()
model_name_or_path = "TheBloke/zephyr-7B-beta-GPTQ"
# To use a different branch, change revision
# For example: revision="gptq-4bit-32g-actorder_True"
model = AutoModelForCausalLM.from_pretrained(model_name_or_path,
                                             device_map="auto",
                                             trust_remote_code=False,
                                             revision="gptq-8bit-32g-actorder_True")

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=True)

streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
pipe = pipeline(
            "text-generation",
            model=model,
            tokenizer=tokenizer,
            max_new_tokens=200,
            do_sample=True,
            temperature=0.1,
            top_k=40,
            top_p=0.95,
            repetition_penalty=1.15,
            streamer=streamer,
        )

langchain_llm = HuggingFacePipeline(pipeline=pipe)

config.json:   0%|          | 0.00/1.31k [00:00<?, ?B/s]

Using `disable_exllama` is deprecated and will be removed in version 4.37. Use `use_exllama` instead and specify the version with `exllama_config`.The value of `use_exllama` will be overwritten by `disable_exllama` passed in `GPTQConfig` or stored in your config file.


model.safetensors:   0%|          | 0.00/8.17G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.43k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/168 [00:00<?, ?B/s]

# Fine-Tuned LLM

In [None]:
# Import necessary libraries
import torch
from peft import PeftModel, PeftConfig
from transformers import AutoModelForCausalLM, AutoTokenizer

# Specify the identifier for the pre-trained Peft model
peft_model_id = "/content/drive/MyDrive/zephyr-7B-beta-GPTQ-FineTuned-ProtocolScoring"

# Load the configuration for the Peft model
config = PeftConfig.from_pretrained(peft_model_id)

# Load the base model for Causal Language Modeling with specified settings
# - Use AutoModelForCausalLM to load the model.
# - Set return_dict=True to enable returning model outputs as dictionaries.
# - Enable 4-bit weight quantization with load_in_4bit=True.
# - Set device_map='auto' to automatically allocate the model on available devices.
model = AutoModelForCausalLM.from_pretrained(
    config.base_model_name_or_path,
    return_dict=True,
    load_in_4bit=True,
    device_map='auto'
)

# Load the tokenizer associated with the base model
tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)

# Load the PeftModel, which applies the Peft (Perturbed Embeddings for Few-shot Text Classification) method
# to the base model, using the specified pre-trained Peft model identifier
model = PeftModel.from_pretrained(model, peft_model_id)

# PDF Upload

In [3]:
from google.colab import files
import os

uploaded = files.upload()

for fn, content in uploaded.items():
  with open(os.path.join("pdfs",fn), 'wb') as f:
    f.write(content)

Saving 263-102-00006_Protocol_Amendment_1_14Nov2019.pdf to 263-102-00006_Protocol_Amendment_1_14Nov2019.pdf


# Embeddings+VectorStore

In [4]:
from langchain import HuggingFacePipeline, PromptTemplate
from langchain.chains import RetrievalQA
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.document_loaders import PyPDFDirectoryLoader, PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceInstructEmbeddings
import re
import torch

DEVICE = "cuda:0" if torch.cuda.is_available() else "cpu"

toc_pattern = re.compile(r'^\d+(\.\d+)*.*$', re.MULTILINE)



pdf_path = "pdfs/"


loader = PyPDFDirectoryLoader(pdf_path)
docs = loader.load()

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1024, chunk_overlap=128)
texts = text_splitter.split_documents(docs)

for i,page in enumerate(texts):
  texts[i].page_content = toc_pattern.sub('',page.page_content)



texts_list = [x.page_content for x in texts]

embeddings = HuggingFaceInstructEmbeddings(
    model_name="BAAI/bge-large-en-v1.5", model_kwargs={"device": DEVICE}
)

db_ = Chroma.from_documents(texts, embeddings)


.gitattributes:   0%|          | 0.00/1.52k [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/191 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/90.3k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/779 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

load INSTRUCTOR_Transformer
max_seq_length  512


# RAG Pipeline

In [17]:
from langchain import PromptTemplate

DEFAULT_SYSTEM_PROMPT = """
You are a helpful, respectful and honest assistant. Always answer as helpfully as possible.
""".strip()


def generate_prompt(prompt: str, system_prompt: str = DEFAULT_SYSTEM_PROMPT) -> str:
    return f"""
<|system|>
{system_prompt}
</s>
<|user|>
{prompt}</s>
<|assistant|>""".strip()


SYSTEM_PROMPT = "Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer."

template = generate_prompt(
    """
{context}

Question: {question}
""",
    system_prompt=SYSTEM_PROMPT,
)
prompt = PromptTemplate(template=template, input_variables=["context", "question"])


from langchain.chains import RetrievalQA

rag_pipeline = RetrievalQA.from_chain_type(
    llm=langchain_llm,
    chain_type="stuff",
    retriever=db_.as_retriever(search_kwargs={"k": 6}),
    return_source_documents=True,
    chain_type_kwargs={"prompt": prompt},
)


# Q/A on RAG Pipeline

In [6]:
answer_1 = rag_pipeline("The phrases 'Study arms', 'treatment groups' and 'cohorts' are used interchangeably. How many number of study arms are mentioned in this text?")





The text does not explicitly mention the number of study arms. It only mentions that it is planned to dose at least 8 healthy male Japanese subjects in Protocol 263- 102-00006. Without further information, it is unclear if there are multiple treatment groups or cohorts within this protocol. Therefore, I would say that we do not know the number of study arms mentioned in this text.


In [13]:
sources = [x.page_content for x in answer_1['source_documents']]
for source in sources:
  print(source)

females), and height/weight/BMI.
Dosing Review Committee: 
Refer to the Trial Design section of the Synopsis for details.
Statistical Methods:
This trial is not powered for statistical comparisons of PK parameters and the sample size 
of 88 subjects was chosen from practical considerations. The number of subjects per 
cohort (6 active and 2 placebo) in Arms 1 and 2 is based on previous experience with 
other drugs for first-in-human trials. The number of subjects per dose level in Arms 1and 

of the PK parameters.
for the sentinel group has been reviewed and agreed by the principal investigator that it is 
safe and appropriate to dose the remaining subjects. The principal investigator will notify 
the CRO medical monitor and the OPDC GCD and GPV representatives that dosing may 
continue in the cohort. 
The dose escalation process for Arm 2 will be the same as in Arm 1.
Trial Population:
Approximately 88 healthy subjects (56 subjects in Arm 1 and up to 32 subjects in Arm 2) 
are expecte

In [13]:
print(answer_1['result'])


Two study arms, or treatment groups, are mentioned in this text. They are referred to as "Arms 1" and "Arms 2." Each arm has a different dosing regimen, and there are anticipated numbers of dose cohorts and dosage levels specified for each arm. Therefore, a total of two study arms, or treatment groups, are mentioned in this text.


In [7]:
answer_2 = rag_pipeline("Patients, subjects, and participants are used interchangeably. Other synonyms are Enrollees, study volunteers, research recruits, cohort members, survey respondents. How many patients are planned for study according to the text?")
print(answer_2['result'])


According to the text, it is planned for at least 8 healthy male Japanese subjects to participate in the study. The term "patients" is not used in this context to describe the study population. Therefore, based on the given context, there are no plans to enroll patients in this particular study.

According to the text, it is planned for at least 8 healthy male Japanese subjects to participate in the study. The term "patients" is not used in this context to describe the study population. Therefore, based on the given context, there are no plans to enroll patients in this particular study.


In [8]:
answer_3 = rag_pipeline("Is multiple drug formulation mentioned in the text of dosing section? Multiple drug formulation means a medicinal product being administered in multiple forms. For example, a drug being presented as a powder in capsule as well as a liquid-filled capsule.")
print(answer_3['result'])


No, there is no mention of a multiple drug formulation in the dosing section provided. The text only describes the administration of OPC-61815 in both single and multiple IV doses, without any reference to other concurrent medications or drug formulations.

No, there is no mention of a multiple drug formulation in the dosing section provided. The text only describes the administration of OPC-61815 in both single and multiple IV doses, without any reference to other concurrent medications or drug formulations.


In [25]:
answer_4 = rag_pipeline("Is mass balance being confirmed in the study? Mass Balance involves accounting for the total amount of a drug that enters and exits a biological system. It is sometimes abbreviated as MB.")
print(answer_4['result'])




Yes, mass balance is being confirmed in the study as stated in the primary objective listed in Table 3-1 titled "Trial Objectives and Endpoints." The goal is to determine the mass balance of total radioactivity following a single IV infusion of (14C)-OPC-61815. This involves measuring the recovery of the administered radioactive dose through various routes of elimination, such as urine and feces, as well as any remaining radioactivity in the body at the end of the study period. By confirming mass balance, researchers can ensure that they have accounted for all of the administered radioactivity and better understand the pharmacokinetics and metabolism of the investigational medicinal product.

Yes, mass balance is being confirmed in the study as stated in the primary objective listed in Table 3-1 titled "Trial Objectives and Endpoints." The goal is to determine the mass balance of total radioactivity following a single IV infusion of (14C)-OPC-61815. This involves measuring the recover

In [28]:
answer_5 = rag_pipeline("Is biologics a part of the study? Biologics are drugs produced using biological systems such as bacteria, yeast, or mammalian cells. Biologics are also referred to as biological therapeutics or biopharmaceuticals> ")


any <|system|>
Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.
 
<|user|>

Trial Site(s): 1 site in the UK  
Investigational Medicinal Product(s), Dose, Dosage R egimen, Treatment Duration, 
Formulation, Mode of Administration:  
(

containing approximately 75.1 μCi ( 2.78 MBq) , over a period of 60 minutes  (55 to 

Trial Assessments:  
Assessments for PK : Blood sampling for OPC-61815 free form and OPC-41061 plasma 
concentration s, whole blood and plasma total radioactivity, and metabolite profiling  and 
identification  (plasma) . Urine sampling for total radioactivity and metabolite profiling 
and identification. Fe ces sampling for measurement of total radioactiv ity and where 
possible, metabolite profiling and identification .

Protocol 263- 102-00006  

Confidential - Proprietary Information   Amendment 1 Approval: 1 4 November 2019 8.2.3 Analytical Methodology  





<|assistant|>
Based on the provided context, there is no information to suggest that biologics are a part of this study. The focus seems to be on investigating the pharmacokinetics and metabolism of two compounds, OPC-61815 and OPC-41061, through blood, urine, and feces sampling. There is no mention of biologics or biological systems being utilized in the production or administration of these compounds. Therefore, I would conclude that biologics are not a part of this study.

Based on the provided context, there is no information to suggest that biologics are a part of this study. The focus seems to be on investigating the pharmacokinetics and metabolism of two compounds, OPC-61815 and OPC-41061, through blood, urine, and feces sampling. There is no mention of biologics or biological systems being utilized in the production or administration of these compounds. Therefore, I would conclude that biologics are not a part of this study.


In [29]:
print(answer_5['result'])


Based on the provided context, there is no information to suggest that biologics are a part of this study. The focus seems to be on investigating the pharmacokinetics and metabolism of two compounds, OPC-61815 and OPC-41061, through blood, urine, and feces sampling. There is no mention of biologics or biological systems being utilized in the production or administration of these compounds. Therefore, I would conclude that biologics are not a part of this study.


In [33]:
answer_6 = rag_pipeline("How many total number of drugs are being assessed as Investigational Medicinal Product in this study?")



I do not see any information provided about additional investigational medicinal products being assessed in this study. Based on the context provided, it appears that only one investigational medicinal product, (14C)-OPC-61815, is being evaluated in this phase I clinical trial. Therefore, my answer would be that there is no information provided regarding the assessment of multiple investigational medicinal products in this study.


In [36]:
answer_7 = rag_pipeline("Is this a Basket trial, which is a clinical trial design where multiple subgroups (baskets) of patients, usually with different types of cancer, are tested with a single drug based on a common biomarker? ")




No, based on the provided context, this does not appear to be a basket trial. A basket trial involves testing a single drug in multiple subgroups of patients with different types of cancer based on a shared biomarker. However, this trial is specifically focused on healthy male Japanese subjects receiving a single dose of a radiolabeled investigational drug called (14C)-OPC-61815. The trial is designed to evaluate pharmacokinetics (PK) and safety, and there is no mention of testing the drug in patients with cancer or any specific biomarker. Therefore, it seems unlikely that this is a basket trial.


In [38]:
answer_8 = rag_pipeline("Are Adverse Events of Special Interest mentioned in the study? They are sometimes abbreviated as AESI or AEs of Special Interest. ")




Yes, Adverse Events of Special Interest (AEs SI) are mentioned in the study. They are referred to as "Events of Special Interest: A noteworthy event for the particular product/IMP or class of products that a sponsor may wish to monitor carefully. All AEs SIs are to be reported as immediately reportable events (IREs)."


In [40]:
answer_9 = rag_pipeline("How many protocol amendments were made according to the text?")




The text states that there is one (1) amendment approved on November 4, 2019, which can be found in Protocol 263-102-00006. Therefore, based on this information, it can be concluded that only one protocol amendment was made as per the given context.


In [None]:
answer_9 = rag_pipeline("Alanine aminotransferase is sometimes abbreviated as ALT. Aspartate aminotransferase is sometimes abbreviated as AST. Upper Limit of Normal is sometimes abbreviated as ULN. Is elevation of alanine aminotransferase or aspartate aminotransferase being compared with Upper limit of normal in the study? ")




Yes, according to the provided context, elevation of both alanine aminotransferase (ALT) and aspartate aminotransferase (AST) is being compared with the upper limit of normal (ULN) in the study. This is indicated in the protocol section where it states that "for a subject who experiences an elevation in AST or ALT that is ≥3 x ULN, a total bilirubin level should 

In [None]:
answer_10 = rag_pipeline("Upper Limit of Normal is sometimes abbreviated as ULN. Is total bilirubin level being compared with upper limit of normal in the study? ")

In [19]:
sources = [x.page_content for x in answer_4['source_documents']]
for source in sources:
  print(source)

interval on D ays 1 1 and 12, respectively). If on the second occasion the subject has still 
not met the desired criterion, then the subject will be discharged from the trial, per 
investigator and sponsor decision. 
Trial Population:  
It is planned for at least 8  healthy male Japanese subjects to be dosed to ensure that 

total.  
Key Inclusion/Exclusion Criteria:  
Key inclusion criteria include but are  not limited to the following: Male subjects between 

Investigator s may discuss trial availability and the possibility for entry with a potential 
subject without first obtaining consent. However, informed consent must be obtained and 
documented before initiation of any procedures that are performed solely for the purpose 
of determining eligibility for this trial, including withdrawal from current medication(s).  
Potential subjects are free to refuse entry into the trial, or withdraw from the trial at any time, without justification, and there will be no consequences to thei r

In [27]:
toc_pattern.sub('',sample_text)

'\n\n\n\n\nInformation........................................................................................................75\n\n\n'

In [None]:
llm_interface = LLMInterface(model=model, tokenizer = tokenizer )

In [None]:
print(texts_list[0])

Rugen Holdings (Cayman) Limited
Investigational Medicinal Product
B-124a
REVISED CLINICAL PROTOCOL
A Phase 1, Single-center, Randomized, Double-blind, Placebo-controlled Trial to Assess 
the Safety, Tolerability, Pharmacokinetics, and Pharmacodynamics of Single Ascending 
Oral Doses of B-124a in Healthy Subjects
A Study in Healthy Men and Women to Assess the Safety and Tolerability of Different 
Doses of B-124a and Their Uptake and Clearance From the Body
Protocol No. X06-201-00001
IND No. 153807
CONFIDENTIAL  PROPRIETARY INFORMATION
Drug Development Phase: 1
Sponsor: Rugen Holdings (Cayman) Limited
Immediately Reportable Event IQVIA Lifecycle Safety
Phone: 855-638-2229
Fax: 855-638-1674
Email: QLS.OtsukaPKD@Quintiles.com
Amendment 4 Approval: 02 Sep 2022
Amendment 3 Approval: 10 Feb 2022
Amendment 2 Approval: 17 Dec 2021
Amendment 1 Approval: 05 Aug 2021
Approval: 25 May 2021


In [None]:
context = texts_list[0]

instruction = """You are an accurate Answering assistant. Given a context and a related query in the medical domain, answer the query based on information from the context. If you can't find the answer, simply reply 'no'. Do not come up with an answer.

The answer should be strictly a json in the format:
{
  "answer":"yes/no",
  "reason":<relevant reasoning for answer>
}
"""

question = f"Context:{context}\nQuery:The phrases 'Study arms', 'treatment groups' and 'cohorts' are used interchangeably. How many number of study arms are mentioned in this text?"

answer = llm_interface.ask(question,instruction)

Response: {
 "answer": "Three",
 "reason": "The protocol mentions three different doses of B-124a that will be tested in separate groups of subjects, which are referred to as 'study arms', 'treatment groups', or 'cohorts'."
}


In [None]:
print(answer)

{
  "answer": "Three",
  "reason": "The protocol mentions three different doses of B-124a that will be tested in separate groups of subjects, which are referred to as'study arms', 'treatment groups', or 'cohorts'."
}


In [None]:
context = texts_list[0]

instruction = """You are an efficient answering assistant. Given a context,a related query, and an existing answer all in the medical domain,refine the answer based on the context checking for consistencies with the context.
The answer should be strictly a json in the format:
{
  "answer":<answer>,
  "reason":<relevant reasoning for answer>
}
"""

question = f"Context:{context}\nQuery:The phrases 'Study arms', 'treatment groups' and 'cohorts' are used interchangeably. How many number of study arms are mentioned in this text?\nExisting Answer:{answer}"

refined_answer = llm_interface.ask(question,instruction)

Here is the refined answer based on the context provided:

{
 "answer": "There are three study arms mentioned in this text.",
 "reason": "The protocol titled 'A Study in Healthy Men and Women to Assess the Safety and Tolerability of Different Doses of B-124a and Their Uptake and Clearance From the Body' describes three different doses of B-124a that will be administered to separate groups of healthy subjects in a single-center, randomized, double-blind, placebo-controlled trial. These groups are collectively referred to as 'study arms', 'treatment groups', or 'cohorts' throughout the protocol."
}


In [None]:
print(answer)

In [None]:
# ! pip install pypdf

Collecting pypdf
  Downloading pypdf-3.17.1-py3-none-any.whl (277 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m277.6/277.6 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pypdf
Successfully installed pypdf-3.17.1


In [None]:
graph_store = SimpleGraphStore()
storage_context = StorageContext.from_defaults(graph_store=graph_store)

ValueError: ignored