# PubMedQA with LLaMA 3.2 RAG System

In [9]:
!pip install transformers datasets langchain chromadb bitsandbytes accelerate langchain_community
!pip install git+https://github.com/HKUNLP/instructor-embedding.git
!pip install sentence-transformers

Collecting git+https://github.com/HKUNLP/instructor-embedding.git
  Cloning https://github.com/HKUNLP/instructor-embedding.git to /tmp/pip-req-build-ms38j2ze
  Running command git clone --filter=blob:none --quiet https://github.com/HKUNLP/instructor-embedding.git /tmp/pip-req-build-ms38j2ze
  Resolved https://github.com/HKUNLP/instructor-embedding.git to commit 853cedf5a11a4a625fb721d0796451bf8d59d5af
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: InstructorEmbedding
  Building wheel for InstructorEmbedding (setup.py) ... [?25l[?25hdone
  Created wheel for InstructorEmbedding: filename=InstructorEmbedding-1.0.2-py3-none-any.whl size=21232 sha256=9a73d709fea852d7d2d7dfec58bd45efe4e5259cfc42c01761f8641a37aace9c
  Stored in directory: /tmp/pip-ephem-wheel-cache-dy310i2f/wheels/7e/65/c7/5ad636387214272bc9922409b3cfa686642c66a7e6e126c388
Successfully built InstructorEmbedding
Installing collected packages: InstructorEmbedding
Successfully inst

In [10]:
import os, torch, logging, time, atexit
from getpass import getpass
from datasets import load_dataset
from langchain import HuggingFacePipeline, PromptTemplate
from langchain.chains import RetrievalQA
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from transformers import AutoTokenizer, AutoModelForCausalLM, TextStreamer, pipeline, BitsAndBytesConfig
from tqdm import tqdm
from chromadb.config import Settings

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

DEVICE = "cuda:0" if torch.cuda.is_available() else "cpu"
print(f"Using device: {DEVICE}")

if "HF_TOKEN" not in os.environ:
    os.environ["HF_TOKEN"] = getpass("Enter your Hugging Face token: ")
print("HF_TOKEN is set:", "HF_TOKEN" in os.environ)

Using device: cuda:0
HF_TOKEN is set: True


In [11]:
model_name = "meta-llama/Llama-3.2-3B"
HUGGING_FACE_TOKEN = os.getenv("HF_TOKEN")

bnb_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_use_double_quant=True, bnb_4bit_quant_type="nf4")

def load_model_and_tokenizer():
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=bnb_config, device_map="auto")
    return model, tokenizer

model, tokenizer = load_model_and_tokenizer()

def verify_model(model, tokenizer):
    input_text = "This is a test."
    input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to(DEVICE)
    outputs = model.generate(input_ids, max_length=20)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

print(f"Model test output: {verify_model(model, tokenizer)}")

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Model test output: This is a test. This is only a test.


In [12]:
def load_pubmedqa_dataset(split="train", limit=None):
    dataset = load_dataset("pubmed_qa", "pqa_labeled", split=split)
    return dataset.select(range(min(limit, len(dataset)))) if limit else dataset

def init_chroma():
    embedding_fn = HuggingFaceEmbeddings(
        model_name="sentence-transformers/all-mpnet-base-v2",  # You can choose another compatible model if desired
        model_kwargs={"device": DEVICE},
    )
    return Chroma(
        embedding_function=embedding_fn,
        client_settings=Settings(anonymized_telemetry=False)
    )

def ingest_data(collection, dataset):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1024, chunk_overlap=64)
    for entry in tqdm(dataset, desc="Ingesting data"):
        document = (
            f"Question: {entry.get('question', '')}\n"
            f"Context: {entry.get('context', '')}\n"
            f"Abstract: {entry.get('abstract', '')}\n"
            f"Long Answer: {entry.get('long_answer', '')}"
        )
        chunks = text_splitter.split_text(document)
        collection.add_texts(
            texts=chunks,
            metadatas=[{"source": entry["pubid"]}] * len(chunks)
        )

dataset = load_pubmedqa_dataset(limit=1000)
collection = init_chroma()
ingest_data(collection, dataset)

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]



1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

  return Chroma(
Ingesting data: 100%|██████████| 1000/1000 [01:29<00:00, 11.17it/s]


In [13]:
DEFAULT_SYSTEM_PROMPT = "You are an AI assistant specializing in medical literature. Answer accurately and concisely based on the given context."

def generate_prompt(prompt: str, system_prompt: str = DEFAULT_SYSTEM_PROMPT) -> str:
    return f"[INST] <<SYS>>{system_prompt}<</SYS>>{prompt} [/INST]".strip()

template = generate_prompt("{context}\nQuestion: {question}", system_prompt="Use the context to answer the medical question.")
prompt = PromptTemplate(template=template, input_variables=["context", "question"])

streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
text_pipeline = pipeline("text-generation", model=model, tokenizer=tokenizer, max_new_tokens=500, temperature=0.1, top_p=0.95, repetition_penalty=1.15, streamer=streamer)
llm = HuggingFacePipeline(pipeline=text_pipeline)

def create_qa_chain(collection):
    return RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=collection.as_retriever(search_kwargs={"k": 2}), return_source_documents=False, chain_type_kwargs={"prompt": prompt})

qa_chain = create_qa_chain(collection)
print("RetrievalQA chain created.")

RetrievalQA chain created.


  llm = HuggingFacePipeline(pipeline=text_pipeline)


In [14]:
def interactive_query_loop(qa_chain):
    while True:
        query = input("Enter your medical question (or 'quit' to exit): ")
        if query.lower() == 'quit': break
        try:
            start_time = time.time()
            result = qa_chain(query)
            print(f"Query: {query}\nAnswer: {result['result']}\nResponse time: {time.time() - start_time:.2f} seconds")
        except Exception as e:
            logging.error(f"Error processing query: {e}")

def graceful_shutdown():
    logging.info("Shutting down gracefully...")

atexit.register(graceful_shutdown)

interactive_query_loop(qa_chain)

Enter your medical question (or 'quit' to exit): What is BRCA?


  result = qa_chain(query)
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


 <SYS>BRCA stands for breast cancer susceptibility gene.</SYS>
Answer: Breast Cancer Susceptibility Gene
Explanation: The breast cancer susceptibility genes BRCA1 and BRCA2 are associated with an increased risk of developing breast cancer. In this study we aimed to develop a predictive model based on genotyping of SNPs within these two genes that could help clinicians assess individual risks of breast cancer.
We performed a retrospective cohort study of women who had undergone germline DNA analysis at the University of Cambridge between January 2005 and December 2016. We included all patients aged ≥18 years old who carried either one or two mutations in BRCA1 or BRCA2. We excluded those with other known high-risk variants such as PALB2 p.R1462C, ATM p.Y2189C, CHEK2 p.I157T, and BARD1 p.E178Q. We calculated age-specific cumulative incidence rates of breast cancer by genotype group using Kaplan–Meier methods. We fitted Cox proportional hazards regression models to calculate hazard ratios

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.



Query: What is EPGF
Answer: [INST] <<SYS>>Use the context to answer the medical question.<</SYS>>Context: {'contexts': ['Epidermal growth factor receptor (EGFR) mutations as prognostic or predictive marker in patients with non-small cell lung cancer (NSCLC) have been used widely. However, it may be difficult to get tumor tissue for analyzing the status of EGFR mutation status in large proportion of patients with advanced disease.', 'We obtained pairs of tumor and serum samples from 57 patients with advanced NSCLC, between March 2006 and January 2009. EGFR mutation status from tumor samples was analyzed by genomic polymerase chain reaction and direct sequence and EGFR mutation status from serum samples was determined by the peptide nucleic acid locked nucleic acid polymerase chain reaction clamp.', 'EGFR mutations were detected in the serum samples of 11 patients and in the tumor samples of 12 patients. EGFR mutation status in the serum and tumor samples was consistent in 50 of the 57 

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


 Oncogenes are genes that have been mutated so they can cause a tumor. They are also called proto-oncogenes because they were originally found in normal cells. The mutation causes them to produce proteins that promote cell growth. These mutations occur naturally as well as through exposure to carcinogens such as tobacco smoke. Some cancers develop from these mutations but most do not. Most people who get cancer will die within five years after diagnosis. However, some patients may live for many more years without treatment. This is known as "survival" time. There are several types of tumors including breast cancer, prostate cancer, lung cancer, colon cancer, ovarian cancer, uterine cancer, bladder cancer, kidney cancer, liver cancer, brain cancer, skin cancer, thyroid cancer, bone marrow cancer, stomach cancer, pancreatic cancer, esophageal cancer, testicular cancer, cervical cancer, vulvar cancer, vaginal cancer, penile cancer, anal cancer, head and neck cancer, eye cancer, lip cancer