### Data Ingestion

In [1]:
# convert the pdf files into documents
from langchain_community.document_loaders import DirectoryLoader, PDFPlumberLoader
loader = DirectoryLoader(
  "../data/pdf", # path
  glob= "**/*.pdf", # pattern to match the files
  loader_cls=PDFPlumberLoader
)
documents = loader.load()

In [2]:
len(documents)

2

In [3]:
documents

[Document(metadata={'source': '..\\data\\pdf\\medical_text.pdf', 'file_path': '..\\data\\pdf\\medical_text.pdf', 'page': 0, 'total_pages': 2, 'Producer': 'pdfmake', 'Creator': 'pdfmake', 'CreationDate': 'D:20251105170459Z'}, page_content="Example 1: Abdominal Pain\nThe location of the pain is a critical pivotal point for framing the differential diagnosis:\nPain Location Possible Diagnoses\nRight Upper Quadrant Biliary disease (colic, cholecystitis), hepatitis, pancreatitis.\n(RUQ)\nEpigastrium Peptic ulcer disease, pancreatitis, biliary disease.\nDiffuse/Periumbilical Appendicitis (early), bowel obstruction, irritable bowel syndrome,\nmesenteric ischemia.\nRight Lower Quadrant Appendicitis, Crohn's disease, ovarian pathology (in women).\n(RLQ)\nLeft Lower Quadrant Diverticulitis, ovarian pathology (in women).\n(LLQ)\nExample 2: Headache\nThe pattern and quality of the headache help distinguish between primary headache disorders\nand more serious secondary causes.\nHeadache Type Possib

In [4]:
# preprocessing the text to remove \n
import re
def clean_text(text: str) -> str:
    """
    Cleans up text by removing extra newlines and redundant spaces.
    Keeps paragraph spacing intact.
    """
    # Replace multiple newlines with a single newline
    text = re.sub(r'\n+', '\n', text)
    
    # Remove newline characters within paragraphs
    text = re.sub(r'(?<!\n)\n(?!\n)', ' ', text)
    
    # Normalize multiple spaces
    text = re.sub(r'\s{2,}', ' ', text)
    
    # Fix hyphenated line breaks
    text = re.sub(r'(\w+)-\s*\n\s*(\w+)', r'\1\2', text)
    
    # Remove hyphen + space (e.g., 'pri- marily' → 'primarily')
    text = re.sub(r'(\w+)-\s+(\w+)', r'\1\2', text)
    
    return text.strip()



In [5]:
for doc in documents:
  # pass into the cleaning function
  doc.page_content = clean_text(doc.page_content)

In [6]:
documents

[Document(metadata={'source': '..\\data\\pdf\\medical_text.pdf', 'file_path': '..\\data\\pdf\\medical_text.pdf', 'page': 0, 'total_pages': 2, 'Producer': 'pdfmake', 'Creator': 'pdfmake', 'CreationDate': 'D:20251105170459Z'}, page_content="Example 1: Abdominal Pain The location of the pain is a critical pivotal point for framing the differential diagnosis: Pain Location Possible Diagnoses Right Upper Quadrant Biliary disease (colic, cholecystitis), hepatitis, pancreatitis. (RUQ) Epigastrium Peptic ulcer disease, pancreatitis, biliary disease. Diffuse/Periumbilical Appendicitis (early), bowel obstruction, irritable bowel syndrome, mesenteric ischemia. Right Lower Quadrant Appendicitis, Crohn's disease, ovarian pathology (in women). (RLQ) Left Lower Quadrant Diverticulitis, ovarian pathology (in women). (LLQ) Example 2: Headache The pattern and quality of the headache help distinguish between primary headache disorders and more serious secondary causes. Headache Type Possible Diagnoses Pr

### Chunking

In [7]:
import torch
print(torch.__version__)
print(torch.cuda.is_available())
print(torch.cuda.get_device_name(0))


2.5.1+cu121
True
NVIDIA GeForce RTX 4060 Laptop GPU


In [8]:
import os
from langchain_experimental.text_splitter import SemanticChunker
from sentence_transformers import SentenceTransformer
from langchain_community.embeddings import HuggingFaceEmbeddings

In [10]:
def build_embedding_model():
    model_name = "NeuML/pubmedbert-base-embeddings"
    return HuggingFaceEmbeddings(
        model_name=model_name,
        model_kwargs={"device": "cuda" if torch.cuda.is_available() else "cpu"},
        encode_kwargs={"normalize_embeddings": True},  # important for cosine
    )

def split_documents(documents, embedding_model=None):
  if embedding_model is None:
      embedding_model = build_embedding_model()

  text_splitter = SemanticChunker(
      embeddings=embedding_model,
      breakpoint_threshold_type="percentile",
      breakpoint_threshold_amount=60,  
  )

  chunks = text_splitter.split_documents(documents)

  #numeric id per chunk for debugging
  for i, doc in enumerate(chunks):
      doc.metadata = doc.metadata or {}
      doc.metadata["chunk_id"] = i

  return chunks

In [11]:
chunks = split_documents(documents)
chunks

  return HuggingFaceEmbeddings(


[Document(metadata={'source': '..\\data\\pdf\\medical_text.pdf', 'file_path': '..\\data\\pdf\\medical_text.pdf', 'page': 0, 'total_pages': 2, 'Producer': 'pdfmake', 'Creator': 'pdfmake', 'CreationDate': 'D:20251105170459Z', 'chunk_id': 0}, page_content='Example 1: Abdominal Pain The location of the pain is a critical pivotal point for framing the differential diagnosis: Pain Location Possible Diagnoses Right Upper Quadrant Biliary disease (colic, cholecystitis), hepatitis, pancreatitis. (RUQ) Epigastrium Peptic ulcer disease, pancreatitis, biliary disease.'),
 Document(metadata={'source': '..\\data\\pdf\\medical_text.pdf', 'file_path': '..\\data\\pdf\\medical_text.pdf', 'page': 0, 'total_pages': 2, 'Producer': 'pdfmake', 'Creator': 'pdfmake', 'CreationDate': 'D:20251105170459Z', 'chunk_id': 1}, page_content="Diffuse/Periumbilical Appendicitis (early), bowel obstruction, irritable bowel syndrome, mesenteric ischemia. Right Lower Quadrant Appendicitis, Crohn's disease, ovarian pathology 

### Embedding and vector store

In [12]:

from sentence_transformers import SentenceTransformer
import numpy as np
import faiss
from langchain_community.docstore.in_memory import InMemoryDocstore
from langchain_community.vectorstores import FAISS
from uuid import uuid4

from langchain_core.documents import Document
import re

In [26]:
import numpy as np
import faiss
import re

class EmbeddingEncoder:
    def __init__(self, embedding_model=None):
        # Reuse same model as chunker
        self.embedding_model = embedding_model or build_embedding_model()
        self.texts = []
        self.metadatas = []
        self.index = None

    def clean_text(self, text: str) -> str:
        text = re.sub(r'<pad>|<EOS>', '', text)
        text = re.sub(r'\s+', ' ', text).strip()
        return text

    def embed_docs(self, documents):
        """
        documents: list[Document]
        returns: np.ndarray of shape (n_docs, dim)
        """
        extracted_text = [self.clean_text(doc.page_content) for doc in documents]
        self.texts = extracted_text
        self.metadatas = [doc.metadata for doc in documents]

        embeddings = self.embedding_model.embed_documents(extracted_text)
        embeddings = np.array(embeddings, dtype="float32")

        # If your model is not already normalized, this keeps things safe
        faiss.normalize_L2(embeddings)

        return embeddings

    def FaissVectorStore(self, embeddings, index_path=None):
        """
        embeddings: np.ndarray (n_docs, dim)
        """
        embeddings = np.array(embeddings, dtype="float32")
        dim = embeddings.shape[1]

        # Cosine similarity via inner product (embeddings are normalized)
        index = faiss.IndexFlatIP(dim)
        index.add(embeddings)

        # Choose default path if not provided
        if index_path is None:
            index_path = "C:/AI_PROJECTS/medical_chatbot/data/faiss_index/faiss_index.bin"

        # Write correctly
        faiss.write_index(index, index_path)

        self.index = index

    def load_faiss(self, path="C:/AI_PROJECTS/medical_chatbot/data/faiss_index/faiss_index.bin"):
        self.index = faiss.read_index(path)
        print("FAISS index loaded successfully.")

        # You still need to reload self.texts and self.metadatas yourself
        # e.g. from a pickle file where you saved them.

    def mmr(self, query_emb, doc_embs, k=5, lambda_mult=0.5):
        """
        query_emb: (d,)
        doc_embs: (N, d)
        returns list of selected indices (size k or less)
        """
        # similarity to query
        sim_to_query = np.dot(doc_embs, query_emb)

        # similarity between docs
        sim_between_docs = np.dot(doc_embs, doc_embs.T)

        selected = []
        candidates = list(range(len(doc_embs)))

        # choose first = best to query
        first = int(np.argmax(sim_to_query))
        selected.append(first)
        candidates.remove(first)

        while len(selected) < k and candidates:
            mmr_score = []
            for c in candidates:
                diversity = max(sim_between_docs[c][s] for s in selected)
                score = lambda_mult * sim_to_query[c] - (1 - lambda_mult) * diversity
                mmr_score.append((score, c))

            _, best_c = max(mmr_score, key=lambda x: x[0])
            selected.append(best_c)
            candidates.remove(best_c)

        return selected


    def retrieve_mmr(self, query, top_k=5, pool_size=20):
        if self.index is None:
            raise ValueError("Index is not initialized.")

        q = np.array(self.embedding_model.embed_query(query), dtype="float32")
        faiss.normalize_L2(q.reshape(1, -1))

        # get a larger pool first
        scores, idxs = self.index.search(q.reshape(1, -1), pool_size)
        idxs = idxs[0]
        scores = scores[0]

        valid_mask = idxs >= 0
        idxs = idxs[valid_mask]
        scores = scores[valid_mask]

        # take their embeddings again (or store them)
        # since we normalized docs before, reuse those embeddings from index build time if you saved them
        # for now, recompute (still doc-agnostic)
        doc_texts = [self.texts[i] for i in idxs]
        doc_embs = np.array(self.embedding_model.embed_documents(doc_texts), dtype="float32")
        faiss.normalize_L2(doc_embs)

        selected_idx_pos = self.mmr(q, doc_embs, k=top_k, lambda_mult=0.5)

        results = []
        for rank, pos in enumerate(selected_idx_pos, start=1):
            real_idx = int(idxs[pos])
            results.append({
                "rank": rank,
                "text": self.texts[real_idx],
                "metadata": self.metadatas[real_idx],
                "score": float(scores[pos]),
            })
        return results


In [27]:
embedding_model = build_embedding_model()

In [28]:
import numpy as np
encoder = EmbeddingEncoder(embedding_model=embedding_model)
embeddings = encoder.embed_docs(chunks)
print(len(embeddings))

7


In [29]:
encoder.FaissVectorStore(embeddings)

In [37]:

query = "shortness of breath"

retrieval_results = encoder.retrieve_mmr(query, 4)

In [38]:
retrieval_results

[{'rank': 1,
  'text': 'dry) are key pivotal points. Context Possible Diagnoses Acute Common cold, acute bronchitis, pneumonia, COVID-19, heart failure. Cough Chronic COPD, asthma, gastroesophageal reflux disease (GERD), postnasal drip, certain Cough medications (e.g., ACE inhibitors).',
  'metadata': {'source': '..\\data\\pdf\\medical_text.pdf',
   'file_path': '..\\data\\pdf\\medical_text.pdf',
   'page': 1,
   'total_pages': 2,
   'Producer': 'pdfmake',
   'Creator': 'pdfmake',
   'CreationDate': 'D:20251105170459Z',
   'chunk_id': 6},
  'score': 0.44128426909446716},
 {'rank': 2,
  'text': 'Secondary Headaches Meningitis, subarachnoid hemorrhage, brain tumor, giant cell arteritis. Example 3: Shortness of Breath (Dyspnea) This symptom can arise from problems in the cardiac, pulmonary, or other systems.',
  'metadata': {'source': '..\\data\\pdf\\medical_text.pdf',
   'file_path': '..\\data\\pdf\\medical_text.pdf',
   'page': 0,
   'total_pages': 2,
   'Producer': 'pdfmake',
   'Creat

In [None]:
import os
from groq import Groq

# create client once
import os
groq_api_key = os.getenv("GROQ_API_KEY")


MODEL_NAME = "llama-3.1-8b-instant" 

def llm_generate(prompt: str) -> str:
    chat_completion = client.chat.completions.create(
        model=MODEL_NAME,
        messages=[
            {"role": "system", "content": "You are a helpful medical assistant."},
            {"role": "user", "content": prompt},
        ],
        temperature=0.1,
        max_tokens=800,
    )

    return chat_completion.choices[0].message.content


In [32]:
llm_generate("Explain what fever is.")


"A fever is a common medical condition characterized by an elevated body temperature above the normal range. In adults, a normal body temperature typically ranges from 97.7°F (36.5°C) to 99.5°F (37.7°C). \n\nWhen the body detects the presence of an infection, inflammation, or other illness, it triggers a response to fight off the invading pathogens. This response involves the release of chemical signals, such as pyrogens, which stimulate the hypothalamus, the body's temperature regulation center. The hypothalamus then raises the body's temperature set point, causing the body to produce more heat.\n\nThere are several types of fever, including:\n\n1. **Pyretic fever**: This is the most common type of fever, caused by an infection, such as a cold or flu.\n2. **Neurogenic fever**: This type of fever is caused by a problem with the brain or nervous system, such as a brain infection or a tumor.\n3. **Infectious fever**: This type of fever is caused by an infection, such as pneumonia or meni

In [39]:
def build_context(retrieval_results, max_chars=3000):
    context_parts = []
    total = 0

    for r in retrieval_results:
        txt = r["text"]
        if not txt:
            continue
        if total + len(txt) > max_chars:
            break

        context_parts.append(txt)
        total += len(txt)

    return "\n\n---\n\n".join(context_parts)


In [42]:
def rag_answer_from_results(question: str, retrieval_results):
    # 1. Build context from retrieved chunks
    context = build_context(retrieval_results)

    # 2. Construct the prompt for the LLM
    prompt = f"""
Answer the following medical question using ONLY the context below.
If the answer is not clearly present in the context, say:
"I'm not sure based on the provided context."

Context:
{context}

Question:
{question}

Answer:
""".strip()

    # 3. Call Groq LLM
    return llm_generate(prompt)


In [47]:
query = "What are the possible diagnoses for shortness of breath?"
retrieval_results = encoder.retrieve_mmr(query, 4)

answer = rag_answer_from_results(query, retrieval_results)
print(answer)



This symptom can arise from problems in the cardiac, pulmonary, or other systems. 

Possible diagnoses include:
- Pulmonary: asthma, COPD, pneumonia, pulmonary embolism, pleural effusion
- Cardiac: heart failure, coronary artery disease, arrhythmias
