In [1]:
import os
from dotenv import load_dotenv

load_dotenv()
langchain_api_key = os.getenv('LANGCHAIN_API_KEY')
huggingface_api_key = os.getenv('HUGGINGFACE_API_KEY')

os.environ['LANGCHAIN_TRACING_V2'] = 'true'
os.environ['LANGCHAIN_API_KEY'] = langchain_api_key
os.environ['HUGGINGFACE_API_KEY'] = huggingface_api_key

In [2]:
from langchain import hub
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.vectorstores import Chroma
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough, RunnableLambda
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_ollama import OllamaLLM
from langchain_core.documents import Document
from langchain_core.prompts import PromptTemplate

In [3]:
import ragas
from ragas.metrics import (
    faithfulness,
    answer_relevancy,
    context_recall,
    context_precision,
    answer_correctness,
    answer_similarity # Often used with answer_correctness
)
from ragas.llms import LangchainLLMWrapper # To wrap Ollama for RAGAS
from datasets import Dataset
import asyncio # Ragas evaluation is often async

In [4]:
pdf_files = [
    r"E:\DiabetIQ\LLM\PDFs\BES-COVID-Pract-Recomnd-06-June-Final-Copy.pdf",
    r"E:\DiabetIQ\LLM\PDFs\BES-Ramadan-Guideline-2020-min.pdf",
    r"E:\DiabetIQ\LLM\PDFs\Diabetes_Care_BADAS_guideline2019-3.pdf",
    r"E:\DiabetIQ\LLM\PDFs\Insulin-Guideline-min.pdf"
]

all_docs = []
print("Loading and Processing PDFs...")
for pdf_path in pdf_files:
    try:
        file_name = os.path.basename(pdf_path)
        print(f"-> Loading: {file_name}")
        loader = PyPDFLoader(pdf_path)
        pages = loader.load_and_split() # load_and_split is often sufficient
        for page_doc in pages:
            page_doc.metadata['source'] = file_name
        all_docs.extend(pages)
        print(f"   Loaded {len(pages)} pages.")
    except Exception as e:
        print(f"Error loading {pdf_path}: {e}")

print(f"\nTotal documents loaded: {len(all_docs)}")
if not all_docs:
    print("\nNo documents were loaded successfully. Exiting.")
    exit()
else:
    print("\nSample Document Metadata (first doc):")
    print(all_docs[0].metadata)
    print("\nSample Document Content (first 500 chars of first doc):")
    print(all_docs[0].page_content[:500])

Loading and Processing PDFs...
-> Loading: BES-COVID-Pract-Recomnd-06-June-Final-Copy.pdf
   Loaded 38 pages.
-> Loading: BES-Ramadan-Guideline-2020-min.pdf
   Loaded 46 pages.
-> Loading: Diabetes_Care_BADAS_guideline2019-3.pdf
   Loaded 79 pages.
-> Loading: Insulin-Guideline-min.pdf
   Loaded 93 pages.

Total documents loaded: 256

Sample Document Metadata (first doc):
{'producer': 'Nitro PDF PrimoPDF', 'creator': 'PrimoPDF http://www.primopdf.com', 'creationdate': '2020-06-07T20:17:39-06:00', 'moddate': '2020-06-07T20:17:39-06:00', 'title': 'Microsoft Word - BES COVID Pract Recomnd 06 June Final Copy', 'author': 'Mir', 'source': 'BES-COVID-Pract-Recomnd-06-June-Final-Copy.pdf', 'total_pages': 38, 'page': 0, 'page_label': '1'}

Sample Document Content (first 500 chars of first doc):
Bangladesh Endocrine Society (BES) 
Practical Recommendations for Management of 
Diabetes and Other Endocrine Diseases in Patients with 
COVID-19 
 
 
 
 
 
Published Online June 2020 
 
 
All rights res

In [5]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200,
    separators=["\n\n", "\n", ". ", ", ", " ", ""],
    length_function=len,
)
chunks = text_splitter.split_documents(all_docs)
print(f"\nTotal chunks created: {len(chunks)}")
if not chunks:
    print("\nNo chunks were created. Check splitting process. Exiting.")
    exit()
else:
    print("\nSample Chunk Metadata (first chunk):")
    print(chunks[0].metadata)
    print("\nSample Chunk Content (first 500 chars):")
    print(chunks[0].page_content[:500])


Total chunks created: 702

Sample Chunk Metadata (first chunk):
{'producer': 'Nitro PDF PrimoPDF', 'creator': 'PrimoPDF http://www.primopdf.com', 'creationdate': '2020-06-07T20:17:39-06:00', 'moddate': '2020-06-07T20:17:39-06:00', 'title': 'Microsoft Word - BES COVID Pract Recomnd 06 June Final Copy', 'author': 'Mir', 'source': 'BES-COVID-Pract-Recomnd-06-June-Final-Copy.pdf', 'total_pages': 38, 'page': 0, 'page_label': '1'}

Sample Chunk Content (first 500 chars):
Bangladesh Endocrine Society (BES) 
Practical Recommendations for Management of 
Diabetes and Other Endocrine Diseases in Patients with 
COVID-19 
 
 
 
 
 
Published Online June 2020 
 
 
All rights reserved by: Bangladesh Endocrine Society (BES) 
 
 
Published by 
Bangladesh Endocrine Society (BES) 
Website: http://bes-org.net 
E-mail: 
endobd2012@gmail.com


In [6]:
print("\nInitializing Embedding Model...")
embedding_model = HuggingFaceEmbeddings(model_name="intfloat/e5-small-v2")

print("\nCreating Vector Store (ChromaDB)...")
# Using in-memory Chroma for simplicity in this example
vectorstore = Chroma.from_documents(
    documents=chunks,
    embedding=embedding_model
)
print("Vector Store Created.")


Initializing Embedding Model...

Creating Vector Store (ChromaDB)...
Vector Store Created.


In [7]:
retriever = vectorstore.as_retriever(search_kwargs={"k": 5}) # Retrieve top 5 chunks
print(f"Retriever configured (using k={retriever.search_kwargs.get('k', 'default')}).")

Retriever configured (using k=5).


In [8]:
prompt_template = """
You are DiabetIQ, an AI assistant specializing in diabetes management for patients in Bangladesh, based *strictly* on the provided context documents.

Context Documents:
{context}

Based *only* on the information in the numbered context documents above, answer the following question concisely and directly.
Your advice should be actionable and consider general practices relevant to Bangladesh where possible (e.g., common foods mentioned in context, local guidelines if present in context).
Do *not* add information that is not present in the context.
If the context does not contain the answer, state that clearly.
Always conclude your response by advising the user to consult a healthcare professional for personalized medical advice.

Question: {question}

Answer:
"""
prompt = PromptTemplate.from_template(prompt_template)

In [9]:
print("Initializing LLM (Ollama - Mistral)...")
llm = OllamaLLM(model="mistral")

Initializing LLM (Ollama - Mistral)...


In [10]:
def format_docs_with_metadata(docs: list[Document]) -> str:
    """Formats retrieved documents including source and page."""
    formatted_strings = []
    for i, doc in enumerate(docs):
        # Ensure metadata keys exist, provide defaults if not
        source = doc.metadata.get('source', 'N/A')
        page = doc.metadata.get('page', 'N/A') # PyPDFLoader adds 'page'
        metadata_str = f"Source: {source}, Page: {page}"
        content_str = doc.page_content.replace('\n', ' ').strip()
        formatted_strings.append(f"{i+1}. [{metadata_str}] {content_str}")
    return "\n\n".join(formatted_strings)

In [11]:
def retrieve_and_format(query: str) -> dict:
    docs = retriever.invoke(query)
    formatted_context = format_docs_with_metadata(docs)
    # We also need the raw context content for RAGAS
    raw_context_list = [doc.page_content for doc in docs]
    return {"formatted_context": formatted_context, "raw_contexts": raw_context_list, "question": query}


In [12]:
generate_answer_chain = (
    prompt
    | llm
    | StrOutputParser()
)

In [13]:
# --- RAGAS Evaluation Section ---

print("\n--- Preparing for RAGAS Evaluation ---")

# 1. Define Evaluation Questions and Ground Truths

eval_data = [
    {
        "question": "How can I control my blood sugar level with diet according to the textbook?",
        "ground_truth": "Dietary control involves emphasizing fruits, legumes, whole grains, dairy, learning carbohydrate counting, avoiding sugary drinks, ensuring sufficient protein (e.g., 1g/kg for older people), consuming 2-3 servings of fruits/vegetables daily, favoring mono/polyunsaturated fats (like from fatty fish, nuts, seeds), maintaining a regular schedule, and seeking medical help if unable to eat/hydrate. Personalized advice from a healthcare professional is essential."
    },
    {
        "question": "What does the BADAS guideline say about insulin initiation?",
        "ground_truth": "The BADAS Guideline 2019 recommends starting glucose-insulin infusion for all major surgeries. Outside surgery, if already on insulin, intermediate/long-acting insulin is continued (dose might need reduction), and short-acting insulin is adjusted based on blood glucose and food. Always consult a healthcare professional."
    },
    {
        "question": "Tell me about managing diabetes during Ramadan based on the provided texts.",
        "ground_truth": "Management during Ramadan includes a balanced diet (considering common Bangladeshi foods), adequate hydration during non-fasting hours, taking medication as prescribed (possibly adjusted), taking suhoor before dawn and iftar at sunset, regular blood glucose monitoring, moderate physical activity (avoiding intense workouts near meals), rest, stress management, and consulting a healthcare professional for personalized advice. [Reference specific advice from BES-Ramadan-Guideline-2020-min.pdf if possible]"
    },
    {
        "question": "Can people with diabetes eat mangoes during Ramadan?",
        "ground_truth": "The provided context does not explicitly state whether people with diabetes can eat mangoes during Ramadan. General advice emphasizes balanced meals and carbohydrate counting. Mangoes are high in sugar, so portion control and monitoring blood glucose would be crucial. Consult a healthcare professional for personalized advice regarding specific foods like mangoes."
    }
]


--- Preparing for RAGAS Evaluation ---


In [14]:
# 2. Collect Data for RAGAS (Question, Answer, Contexts, Ground Truth)
print("Collecting data for RAGAS evaluation...")
evaluation_results = []
for item in eval_data:
    question = item["question"]
    ground_truth = item["ground_truth"]
    print(f"  Processing question: {question[:50]}...")

    # a. Retrieve contexts
    retrieved_docs = retriever.invoke(question)
    contexts_list = [doc.page_content for doc in retrieved_docs]
    formatted_context_string = format_docs_with_metadata(retrieved_docs)

    # b. Generate Answer using the LLM
    try:
        response = generate_answer_chain.invoke({
            "context": formatted_context_string,
            "question": question
        })
        answer = response.strip() # Get the generated answer
    except Exception as e:
        print(f"    Error generating answer for '{question[:50]}...': {e}")
        answer = "[Error generating answer]"
        contexts_list = [] # Avoid evaluating if generation failed badly

    # c. Store results
    evaluation_results.append({
        "question": question,
        "answer": answer,
        "contexts": contexts_list, # List of strings (document content)
        "ground_truth": ground_truth
    })

print(f"Data collection complete. Collected {len(evaluation_results)} results.")

Collecting data for RAGAS evaluation...
  Processing question: How can I control my blood sugar level with diet a...
  Processing question: What does the BADAS guideline say about insulin in...
  Processing question: Tell me about managing diabetes during Ramadan bas...
  Processing question: Can people with diabetes eat mangoes during Ramada...
Data collection complete. Collected 4 results.


In [19]:
# 3. Convert to Hugging Face Dataset
if evaluation_results:
    dataset_dict = {
        "question": [item["question"] for item in evaluation_results],
        "answer": [item["answer"] for item in evaluation_results],
        "contexts": [item["contexts"] for item in evaluation_results],
        "ground_truth": [item["ground_truth"] for item in evaluation_results],
    }
    eval_dataset = Dataset.from_dict(dataset_dict) # <-- This line creates it
    print("\nEvaluation dataset created:")
    print(eval_dataset)
else:
    print("\nNo evaluation results collected, skipping RAGAS evaluation.")
    # exit() # You might have an exit() here, which would stop the notebook


Evaluation dataset created:
Dataset({
    features: ['question', 'answer', 'contexts', 'ground_truth'],
    num_rows: 4
})


In [20]:
print("\n--- Configuring RAGAS ---")

judge_llm_model = OllamaLLM(model="llama3") # Or change to a faster model if needed
print(f"  Judge LLM created: LangChain OllamaLLM (model='{judge_llm_model.model}')")


--- Configuring RAGAS ---
  Judge LLM created: LangChain OllamaLLM (model='llama3')


In [21]:
# 2. Configure the Embeddings for RAGAS (using the same as your pipeline)

base_embeddings = HuggingFaceEmbeddings(model_name="intfloat/e5-small-v2")
print(f"  Embeddings created: LangChain HuggingFaceEmbeddings (model='{base_embeddings.model_name}')")

  Embeddings created: LangChain HuggingFaceEmbeddings (model='intfloat/e5-small-v2')


In [22]:
# 3. Define the metrics for evaluation
metrics = [
    faithfulness,        # How factually consistent is the answer with the context? (LLM judged)
    answer_relevancy,    # How relevant is the answer to the question? (LLM + Embedding judged)
    context_precision,   # Signal-to-noise ratio in retrieved contexts. (LLM judged)
    context_recall,      # How well does the context capture the necessary info from ground_truth? (LLM judged)
    answer_correctness   # How accurate is the answer compared to the ground_truth? (LLM judged)
]
print("  RAGAS Metrics defined:")
for m in metrics:
    print(f"    - {m.name}")

print("--- RAGAS Configuration Complete ---")

  RAGAS Metrics defined:
    - faithfulness
    - answer_relevancy
    - context_precision
    - context_recall
    - answer_correctness
--- RAGAS Configuration Complete ---


In [None]:
from ragas import evaluate
import pandas as pd
import warnings

try:
    loop = asyncio.get_running_loop()
    print("Asyncio event loop found.")
except RuntimeError:
    print("No running asyncio event loop, creating one.")
    loop = asyncio.new_event_loop()
    asyncio.set_event_loop(loop)

if eval_dataset:
    print("\n--- Starting RAGAS Evaluation ---")
    print(f"Using Judge LLM: {judge_llm_model.model}")
    print(f"Using Embeddings: {base_embeddings.model_name}")
    print(f"Evaluating {len(eval_dataset)} samples with {len(metrics)} metrics each.")
    print("This may take a while, especially with local LLMs...")
    print("Ensure Ollama is running and accessible.")

    try:
        # Key Change: Pass the LangChain LLM and Embeddings objects directly
        # RAGAS v0.1.0+ prefers this over the older LangchainLLMWrapper approach.
        result = evaluate(
            dataset=eval_dataset,       # The dataset prepared earlier
            metrics=metrics,            # The list of base metrics
            llm=judge_llm_model,        # The LangChain LLM object for judging
            embeddings=base_embeddings, # The LangChain Embeddings object
            # raise_exceptions=False    # Set to False to log errors and continue, True (default) to stop on first error.
                                        # Let's keep it True for now to see errors clearly.
        )

        print("--- RAGAS Evaluation Complete ---")

        # Display results
        print("\nEvaluation Results:")
        results_df = result.to_pandas()

        # Improve Display
        pd.set_option('display.max_colwidth', None) # Show full text in columns
        pd.set_option('display.max_columns', None)  # Show all columns
        pd.set_option('display.float_format', '{:.4f}'.format) # Format scores

        print(results_df)

        print("\n--- Interpreting Scores (Scale 0.0 to 1.0, higher is better) ---")
        print("- faithfulness: Does the answer stick to the provided context? High = Less hallucination.")
        print("- answer_relevancy: Is the answer relevant to the question? High = On-topic answer.")
        print("- context_precision: Are the retrieved contexts relevant? High = Less noise in context.")
        print("- context_recall: Did the retriever find all necessary contexts (based on ground_truth)? High = Good retrieval coverage.")
        print("- answer_correctness: Is the answer factually correct compared to the ground truth? High = Accurate answer.")
        print("\nNaN scores indicate the metric calculation failed, often due to timeouts or errors from the Judge LLM.")

    except Exception as e:
        print(f"\n--- RAGAS Evaluation Failed ---")
        print(f"An error occurred during evaluation: {e}")
        print("Potential Causes & Fixes:")
        print("  1. Ollama Server Down: Ensure 'ollama serve' is running and the judge model ({judge_llm_model.model}) is available.")
        print("  2. Network Issues: Check if the script can reach the Ollama server address (usually http://localhost:11434).")
        print("  3. Judge LLM Too Slow (Timeout):")
        print("     - Use a smaller/faster judge LLM model in Ollama (e.g., 'phi3:mini', 'mistral').")
        print("     - Increase hardware resources (RAM, CPU/GPU) for Ollama.")
        print("     - If using LangChain's Ollama wrapper, try increasing 'request_timeout' when creating `judge_llm_model` (see config cell).")
        print("  4. Insufficient Ground Truth: Some metrics rely heavily on good ground truth.")
        print("  5. RAGAS/Dependency Issues: Ensure RAGAS and its dependencies (langchain, datasets, etc.) are up-to-date.")

else:
    print("\nSkipping RAGAS evaluation as eval_dataset is empty.")

Asyncio event loop found.

--- Starting RAGAS Evaluation ---
Using Judge LLM: llama3
Using Embeddings: intfloat/e5-small-v2
Evaluating 4 samples with 5 metrics each.
This may take a while, especially with local LLMs...
Ensure Ollama is running and accessible.


Evaluating:   0%|          | 0/20 [00:00<?, ?it/s]

Exception raised in Job[0]: TimeoutError()
Exception raised in Job[2]: TimeoutError()
Exception raised in Job[3]: TimeoutError()
Exception raised in Job[4]: TimeoutError()
Exception raised in Job[5]: TimeoutError()
Exception raised in Job[7]: TimeoutError()
Exception raised in Job[8]: TimeoutError()
Exception raised in Job[9]: TimeoutError()
Exception raised in Job[10]: TimeoutError()
Exception raised in Job[12]: TimeoutError()
Exception raised in Job[13]: TimeoutError()
Exception raised in Job[14]: TimeoutError()
Exception raised in Job[15]: TimeoutError()
Exception raised in Job[1]: TimeoutError()
Exception raised in Job[6]: TimeoutError()
Exception raised in Job[11]: TimeoutError()
