In [1]:
import os
from dotenv import load_dotenv

load_dotenv()
langchain_api_key = os.getenv('LANGCHAIN_API_KEY')
huggingface_api_key = os.getenv('HUGGINGFACE_API_KEY')
os.environ['LANGCHAIN_TRACING_V2'] = 'true'
os.environ['LANGCHAIN_API_KEY'] = langchain_api_key
os.environ['HUGGINGFACE_API_KEY'] = huggingface_api_key

In [2]:
from langchain import hub
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyPDFLoader # Use this
from langchain_community.vectorstores import Chroma
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_ollama import OllamaLLM
from langchain_core.documents import Document

In [3]:
pdf_files = [
    r"F:\DiabetIQ\LLM\PDFs\BES-COVID-Pract-Recomnd-06-June-Final-Copy.pdf",
    r"F:\DiabetIQ\LLM\PDFs\BES-Ramadan-Guideline-2020-min.pdf",
    r"F:\DiabetIQ\LLM\PDFs\Diabetes_Care_BADAS_guideline2019-3.pdf",
    r"F:\DiabetIQ\LLM\PDFs\Insulin-Guideline-min.pdf"
]

all_docs = [] # Will store LangChain Document objects

print("Loading and Processing PDFs...")
for pdf_path in pdf_files:
    try:
        # Extract filename for metadata
        file_name = os.path.basename(pdf_path)
        print(f"-> Loading: {file_name}")

        loader = PyPDFLoader(pdf_path)
        # Load pages as individual documents. Each doc will have metadata['page']
        pages = loader.load_and_split() # This does basic splitting

        # Add source filename to metadata for each page/document
        for page_doc in pages:
            page_doc.metadata['source'] = file_name
            # Optional: clean up page content slightly if needed
            # page_doc.page_content = page_doc.page_content.replace('\n', ' ').strip()

        all_docs.extend(pages)
        print(f"   Loaded {len(pages)} pages.")

    except Exception as e:
        print(f"Error loading {pdf_path}: {e}")

print(f"\nTotal documents loaded: {len(all_docs)}")
if all_docs:
    print("\nSample Document Metadata (first doc):")
    print(all_docs[0].metadata)
    print("\nSample Document Content (first 500 chars of first doc):")
    print(all_docs[0].page_content[:500])
else:
    print("\nNo documents were loaded successfully.")
    # Consider exiting or handling this error appropriately
    exit()

Loading and Processing PDFs...
-> Loading: BES-COVID-Pract-Recomnd-06-June-Final-Copy.pdf
   Loaded 38 pages.
-> Loading: BES-Ramadan-Guideline-2020-min.pdf
   Loaded 46 pages.
-> Loading: Diabetes_Care_BADAS_guideline2019-3.pdf
   Loaded 79 pages.
-> Loading: Insulin-Guideline-min.pdf
   Loaded 93 pages.

Total documents loaded: 256

Sample Document Metadata (first doc):
{'producer': 'Nitro PDF PrimoPDF', 'creator': 'PrimoPDF http://www.primopdf.com', 'creationdate': '2020-06-07T20:17:39-06:00', 'moddate': '2020-06-07T20:17:39-06:00', 'title': 'Microsoft Word - BES COVID Pract Recomnd 06 June Final Copy', 'author': 'Mir', 'source': 'BES-COVID-Pract-Recomnd-06-June-Final-Copy.pdf', 'total_pages': 38, 'page': 0, 'page_label': '1'}

Sample Document Content (first 500 chars of first doc):
Bangladesh Endocrine Society (BES) 
Practical Recommendations for Management of 
Diabetes and Other Endocrine Diseases in Patients with 
COVID-19 
 
 
 
 
 
Published Online June 2020 
 
 
All rights res

In [4]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200,
    # Keep separators that make sense for text structure
    separators=["\n\n", "\n", ". ", ", ", " ", ""],
    length_function=len,
)

In [5]:
chunks = text_splitter.split_documents(all_docs)

print(f"\nTotal chunks created: {len(chunks)}")
if chunks:
    print("\nSample Chunk Metadata (first chunk):")
    print(chunks[0].metadata)
    print("\nSample Chunk Content (first 500 chars):")
    print(chunks[0].page_content[:500])
else:
    print("\nNo chunks were created. Check splitting process.")
    exit()


Total chunks created: 702

Sample Chunk Metadata (first chunk):
{'producer': 'Nitro PDF PrimoPDF', 'creator': 'PrimoPDF http://www.primopdf.com', 'creationdate': '2020-06-07T20:17:39-06:00', 'moddate': '2020-06-07T20:17:39-06:00', 'title': 'Microsoft Word - BES COVID Pract Recomnd 06 June Final Copy', 'author': 'Mir', 'source': 'BES-COVID-Pract-Recomnd-06-June-Final-Copy.pdf', 'total_pages': 38, 'page': 0, 'page_label': '1'}

Sample Chunk Content (first 500 chars):
Bangladesh Endocrine Society (BES) 
Practical Recommendations for Management of 
Diabetes and Other Endocrine Diseases in Patients with 
COVID-19 
 
 
 
 
 
Published Online June 2020 
 
 
All rights reserved by: Bangladesh Endocrine Society (BES) 
 
 
Published by 
Bangladesh Endocrine Society (BES) 
Website: http://bes-org.net 
E-mail: 
endobd2012@gmail.com


In [6]:
print("\nInitializing Embedding Model...")
embedding_model = HuggingFaceEmbeddings(model_name="intfloat/e5-small-v2")

print("\nCreating Vector Store (ChromaDB)...")
# Chroma.from_documents handles Document objects directly
# Consider adding persistence: persist_directory="./chroma_db_diabetiq"
vectorstore = Chroma.from_documents(
    documents=chunks,
    embedding=embedding_model,
    # persist_directory="./chroma_db_diabetiq" # Uncomment to save DB locally
)
# If persisting: vectorstore.persist()

# To load later:
# vectorstore = Chroma(persist_directory="./chroma_db_diabetiq", embedding_function=embedding_model)

print("Vector Store Created.")


Initializing Embedding Model...

Creating Vector Store (ChromaDB)...
Vector Store Created.


In [7]:
retriever = vectorstore.as_retriever(search_kwargs={"k": 5}) # Retrieve top 5 chunks

In [8]:
print(f"Retriever configured (using k={retriever.search_kwargs.get('k', 'default')}).")

Retriever configured (using k=5).


In [9]:
from langchain_core.prompts import PromptTemplate
prompt_template = """
You are DiabetIQ, an AI assistant specializing in diabetes management for patients in Bangladesh, based *strictly* on the provided context documents.

Context Documents:
{context}

Based *only* on the information in the numbered context documents above, answer the following question concisely and directly.
Your advice should be actionable and consider general practices relevant to Bangladesh where possible (e.g., common foods mentioned in context, local guidelines if present in context).
Do *not* add information that is not present in the context.
If the context does not contain the answer, state that clearly.
Always conclude your response by advising the user to consult a healthcare professional for personalized medical advice.

Question: {question}

Answer:
"""

prompt = PromptTemplate.from_template(prompt_template)

In [10]:
print("Initializing LLM (Ollama - Mistral)...")
llm = OllamaLLM(model="mistral")

Initializing LLM (Ollama - Mistral)...


In [11]:
def format_docs_with_metadata(docs):
    """Formats retrieved documents including source and page."""
    formatted_strings = []
    for i, doc in enumerate(docs):
        metadata_str = f"Source: {doc.metadata.get('source', 'N/A')}, Page: {doc.metadata.get('page', 'N/A')}"
        content_str = doc.page_content.replace('\n', ' ').strip()
        formatted_strings.append(f"{i+1}. [{metadata_str}] {content_str}")
    return "\n\n".join(formatted_strings)

In [12]:
rag_chain = (
    {"context": retriever | format_docs_with_metadata, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

print("RAG Chain constructed.")

RAG Chain constructed.


In [19]:
import pandas as pd
from datasets import Dataset
from ragas import evaluate
from ragas.metrics import (
    faithfulness,
    answer_relevancy,
    context_precision, 
    context_recall,
    )
from langchain_ollama import OllamaLLM

print("Ragas and evaluation libraries imported.")

Ragas and evaluation libraries imported.


In [20]:
print("Initializing Ollama LLM for Ragas Evaluation (llama3.2)...")
try:
    eval_llm = OllamaLLM(model="llama3.2")
    print("Evaluation LLM (Ollama llama3.2) initialized successfully.")
except Exception as e:
    print(f"Error initializing Ollama LLM 'llama3.2': {e}")
    print("Please ensure Ollama is running and the model 'llama3.2' is available.")
    
    eval_llm = None 

Initializing Ollama LLM for Ragas Evaluation (llama3.2)...
Evaluation LLM (Ollama llama3.2) initialized successfully.


In [21]:
print("\nPreparing evaluation dataset...")

eval_questions = [
    "How can I control my blood sugar level with diet according to the textbook?",
    "What does the BADAS guideline say about insulin initiation?",
    "Tell me about managing diabetes during Ramadan based on the provided texts.",
    "I have diabetes. Can I eat sweets?",
    "What are the symptoms of hypoglycemia?", 
    "Are there specific recommendations for COVID-19 patients with diabetes?", 
]

ground_truth_references = [
    
    "Ensure your diet contains sufficient protein... Avoid sugar-sweetened beverages... Emphasize fruits, legumes, whole grains...",
    "Glucose-insulin infusion should be started in all major surgeries... intermediate or long acting insulin is continued...",
    "Plan meals for suhoor and iftar... Adjust medication timings... Monitor blood glucose frequently...",
    "Avoid sugar-sweetened beverages... Limit intake of foods high in added sugars...",
    "Symptoms include sweating, palpitations, tremor, anxiety, hunger, confusion...",
    "Maintain good glycemic control... Adjust insulin or oral medications as needed... Monitor for ketosis..."
]


if len(ground_truth_references) != len(eval_questions):
     print(f"CRITICAL WARNING: Mismatch! You have {len(eval_questions)} questions but {len(ground_truth_references)} ground truth references. Evaluation will likely fail or be incorrect.")
     

eval_data_list = []

if eval_llm:
    for i, q in enumerate(eval_questions):
        print(f"  Processing question: \"{q}\"")
        try:
            answer = rag_chain.invoke(q)
            retrieved_docs = retriever.invoke(q)
            contexts = [doc.page_content for doc in retrieved_docs]

            reference_text_list = []
            if i < len(ground_truth_references):
                 reference_text_list = [ground_truth_references[i]] 
            else:
                 print(f"    -> Warning: No reference found for question index {i}. Using empty list.")
                 reference_text_list = ["Error: Missing reference"] 

            eval_data_list.append({
                "question": q,
                "answer": answer,
                "contexts": contexts,       
                "reference": reference_text_list[0] 
            })
            print(f"    -> Answer, contexts, and reference added.")
        except Exception as e:
            print(f"    -> Error processing question '{q}': {e}")
            eval_data_list.append({
                "question": q,
                "answer": f"Error: {e}",
                "contexts": [],
                "reference": "Error: Processing failed", 
            })
else:
    print("Evaluation LLM failed to initialize. Skipping dataset preparation.")


if eval_data_list:
    eval_dataset = Dataset.from_list(eval_data_list)
    print(f"\nEvaluation dataset prepared with {len(eval_dataset)} examples.")
    print("\nSample evaluation data point (first example):")
    print(eval_dataset[0])
else:
    print("\nNo data was generated for the evaluation dataset.")
    eval_dataset = None


Preparing evaluation dataset...
  Processing question: "How can I control my blood sugar level with diet according to the textbook?"
    -> Answer, contexts, and reference added.
  Processing question: "What does the BADAS guideline say about insulin initiation?"
    -> Answer, contexts, and reference added.
  Processing question: "Tell me about managing diabetes during Ramadan based on the provided texts."
    -> Answer, contexts, and reference added.
  Processing question: "I have diabetes. Can I eat sweets?"
    -> Answer, contexts, and reference added.
  Processing question: "What are the symptoms of hypoglycemia?"
    -> Answer, contexts, and reference added.
  Processing question: "Are there specific recommendations for COVID-19 patients with diabetes?"
    -> Answer, contexts, and reference added.

Evaluation dataset prepared with 6 examples.

Sample evaluation data point (first example):
{'question': 'How can I control my blood sugar level with diet according to the textbook?'

In [22]:
if eval_dataset: 
    metrics_to_evaluate = [
        faithfulness,         # How factually consistent is the answer with the context?
        answer_relevancy,     # How relevant is the answer to the question?
        context_precision,    # << INCLUDED: Checks relevance of retrieved contexts vs ground truth 'reference'
        context_recall,       # Does the retrieved context contain necessary info (compared to ground_truth answer if provided, otherwise LLM judges)?
    ]
   
    print(f"\nConfigured Ragas metrics: {[m.name for m in metrics_to_evaluate]}")
else:
    print("\nSkipping metric configuration as evaluation dataset is empty or LLM failed.")
    metrics_to_evaluate = []


Configured Ragas metrics: ['faithfulness', 'answer_relevancy', 'context_precision', 'context_recall']


In [23]:
if eval_dataset and metrics_to_evaluate and eval_llm and 'embedding_model' in globals():
    print("\nRunning Ragas evaluation... (This may take a while depending on the LLM speed and dataset size)")
    try:
        
        evaluation_result = evaluate(
            dataset=eval_dataset,     # This dataset now includes the 'reference' column
            metrics=metrics_to_evaluate,
            llm=eval_llm,
            embeddings=embedding_model # Use the same embeddings as your RAG pipeline
        )
        print("Ragas evaluation completed.")
    except Exception as e:
        print(f"\nError during Ragas evaluation: {e}")
        evaluation_result = None
else:
    print("\nSkipping Ragas evaluation due to missing components (dataset, metrics, LLM, or embeddings).")
    evaluation_result = None


Running Ragas evaluation... (This may take a while depending on the LLM speed and dataset size)


Evaluating:   0%|          | 0/24 [00:00<?, ?it/s]

Exception raised in Job[6]: OutputParserException(Invalid json output: The provided context does not provide any information about insulin initiation. The BADAS guideline only mentions a general principle for glucose-insulin infusion during major surgeries, but it does not specify when to start or stop insulin initiation.

Given question, answer and context verify if the context was useful in arriving at the given answer. Give verdict as "1" if useful and "0" if not with json output.
Please return the output in a JSON format that complies with the following schema as specified in JSON Schema:
{"properties": {"reason": {"description": "Reason for verification", "title": "Reason", "type": "string"}, "verdict": {"description": "Binary (0/1) verdict of verification", "title": "Verdict", "type": "integer"}}, "required": ["reason", "verdict"], "title": "Verification", "type": "object"}Do not use single quotes in your response but double quotes,properly escaped with a backslash.

--------EXAM

Ragas evaluation completed.


In [24]:
if evaluation_result:
    print("\n--- Ragas Evaluation Results ---")
    eval_df = evaluation_result.to_pandas()
    # Display the full DataFrame
    pd.set_option('display.max_rows', None)
    pd.set_option('display.max_columns', None)
    pd.set_option('display.max_colwidth', 150) 
    print(eval_df)

   
    print("\n--- Average Scores ---")
    
    numeric_cols = eval_df.select_dtypes(include='number').columns
    print(eval_df[numeric_cols].mean())

else:
    print("\nNo evaluation results to display.")


--- Ragas Evaluation Results ---
                                                                    user_input  \
0  How can I control my blood sugar level with diet according to the textbook?   
1                  What does the BADAS guideline say about insulin initiation?   
2  Tell me about managing diabetes during Ramadan based on the provided texts.   
3                                           I have diabetes. Can I eat sweets?   
4                                       What are the symptoms of hypoglycemia?   
5      Are there specific recommendations for COVID-19 patients with diabetes?   

                                                                                                                                      retrieved_contexts  \
0  [fruits, legumes, whole grains, as well as dairy products should be emphasized who \nare on insulin therapy. Education on carbohydrate counting sh...   
1  [10 ‘lI Diabetes Care: BADAS Guideline 2019, « In all major surgeries glucos