##Andrew Huebner - ID: 2231994

## CSC 380 HW\#4: LLM and RAG

Fall 2025 HW\#4 starter code

In [None]:
# -*- coding: utf-8 -*-
# Setup
# ----------------------------
!pip install -qU requests==2.32.4 chromadb langchain langchain-chroma langchain-huggingface langchain_openai langchain_community sentence-transformers tiktoken openai pypdf

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/67.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.3/67.3 kB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m20.8/20.8 MB[0m [31m105.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.0/76.0 kB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.5/2.5 MB[0m [31m100.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m67.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m326.6/326.6 kB[0m [31m34.9 MB/s[0m eta [36m0:00:00

In [None]:
# Imports
# ----------------------------
import os, re, shutil
import requests
import chromadb
from typing import List, Dict, Tuple
from IPython.display import Markdown, display

# LangChain imports
from langchain.schema import Document
from langchain.document_loaders import PyPDFLoader, TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.chains import LLMChain
from langchain_openai import ChatOpenAI, OpenAI
from langchain.prompts import PromptTemplate

In [None]:
# Load API keys from CoLab secret keys
from google.colab import userdata

langchain_api = userdata.get('langchain_api')
openai_api = userdata.get('openai_api')
hf_token = userdata.get('HF_TOKEN')

os.environ["OPENAI_API_KEY"] = openai_api
os.environ["LANGCHAIN_API_KEY"] = langchain_api
os.environ["HUGGINGFACEHUB_API_TOKEN"] = hf_token

## Part 0: Define Core Functions

In [None]:
from typing import List, Dict, Tuple

# 1-1: Load and preprocess documents
# ----------------------------
def load_documents() -> List[Document]:
    """
    Load the four AI policy documents from their URLs.
    Returns a list of Document objects.
    """
    # (1) TODO: Implement this function
    # Load all four documents using appropriate LangChain document loaders
    # Return a list of Document objects
    url = "https://raw.githubusercontent.com/ntomuro/CSC380/main/LLM_RAG/data"
    document_filenames = [
        "Human-Nutrition-2020-Edition-1598491699.txt",
        "dci190009_pdf.txt",
        "dci190014_pdf.txt"
    ]
    documents = []
    ## Continue the TODO and collect raw documents in a variable 'documents'
    ## and return it.  You'll use langchain's PyPDFLoader, TextLoader to do.

    for doc in document_filenames:
      doc_url = url+"/"+doc

      response = requests.get(doc_url)
      with open(doc, 'wb') as f:
        f.write(response.content)

      loader = TextLoader(doc)
      docs = loader.load()

      documents.extend(docs)

      #print(f"Loaded {len(docs)} docs from: {doc}")
    # The original code called load_documents() here, which is incorrect. It should return the documents.
    return documents

def preprocess_documents(documents: List[Document]) -> List[Document]:
    """
    Split documents into chunks for vector storage.
    """
    # (2) TODO: Implement text splitting
    # Use RecursiveCharacterTextSplitter with appropriate parameters
    # Consider what chunk_size and chunk_overlap work best for multi-document retrieval
    # Return list of document chunks

    text_splitter = RecursiveCharacterTextSplitter(chunk_size = 1000, chunk_overlap = 200)
    docs = text_splitter.split_documents(documents)





    # for now; change to something appropritate based on your code
    return docs    #return retriever

# 1-2: Vector Store Population
# ----------------------------
def create_vector_store(documents: List[Document], embedding_model_name: str, client):
    """
    Create and return a retriever from documents using the specified embedding model.
    """
    # (3) TODO: Implement vector store creation
    # Initialize embedding function with the specified model
    # Create Chroma vector store from documents
    # Return a retriever with search_kwargs={"k": 3}


    embeddings = HuggingFaceEmbeddings(model_name=embedding_model_name)
    # The return type of from_documents is a Chroma instance, not directly a retriever with search_kwargs
    vectorstore = Chroma.from_documents(documents, embeddings, client=client, persist_directory="./chroma_db")
    retriever = vectorstore.as_retriever(search_kwargs={"k":3})


    return retriever

# 1-3: Query & Evaluation Set
# ----------------------------
def create_test_queries() -> List[str]:
    """
    Create a diverse set of 6 test queries as specified in the assignment.
    """
    # (4) TODO: Implement this function to return 6 well-designed test queries
    # You can hard-code your own queries, or automatically generate queries in some way.
    queries = [
      "How many people are affected by diabetes?",
      "How does diabetes affect sugar cravings?",
      "What are the dietary recommendations for pregnant women regarding iron intake?",
      "How does exercise affect blood sugar levels?",
      "What are the potential health risks associated with excessive sugar consumption?",
      "Explain the process of digestion and absorption of fats in the human digestive system."
    ]
    return queries

# 1-4: The RAG Chain
# ----------------------------

def run_rag_query(query: str, retriever) -> Tuple[str, List[Document]]:
    """
    Execute a RAG query: retrieve relevant context and generate an answer.
    Returns the generated answer and the retrieved documents for evaluation.
    """
    # (5) TODO: Implement the RAG query execution
    # Retrieve relevant chunks using the retriever
    # Construct a high-quality prompt that includes context and query
    # Use an LLM chain ("gpt-3.5-turbo-instruct", with 'temperature=0') to
    # generate the answer.  Assign the answer and retrieved documents to
    # variables 'answer' and 'retrieved_docs', and return them.

    retrieved_docs = retriever.get_relevant_documents(query)

    context = "\n".join([doc.page_content for doc in retrieved_docs])

    prompt = """Please answer the question based upon the context. If the question cannot be answered using the information provided answer with "I don't know".

    Context: {context}

    Question: {query}

    Answer:"""

    from langchain.llms import OpenAI

    llm = OpenAI(model="gpt-3.5-turbo-instruct", temperature=0)
    answer = llm(prompt.format(context=context, query=query))

    return answer, retrieved_docs

# 1-5: Evaluation
# ----------------------------
def evaluate_results(query: str, retrieved_docs: List[Document], answer: str) -> Tuple[int, int]:
    """
    Manually evaluate the retrieval quality and answer quality.
    Returns (retrieval_score, answer_score) on scale 1-3.
    """
    # (6) TODO: Manually evaluate and return scores
    # Code is written.  You only enter your manual evaluation scores during execution.
    print(f"Query: {query}")
    print(f"Retrieved {len(retrieved_docs)} documents")
    print(f"Answer: {answer}")
    print("---")

    retrieval_score = int(input("Retrieval score (1-3): "))
    answer_score = int(input("Answer score (1-3): "))

    return retrieval_score, answer_score

## **Experiment 1**: Sentence Transformers

#### This cell is mostly complete. You can make slight modifications to do more experiments, but do not make large changes.



In [None]:
# 2: The Main Experiment 1
# ----------------------------
def main():
    """
    Main function to run the complete RAG experiment.
    """
    def safe_name(s):
        """
        Convert a string to a valid file name by removing invalid characters
        and using underscores to separate words.
        """
        return re.sub(r'[^0-9A-Za-z._-]', '_', s)

    # Define embedding models to test
    embedding_models = [
        "all-MiniLM-L6-v2",  # 384
        "multi-qa-MiniLM-L6-cos-v1" # 384
    ]

    # Load and preprocess documents
    print("Loading documents...")
    raw_documents = load_documents()
    print(f"Loaded {len(raw_documents)} raw documents")

    print("Preprocessing documents...")
    processed_documents = preprocess_documents(raw_documents)
    print(f"Created {len(processed_documents)} document chunks")

    # Create test queries
    test_queries = create_test_queries()
    print(f"Created {len(test_queries)} test queries")

    # Store results for analysis
    results = {}

    # Run experiment for each embedding model
    for model_name in embedding_models:
        print(f"\n{'='*50}")
        print(f"Testing model: {model_name}")
        print(f"{'='*50}")

        # Create a unique DB for the embedding model
        model_key = safe_name(model_name)
        db_path = f"./chroma_db_{model_key}"   # unique per model

        chroma_client = chromadb.PersistentClient(path=db_path)

        # Create vector store with current model
        retriever = create_vector_store(processed_documents, model_name, chroma_client)

        model_results = {
            'retrieval_scores': [],
            'answer_scores': [],
            'details': []
        }

        # Test each query
        for i, query in enumerate(test_queries): ####
            print(f"\nQuery {i+1}: {query}")

            # Run RAG query
            answer, retrieved_docs = run_rag_query(query, retriever)

            # Evaluate results
            retrieval_score, answer_score = evaluate_results(query, retrieved_docs, answer)

            # Store scores
            model_results['retrieval_scores'].append(retrieval_score)
            model_results['answer_scores'].append(answer_score)
            model_results['details'].append({
                'query': query,
                'answer': answer,
                'retrieved_docs': retrieved_docs
            })

        # Calculate averages
        model_results['avg_retrieval'] = sum(model_results['retrieval_scores']) / len(model_results['retrieval_scores'])
        model_results['avg_answer'] = sum(model_results['answer_scores']) / len(model_results['answer_scores'])

        results[model_name] = model_results

        print(f"\nModel {model_name} - Avg Retrieval: {model_results['avg_retrieval']:.2f}, Avg Answer: {model_results['avg_answer']:.2f}")

    return results, processed_documents, test_queries

### Experiment 1 (Run Experiment)

In [None]:
# Execute the experiment
final_results, documents, queries = main()
print (final_results)

Loading documents...
Loaded 3 raw documents
Preprocessing documents...
Created 2107 document chunks
Created 6 test queries

Testing model: all-MiniLM-L6-v2


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]


Query 1: How many people are affected by diabetes?


  retrieved_docs = retriever.get_relevant_documents(query)
  llm = OpenAI(model="gpt-3.5-turbo-instruct", temperature=0)
  answer = llm(prompt.format(context=context, query=query))


Query: How many people are affected by diabetes?
Retrieved 3 documents
Answer:  As of 2010, 25.8 million Americans, which is 8.3 percent of the population, are affected by diabetes.
---
Retrieval score (1-3): 3
Answer score (1-3): 3

Query 2: How does diabetes affect sugar cravings?
Query: How does diabetes affect sugar cravings?
Retrieved 3 documents
Answer:  I don't know.
---
Retrieval score (1-3): 3
Answer score (1-3): 1

Query 3: What are the dietary recommendations for pregnant women regarding iron intake?
Query: What are the dietary recommendations for pregnant women regarding iron intake?
Retrieved 3 documents
Answer:  Pregnant women should take a prenatal supplement to ensure an adequate intake of iron.
---
Retrieval score (1-3): 3
Answer score (1-3): 3

Query 4: How does exercise affect blood sugar levels?
Query: How does exercise affect blood sugar levels?
Retrieved 3 documents
Answer:  I don't know.
---
Retrieval score (1-3): 3
Answer score (1-3): 1

Query 5: What are the po

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/383 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]


Query 1: How many people are affected by diabetes?
Query: How many people are affected by diabetes?
Retrieved 3 documents
Answer:  25.8 million Americans have diabetes, which is 8.3 percent of the population.
---
Retrieval score (1-3): 3
Answer score (1-3): 3

Query 2: How does diabetes affect sugar cravings?
Query: How does diabetes affect sugar cravings?
Retrieved 3 documents
Answer:  I don't know.
---
Retrieval score (1-3): 3
Answer score (1-3): 1

Query 3: What are the dietary recommendations for pregnant women regarding iron intake?
Query: What are the dietary recommendations for pregnant women regarding iron intake?
Retrieved 3 documents
Answer:  Pregnant women should eat iron-rich or iron-fortified foods and take a prenatal supplement to ensure an adequate intake of iron.
---
Retrieval score (1-3): 3
Answer score (1-3): 3

Query 4: How does exercise affect blood sugar levels?
Query: How does exercise affect blood sugar levels?
Retrieved 3 documents
Answer:  Exercise can help lo

### Experiment 1 (Print Results) -- Run this cell **after you finish writing your code**. Report the results table in your write-up!



In [None]:
# Analysis Helper Functions
# ----------------------------

def print_results_table(results: Dict):
    """Print a formatted results table for the report."""
    print("\n" + "="*60)
    print("EXPERIMENT RESULTS SUMMARY")
    print("="*60)
    print(f"{'Model':<25} {'Avg Retrieval':<15} {'Avg Answer':<15}")
    print("-"*60)

    for model_name, model_results in results.items():
        print(f"{model_name:<25} {model_results['avg_retrieval']:<15.2f} {model_results['avg_answer']:<15.2f}")

def compare_retrieval_for_query(results: Dict, query_index: int):
    """Compare retrieval performance for a specific query across models."""
    print(f"\nRetrieval comparison for Query {query_index + 1}:")
    for model_name, model_results in results.items():
        retrieval_score = model_results['retrieval_scores'][query_index]
        answer_score = model_results['answer_scores'][query_index]
        print(f"  {model_name}: Retrieval={retrieval_score}, Answer={answer_score}")

# Print the final results
# ----------------------------
print_results_table(final_results)
for query in queries:
  compare_retrieval_for_query(final_results, queries.index(query))


EXPERIMENT RESULTS SUMMARY
Model                     Avg Retrieval   Avg Answer     
------------------------------------------------------------
all-MiniLM-L6-v2          3.00            2.33           
multi-qa-MiniLM-L6-cos-v1 3.00            2.67           

Retrieval comparison for Query 1:
  all-MiniLM-L6-v2: Retrieval=3, Answer=3
  multi-qa-MiniLM-L6-cos-v1: Retrieval=3, Answer=3

Retrieval comparison for Query 2:
  all-MiniLM-L6-v2: Retrieval=3, Answer=1
  multi-qa-MiniLM-L6-cos-v1: Retrieval=3, Answer=1

Retrieval comparison for Query 3:
  all-MiniLM-L6-v2: Retrieval=3, Answer=3
  multi-qa-MiniLM-L6-cos-v1: Retrieval=3, Answer=3

Retrieval comparison for Query 4:
  all-MiniLM-L6-v2: Retrieval=3, Answer=1
  multi-qa-MiniLM-L6-cos-v1: Retrieval=3, Answer=3

Retrieval comparison for Query 5:
  all-MiniLM-L6-v2: Retrieval=3, Answer=3
  multi-qa-MiniLM-L6-cos-v1: Retrieval=3, Answer=3

Retrieval comparison for Query 6:
  all-MiniLM-L6-v2: Retrieval=3, Answer=3
  multi-qa-MiniLM-L6

## **Experiment 2**: Temperature parameter

In [None]:
"""You run just answerable queries (three of them from above).
For each query, try one low, one high temperatures of your choice (thus three times), and 0.5."""

##
## IMPLEMENT YOUR OWN
##
def temperature_affect(retriever, answerable_queries):
  from langchain.llms import OpenAI

  temp = [0.1,0.5,1.0]

  results = []

  for query in answerable_queries:
    retrieved_docs = retriever.get_relevant_documents(query)
    context = "\n".join([doc.page_content for doc in retrieved_docs])

    for t in temp:
      prompt = """Please answer the question based upon the context. If the question cannot be answered using the information provided answer with "I don't know".

      Context: {context}

      Question: {query}

      Answer:"""

      llm = OpenAI(model="gpt-3.5-turbo-instruct", temperature=t)
      answer = llm(prompt.format(context=context, query=query))

      print(f"Query: {query}")
      print(f"Temperature: {t}")
      print(f"Answer: {answer}")
      print("---")
      results.append((query, t, answer))

  return results
answerable_queries = ["How many people are affected by diabetes?", "How does exercise affect blood sugar levels?", "What are the dietary recommendations for pregnant women regarding iron intake?"]
chroma_client = chromadb.PersistentClient(path="./chroma_db")
retriever = create_vector_store(documents, "multi-qa-MiniLM-L6-cos-v1", chroma_client)
temps_answers = temperature_affect(retriever, answerable_queries)

Query: How many people are affected by diabetes?
Temperature: 0.1
Answer:  25.8 million Americans have diabetes, which is 8.3 percent of the population.
---
Query: How many people are affected by diabetes?
Temperature: 0.5
Answer:  As of 2010, 25.8 million Americans have diabetes, which is 8.3 percent of the population.
---
Query: How many people are affected by diabetes?
Temperature: 1.0
Answer:  25.8 million Americans, which is 8.3 percent of the population, are estimated to have diabetes according to the CDC in 2010. However, the prevalence has likely increased since then. 
---
Query: How does exercise affect blood sugar levels?
Temperature: 0.1
Answer:  Exercise can help lower blood sugar levels in individuals with Type 2 diabetes by promoting weight loss and improving insulin sensitivity.
---
Query: How does exercise affect blood sugar levels?
Temperature: 0.5
Answer:  Exercise can help lower blood sugar levels by promoting weight loss and increasing lean body mass, which in turn 

In [None]:
for items in temps_answers:
  print(items)

('How many people are affected by diabetes?', 0.1, ' 25.8 million Americans have diabetes, which is 8.3 percent of the population.')
('How many people are affected by diabetes?', 0.5, ' As of 2010, 25.8 million Americans have diabetes, which is 8.3 percent of the population.')
('How many people are affected by diabetes?', 1.0, ' 25.8 million Americans, which is 8.3 percent of the population, are estimated to have diabetes according to the CDC in 2010. However, the prevalence has likely increased since then. ')
('How does exercise affect blood sugar levels?', 0.1, ' Exercise can help lower blood sugar levels in individuals with Type 2 diabetes by promoting weight loss and improving insulin sensitivity.')
('How does exercise affect blood sugar levels?', 0.5, ' Exercise can help lower blood sugar levels by promoting weight loss and increasing lean body mass, which in turn leads to an increase in basal metabolism. This can help improve overall metabolic fitness and decrease the risk for de

## **Experiment 3**: LLM as Judge

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline

# Load Mistral 7B model from Hugging Face -- the Judge (independent from RAG-LLM)!
model_name = "mistralai/Mistral-7B-Instruct-v0.1"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto")

# Create a text generation pipeline
generator = pipeline("text-generation", model=model, tokenizer=tokenizer)

##
## YOU FILL THE REST
##
def llm_as_judge(generator, query, retrieved_docs, answer):
  context_text = "\n".join([doc.page_content for doc in retrieved_docs][:3])
  judge_prompt = f""" You are acting as an independent evaluator (LLM-as-a-Judge) for answers generated by a medical RAG system.

  Evaluate the following response on two criteria:

  1. MEDICAL CORRECTNESS (1–3):
    - 1 = Contains incorrect medical statements OR contradicts guidelines (ADA, CDC, WHO).
    - 2 = Mostly correct but missing nuance or slightly outdated.
    - 3 = Accurate and medically sound, reflects current clinical guidelines.

  2. LANGUAGE SIMPLICITY (1–3):
    - 1 = Too technical, confusing, or filled with jargon.
    - 2 = Somewhat understandable, but still technical in places.
    - 3 = Clear, simple, layperson-friendly language.

  Question:
  {query}

  Retrieved Context:
  {context_text}

  System Response:
  {answer}"""
  raw = generator(judge_prompt, max_new_tokens=200)

    # Ensure we always get a string
  if isinstance(raw, list):
      output = raw[0].get("generated_text", "")
  elif isinstance(raw, dict):
      output = raw.get("generated_text", "")
  else:
      output = str(raw)

  print(output)  # optional debug print

  import re, json
  match = re.search(r"\{.*\}", output, re.DOTALL)
  if not match:
      return {
          "medical_correctness": None,
          "language_simplicity": None,
          "explanation": "Could not extract JSON"
      }
  try:
      scores = json.loads(match.group(0))
  except:
      scores = {
          "medical_correctness": None,
          "language_simplicity": None,
          "explanation": "JSON parse error"
      }

  return scores

best_outputs = [temps_answers[0], temps_answers[1], temps_answers[4]]
for items in best_outputs:
  query = items[0]
  answer = items[2]
  retrieved_docs = retriever.get_relevant_documents(query)

  scores = llm_as_judge(generator, query, retrieved_docs, answer)
  #scores = llm_as_judge(generator, items[0], items[1], items[2])
  print(scores)

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.94G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/4.54G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

Device set to use cuda:0
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


 You are acting as an independent evaluator (LLM-as-a-Judge) for answers generated by a medical RAG system.

  Evaluate the following response on two criteria:

  1. MEDICAL CORRECTNESS (1–3):
    - 1 = Contains incorrect medical statements OR contradicts guidelines (ADA, CDC, WHO).
    - 2 = Mostly correct but missing nuance or slightly outdated.
    - 3 = Accurate and medically sound, reflects current clinical guidelines.

  2. LANGUAGE SIMPLICITY (1–3):
    - 1 = Too technical, confusing, or filled with jargon.
    - 2 = Somewhat understandable, but still technical in places.
    - 3 = Clear, simple, layperson-friendly language.

  Question:
  How many people are affected by diabetes?

  Retrieved Context:
  The Centers for Disease Control Prevention (CDC) estimates that as of 2010, 25.8 million Americans have diabetes, which is 8.3 percent of the population.9
9. 
Diabetes Research and Statistics.Centers for Disease Control and Prevention. https://www.cdc.gov/diabetes/data/index.htm

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


 You are acting as an independent evaluator (LLM-as-a-Judge) for answers generated by a medical RAG system.

  Evaluate the following response on two criteria:

  1. MEDICAL CORRECTNESS (1–3):
    - 1 = Contains incorrect medical statements OR contradicts guidelines (ADA, CDC, WHO).
    - 2 = Mostly correct but missing nuance or slightly outdated.
    - 3 = Accurate and medically sound, reflects current clinical guidelines.

  2. LANGUAGE SIMPLICITY (1–3):
    - 1 = Too technical, confusing, or filled with jargon.
    - 2 = Somewhat understandable, but still technical in places.
    - 3 = Clear, simple, layperson-friendly language.

  Question:
  How many people are affected by diabetes?

  Retrieved Context:
  The Centers for Disease Control Prevention (CDC) estimates that as of 2010, 25.8 million Americans have diabetes, which is 8.3 percent of the population.9
9. 
Diabetes Research and Statistics.Centers for Disease Control and Prevention. https://www.cdc.gov/diabetes/data/index.htm