In [1]:
from deepeval import evaluate
from deepeval.metrics import GEval, FaithfulnessMetric, ContextualRelevancyMetric
from deepeval.test_case import LLMTestCase, LLMTestCaseParams

### Test Correctness

In [2]:
import sys
from pathlib import Path

# 1. Define the directory *containing* the all_rag_techniques package
# Get the directory of the current notebook/script (__file__ might not work in some notebooks)
# Assuming the notebook is inside all_rag_techniques/
current_dir = Path.cwd() 

# The directory containing 'all_rag_techniques' is the parent directory
project_root = current_dir.parent 

# 2. Add this root to the system path
if str(project_root) not in sys.path:
    sys.path.insert(0, str(project_root))
    print(f"Added project root to path: {project_root}")
else:
    print("Project root already in path.")

# 3. Now the import should work
try:
    from all_rag_techniques import setup_environment, check_keys
    print("✅ Package imported successfully!")
    setup_environment()
    check_keys()
except Exception as e:
    print(f"❌ Final import failed: {e}")

Added project root to path: /Users/ruhwang/Desktop/AI/my_projects/context-engineering/advanced-rag
✅ Package imported successfully!
LANGCHAIN_API_KEY not set (empty in .env file)
Environment setup complete!
=== API Keys from config.py ===
  GROQ_API_KEY: Loaded
  COHERE_API_KEY: Loaded
  OPENAI_API_KEY: Loaded
  LANGCHAIN_API_KEY: Missing

=== Environment Variables ===
  os.environ['GROQ_API_KEY']: Set
  os.environ['COHERE_API_KEY']: Set

All essential keys loaded!


In [None]:
!pip install ipywidgets

In [8]:
type(LLMTestCaseParams)

enum.EnumType

In [9]:
LLMTestCaseParams.EXPECTED_OUTPUT

<LLMTestCaseParams.EXPECTED_OUTPUT: 'expected_output'>

In [11]:
correctness_metric = GEval(
    name="Correctness",
    model="gpt-4o-mini",
    evaluation_params=[
        LLMTestCaseParams.EXPECTED_OUTPUT,
        LLMTestCaseParams.ACTUAL_OUTPUT],
        evaluation_steps=[
        "Determine whether the actual output is factually correct based on the expected output",
        "Do not heavily penalize minor semantic differences, such as capitalization or punctuation",
        "if the meaning is perserved, but with different words, still give a reasonably high score"
    ],
)

gt_answer = "Madrid is the capital of Spain."
pred_answer = "MadriD."

test_case_correctness = LLMTestCase(
    input="What is the capital of Spain?",
    expected_output=gt_answer,
    actual_output=pred_answer,
)

correctness_metric.measure(test_case_correctness)
print(correctness_metric.score)

Output()

0.2916029953786671


In [None]:
gt_answer = "graph rag"
pred_answer = "Graph RAG"

test_case_correctness = LLMTestCase(
    input="alternative strategies to RAG for relational, knowledge-intensive tasks?",
    expected_output=gt_answer,
    actual_output=pred_answer,
)

correctness_metric.measure(test_case_correctness)
print(correctness_metric.score)

Output()

0.8125496737909847


### Test faithfulness

In [13]:
FaithfulnessMetric

deepeval.metrics.faithfulness.faithfulness.FaithfulnessMetric

In [4]:
question = "what is 3+3?"
context = ["6"]
generated_answer = "6"

faithfulness_metric = FaithfulnessMetric(
    threshold=0.7,
    model="gpt-4o-mini",
    include_reason=False
)

test_case = LLMTestCase(
    input = question,
    actual_output=generated_answer,
    retrieval_context=context

)

faithfulness_metric.measure(test_case)
print(faithfulness_metric.score)
print(faithfulness_metric.reason)

Output()

1.0
None


### Test contextual relevancy 

In [None]:
actual_output = "then go somewhere else."
retrieval_context = ["this is a test context","mike is a cat","if the shoes don't fit, then go somewhere else."]
gpt_answer = "if the shoes don't fit, then go somewhere else."

relevance_metric = ContextualRelevancyMetric(
    threshold=1,
    model="gpt-4o-mini",
    include_reason=True
)
relevance_test_case = LLMTestCase(
    input="What if these shoes don't fit?",
    actual_output=actual_output,
    retrieval_context=retrieval_context,
    expected_output=gpt_answer,
)

relevance_metric.measure(relevance_test_case)
print(relevance_metric.score)
print(relevance_metric.reason)

Output()

0.0
The score is 0.00 because there are no relevant statements in the retrieval context that address the concern about whether the shoes fit, as all provided statements are unrelated.


In [6]:
new_test_case = LLMTestCase(
    input="What is the capital of Spain?",
    expected_output="Madrid is the capital of Spain.",
    actual_output="MadriD.",
    retrieval_context=["Madrid is the capital of Spain."]
)

### Test two different cases together with several metrics together

In [7]:
evaluate(
    test_cases=[relevance_test_case, new_test_case],
    metrics=[correctness_metric, faithfulness_metric, relevance_metric]
)

Output()

LengthFinishReasonError: Could not parse response content as the length limit was reached - CompletionUsage(completion_tokens=16384, prompt_tokens=514, total_tokens=16898, completion_tokens_details=CompletionTokensDetails(accepted_prediction_tokens=0, audio_tokens=0, reasoning_tokens=0, rejected_prediction_tokens=0), prompt_tokens_details=PromptTokensDetails(audio_tokens=0, cached_tokens=0))

### Function to create multiple LLMTestCases based on four lists: 
* Questions
* Ground Truth Answers
* Generated Answers
* Retrieved Documents - Each element is a list

In [None]:
def create_deep_eval_test_cases(questions, gpt_answers, generated_answers, retrieved_documents):
    return [
        LLMTestCase(
            input=question,
            expected_output=gt_answer,
            actual_output=generated_answer,
            retrieval_context=retrieved_document
        )
        for question, gpt_answer, generated_answer, retrieved_document in zip(
            questions, gpt_answers, generated_answers, retrieved_documents
        )
    ]

![](https://europe-west1-rag-techniques-views-tracker.cloudfunctions.net/rag-techniques-tracker?notebook=evaluation--evaluation-deep-eval)