In [None]:

%pip install python-dotenv
%pip uninstall -y deepeval
%pip install "deepeval>=3.2.6"




Note: you may need to restart the kernel to use updated packages.
Found existing installation: deepeval 3.2.6
Uninstalling deepeval-3.2.6:
  Successfully uninstalled deepeval-3.2.6
Note: you may need to restart the kernel to use updated packages.
Collecting deepeval>=3.2.6
  Using cached deepeval-3.2.6-py3-none-any.whl.metadata (17 kB)
Using cached deepeval-3.2.6-py3-none-any.whl (582 kB)
Installing collected packages: deepeval
Successfully installed deepeval-3.2.6
Note: you may need to restart the kernel to use updated packages.


In [1]:
from dotenv import load_dotenv
import os
from deepeval.metrics import FaithfulnessMetric, ContextualPrecisionMetric, AnswerRelevancyMetric, ContextualRecallMetric
from deepeval.test_case import LLMTestCase

load_dotenv()
if not os.getenv("OPENAI_API_KEY"):
    raise ValueError("OPENAI_API_KEY not found in .env file.")

examples = [
    {
        "question": "Why do leaves change color in the fall?",
        "contexts": [
            "Leaves turn color in autumn due to changes in daylight and temperature.",
            "Photosynthesis slows and chlorophyll breaks down, revealing red and yellow pigments.",
            "Some tree species have different pigment compounds that show up prominently in fall."
        ],
        "response": "Leaves change color in autumn because shorter days and cooler temperatures cause chlorophyll to break down, revealing other pigments."
    },
    {
        "question": "Why do we have seasons on Earth?",
        "contexts": [
            "The tilt of Earth's axis relative to its orbit causes different parts of Earth to receive varying amounts of sunlight throughout the year.",
            "This axial tilt is approximately 23.5 degrees.",
            "Changes in distance from the sun during the orbit do not cause seasons."
        ],
        "response": "Seasons occur because the Earth's axis is tilted, which changes how sunlight hits different parts of the planet throughout the year."
    },
    {
        "question": "Why do we have seasons on Earth?",
        "contexts": [
            "Earth revolves around the Sun in an elliptical orbit.",
            "There are four main seasons: spring, summer, autumn, and winter."
        ],
        "response": "We have seasons because Earth moves around the Sun."
    },
    {
        "question": "Why do leaves change color in the fall?",
        "contexts": [
            "Fish migrate upstream to spawn in freshwater rivers.",
            "This migration ensures the survival of the next generation."
        ],
        "response": "Leaves change color because fish swim upstream in autumn."
    }
]

for idx, ex in enumerate(examples, start=1):
    print(f"\n### Example {idx}")
    print(f"**Question:** {ex['question']}")
    print(f"**Response:** {ex['response']}")
    
    test_case = LLMTestCase(
        input=ex["question"],
        actual_output=ex["response"],
        expected_output=ex["response"],
        retrieval_context=ex["contexts"] 
    )
    
    faithfulness_metric = FaithfulnessMetric()
    context_precision_metric = ContextualPrecisionMetric()
    answer_relevancy_metric = AnswerRelevancyMetric()
    contextual_recall_metric = ContextualRecallMetric()  

    faithfulness_score = faithfulness_metric.measure(test_case)
    context_precision_score = context_precision_metric.measure(test_case)
    answer_relevancy_score = answer_relevancy_metric.measure(test_case)
    contextual_recall_score = contextual_recall_metric.measure(test_case)  

    print("\n| Metric            | Score |")
    print("|-------------------|-------|")
    print(f"| Faithfulness       | {faithfulness_score:.3f} |")
    print(f"| Context Precision  | {context_precision_score:.3f} |")
    print(f"| Answer Relevancy   | {answer_relevancy_score:.3f} |")
    print(f"| Contextual Recall  | {contextual_recall_score:.3f} |")  


Output()


### Example 1
**Question:** Why do leaves change color in the fall?
**Response:** Leaves change color in autumn because shorter days and cooler temperatures cause chlorophyll to break down, revealing other pigments.


Output()

Output()

Output()

Output()


| Metric            | Score |
|-------------------|-------|
| Faithfulness       | 1.000 |
| Context Precision  | 1.000 |
| Answer Relevancy   | 1.000 |
| Contextual Recall  | 1.000 |

### Example 2
**Question:** Why do we have seasons on Earth?
**Response:** Seasons occur because the Earth's axis is tilted, which changes how sunlight hits different parts of the planet throughout the year.


Output()

Output()

Output()

Output()


| Metric            | Score |
|-------------------|-------|
| Faithfulness       | 1.000 |
| Context Precision  | 1.000 |
| Answer Relevancy   | 1.000 |
| Contextual Recall  | 1.000 |

### Example 3
**Question:** Why do we have seasons on Earth?
**Response:** We have seasons because Earth moves around the Sun.


Output()

Output()

Output()

Output()


| Metric            | Score |
|-------------------|-------|
| Faithfulness       | 1.000 |
| Context Precision  | 1.000 |
| Answer Relevancy   | 1.000 |
| Contextual Recall  | 1.000 |

### Example 4
**Question:** Why do leaves change color in the fall?
**Response:** Leaves change color because fish swim upstream in autumn.


Output()

Output()

Output()


| Metric            | Score |
|-------------------|-------|
| Faithfulness       | 0.000 |
| Context Precision  | 1.000 |
| Answer Relevancy   | 0.000 |
| Contextual Recall  | 0.000 |


In [3]:
from deepeval.test_case import LLMTestCase, LLMTestCaseParams
from deepeval import assert_test
from deepeval.metrics import GEval 
test_case = LLMTestCase(
    input="Why do leaves change color in autumn?",
    actual_output="Leaves change color in autumn due to chlorophyll breakdown.",
    expected_output="Leaves change color because chlorophyll fades, revealing other pigments."
)

correctness_metric = GEval(
    name="Correctness",
    criteria="Determine if the actual_output correctly and completely explains why leaves change color in autumn based on the expected_output.",
    evaluation_params=[LLMTestCaseParams.ACTUAL_OUTPUT, LLMTestCaseParams.EXPECTED_OUTPUT],
    threshold=0.6
)

assert_test(test_case, [correctness_metric])

Output()

AssertionError: Metrics: Correctness [GEval] (score: 0.5970535200706688, threshold: 0.6, strict: False, error: None, reason: The actual output correctly identifies chlorophyll breakdown as the main reason for leaves changing color, aligning with the expected output's main point. However, it omits the detail that other pigments are revealed when chlorophyll fades, which is a key component in the expected output. There are no significant inaccuracies, but the omission of the role of other pigments prevents a higher score.) failed.

In [5]:

from deepeval import assert_test

test_case = LLMTestCase(
    input="Why do leaves change color in autumn?",
    actual_output="Leaves change color in autumn due to chlorophyll breakdown.",
    expected_output="Leaves change color because chlorophyll fades, revealing other pigments.",
    retrieval_context=[
        "In autumn, cooler temperatures and shorter days reduce chlorophyll production.",
        "As chlorophyll breaks down, yellows and reds become visible."
    ]
)

faithfulness = FaithfulnessMetric(threshold=0.8)
precision = ContextualPrecisionMetric(threshold=0.7)
recall = ContextualRecallMetric(threshold=0.7)
relevancy = AnswerRelevancyMetric(threshold=0.7)
correctness = GEval(
    name="Correctness",
    criteria="Does the answer correctly and completely explain why leaves change color in autumn?",
    evaluation_params=[LLMTestCaseParams.ACTUAL_OUTPUT, LLMTestCaseParams.EXPECTED_OUTPUT],
    threshold=0.7
)

assert_test(test_case, [faithfulness, precision, recall, relevancy, correctness])


Output()

AssertionError: Metrics: Correctness [GEval] (score: 0.40293624525784616, threshold: 0.7, strict: False, error: None, reason: The Actual Output correctly identifies chlorophyll breakdown as the reason for leaves changing color, aligning with the Expected Output's mention of chlorophyll fading. However, it omits the explanation that other pigments are revealed, and does not mention carotenoids, anthocyanins, or environmental triggers like temperature and daylight changes. The explanation is less complete and lacks clarity compared to the Expected Output.) failed.