In [3]:
from dotenv import load_dotenv
import os
load_dotenv()

os.environ["CONFIDENT_AI_API_KEY"] = os.getenv("CONFIDENT_AI_API_KEY")
os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")

## Creating Confident AI Login

In [2]:
import deepeval

deepeval.login_with_confident_api_key(os.environ["CONFIDENT_AI_API_KEY"])

## First DeepEval Tests

### Answer Relevancy Metric

In [None]:
from deepeval.test_case import LLMTestCase
from deepeval.metrics import AnswerRelevancyMetric

answer_relevancy_metric = AnswerRelevancyMetric()

test_case = LLMTestCase(
    input = "Who is the number one ranked T20I batsman in the world as of latest ICC Rankings?",
    actual_output = "Abhishek Sharma",
    retrieval_context= ["Abhishek Sharma is the number one ranked T20I batsman in the world as of latest ICC Rankings."]
)

answer_relevancy_metric.measure(test_case)

print(f"Answer Relevancy Score: {answer_relevancy_metric.score}")

Output()

Answer Relevancy Score: 1.0


### Contextual Relevancy Metric

We are also logging the metrics in Confident AI Dashboard using the `evaluate` method.

In [15]:
from deepeval.test_case import LLMTestCase
from deepeval.metrics import ContextualPrecisionMetric
from deepeval import evaluate

metric = ContextualPrecisionMetric(
    threshold=0.5,  # Set threshold for precision
    model ="gpt-4o",  # Specify the model to use for evaluation
    include_reason= True,  # Include reasoning in the evaluation
)
test_case = LLMTestCase(
    input = "Who is the number one ranked T20I batsman in the world as of latest ICC Rankings?",
    # Should be an output from an LLM or a RAG or a Agentic System
    actual_output = "Abhishek Sharma",
    expected_output= "Abhishek Sharma",
    # Should come from a Vector DB, Agent tool etc.
    retrieval_context= ["Abhishek Sharma is the number one ranked T20I batsman in the world as of latest ICC Rankings."]
)

evaluate(
    test_cases=[test_case],
    metrics=[metric],
)

Output()



Metrics Summary

  - âœ… Contextual Precision (score: 1.0, threshold: 0.5, strict: False, evaluation model: gpt-4o, reason: The score is 1.00 because the relevant node is perfectly ranked at the top, providing the precise information needed. Great job on achieving the highest accuracy!, error: None)

For test case:

  - input: Who is the number one ranked T20I batsman in the world as of latest ICC Rankings?
  - actual output: Abhishek Sharma
  - expected output: Abhishek Sharma
  - context: None
  - retrieval context: ['Abhishek Sharma is the number one ranked T20I batsman in the world as of latest ICC Rankings.']


Overall Metric Pass Rates

Contextual Precision: 100.00% pass rate




I0000 00:00:1753957022.331906 15703634 fork_posix.cc:71] Other threads are currently calling into gRPC, skipping fork() handlers


EvaluationResult(test_results=[TestResult(name='test_case_0', success=True, metrics_data=[MetricData(name='Contextual Precision', threshold=0.5, success=True, score=1.0, reason='The score is 1.00 because the relevant node is perfectly ranked at the top, providing the precise information needed. Great job on achieving the highest accuracy!', strict_mode=False, evaluation_model='gpt-4o', error=None, evaluation_cost=0.0033950000000000004, verbose_logs='Verdicts:\n[\n    {\n        "verdict": "yes",\n        "reason": "The context directly states that \'Abhishek Sharma is the number one ranked T20I batsman in the world as of latest ICC Rankings,\' which matches the expected output."\n    }\n]')], conversational=False, multimodal=False, input='Who is the number one ranked T20I batsman in the world as of latest ICC Rankings?', actual_output='Abhishek Sharma', expected_output='Abhishek Sharma', context=None, retrieval_context=['Abhishek Sharma is the number one ranked T20I batsman in the worl

In [24]:
print(test_case.input)

Who is the number one ranked T20I batsman in the world as of latest ICC Rankings?


### Creating Evaluation Datasets To Evaluate our Tests

In [29]:
from deepeval.test_case import LLMTestCase
from deepeval.metrics import ContextualPrecisionMetric
from deepeval.dataset import EvaluationDataset, Golden

metric = ContextualPrecisionMetric(
    threshold=0.5,  # Set threshold for precision
    model="gpt-4o",  # Specify the model to use for evaluation
    include_reason=True,  # Include reasoning in the evaluation
)

test_case = LLMTestCase(
    input="Who is the number one ranked T20I batsman in the world as of latest ICC Rankings?",
    actual_output="Abhishek Sharma",
    expected_output="Abhishek Sharma",
    retrieval_context=["Abhishek Sharma is the number one ranked T20I batsman in the world as of latest ICC Rankings."]
)

# dataset = EvaluationDataset(goldens=[Golden(input = test_case.input)])
dataset = EvaluationDataset()
dataset.add_test_case(test_case)

In [30]:
evaluate(
    test_cases = dataset.test_cases,
    metrics= [metric]
)

Output()



Metrics Summary

  - âœ… Contextual Precision (score: 1.0, threshold: 0.5, strict: False, evaluation model: gpt-4o, reason: The score is 1.00 because the relevant node is perfectly ranked at the top, providing the precise information needed. Great job on achieving perfect precision!, error: None)

For test case:

  - input: Who is the number one ranked T20I batsman in the world as of latest ICC Rankings?
  - actual output: Abhishek Sharma
  - expected output: Abhishek Sharma
  - context: None
  - retrieval context: ['Abhishek Sharma is the number one ranked T20I batsman in the world as of latest ICC Rankings.']


Overall Metric Pass Rates

Contextual Precision: 100.00% pass rate




EvaluationResult(test_results=[TestResult(name='test_case_0', success=True, metrics_data=[MetricData(name='Contextual Precision', threshold=0.5, success=True, score=1.0, reason='The score is 1.00 because the relevant node is perfectly ranked at the top, providing the precise information needed. Great job on achieving perfect precision!', strict_mode=False, evaluation_model='gpt-4o', error=None, evaluation_cost=0.003385, verbose_logs='Verdicts:\n[\n    {\n        "verdict": "yes",\n        "reason": "The context directly states that \'Abhishek Sharma is the number one ranked T20I batsman in the world as of latest ICC Rankings,\' which matches the expected output."\n    }\n]')], conversational=False, multimodal=False, input='Who is the number one ranked T20I batsman in the world as of latest ICC Rankings?', actual_output='Abhishek Sharma', expected_output='Abhishek Sharma', context=None, retrieval_context=['Abhishek Sharma is the number one ranked T20I batsman in the world as of latest I

### Creating Golden(Dataset) for Evaluation

In [31]:
test_data = [
    {
        "input" : "Who is the number one ranked T20I batsman in the world as of latest ICC Rankings?",
        "expected_output": "Abhishek Sharma",
    },
    {
        "input": "What is the capital of France?",
        "expected_output": "Paris",
    }
]

In [32]:
## We just converted a dictionary of test cases to Golden 

from deepeval.dataset import Golden, EvaluationDataset

goldens = [] # A list of all goldens

for data in test_data:
    golden = Golden(
        input = data["input"],
        expected_output = data["expected_output"]
    )

    goldens.append(golden)
