# End-to-End DeepEval (Single Question)

In [1]:
# If needed, uncomment to install/upgrade DeepEval
!pip install -U deepeval


zsh:1: command not found: pip


In [2]:
import os
import sys
from pathlib import Path
import requests
import pandas as pd

from dotenv import load_dotenv
load_dotenv('./../.env')

ROOT = Path('..').resolve().parent
sys.path.append(str(ROOT))

BASE_URL = os.getenv('BASE_URL', 'http://localhost:8000')
FILE_PATH = Path(os.getenv('SAMPLE_FILE', '../sample_docs/Match_Summary.pdf')).resolve()
PUBLISH = os.getenv('DEEPEVAL_PUBLISH', 'false').lower() == 'true'

print('Backend:', BASE_URL)
print('File:', FILE_PATH)
print('Publish to Confident AI:', PUBLISH)


Backend: http://localhost:8000
File: /Users/shubhanshurastogi_1/Learning/FinancialAssistant/rag-session-qa/eval/sample_docs/Match_Summary.pdf
Publish to Confident AI: False


In [3]:
from pathlib import Path
from dotenv import load_dotenv

ROOT = Path('..').resolve().parent  # repo root
load_dotenv(ROOT / 'backend' / '.env')


True

In [4]:
from deepeval.test_case import LLMTestCase, LLMTestCaseParams
from deepeval.evaluate import evaluate, AsyncConfig
from deepeval.metrics import (
    ContextualPrecisionMetric,
    ContextualRecallMetric,
    AnswerRelevancyMetric,
    FaithfulnessMetric,
)

try:
    from deepeval.metrics import ContextualRelevancyMetric
except Exception:
    ContextualRelevancyMetric = None

try:
    from deepeval.metrics import CompletenessMetric
except Exception:
    CompletenessMetric = None

try:
    from deepeval.metrics import GEval
except Exception:
    GEval = None

def build_metrics():
    metrics = [
        ContextualPrecisionMetric(),
        ContextualRecallMetric(),
        AnswerRelevancyMetric(),
        FaithfulnessMetric(),
    ]
    if ContextualRelevancyMetric is not None:
        metrics.append(ContextualRelevancyMetric())
    elif GEval is not None:
        metrics.append(
            GEval(
                name='Context Relevance',
                criteria='Evaluate how relevant the retrieval context is to the question. Score 0 to 1.',
                evaluation_params=[LLMTestCaseParams.INPUT, LLMTestCaseParams.RETRIEVAL_CONTEXT],
            )
        )
    if CompletenessMetric is not None:
        metrics.append(CompletenessMetric())
    elif GEval is not None:
        metrics.append(
            GEval(
                name='Completeness',
                criteria='Assess if the answer is complete given the context. Score 0 to 1.',
                evaluation_params=[LLMTestCaseParams.INPUT, LLMTestCaseParams.ACTUAL_OUTPUT, LLMTestCaseParams.RETRIEVAL_CONTEXT],
            )
        )
    return metrics


## Question

In [5]:
QUESTION = 'How many sixes did Tilak Varma hit?'


## Upload Document

In [6]:
with open(FILE_PATH, 'rb') as f:
    files = {'file': (FILE_PATH.name, f)}
    upload_res = requests.post(f'{BASE_URL}/upload', files=files)

upload_res.raise_for_status()
session_id = upload_res.json().get('session_id')
print('Session:', session_id)


Session: 185f2ade-c73f-43b6-a499-bc1dca64c466


## Ask + Evaluate

In [7]:
payload = {'session_id': session_id, 'question': QUESTION}
ask_res = requests.post(f'{BASE_URL}/ask', json=payload)
ask_res.raise_for_status()
ask_data = ask_res.json()
answer = ask_data.get('answer', '')
retrieval_context = ask_data.get('retrieval_context', [])
print('Answer:', answer)

test_case = LLMTestCase(
    input=QUESTION,
    actual_output=answer,
    expected_output=answer,
    retrieval_context=retrieval_context,
)

metrics = build_metrics()
evaluate(
    test_cases=[test_case],
    metrics=metrics,
    async_config=AsyncConfig(run_async=False)
)


Answer: Tilak Varma hit 3 sixes.


Output()



Metrics Summary

  - ‚úÖ Contextual Precision (score: 1.0, threshold: 0.5, strict: False, evaluation model: gpt-4.1, reason: The score is 1.00 because the top-ranked node in the retrieval contexts directly states 'Tilak Varma hit 3 sixes', perfectly matching the input question. All relevant information is ranked at the top, with no irrelevant nodes present. Great job!, error: None)
  - ‚úÖ Contextual Recall (score: 1.0, threshold: 0.5, strict: False, evaluation model: gpt-4.1, reason: The score is 1.00 because the node(s) in retrieval context directly confirm that Tilak Varma hit 3 sixes, perfectly supporting the expected output., error: None)
  - ‚úÖ Answer Relevancy (score: 1.0, threshold: 0.5, strict: False, evaluation model: gpt-4.1, reason: The score is 1.00 because the answer was fully relevant and directly addressed the question without any irrelevant information. Great job staying focused and concise!, error: None)
  - ‚úÖ Faithfulness (score: 1.0, threshold: 0.5, strict: Fal

EvaluationResult(test_results=[TestResult(name='test_case_0', success=False, metrics_data=[MetricData(name='Contextual Precision', threshold=0.5, success=True, score=1.0, reason="The score is 1.00 because the top-ranked node in the retrieval contexts directly states 'Tilak Varma hit 3 sixes', perfectly matching the input question. All relevant information is ranked at the top, with no irrelevant nodes present. Great job!", strict_mode=False, evaluation_model='gpt-4.1', error=None, evaluation_cost=0.003576, verbose_logs='Verdicts:\n[\n    {\n        "verdict": "yes",\n        "reason": "The context contains the line \'Tilak Varma b Marco Jansen 45(19) [4s-3 6s-3]\', which directly states that Tilak Varma hit 3 sixes, matching the expected output."\n    }\n]'), MetricData(name='Contextual Recall', threshold=0.5, success=True, score=1.0, reason='The score is 1.00 because the node(s) in retrieval context directly confirm that Tilak Varma hit 3 sixes, perfectly supporting the expected outpu

## Metric Scores

In [8]:
score_rows = []
for metric in build_metrics():
    metric.measure(test_case)
    score_rows.append({
        'metric': getattr(metric, 'name', metric.__class__.__name__),
        'score': getattr(metric, 'score', None),
        'reason': getattr(metric, 'reason', None),
        'success': getattr(metric, 'success', None),
    })
pd.DataFrame(score_rows)


Output()

Output()

Output()

Output()

Output()

Output()

Unnamed: 0,metric,score,reason,success
0,ContextualPrecisionMetric,1.0,The score is 1.00 because the first node in th...,True
1,ContextualRecallMetric,1.0,The score is 1.00 because the node(s) in retri...,True
2,AnswerRelevancyMetric,1.0,The score is 1.00 because the answer was fully...,True
3,FaithfulnessMetric,1.0,The score is 1.00 because there are no contrad...,True
4,ContextualRelevancyMetric,0.25,"The score is 0.25 because only one statement, ...",False
5,Completeness,1.0,The Actual Output correctly states that Tilak ...,True
