# Layer 1 Metrics: Contextual Precision, Contextual Recall, and Context Relevance

In [15]:
!pip install -U deepeval



[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.3[0m[39;49m -> [0m[32;49m26.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


## Setup
This notebook pulls real values from your running application.

Backend requirement: `uvicorn app.main:app --reload --port 8000`

Environment:
- `BASE_URL` (default: `http://localhost:8000`)
- `SAMPLE_FILE` (default: `../sample_docs/Match_Summary.pdf`)
- `QUESTION` (default set below)
- `EXPECTED_OUTPUT` (default set below)



In [16]:
import os
from pathlib import Path

import requests
from deepeval.test_case import LLMTestCase, LLMTestCaseParams
from deepeval.evaluate import evaluate, AsyncConfig
from deepeval.metrics import ContextualPrecisionMetric, ContextualRecallMetric

try:
    from deepeval.metrics import ContextualRelevancyMetric
except Exception:
    ContextualRelevancyMetric = None

try:
    from deepeval.metrics import GEval
except Exception:
    GEval = None

try:
    from dotenv import load_dotenv
    load_dotenv('./../.env')
    load_dotenv('./../../backend/.env')
except Exception:
    pass

BASE_URL = os.getenv('BASE_URL', 'http://localhost:8000')
FILE_PATH = Path(os.getenv('SAMPLE_FILE', '../sample_docs/Match_Summary.pdf')).resolve()

print('Backend:', BASE_URL)
print('File:', FILE_PATH)

if not FILE_PATH.exists():
    raise FileNotFoundError(f'Sample file not found: {FILE_PATH}')


def fetch_case(question: str):
    with open(FILE_PATH, 'rb') as f:
        files = {'file': (FILE_PATH.name, f)}
        upload_res = requests.post(f'{BASE_URL}/upload', files=files, timeout=120)

    upload_res.raise_for_status()
    session_id = upload_res.json().get('session_id')

    payload = {'session_id': session_id, 'question': question}
    ask_res = requests.post(f'{BASE_URL}/ask', json=payload, timeout=120)
    ask_res.raise_for_status()

    ask_data = ask_res.json()
    answer = ask_data.get('answer', '')
    retrieval_context = ask_data.get('retrieval_context', []) or []

    return answer, retrieval_context, session_id



Backend: http://localhost:8000
File: /Users/shubhanshurastogi_1/Learning/rag-session-qa-eval/eval/sample_docs/Match_Summary.pdf


## Get Real Input, Output, Retrieval Context, and Expected Output


In [17]:
print('Each metric section below uses its own question and expected output.')



Each metric section below uses its own question and expected output.


## Evaluate Contextual Precision

**Purpose of this metric:** Contextual Precision checks whether the top retrieved chunks are relevant to the expected answer. A higher score means relevant chunks are ranked earlier.

In [18]:
precision_question = 'How many sixes did Tilak Varma hit?'
precision_expected_output = 'Tilak Varma hit 3 sixes.'

precision_answer, precision_retrieval_context, precision_session_id = fetch_case(precision_question)

print('Session:', precision_session_id)
print('Input:', precision_question)
print('Expected Output:', precision_expected_output)
print('Actual Output:', precision_answer)
print('Retrieved Context Chunks:', len(precision_retrieval_context))
print(f'Retrieved Context ({len(precision_retrieval_context)}): {[c[:200] for c in precision_retrieval_context]}')

contextual_precision_metric = ContextualPrecisionMetric()

precision_test_case = LLMTestCase(
    input=precision_question,
    actual_output=precision_answer,
    expected_output=precision_expected_output,
    retrieval_context=precision_retrieval_context,
)

evaluate(
    test_cases=[precision_test_case],
    metrics=[contextual_precision_metric],
    async_config=AsyncConfig(run_async=False),
)



Session: eab3e64c-ac18-42dd-9da7-34fb79967d42
Input: How many sixes did Tilak Varma hit?
Expected Output: Tilak Varma hit 3 sixes.
Actual Output: Tilak Varma hit 3 sixes.
Retrieved Context Chunks: 1
Retrieved Context (1): ["aren't in use today so it took some time for everyone to realise what happened. A yorker from round the wicket, looks to flick it away, misses and it clips leg stump on the way. Tilak Varma b Marco Ja"]


Output()



Metrics Summary

  - ‚úÖ Contextual Precision (score: 1.0, threshold: 0.5, strict: False, evaluation model: gpt-4.1, reason: The score is 1.00 because the top-ranked node directly states 'Tilak Varma hit 3 sixes', perfectly matching the input. Great job!, error: None)

For test case:

  - input: How many sixes did Tilak Varma hit?
  - actual output: Tilak Varma hit 3 sixes.
  - expected output: Tilak Varma hit 3 sixes.
  - context: None
  - retrieval context: ["aren't in use today so it took some time for everyone to realise what happened. A yorker from round the wicket, looks to flick it away, misses and it clips leg stump on the way. Tilak Varma b Marco Jansen 45(19) [4s-3 6s-3] 10.5 4 Marco Jansen to Tilak Varma, FOUR, round the wicket, short of length across off, and it's swatted wide of mid-on. Fielder gets a hand diving across but it still runs away 10.2 Marco Jansen to Suryakumar Yadav, 1 run, dropped! Suryakumar having all the luck out there! Length ball on leg, tries his tra

EvaluationResult(test_results=[TestResult(name='test_case_0', success=True, metrics_data=[MetricData(name='Contextual Precision', threshold=0.5, success=True, score=1.0, reason="The score is 1.00 because the top-ranked node directly states 'Tilak Varma hit 3 sixes', perfectly matching the input. Great job!", strict_mode=False, evaluation_model='gpt-4.1', error=None, evaluation_cost=0.003416, verbose_logs='Verdicts:\n[\n    {\n        "verdict": "yes",\n        "reason": "The context contains the line \'Tilak Varma b Marco Jansen 45(19) [4s-3 6s-3]\', which directly states that Tilak Varma hit 3 sixes, matching the expected output."\n    }\n]')], conversational=False, multimodal=False, input='How many sixes did Tilak Varma hit?', actual_output='Tilak Varma hit 3 sixes.', expected_output='Tilak Varma hit 3 sixes.', context=None, retrieval_context=["aren't in use today so it took some time for everyone to realise what happened. A yorker from round the wicket, looks to flick it away, misse

### Contextual Recall

**Purpose of this metric:** Contextual Recall checks whether the retrieved context covers the important facts needed for the expected answer. A higher score means less missing evidence.

In [19]:
recall_question = 'On which delivery was Tilak Varma dismissed and how?'
recall_expected_output = 'Tilak Varma was dismissed on the 11th delivery by Marco Jansen, bowled behind his legs.'

recall_answer, recall_retrieval_context, recall_session_id = fetch_case(recall_question)

print('Session:', recall_session_id)
print('Input:', recall_question)
print('Expected Output:', recall_expected_output)
print('Actual Output:', recall_answer)
print('Retrieved Context Chunks:', len(recall_retrieval_context))
print(f'Retrieved Context ({len(recall_retrieval_context)}): {[c[:200] for c in recall_retrieval_context]}')

contextual_recall_metric = ContextualRecallMetric()

recall_test_case = LLMTestCase(
    input=recall_question,
    actual_output=recall_answer,
    expected_output=recall_expected_output,
    retrieval_context=recall_retrieval_context,
)

evaluate(
    test_cases=[recall_test_case],
    metrics=[contextual_recall_metric],
    async_config=AsyncConfig(run_async=False),
)



Session: cf840788-bb4b-40ef-8f1e-0dd71af2dfa1
Input: On which delivery was Tilak Varma dismissed and how?
Expected Output: Tilak Varma was dismissed on the 11th delivery by Marco Jansen, bowled behind his legs.
Actual Output: Tilak Varma was dismissed on the 11th delivery by Marco Jansen, bowled behind his legs.
Retrieved Context Chunks: 5
Retrieved Context (5): ["aren't in use today so it took some time for everyone to realise what happened. A yorker from round the wicket, looks to flick it away, misses and it clips leg stump on the way. Tilak Varma b Marco Ja", 'f 8.5 6 Nortje to Tilak Varma, SIX, wow, just wow! Advances down to a fast length ball and smokes it over mid-on. Nortje is travelling the distance! 8.2 6 Nortje to Suryakumar Yadav, SIX, first-ball s', "or Linde at deep backward square and Surya's scratchy knock is over. Just never found his timing. Despite that, he walks off with 30 off just 16. Suryakumar Yadav c Linde b Kwena Maphaka 30(16) [4s-2 ", "kumar Yadav, FOUR, an

Output()



Metrics Summary

  - ‚úÖ Contextual Recall (score: 1.0, threshold: 0.5, strict: False, evaluation model: gpt-4.1, reason: The score is 1.00 because every detail in the expected output is directly confirmed by node 3 in the retrieval context. Great job!, error: None)

For test case:

  - input: On which delivery was Tilak Varma dismissed and how?
  - actual output: Tilak Varma was dismissed on the 11th delivery by Marco Jansen, bowled behind his legs.
  - expected output: Tilak Varma was dismissed on the 11th delivery by Marco Jansen, bowled behind his legs.
  - context: None
  - retrieval context: ["aren't in use today so it took some time for everyone to realise what happened. A yorker from round the wicket, looks to flick it away, misses and it clips leg stump on the way. Tilak Varma b Marco Jansen 45(19) [4s-3 6s-3] 10.5 4 Marco Jansen to Tilak Varma, FOUR, round the wicket, short of length across off, and it's swatted wide of mid-on. Fielder gets a hand diving across but it still

EvaluationResult(test_results=[TestResult(name='test_case_0', success=True, metrics_data=[MetricData(name='Contextual Recall', threshold=0.5, success=True, score=1.0, reason='The score is 1.00 because every detail in the expected output is directly confirmed by node 3 in the retrieval context. Great job!', strict_mode=False, evaluation_model='gpt-4.1', error=None, evaluation_cost=0.004824, verbose_logs='Verdicts:\n[\n    {\n        "verdict": "yes",\n        "reason": "The 3rd node states: \'11 W Marco Jansen to Tilak Varma, out Bowled!! Tilak has been bowled behind his legs... A yorker from round the wicket...\' confirming the dismissal, bowler, and method.",\n        "expected_output": "Tilak Varma was dismissed on the 11th delivery by Marco Jansen, bowled behind his legs."\n    }\n]')], conversational=False, multimodal=False, input='On which delivery was Tilak Varma dismissed and how?', actual_output='Tilak Varma was dismissed on the 11th delivery by Marco Jansen, bowled behind his 

### Context Relevance

**Purpose of this metric:** Context Relevance checks whether the retrieved context is on-topic for the user question. A higher score means less noisy or irrelevant retrieval.


In [20]:
relevance_question = 'Which bowler conceded 49 runs in two overs and what key events happened during that spell?'
relevance_expected_output = 'Nortje conceded 49 runs in his first two overs and later dismissed Rinku Singh, caught by Stubbs.'

relevance_answer, relevance_retrieval_context, relevance_session_id = fetch_case(relevance_question)

print('Session:', relevance_session_id)
print('Input:', relevance_question)
print('Expected Output:', relevance_expected_output)
print('Actual Output:', relevance_answer)
print('Retrieved Context Chunks:', len(relevance_retrieval_context))
print(f'Retrieved Context ({len(relevance_retrieval_context)}): {[c[:200] for c in relevance_retrieval_context]}')

if ContextualRelevancyMetric is not None:
    context_relevance_metric = ContextualRelevancyMetric()
elif GEval is not None:
    context_relevance_metric = GEval(
        name='Context Relevance',
        criteria='Evaluate how relevant the retrieval context is to the user question. Score 0 to 1.',
        evaluation_params=[LLMTestCaseParams.INPUT, LLMTestCaseParams.RETRIEVAL_CONTEXT],
    )
else:
    raise ImportError('Neither ContextualRelevancyMetric nor GEval is available in this Deepeval version.')

relevance_test_case = LLMTestCase(
    input=relevance_question,
    actual_output=relevance_answer,
    expected_output=relevance_expected_output,
    retrieval_context=relevance_retrieval_context,
)

evaluate(
    test_cases=[relevance_test_case],
    metrics=[context_relevance_metric],
    async_config=AsyncConfig(run_async=False),
)



Session: e350d598-6008-4c7b-b000-447fcc1721b6
Input: Which bowler conceded 49 runs in two overs and what key events happened during that spell?
Expected Output: Nortje conceded 49 runs in his first two overs and later dismissed Rinku Singh, caught by Stubbs.
Actual Output: The bowler who conceded 49 runs in two overs is Nortje. During that spell, he was smashed for runs, but he later took a wicket by dismissing Rinku Singh, who was caught by Stubbs after attempting to hit a pitched-up delivery downtown.
Retrieved Context Chunks: 5
Retrieved Context (5): ["or Linde at deep backward square and Surya's scratchy knock is over. Just never found his timing. Despite that, he walks off with 30 off just 16. Suryakumar Yadav c Linde b Kwena Maphaka 30(16) [4s-2 ", "aren't in use today so it took some time for everyone to realise what happened. A yorker from round the wicket, looks to flick it away, misses and it clips leg stump on the way. Tilak Varma b Marco Ja", 'e past the keeper and through 

Output()



Metrics Summary

  - ‚ùå Contextual Relevancy (score: 0.47058823529411764, threshold: 0.5, strict: False, evaluation model: gpt-4.1, reason: The score is 0.47 because only one statement directly addresses the input ('Nortje was smashed for 49 runs in his first two overs but then took the wicket of Rinku Singh, who was caught by Stubbs.'), while most other statements are about different bowlers or unrelated events, as noted in the irrelevancy reasons., error: None)

For test case:

  - input: Which bowler conceded 49 runs in two overs and what key events happened during that spell?
  - actual output: The bowler who conceded 49 runs in two overs is Nortje. During that spell, he was smashed for runs, but he later took a wicket by dismissing Rinku Singh, who was caught by Stubbs after attempting to hit a pitched-up delivery downtown.
  - expected output: Nortje conceded 49 runs in his first two overs and later dismissed Rinku Singh, caught by Stubbs.
  - context: None
  - retrieval conte

EvaluationResult(test_results=[TestResult(name='test_case_0', success=False, metrics_data=[MetricData(name='Contextual Relevancy', threshold=0.5, success=False, score=0.47058823529411764, reason="The score is 0.47 because only one statement directly addresses the input ('Nortje was smashed for 49 runs in his first two overs but then took the wicket of Rinku Singh, who was caught by Stubbs.'), while most other statements are about different bowlers or unrelated events, as noted in the irrelevancy reasons.", strict_mode=False, evaluation_model='gpt-4.1', error=None, evaluation_cost=0.02443, verbose_logs='Verdicts:\n[\n    {\n        "verdicts": [\n            {\n                "statement": "Suryakumar Yadav c Linde b Kwena Maphaka 30(16) [4s-2 6s-2]",\n                "verdict": "yes",\n                "reason": "This statement is relevant because it mentions Kwena Maphaka, who is the bowler in question, and provides information about a wicket and runs scored during his spell."\n       

### Combined View (reuse previous results)

This section does not run evaluation again. It only formats scores from the three metric objects already computed above.



In [21]:
import pandas as pd

# Reuse already-computed metric objects from previous sections
metric_objects = [
    ('Contextual Precision', globals().get('contextual_precision_metric')),
    ('Contextual Recall', globals().get('contextual_recall_metric')),
    ('Context Relevance', globals().get('context_relevance_metric')),
]

rows = []
for fallback_name, metric_obj in metric_objects:
    if metric_obj is None:
        rows.append({
            'metric': fallback_name,
            'status_icon': '‚¨ú',
            'result': 'NOT RUN',
            'score': None,
            'threshold': None,
            'score_pct': None,
            'threshold_pct': None,
            'reason': 'Run its section above first.',
        })
        continue

    score = getattr(metric_obj, 'score', None)
    threshold = getattr(metric_obj, 'threshold', None)
    success = getattr(metric_obj, 'success', None)
    reason = getattr(metric_obj, 'reason', None)

    if success is True:
        result = 'PASS'
        status_icon = '‚úÖ'
    elif success is False:
        result = 'FAIL'
        status_icon = '‚ùå'
    else:
        result = 'N/A'
        status_icon = '‚¨ú'

    rows.append({
        'metric': getattr(metric_obj, 'name', fallback_name),
        'status_icon': status_icon,
        'result': result,
        'score': score,
        'threshold': threshold,
        'score_pct': round(score * 100, 2) if isinstance(score, (int, float)) else None,
        'threshold_pct': round(threshold * 100, 2) if isinstance(threshold, (int, float)) else None,
        'reason': reason,
    })

summary_df = pd.DataFrame(rows)

display(summary_df[['metric', 'status_icon', 'result', 'score', 'threshold', 'score_pct', 'threshold_pct', 'reason']])


def color_by_threshold(row):
    styles = [''] * len(row.index)
    score_pct = row.get('score_pct')
    threshold_pct = row.get('threshold_pct')

    if pd.notna(score_pct) and pd.notna(threshold_pct):
        passed = score_pct >= threshold_pct
        color = '#166534' if passed else '#b91c1c'
        score_idx = row.index.get_loc('score_pct')
        styles[score_idx] = f'background-color: {color}; color: white;'

    return styles


styled_summary = (
    summary_df[['metric', 'status_icon', 'result', 'score_pct', 'threshold_pct', 'reason']]
    .style
    .hide(axis='index')
    .format(
        {
            'score_pct': lambda v: '' if pd.isna(v) else f'{v:.2f}%',
            'threshold_pct': lambda v: '' if pd.isna(v) else f'{v:.2f}%',
        }
    )
    .apply(color_by_threshold, axis=1)
)

styled_summary



Unnamed: 0,metric,status_icon,result,score,threshold,score_pct,threshold_pct,reason
0,Contextual Precision,‚úÖ,PASS,1.0,0.5,100.0,50.0,The score is 1.00 because the top-ranked node ...
1,Contextual Recall,‚úÖ,PASS,1.0,0.5,100.0,50.0,The score is 1.00 because every detail in the ...
2,Context Relevance,‚ùå,FAIL,0.470588,0.5,47.06,50.0,The score is 0.47 because only one statement d...


metric,status_icon,result,score_pct,threshold_pct,reason
Contextual Precision,‚úÖ,PASS,100.00%,50.00%,"The score is 1.00 because the top-ranked node directly states 'Tilak Varma hit 3 sixes', perfectly matching the input. Great job!"
Contextual Recall,‚úÖ,PASS,100.00%,50.00%,The score is 1.00 because every detail in the expected output is directly confirmed by node 3 in the retrieval context. Great job!
Context Relevance,‚ùå,FAIL,47.06%,50.00%,"The score is 0.47 because only one statement directly addresses the input ('Nortje was smashed for 49 runs in his first two overs but then took the wicket of Rinku Singh, who was caught by Stubbs.'), while most other statements are about different bowlers or unrelated events, as noted in the irrelevancy reasons."
