# DeepEval Full Metrics (Match Summary)

In [None]:
# If needed, uncomment to install/upgrade DeepEval
# !pip install -U deepeval


In [None]:
import os
import sys
from pathlib import Path
import requests
import pandas as pd

from dotenv import load_dotenv
load_dotenv('./../.env')

ROOT = Path('..').resolve().parent
sys.path.append(str(ROOT))

BASE_URL = os.getenv('BASE_URL', 'http://localhost:8000')
FILE_PATH = Path(os.getenv('SAMPLE_FILE', '../sample_docs/Match_Summary.pdf')).resolve()
PUBLISH = os.getenv('DEEPEVAL_PUBLISH', 'false').lower() == 'true'

print('Backend:', BASE_URL)
print('File:', FILE_PATH)
print('Publish to Confident AI:', PUBLISH)


In [None]:
from deepeval.test_case import LLMTestCase, LLMTestCaseParams
from deepeval.evaluate import evaluate, AsyncConfig
from deepeval.metrics import (
    ContextualPrecisionMetric,
    ContextualRecallMetric,
    AnswerRelevancyMetric,
    FaithfulnessMetric,
)

try:
    from deepeval.metrics import ContextualRelevancyMetric
except Exception:
    ContextualRelevancyMetric = None

try:
    from deepeval.metrics import CompletenessMetric
except Exception:
    CompletenessMetric = None

try:
    from deepeval.metrics import GEval
except Exception:
    GEval = None

def build_metrics():
    metrics = [
        ContextualPrecisionMetric(),
        ContextualRecallMetric(),
        AnswerRelevancyMetric(),
        FaithfulnessMetric(),
    ]
    if ContextualRelevancyMetric is not None:
        metrics.append(ContextualRelevancyMetric())
    elif GEval is not None:
        metrics.append(
            GEval(
                name='Context Relevance',
                criteria='Evaluate how relevant the retrieval context is to the question. Score 0 to 1.',
                evaluation_params=[LLMTestCaseParams.INPUT, LLMTestCaseParams.RETRIEVAL_CONTEXT],
            )
        )
    if CompletenessMetric is not None:
        metrics.append(CompletenessMetric())
    elif GEval is not None:
        metrics.append(
            GEval(
                name='Completeness',
                criteria='Assess if the answer is complete given the context. Score 0 to 1.',
                evaluation_params=[LLMTestCaseParams.INPUT, LLMTestCaseParams.ACTUAL_OUTPUT, LLMTestCaseParams.RETRIEVAL_CONTEXT],
            )
        )
    return metrics


## Test Cases

In [None]:
test_cases = [
    {'id': 'Q1', 'question': 'How many runs did Hardik Pandya score?'},
    {'id': 'Q2', 'question': 'How many balls did Hardik Pandya face?'},
    {'id': 'Q3', 'question': 'How many sixes did Hardik Pandya hit?'},
    {'id': 'Q4', 'question': 'How many fours did Hardik Pandya hit?'},
    {'id': 'Q5', 'question': 'How many runs did Tilak Varma score?'},
    {'id': 'Q6', 'question': 'How many sixes did Tilak Varma hit?'},
    {'id': 'Q7', 'question': 'How many fours did Tilak Varma hit?'},
    {'id': 'Q8', 'question': 'How many runs did Rinku Singh score?'},
    {'id': 'Q9', 'question': 'Who dismissed Suryakumar Yadav?'},
    {'id': 'Q10', 'question': 'Who caught Hardik Pandya?'},
]


## Upload Document

In [None]:
with open(FILE_PATH, 'rb') as f:
    files = {'file': (FILE_PATH.name, f)}
    upload_res = requests.post(f'{BASE_URL}/upload', files=files)

upload_res.raise_for_status()
session_id = upload_res.json().get('session_id')
print('Session:', session_id)


## Ask + Evaluate

In [None]:
cases = []
rows = []
for case in test_cases:
    q = case['question']
    payload = {'session_id': session_id, 'question': q}
    ask_res = requests.post(f'{BASE_URL}/ask', json=payload)
    ask_res.raise_for_status()
    ask_data = ask_res.json()
    answer = ask_data.get('answer', '')
    retrieval_context = ask_data.get('retrieval_context', [])

    test_case = LLMTestCase(
        input=q,
        actual_output=answer,
    expected_output=answer,
        retrieval_context=retrieval_context,
    )
    cases.append(test_case)

    metric_scores = {}
    for metric in build_metrics():
        metric.measure(test_case)
        name = getattr(metric, 'name', metric.__class__.__name__)
        metric_scores[name] = {
            'score': getattr(metric, 'score', None),
            'reason': getattr(metric, 'reason', None),
        }

    rows.append({
        'id': case['id'],
        'question': q,
        'answer': answer,
        'metrics': metric_scores,
    })

evaluate(
    test_cases=cases,
    metrics=build_metrics(),
    async_config=AsyncConfig(run_async=False)
)


## Flattened Metrics Table

In [None]:
def flatten_metrics(metrics_dict):
    flat = {}
    for name, payload in metrics_dict.items():
        flat[f'{name}_score'] = payload.get('score')
        flat[f'{name}_reason'] = payload.get('reason')
    return flat

flat_rows = []
for row in rows:
    flat = {
        'id': row['id'],
        'question': row['question'],
        'answer': row['answer']
    }
    flat.update(flatten_metrics(row['metrics']))
    flat_rows.append(flat)

pd.DataFrame(flat_rows)
