# Deepeval's RAGAS evaluation

In [1]:
import pandas as pd
import os
from deepeval import evaluate
from deepeval.metrics.ragas import RagasMetric
from deepeval.test_case import LLMTestCase

# Set the API key (ensure it's set securely in practice)
os.environ["OPENAI_API_KEY"] = ""

# Load CSV files
annotations_file = 'Annotations.csv'
new_rels_file = 'NewRels_Skip2.csv'

try:
    annotations_df = pd.read_csv(annotations_file)
    newrels_df = pd.read_csv(new_rels_file)
except FileNotFoundError as e:
    raise FileNotFoundError(f"Ensure both {annotations_file} and {new_rels_file} are in the working directory.") from e

# Generate test cases
test_cases = [
    LLMTestCase(
        input="Extract the relationship between the subject and object.",
        actual_output=f"Subject: {rel['subj']}, Relationship: {rel['rel']}, Object: {rel['obj']}",
        expected_output=f"Subject {gt['subj']}, Relationship: {gt['rel']}, Object: {gt['obj']}",
        retrieval_context=[rel['ref']]
    )
    for _, gt in annotations_df.iterrows()
    for _, rel in newrels_df[
        (newrels_df['subj'] == gt['subj']) | (newrels_df['subjSummary'] == gt['subj'])
    ].iterrows()
]

# Create evaluation metric
metric = RagasMetric(threshold=0.5, model="gpt-4-turbo")

# Evaluate test cases
results = evaluate(test_cases, [metric])

# Calculate average score
average_score = sum(tc.metrics['RagasMetric'].score for tc in test_cases) / len(test_cases)

# Display results
print(f"Average Score: {average_score:.2f}")

for i, test_case in enumerate(test_cases, start=1):
    print(f"\nTest Case {i}:")
    print(f"Expected: {test_case.expected_output}")
    print(f"Actual: {test_case.actual_output}")
    print(f"Score: {test_case.metrics['RagasMetric'].score:.2f}")
    print(f"Context: {test_case.retrieval_context}")




Event loop is already running. Applying nest_asyncio patch to allow async execution...


  from .autonotebook import tqdm as notebook_tqdm

For example, replace imports like: `from langchain_core.pydantic_v1 import BaseModel`
with: `from pydantic import BaseModel`
or the v1 compatibility namespace if you are working in a code base that has not been fully upgraded to pydantic 2 yet. 	from pydantic.v1 import BaseModel

  from ragas.llms.prompt import PromptValue

Evaluating:   0%|                                         | 0/1 [00:00<?, ?it/s][A

Evaluating:   0%|                                         | 0/1 [00:00<?, ?it/s][A[A


Evaluating:   0%|                                         | 0/1 [00:00<?, ?it/s][A[A[A



Evaluating:   0%|                                         | 0/1 [00:00<?, ?it/s][A[A[A[A




Evaluating:   0%|                                         | 0/1 [00:00<?, ?it/s][A[A[A[A[A





Evaluating:   0%|                                         | 0/1 [00:00<?, ?it/s][A[A[A[A[A[A






Evaluating:   0%|                                    



Metrics Summary

  - ❌ RAGAS (score: nan, threshold: 0.5, strict: False, evaluation model: gpt-4-turbo, reason: None, error: None)

For test case:

  - input: Extract the relationship between the subject and object.
  - actual output: Subject: Physical Exercise, Relationship: recommended, Object: adjunct to other anti-depressive treatment
  - expected output: Subject Physical Exercise, Relationship: Finding, Object: an adjunct to other Anti-depressive treatments
  - context: None
  - retrieval context: ['exercise is considered an adjunct to other anti-depressive treatment']


Metrics Summary

  - ❌ RAGAS (score: nan, threshold: 0.5, strict: False, evaluation model: gpt-4-turbo, reason: None, error: None)

For test case:

  - input: Extract the relationship between the subject and object.
  - actual output: Subject: Physical Exercise, Relationship: definition, Object: recommended non-pharmacological treatment to alleviate symptoms and prevent relapse of depression
  - expected output:




AttributeError: 'LLMTestCase' object has no attribute 'metrics'