In [None]:
!pip3 install deepeval
!pip3 install pandas

In [5]:
from deepeval import evaluate
from deepeval.metrics import AnswerRelevancyMetric, ContextualPrecisionMetric, ContextualRecallMetric, FaithfulnessMetric, HallucinationMetric
from deepeval.test_case import LLMTestCase
import openai
import pandas as pd

In [None]:
openai.api_key = ""

In [8]:
# Load the data from the CSV file
df = pd.read_csv('deepeval_evaluation.csv', encoding='latin1')

# Extract the relevant columns
inputs = df['input'].tolist()
actual_outputs = df['actual_output'].tolist()
expected_outputs = df['expected_output'].tolist()

In [9]:
# number of rows
n = 3  

In [10]:
# Answer Relevancy Metric

# Initialize the metric
metric = AnswerRelevancyMetric(
    threshold=0.7,
    model="gpt-4",  # Ensure you have access to this model
    include_reason=True
)

# Create a list to hold test cases
test_cases = []

# Loop through the data to create test cases
for i in range(n):
    test_case = LLMTestCase(
        input=inputs[i],
        actual_output=actual_outputs[i]
    )
    test_cases.append(test_case)

# Evaluate test cases in bulk
evaluate(test_cases, [metric])

# Optionally, print the scores and reasons
for test_case in test_cases:
    if hasattr(test_case, 'metrics') and test_case.metrics:  # Ensure metrics are available
        print(f"Test case input: {test_case.input}")
        print(f"Answer Relevancy Score: {test_case.metrics[0].score}")
        print(f"Reason: {test_case.metrics[0].reason}")
    else:
        print(f"Metrics not available for input: {test_case.input}")
    print("="*50)

OpenAIError: The api_key client option must be set either by passing api_key to the client or by setting the OPENAI_API_KEY environment variable

In [None]:
# Contextual Precision Metric
metric = ContextualPrecisionMetric(
    threshold=0.7,
    model="gpt-4",  # Ensure you have access to this model
    include_reason=True
)

# Create a list to hold test cases
test_cases = []

# Loop through the data to create 10 test cases
for i in range(n):
    test_case = LLMTestCase(
        input=inputs[i],
        actual_output=actual_outputs[i],
        expected_output=expected_outputs[i],
        retrieval_context=retrieval_context[i],
        context=provided_context[i]
    )
    test_cases.append(test_case)

# Evaluate test cases in bulk
evaluate(test_cases, [metric])

# Optionally, print the scores and reasons
for test_case in test_cases:
    if hasattr(test_case, 'metrics') and test_case.metrics:  # Ensure metrics are available
        print(f"Test case input: {test_case.input}")
        print(f"Score: {test_case.metrics[0].score}")
        print(f"Reason: {test_case.metrics[0].reason}")
    else:
        print(f"Metrics not available for input: {test_case.input}")
    print("="*50)

In [None]:
# Contextual Recall Metric
metric = ContextualRecallMetric(
    threshold=0.7,
    model="gpt-4",  # Ensure you have access to this model
    include_reason=True
)

# Create a list to hold test cases
test_cases = []

# Loop through the data to create test cases
for i in range(n):
    test_case = LLMTestCase(
        input=inputs[i],
        actual_output=actual_outputs[i],
        expected_output=expected_outputs[i],
        retrieval_context=retrieval_context[i],
        context=provided_context[i]
    )
    test_cases.append(test_case)

# Evaluate test cases in bulk
evaluate(test_cases, [metric])

# Optionally, print the scores and reasons
for test_case in test_cases:
    if hasattr(test_case, 'metrics') and test_case.metrics:  # Ensure metrics are available
        print(f"Test case input: {test_case.input}")
        print(f"Recall Score: {test_case.metrics[0].score}")
        print(f"Reason: {test_case.metrics[0].reason}")
    else:
        print(f"Metrics not available for input: {test_case.input}")
    print("="*50)

In [None]:
# FaithfulnessMetric
metric = FaithfulnessMetric(
    threshold=0.7,
    model="gpt-4",  # Ensure you have access to this model
    include_reason=True
)

# Create a list to hold test cases
test_cases = []

# Loop through the data to create test cases
for i in range(n):
    test_case = LLMTestCase(
        input=inputs[i],
        actual_output=actual_outputs[i],
        retrieval_context=retrieval_context[i]
    )
    test_cases.append(test_case)

# Evaluate test cases in bulk
evaluate(test_cases, [metric])

# Optionally, print the scores and reasons
for test_case in test_cases:
    if hasattr(test_case, 'metrics') and test_case.metrics:  # Ensure metrics are available
        print(f"Test case input: {test_case.input}")
        print(f"Faithfulness Score: {test_case.metrics[0].score}")
        print(f"Reason: {test_case.metrics[0].reason}")
    else:
        print(f"Metrics not available for input: {test_case.input}")
    print("="*50)


In [None]:
# Hallucination Metric
metric = HallucinationMetric(threshold=0.5)

# Create a list to hold test cases
test_cases = []

# Loop through the data to create test cases
for i in range(n):
    test_case = LLMTestCase(
        input=inputs[i],
        actual_output=actual_outputs[i],
        expected_output=expected_outputs[i],
        context=provided_context[i]
    )
    test_cases.append(test_case)

# Evaluate test cases in bulk
evaluate(test_cases, [metric])

# Optionally, print the scores and reasons
for test_case in test_cases:
    if hasattr(test_case, 'metrics') and test_case.metrics:  # Ensure metrics are available
        print(f"Test case input: {test_case.input}")
        print(f"Hallucination Score: {test_case.metrics[0].score}")
        print(f"Reason: {test_case.metrics[0].reason}")
    else:
        print(f"Metrics not available for input: {test_case.input}")
    print("="*50)