In [1]:
import os
import pandas as pd
from deepeval import evaluate
from deepeval.test_case import LLMTestCase
from deepeval.metrics import SummarizationMetric
from deepeval.models import GeminiModel

def batch_summarization_score(
    input_output_pairs,
    api_key,
    batch_size=100,
    # sleep_time=60,
    # max_retries=3,
    # retry_delay=10
):
    """
    Evaluates summarization quality in batches using the Gemini model with deepeval,
    including automatic retries for failed evaluations.

    Args:
        input_output_pairs: List of tuples (input, actual_output)
        api_key: Gemini API key string
        batch_size: Number of evaluations per batch to avoid hitting rate limits
        sleep_time: Sleep time in seconds between batches
        max_retries: Number of retry attempts per batch on failure
        retry_delay: Delay between retries in seconds

    Returns:
        List of dicts containing input, actual_output, score, reason
    """
    model = GeminiModel(
        model_name="gemini-2.0-flash",
        api_key=api_key,
        temperature=0,
    )

    metric = SummarizationMetric(
        threshold=0.5,
        model=model,
        # assessment_questions=[
        #     "Is the coverage score based on a percentage of 'yes' answers?",
        #     "Does the score ensure the summary's accuracy with the source?",
        #     "Does a higher score mean a more comprehensive summary?"
        # ]
        n=10
    )

    results = []

    for i in range(0, len(input_output_pairs), batch_size):
        batch = input_output_pairs[i:i + batch_size]
        test_cases = [LLMTestCase(input=inp, actual_output=out) for inp, out in batch]

        try:
            batch_result = evaluate(test_cases=test_cases, metrics=[metric])
            for test_case, res in zip(batch, batch_result.test_results):
                results.append(res)
        except Exception as e:
            print(f"Batch {i // batch_size + 1} failed with error: {e}")
            for _ in batch:
                results.append(None)  # Preserve alignment with input

    return results

In [2]:
API_KEY = os.getenv("GEMINI_API_KEY")

In [3]:
df = pd.read_csv("/Users/pupipatsingkhorn/Developer/repositories/NanoLLaDA/data/gemini_summaries.csv")

In [4]:
results = batch_summarization_score(
    input_output_pairs=list(
        df[["body", "generated"]].itertuples(index=False, name=None)
    ),
    api_key=API_KEY,
)

Evaluating 100 test case(s) in parallel: |█████▏    | 52% (52/100) [Time Taken: 00:27,  1.91test case/s]

Batch 1 failed with error: 'NoneType' object has no attribute 'truths'





In [5]:
scores = []
for r in results:
    try:
        if r is None:
            raise ValueError("Result is None")
        scores.append(r.metrics_data[0].score)
    except Exception as e:
        print(f"An error occurred: {e}")
        scores.append(0)

An error occurred: Result is None
An error occurred: Result is None
An error occurred: Result is None
An error occurred: Result is None
An error occurred: Result is None
An error occurred: Result is None
An error occurred: Result is None
An error occurred: Result is None
An error occurred: Result is None
An error occurred: Result is None
An error occurred: Result is None
An error occurred: Result is None
An error occurred: Result is None
An error occurred: Result is None
An error occurred: Result is None
An error occurred: Result is None
An error occurred: Result is None
An error occurred: Result is None
An error occurred: Result is None
An error occurred: Result is None
An error occurred: Result is None
An error occurred: Result is None
An error occurred: Result is None
An error occurred: Result is None
An error occurred: Result is None
An error occurred: Result is None
An error occurred: Result is None
An error occurred: Result is None
An error occurred: Result is None
An error occur

In [6]:
df2 = pd.DataFrame({'body': df['body'], 'score': scores})
df2.to_csv('score-gemini.csv',index=False)