In [1]:
# import os
# import pandas as pd
# from deepeval import evaluate
# from deepeval.test_case import LLMTestCase
# from deepeval.metrics import SummarizationMetric
# from deepeval.models import GeminiModel

# def batch_summarization_score(
#     input_output_pairs,
#     api_key,
#     batch_size=100,
#     # sleep_time=60,
#     # max_retries=3,
#     # retry_delay=10
# ):
#     """
#     Evaluates summarization quality in batches using the Gemini model with deepeval,
#     including automatic retries for failed evaluations.

#     Args:
#         input_output_pairs: List of tuples (input, actual_output)
#         api_key: Gemini API key string
#         batch_size: Number of evaluations per batch to avoid hitting rate limits
#         sleep_time: Sleep time in seconds between batches
#         max_retries: Number of retry attempts per batch on failure
#         retry_delay: Delay between retries in seconds

#     Returns:
#         List of dicts containing input, actual_output, score, reason
#     """
#     model = GeminiModel(
#         model_name="gemini-2.0-flash",
#         api_key=api_key,
#         temperature=0,
#     )

#     metric = SummarizationMetric(
#         threshold=0.5,
#         model=model,
#         # assessment_questions=[
#         #     "Is the coverage score based on a percentage of 'yes' answers?",
#         #     "Does the score ensure the summary's accuracy with the source?",
#         #     "Does a higher score mean a more comprehensive summary?"
#         # ]
#         n=10
#     )

#     results = []

#     for i in range(0, len(input_output_pairs), batch_size):
#         batch = input_output_pairs[i:i + batch_size]
#         test_cases = [LLMTestCase(input=inp, actual_output=out) for inp, out in batch]

#         try:
#             batch_result = evaluate(test_cases=test_cases, metrics=[metric])
#             for test_case, res in zip(batch, batch_result.test_results):
#                 results.append(res)
#         except Exception as e:
#             print(f"Batch {i // batch_size + 1} failed with error: {e}")
#             for _ in batch:
#                 results.append(None)  # Preserve alignment with input

#     return results

In [2]:
# import asyncio
# import os
# import pandas as pd
# from deepeval import evaluate
# from deepeval.test_case import LLMTestCase
# from deepeval.metrics import SummarizationMetric
# from deepeval.models import GeminiModel

# async def async_summarization_score(
#     input_output_pairs,
#     api_key,
# ):
#     """
#     Evaluates summarization quality using async individual test case calls with the Gemini model,
#     processing all test cases concurrently with asyncio.gather, returning the metric score or 0 on failure.

#     Args:
#         input_output_pairs: List of tuples (input, actual_output)
#         api_key: Gemini API key string

#     Returns:
#         List of floats containing the summarization score for each test case, or 0 if evaluation fails
#     """
#     model = GeminiModel(
#         model_name="gemini-2.0-flash",
#         api_key=api_key,
#         temperature=0,
#     )

#     metric = SummarizationMetric(
#         threshold=0.5,
#         model=model,
#         n=10
#     )

#     async def evaluate_test_case(test_case):
#         try:
#             result = await evaluate(test_cases=[test_case], metrics=[metric])
#             return result.test_results[0].metrics_data[0].score
#         except Exception as e:
#             print(f"Test case failed with error: {e}")
#             return 0

#     test_cases = [LLMTestCase(input=inp, actual_output=out) for inp, out in input_output_pairs]

#     # Process all test cases concurrently using asyncio.gather
#     results = await asyncio.gather(
#         *[evaluate_test_case(test_case) for test_case in test_cases],
#         return_exceptions=True
#     )

#     # Handle any exceptions returned by gather
#     final_results = [result if isinstance(result, float) else 0 for result in results]

#     return final_results

In [3]:

from deepeval import evaluate
from deepeval.test_case import LLMTestCase
from deepeval.metrics import SummarizationMetric
from deepeval.models import GeminiModel

def summarize_100_pairs(
    input_output_pairs,
    api_key,
    # sleep_time=20,
    # max_retries=1,
    # retry_delay=60
):
    """
    Evaluates summarization quality for 100 pairs using Gemini model and DeepEval.

    Args:
        input_output_pairs: List of 100 (input, output) tuples.
        api_key: Gemini API key.
        sleep_time: Optional pause after evaluation (e.g., for chaining jobs).
        max_retries: Retry attempts per pair.
        retry_delay: Delay between retries.

    Returns:
        List of dicts or evaluation results.
    """
    # if len(input_output_pairs) != 100:
    #     raise ValueError("Expected exactly 100 input-output pairs.")

    model = GeminiModel(
        model_name="gemini-2.0-flash",
        api_key=api_key,
        temperature=0,
    )

    metric = SummarizationMetric(
        threshold=0.5,
        model=model,
        n=10
    )

    results = []

    for idx, (inp, out) in enumerate(input_output_pairs):
        try:
            test_case = LLMTestCase(input=inp, actual_output=out)
            result = evaluate(test_cases=[test_case], metrics=[metric])
            results.append(result.test_results[0].metrics_data[0].score)
        except Exception as e:
            print(f"Error evaluating pair {idx + 1}: {e}")
            results.append(0)
    return results


In [4]:
import os
API_KEY = os.getenv("GEMINI_API_KEY")

In [5]:
import pandas as pd
df = pd.read_csv("/Users/pupipatsingkhorn/Developer/repositories/NanoLLaDA/data/gemini_summaries.csv").sample(10)

In [6]:
scores = summarize_100_pairs(
    input_output_pairs=list(
        df[["body", "generated"]].itertuples(index=False, name=None)
    ),
    api_key=API_KEY,
)

Evaluating 1 test case(s) in parallel: |‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà|100% (1/1) [Time Taken: 00:06,  6.09s/test case]



Metrics Summary

  - ‚úÖ Summarization (score: 0.6, threshold: 0.5, strict: False, evaluation model: gemini-2.0-flash, reason: The score is 0.60 because the summary omits answers to several questions that the original text could answer, such as details about Manning's fines, a suicide attempt, hospitalization, and beliefs about the grand jury., error: None)

For test case:

  - input: ‡πÄ‡∏°‡∏∑‡πà‡∏≠ 12 ‡∏°‡∏µ.‡∏Ñ. ‡∏ô‡∏≤‡∏¢‡πÅ‡∏≠‡∏ô‡πÇ‡∏ò‡∏ô‡∏µ ‡πÄ‡∏ó‡∏£‡∏ô‡∏Å‡∏≤ ‡∏ú‡∏π‡πâ‡∏û‡∏¥‡∏û‡∏≤‡∏Å‡∏©‡∏≤‡∏®‡∏≤‡∏•‡∏Å‡∏•‡∏≤‡∏á‡πÉ‡∏ô‡πÄ‡∏°‡∏∑‡∏≠‡∏á‡∏≠‡πÄ‡∏•‡∏Å‡∏ã‡∏≤‡∏ô‡πÄ‡∏î‡∏£‡∏µ‡∏¢ ‡∏£‡∏±‡∏ê‡πÄ‡∏ß‡∏≠‡∏£‡πå‡∏à‡∏¥‡πÄ‡∏ô‡∏µ‡∏¢ ‡πÉ‡∏ô‡∏™‡∏´‡∏£‡∏±‡∏ê‡∏Ø ‡∏™‡∏±‡πà‡∏á‡πÉ‡∏´‡πâ‡πÄ‡∏ä‡∏•‡∏ã‡∏µ ‡πÅ‡∏°‡∏ô‡∏ô‡∏¥‡∏á ‡∏≠‡∏î‡∏µ‡∏ï‡∏ô‡∏±‡∏Å‡∏ß‡∏¥‡πÄ‡∏Ñ‡∏£‡∏≤‡∏∞‡∏´‡πå‡∏Ç‡πà‡∏≤‡∏ß‡∏Å‡∏£‡∏≠‡∏á‡∏Ç‡∏≠‡∏á‡∏Å‡∏≠‡∏á‡∏ó‡∏±‡∏û‡∏™‡∏´‡∏£‡∏±‡∏ê‡∏Ø‡∏ó‡∏µ‡πà‡∏Å‡∏•‡∏≤‡∏¢‡πÄ‡∏õ‡πá‡∏ô‡∏™‡∏ï‡∏£‡∏µ‡∏Ç‡πâ‡∏≤‡∏°‡πÄ‡∏û‡∏® ‡∏ú‡∏π‡πâ‡∏™‡πà‡∏á‡∏ï‡πà‡∏≠‡∏Ç‡πâ‡∏≠‡∏°‡∏π‡∏•‡∏ó‡∏µ‡πà‡πÄ‡∏õ‡πá‡∏ô‡∏Ñ‡∏ß‡∏≤‡∏°‡∏•‡∏±‡∏ö‡∏Ç‡∏≠‡∏á‡∏Å‡∏≠‡∏á‡∏ó‡∏±‡




Evaluating 1 test case(s) in parallel: |‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà|100% (1/1) [Time Taken: 00:06,  6.31s/test case]



Metrics Summary

  - ‚ùå Summarization (score: 0.4, threshold: 0.5, strict: False, evaluation model: gemini-2.0-flash, reason: The score is 0.40 because the summary includes extra information not present in the original text. Additionally, the summary fails to answer several questions that the original text addresses, indicating a lack of comprehensive coverage., error: None)

For test case:

  - input: ‡∏õ‡∏±‡∏ï‡∏ï‡∏≤‡∏ô‡∏µ-12 ‡∏û.‡∏Ñ.48‡∏û.‡∏≠.‡∏™‡∏°‡∏Ñ‡∏ß‡∏£ ‡πÅ‡∏™‡∏á‡∏†‡∏±‡∏ó‡∏£‡πÄ‡∏ô‡∏ï‡∏£ ‡πÇ‡∏Ü‡∏©‡∏Å‡∏Å‡∏≠‡∏á‡∏≠‡∏≥‡∏ô‡∏ß‡∏¢‡∏Å‡∏≤‡∏£‡πÄ‡∏™‡∏£‡∏¥‡∏°‡∏™‡∏£‡πâ‡∏≤‡∏á‡∏™‡∏±‡∏ô‡∏ï‡∏¥‡∏™‡∏∏‡∏Ç‡∏à‡∏±‡∏á‡∏´‡∏ß‡∏±‡∏î‡∏ä‡∏≤‡∏¢‡πÅ‡∏î‡∏ô‡∏†‡∏≤‡∏Ñ‡πÉ‡∏ï‡πâ (‡∏Å‡∏≠.‡∏™‡∏™‡∏™.‡∏à‡∏ä‡∏ï.) ‡∏¢‡∏∑‡∏ô‡∏¢‡∏±‡∏ô‡∏ß‡πà‡∏≤ ‡∏ú‡∏π‡πâ‡∏õ‡∏Å‡∏Ñ‡∏£‡∏≠‡∏á‡πÅ‡∏•‡∏∞‡∏ô‡∏±‡∏Å‡∏®‡∏∂‡∏Å‡∏©‡∏≤‡∏°‡∏´‡∏≤‡∏ß‡∏¥‡∏ó‡∏¢‡∏≤‡∏•‡∏±‡∏¢‡∏™‡∏á‡∏Ç‡∏•‡∏≤‡∏ô‡∏Ñ‡∏£‡∏¥‡∏ô‡∏ó‡∏£‡πå ‡∏ß‡∏¥‡∏ó‡∏¢‡∏≤‡πÄ‡∏Ç‡∏ï‡∏õ‡∏±‡∏ï‡∏ï‡∏≤‡∏ô‡∏µ ‡πÑ‡∏°‡πà‡πÉ‡∏ä‡πà‡πÄ‡∏õ‡πâ‡∏≤‡∏´‡∏°‡∏≤‡∏¢‡∏Ç‡∏≠‡∏á‡∏Å‡∏≤‡∏£‡∏Å‡πà‡∏≠‡∏Å‡∏≤‡∏£‡∏£‡πâ‡∏≤‡∏¢‡πÉ‡∏ô‡∏û‡∏∑‡π




Evaluating 1 test case(s) in parallel: |‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà|100% (1/1) [Time Taken: 00:06,  6.28s/test case]



Metrics Summary

  - ‚ùå Summarization (score: 0.2, threshold: 0.5, strict: False, evaluation model: gemini-2.0-flash, reason: The score is 0.20 because the summary includes extra information not present in the original text, such as defining ‡∏ï‡∏∏‡∏•‡∏≤‡∏Å‡∏≤‡∏£ as a promoter of justice, protector of rights and freedoms, and guarantor of democracy, stating that ‡∏ï‡∏∏‡∏•‡∏≤‡∏Å‡∏≤‡∏£ are employees of the people, stating that ‡∏ï‡∏∏‡∏•‡∏≤‡∏Å‡∏≤‡∏£ must be aware of their duties, serve the people with justice, transparency, and independence, and stating that ‡∏ï‡∏∏‡∏•‡∏≤‡∏Å‡∏≤‡∏£ should not serve orders or be greedy for bribes., error: None)

For test case:

  - input:  ‡∏Ñ‡∏∑‡∏≠‡∏ï‡∏£‡∏≤‡∏ä‡∏π ‡∏ú‡∏π‡πâ‡∏ä‡∏µ‡πâ ‡πÄ‡∏™‡∏£‡∏µ‡∏™‡∏¥‡∏ó‡∏ò‡∏¥ ‡∏Ñ‡∏∑‡∏≠‡∏®‡∏≤‡∏•‡∏™‡∏ñ‡∏¥‡∏ï ‡∏¢‡∏∏‡∏ï‡∏¥‡∏ò‡∏£‡∏£‡∏° ‡∏ô‡∏≥‡∏™‡∏°‡∏±‡∏¢ ‡∏Ñ‡∏∑‡∏≠‡∏´‡∏•‡∏±‡∏Å ‡∏õ‡∏£‡∏∞‡∏Å‡∏±‡∏ô ‡∏õ‡∏£‡∏∞‡∏ä‡∏≤‡∏ò‡∏¥‡∏õ‡πÑ‡∏ï‡∏¢ ‡∏°‡∏¥‡πÉ‡∏ä‡πà‡∏≠‡∏†‡∏¥‡∏ä‡∏ô‡∏Ñ‡∏ô‡∏ä‡∏±‡πâ‡∏ô‡∏ü‡πâ‡∏≤ ‡∏Ñ‡∏£‡∏∏‡∏¢‡∏ó‡∏µ‡πà‡∏™‡∏ß‡∏° ‡∏ô‡∏±‡πâ‡




Evaluating 1 test case(s) in parallel: |‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà|100% (1/1) [Time Taken: 00:05,  5.72s/test case]



Metrics Summary

  - ‚úÖ Summarization (score: 0.7, threshold: 0.5, strict: False, evaluation model: gemini-2.0-flash, reason: The score is 0.70 because the summary omits answers to several questions that the original text could answer. However, the summary does not contain any contradictions or extra information., error: None)

For test case:

  - input: ‡∏Å‡∏≠‡∏•‡πå‡∏ü‡∏¢‡∏π‡πÇ‡∏£‡πÄ‡∏õ‡∏µ‡πâ‡∏¢‡∏ô‡∏ó‡∏±‡∏ß‡∏£‡πå‡∏£‡∏≤‡∏¢‡∏Å‡∏≤‡∏£‡∏ã‡∏¥‡∏ã‡∏¥‡πÄ‡∏•‡∏µ‡πà‡∏¢‡∏ô ‡πÇ‡∏≠‡πÄ‡∏û‡πà‡∏ô ‡∏ó‡∏µ‡πà‡πÄ‡∏ß‡∏≠‡∏£‡πå‡∏î‡∏π‡∏£‡πà‡∏≤ ‡∏Å‡∏≠‡∏•‡πå‡∏ü ‡πÅ‡∏≠‡∏ô‡∏î‡πå ‡∏™‡∏õ‡∏≤ ‡∏£‡∏µ‡∏™‡∏≠‡∏£‡πå‡∏ó ‡∏ö‡∏ô‡πÄ‡∏Å‡∏≤‡∏∞‡∏ã‡∏¥‡∏ã‡∏¥‡∏•‡∏µ ‡∏õ‡∏£‡∏∞‡πÄ‡∏ó‡∏®‡∏≠‡∏¥‡∏ï‡∏≤‡∏•‡∏µ ‡∏ß‡∏±‡∏ô‡∏™‡∏∏‡∏î‡∏ó‡πâ‡∏≤‡∏¢‡∏õ‡∏£‡∏≤‡∏Å‡∏è‡∏ß‡πà‡∏≤ ‡∏ò‡∏≠‡∏£‡πå‡∏õ‡∏¢‡∏≠‡∏ô ‡πÇ‡∏≠‡πÄ‡∏•‡πÄ‡∏ã‡πà‡∏ô ‡πÇ‡∏õ‡∏£‡πÄ‡∏î‡∏ô‡∏°‡∏≤‡∏£‡πå‡∏Å‡∏ß‡∏±‡∏¢ 22 ‡∏õ‡∏µ ‡∏ó‡∏≥‡∏ú‡∏•‡∏á‡∏≤‡∏ô‡∏ï‡∏µ‡πÄ‡∏û‡∏¥‡πà‡∏°‡πÄ‡∏Ç‡πâ‡∏≤‡∏°‡∏≤‡∏≠‡∏µ‡∏Å 3 ‡∏≠‡∏±‡∏ô‡πÄ‡∏î‡∏≠‡∏£‡πå ‡∏Ñ‡∏∞‡πÅ‡∏ô‡∏ô‡∏£‡∏ß‡∏° 4 ‡∏ß‡∏±‡∏ô 15 ‡∏≠‡∏±‡∏ô‡πÄ‡∏î‡∏≠‡∏£‡πå ‡∏Ñ‡∏ß‡πâ‡∏≤‡π




Evaluating 1 test case(s) in parallel: |‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà|100% (1/1) [Time Taken: 00:05,  5.74s/test case]



Metrics Summary

  - ‚úÖ Summarization (score: 0.9, threshold: 0.5, strict: False, evaluation model: gemini-2.0-flash, reason: The score is 0.90 because the summary contains no contradictions or extra information. However, the summary omits the answer to the question of whether the rescued girl denied having COVID-19 symptoms., error: None)

For test case:

  - input: 3 ‡∏™‡∏≤‡∏ß‡∏ß‡∏±‡∏¢‡∏£‡∏∏‡πà‡∏ô‡∏ä‡∏ß‡∏ô‡∏Å‡∏±‡∏ô‡∏°‡∏≤‡∏Å‡∏£‡∏∞‡πÇ‡∏î‡∏î‡πÅ‡∏°‡πà‡∏ô‡πâ‡∏≥‡πÄ‡∏à‡πâ‡∏≤‡∏û‡∏£‡∏∞‡∏¢‡∏≤ ‡πÇ‡∏î‡∏¢‡∏Å‡∏£‡∏∞‡πÇ‡∏î‡∏î‡∏•‡∏á‡∏à‡∏≤‡∏Å‡∏™‡∏∞‡∏û‡∏≤‡∏ô‡∏û‡∏£‡∏∞‡∏£‡∏≤‡∏° 8 ‡∏ù‡∏±‡πà‡∏á‡∏ò‡∏ô‡∏ö‡∏∏‡∏£‡∏µ ‡∏ä‡πà‡∏ß‡∏¢‡πÑ‡∏î‡πâ‡∏ó‡∏±‡∏ô 1 ‡∏Ñ‡∏ô ‡∏≠‡∏µ‡∏Å‡∏Ñ‡∏ô‡∏ô‡∏≥‡∏Ç‡∏∂‡πâ‡∏ô‡∏°‡∏≤‡πÑ‡∏î‡πâ‡πÅ‡∏ï‡πà‡πÄ‡∏™‡∏µ‡∏¢‡∏ä‡∏µ‡∏ß‡∏¥‡∏ï‡πÅ‡∏•‡πâ‡∏ß ‡∏™‡πà‡∏ß‡∏ô‡∏≠‡∏µ‡∏Å 1 ‡∏Ñ‡∏ô ‡∏à‡∏°‡∏´‡∏≤‡∏¢‡πÑ‡∏õ ‡∏¢‡∏±‡∏á‡πÑ‡∏°‡πà‡∏ó‡∏£‡∏≤‡∏ö‡∏™‡∏≤‡πÄ‡∏´‡∏ï‡∏∏ ‡πÄ‡∏ö‡∏∑‡πâ‡∏≠‡∏á‡∏ï‡πâ‡∏ô‡∏¢‡∏∑‡∏ô‡∏¢‡∏±‡∏ô‡πÑ‡∏°‡πà‡πÄ‡∏Å‡∏µ‡πà‡∏¢‡∏ß‡∏Å‡∏±‡∏ö‡πÇ‡∏£‡∏Ñ‡∏£‡∏∞‡∏ö‡∏≤‡∏î‡∏ß‡∏±‡∏ô‡∏ó‡∏µ‡πà 4 ‡πÄ‡∏°.‡∏¢. ‡∏ú‡∏π‡πâ‡∏™‡∏∑‡πà‡∏≠




Evaluating 1 test case(s) in parallel: |‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà|100% (1/1) [Time Taken: 00:06,  6.36s/test case]



Metrics Summary

  - ‚úÖ Summarization (score: 0.75, threshold: 0.5, strict: False, evaluation model: gemini-2.0-flash, reason: The score is 0.75 because the summary introduces extra information not present in the original text, such as the grades 7-9 and the specific percentage of students feeling resentful and wanting revenge., error: None)

For test case:

  - input: ‡∏õ‡∏£‡∏∞‡∏ä‡∏≤‡πÑ‡∏ó-20 ‡∏°.‡∏Ñ.48 ‡∏£‡∏®.‡∏î‡∏£.‡∏™‡∏°‡∏û‡∏£ ‡πÄ‡∏ï‡∏£‡∏µ‡∏¢‡∏°‡∏ä‡∏±‡∏¢‡∏®‡∏£‡∏µ ‡∏Ñ‡∏ì‡∏∞‡∏™‡∏≤‡∏ò‡∏≤‡∏£‡∏ì‡∏™‡∏∏‡∏Ç‡∏®‡∏≤‡∏™‡∏ï‡∏£‡πå ‡∏°‡∏´‡∏¥‡∏î‡∏• ‡∏ô‡∏≥‡πÄ‡∏™‡∏ô‡∏≠‡∏ú‡∏•‡∏á‡∏≤‡∏ô‡∏ß‡∏¥‡∏à‡∏±‡∏¢‡πÄ‡∏£‡∏∑‡πà‡∏≠‡∏á‡∏Ñ‡∏ß‡∏≤‡∏°‡∏£‡∏∏‡∏ô‡πÅ‡∏£‡∏á‡πÉ‡∏ô‡πÄ‡∏î‡πá‡∏Å ‡∏™‡∏≤‡∏¢‡πÇ‡∏ã‡πà‡∏ß‡∏á‡∏à‡∏£‡∏Ç‡∏≠‡∏á‡∏Ñ‡∏ß‡∏≤‡∏°‡∏£‡∏∏‡∏ô‡πÅ‡∏£‡∏á ‡πÇ‡∏î‡∏¢‡∏ú‡∏•‡∏Å‡∏≤‡∏£‡∏®‡∏∂‡∏Å‡∏©‡∏≤‡∏ß‡∏¥‡∏à‡∏±‡∏¢‡∏ä‡∏µ‡πâ‡∏ß‡πà‡∏≤ ‡∏Ñ‡∏ß‡∏≤‡∏°‡∏£‡∏∏‡∏ô‡πÅ‡∏£‡∏á‡∏ó‡∏µ‡πà‡πÄ‡∏Å‡∏¥‡∏î‡∏Ç‡∏∂‡πâ‡∏ô‡∏†‡∏≤‡∏¢‡πÉ‡∏ô‡∏Ñ‡∏£‡∏≠‡∏ö‡∏Ñ‡∏£‡∏±‡∏ß ‡∏à‡∏∞‡∏™‡πà‡∏á‡∏ú‡∏•‡∏ñ‡∏∂‡∏á‡∏Ñ‡∏ß‡∏≤‡∏°‡∏Å‡πâ‡∏≤‡∏ß‡∏£‡πâ‡∏≤‡∏ß‡∏Ç‡∏≠‡∏á‡πÄ‡∏î‡πá‡∏Å ‡πÄ‡∏




Evaluating 1 test case(s) in parallel: |‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà|100% (1/1) [Time Taken: 00:05,  5.95s/test case]



Metrics Summary

  - ‚úÖ Summarization (score: 0.7, threshold: 0.5, strict: False, evaluation model: gemini-2.0-flash, reason: The score is 0.70 because the summary fails to address some questions that the original text answers, but it does not contain any contradictions or extra information., error: None)

For test case:

  - input: ‡∏´‡∏°‡∏≤‡∏¢‡πÄ‡∏´‡∏ï‡∏∏‡∏õ‡∏£‡∏∞‡πÄ‡∏û‡∏ó‡πÑ‡∏ó‡∏¢‡∏™‡∏±‡∏õ‡∏î‡∏≤‡∏´‡πå‡∏ô‡∏µ‡πâ ‡∏û‡∏ö‡∏Å‡∏±‡∏ö ‡∏Ñ‡∏≥ ‡∏ú‡∏Å‡∏≤ ‡πÅ‡∏•‡∏∞ ‡∏≠‡∏£‡∏£‡∏ñ ‡∏ö‡∏∏‡∏ô‡∏ô‡∏≤‡∏Ñ ‡∏°‡∏≤‡∏Ñ‡∏∏‡∏¢‡∏Å‡∏±‡∏ô‡∏ñ‡∏∂‡∏á‡πÄ‡∏´‡∏ï‡∏∏‡∏Å‡∏≤‡∏£‡∏ì‡πå‡πÄ‡∏Å‡∏µ‡πà‡∏¢‡∏ß‡∏Å‡∏±‡∏ö ‡∏™‡∏ï‡∏£‡∏µ‡∏ô‡∏¥‡∏¢‡∏° ‡πÉ‡∏ô‡∏ï‡πà‡∏≤‡∏á‡∏õ‡∏£‡∏∞‡πÄ‡∏ó‡∏®‡∏ó‡∏µ‡πà‡πÄ‡∏û‡∏¥‡πà‡∏á‡∏°‡∏µ‡∏Å‡∏≤‡∏£‡∏ï‡πà‡∏≠‡∏ï‡πâ‡∏≤‡∏ô‡∏à‡∏≤‡∏Å‡∏™‡∏ï‡∏£‡∏µ‡∏î‡πâ‡∏ß‡∏¢‡∏Å‡∏±‡∏ô ‡∏î‡πâ‡∏ß‡∏¢‡∏Å‡∏≤‡∏£‡∏£‡∏ì‡∏£‡∏á‡∏Ñ‡πå‡∏õ‡∏è‡∏¥‡πÄ‡∏™‡∏ò‡∏™‡∏ï‡∏£‡∏µ‡∏ô‡∏¥‡∏¢‡∏° ‡∏Å‡∏≤‡∏£‡∏£‡∏ì‡∏£‡∏á‡∏Ñ‡πå‡∏î‡∏±‡∏á‡∏Å‡∏•‡πà‡∏≤‡∏ß‡∏ô‡∏µ‡πâ‡πÄ‡∏Å‡∏¥‡∏î‡∏Ç‡∏∂‡πâ‡∏ô‡πÉ‡∏ô‡∏ä‡πà‡∏ß‡∏á‡∏£‡∏≤‡∏ß 2 ‡πÄ‡∏î‡∏∑‡∏≠‡∏ô‡πÄ‡∏®‡∏©‡∏ó‡∏µ‡πà‡∏ú‡πà‡∏≤‡∏ô‡∏°‡∏≤‡∏ö‡∏ô




Evaluating 1 test case(s) in parallel: |‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà|100% (1/1) [Time Taken: 00:05,  5.63s/test case]



Metrics Summary

  - ‚úÖ Summarization (score: 0.7, threshold: 0.5, strict: False, evaluation model: gemini-2.0-flash, reason: The score is 0.70 because the summary omits answers to several questions that the original text could answer. However, the summary does not contain any contradicting or extra information., error: None)

For test case:

  - input: ‡∏ß‡∏á‡∏Å‡∏≤‡∏£‡πÄ‡∏ó‡∏Ñ‡πÇ‡∏ô‡πÇ‡∏•‡∏¢‡∏µ‡∏Ç‡∏≠‡∏á‡∏à‡∏µ‡∏ô‡∏Ñ‡∏¥‡∏î‡∏Ñ‡πâ‡∏ô‡πÅ‡∏•‡∏∞‡∏û‡∏±‡∏í‡∏ô‡∏≤‡πÑ‡∏°‡πà‡∏´‡∏¢‡∏∏‡∏î ‡∏•‡πà‡∏≤‡∏™‡∏∏‡∏î‡∏™‡∏ñ‡∏≤‡∏ö‡∏±‡∏ô‡∏ï‡πã‡∏≤‡πÇ‡∏°‡πã (DAMO Academy) ‡∏™‡∏ñ‡∏≤‡∏ö‡∏±‡∏ô‡∏ß‡∏¥‡∏à‡∏±‡∏¢‡∏ó‡∏≤‡∏á‡∏ß‡∏¥‡∏ó‡∏¢‡∏≤‡∏®‡∏≤‡∏™‡∏ï‡∏£‡πå‡∏Ç‡∏≠‡∏á‡∏≠‡∏≤‡∏•‡∏µ‡∏ö‡∏≤‡∏ö‡∏≤ ‡∏Ç‡∏≠‡∏á‡∏°‡∏´‡∏≤‡πÄ‡∏®‡∏£‡∏©‡∏ê‡∏µ‡∏à‡∏µ‡∏ô ‡πÅ‡∏à‡πá‡∏Ñ ‡∏´‡∏°‡πà‡∏≤ ‡πÄ‡∏£‡∏¥‡πà‡∏°‡∏ó‡∏î‡∏™‡∏≠‡∏ö‡∏£‡∏ñ‡∏¢‡∏ô‡∏ï‡πå‡πÑ‡∏£‡πâ‡∏Ñ‡∏ô‡∏Ç‡∏±‡∏ö‡∏ö‡∏ô‡∏ó‡πâ‡∏≠‡∏á‡∏ñ‡∏ô‡∏ô‡πÅ‡∏ö‡∏ö‡∏à‡∏≥‡∏•‡∏≠‡∏á‡πÅ‡∏•‡πâ‡∏ß‡∏™‡∏≥‡∏ô‡∏±‡∏Å‡∏Ç‡πà‡∏≤‡∏ß‡∏ã‡∏¥‡∏ô‡∏´‡∏±‡∏ß ‡πÄ‡∏ú‡∏¢‡πÅ‡∏û‡∏£‡πà‡∏Ç‡πà‡∏≤‡∏ß‡∏ô‡∏µ‡πâ ‡∏û‡∏£‡πâ‡∏≠‡∏°‡∏Ñ‡∏•‡∏¥‡∏õ‡∏ß‡∏¥‡∏î‡∏µ‡πÇ




Evaluating 1 test case(s) in parallel: |‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà|100% (1/1) [Time Taken: 00:06,  6.15s/test case]



Metrics Summary

  - ‚úÖ Summarization (score: 0.6, threshold: 0.5, strict: False, evaluation model: gemini-2.0-flash, reason: The score is 0.60 because the summary fails to address several questions that the original text answers, indicating a loss of key information., error: None)

For test case:

  - input: ‡πÄ‡∏°‡∏∑‡πà‡∏≠‡∏ß‡∏±‡∏ô‡∏ó‡∏µ‡πà 28 ‡∏°‡∏Å‡∏£‡∏≤‡∏Ñ‡∏° 2559 ‡∏ú‡∏π‡πâ‡∏™‡∏∑‡πà‡∏≠‡∏Ç‡πà‡∏≤‡∏ß‡∏£‡∏≤‡∏¢‡∏á‡∏≤‡∏ô‡∏ß‡πà‡∏≤ ‡∏´‡∏•‡∏±‡∏á‡∏à‡∏≤‡∏Å‡∏ó‡∏µ‡πà‡∏Å‡∏£‡∏∞‡πÅ‡∏™‡∏ï‡∏∏‡πä‡∏Å‡∏ï‡∏≤‡∏•‡∏π‡∏Å‡πÄ‡∏ó‡∏û‡∏Å‡∏≥‡∏•‡∏±‡∏á‡∏°‡∏≤‡πÅ‡∏£‡∏á ‡∏ô‡∏≠‡∏Å‡∏à‡∏≤‡∏Å‡∏™‡∏£‡πâ‡∏≤‡∏á‡∏Ñ‡∏ß‡∏≤‡∏°‡πÄ‡∏ä‡∏∑‡πà‡∏≠‡∏°‡∏±‡πà‡∏ô‡∏Å‡∏±‡∏ö‡∏ú‡∏π‡πâ‡∏ó‡∏µ‡πà‡∏ô‡∏¥‡∏¢‡∏°‡πÅ‡∏•‡πâ‡∏ß ‡∏¢‡∏±‡∏á‡∏™‡∏£‡πâ‡∏≤‡∏á‡∏Ñ‡∏ß‡∏≤‡∏°‡∏´‡∏ô‡∏±‡∏Å‡πÉ‡∏à‡πÉ‡∏´‡πâ‡∏Å‡∏±‡∏ö‡∏ù‡πà‡∏≤‡∏¢‡∏Ñ‡∏ß‡∏≤‡∏°‡∏°‡∏±‡πà‡∏ô‡∏Ñ‡∏á ‡πÄ‡∏û‡∏£‡∏≤‡∏∞‡∏•‡πà‡∏≤‡∏™‡∏∏‡∏î‡∏ñ‡∏π‡∏Å‡∏ô‡∏≥‡πÑ‡∏õ‡∏ã‡∏∏‡∏Å‡∏ã‡πà‡∏≠‡∏ô‡∏¢‡∏≤‡πÄ‡∏™‡∏û‡∏ï‡∏¥‡∏î‡∏à‡∏ô‡∏Å‡∏•‡∏≤‡∏¢‡πÄ‡∏õ‡πá‡∏ô‡∏Ç‡πà‡∏≤‡∏ß‡∏î‡∏±‡∏á‡πÑ‡∏õ‡∏ñ‡∏∂‡∏á‡∏õ‡∏£‡∏∞‡πÄ‡∏ó‡∏®‡πÄ‡∏°‡∏µ‡∏¢‡∏ô‡∏°‡∏≤,‡∏ô‡∏≤‡∏




Evaluating 1 test case(s) in parallel: |          |  0% (0/1) [Time Taken: 00:02, ?test case/s]

Error evaluating pair 10: 'NoneType' object has no attribute 'truths'





In [7]:
# scores = []
# for r in results:
#     try:
#         scores.append(r.test_results[0].metrics_data[0].score)
#     except:
#         scores.append(0)

In [8]:
# scores = []
# for r in results:
#     try:
#         if r is None:
#             raise ValueError("Result is None")
#         scores.append(r.metrics_data[0].score)
#     except Exception as e:
#         print(f"An error occurred: {e}")
#         scores.append(0)

In [9]:
df2 = pd.DataFrame({'body': df['body'], 'score': scores})
df2.to_csv('score-gemini.csv',index=False)