## First experiment

In [None]:
from typing import Literal, Optional
from typing import Literal, Optional
from concurrent.futures import ThreadPoolExecutor
from langchain_openai import ChatOpenAI
from langchain_core.output_parsers import StrOutputParser
from langchain_core.output_parsers import StrOutputParser
from langchain_core.pydantic_v1 import BaseModel, Field
from langchain.agents import AgentExecutor, create_openai_tools_agent
from langchain_community.tools.tavily_search import TavilySearchResults
from langchain.tools import tool
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_core.prompts import ChatPromptTemplate


class TranslationEvaluation(BaseModel):
    """The final evaluation of the English-to-Filipino translation."""
    score: int = Field(..., description="Numerical score from 1 (poor) to 5 (perfect).")
    label: Literal["Incomprehensible", "Poor", "Good", "Excellent", "Perfect"] = Field(
        ..., description="Categorical label for the translation quality."
    )
    reasoning: str = Field(
        ..., description="Detailed, point-by-point reasoning for the score, citing specific examples from the text."
    )

search_tool = TavilySearchResults(k=1)
search_tool.description = (
    "Use this to search for definitions, synonyms, or cultural context of specific English or Filipino words and phrases. Mention the word TRANSLATE so that tavily knows that your looking for the counterpart of that word like \"Translate 'food' in Filino\"."
)

@tool
def opinion_pooling_tool(source_text: str, translated_text: str, reference_text: Optional[str] = None) -> str:
    """
    Use this ONLY as a last resort if the search tool did not clarify your uncertainty. 
    This tool consults other expert AI models (Gemini and GPT-4) for their evaluations. It is very expensive.
    """
    print("\n--- CONSULTING EXPENSIVE OPINION POOLING TOOL ---")
    
    # judge models
    gemini_judge = ChatGoogleGenerativeAI(model="gemini-2.5-pro", temperature=0.2)
    chatgpt_judge = ChatOpenAI(model="gpt-4o", temperature=0.2) # Using a different GPT model

    judge_prompt = ChatPromptTemplate.from_template(
        """You are a rigorous, impartial English→Filipino translation judge with deep expertise in Filipino grammar, style, and cultural nuance. Evaluate ONLY the given source/translation using the criteria below. Favor idiomatic Filipino that preserves meaning. Penalize omissions/additions, mistranslations (polarity/negation, tense/aspect, quantities, named entities), awkward calques, unjustified Taglish, and register mismatches. Do NOT rewrite the translation—only judge it. When uncertain, choose the lower score and justify briefly with evidence. Assume formal register unless stated otherwise. If no domain/style guide is provided, use general editorial norms as the guideline.

SCORED REFERENCE EXAMPLES (for patterning; do NOT output these):

- Example A — Excellent
  Source: "The meeting was postponed because of the storm."
  Translation: "Naantala ang pagpupulong dahil sa bagyo."
  Expected JSON:
  {{"criteria": {{"accuracy": 1, "fluency": 1, "coherence": 1, "cultural_appropriateness": 1, "guideline_adherence": 1, "completeness": 1}},
    "total_points": 6, "overall_score": 5, "label": "excellent",
    "explanation": "Idiomatic and precise; preserves cause and entities; no omissions/additions."}}

- Example B — Very good (minor style issue)
  Source: "Please submit the report by Friday."
  Translation: "Pakiusap na isumite ang ulat pagsapit ng Biyernes."
  Expected JSON:
  {{"criteria": {{"accuracy": 1, "fluency": 1, "coherence": 1, "cultural_appropriateness": 1, "guideline_adherence": 1, "completeness": 0}},
    "total_points": 5, "overall_score": 4, "label": "very_good",
    "explanation": "Meaning preserved; minor completeness/style nuance (tone/softener not fully mirrored)."}}

- Example C — Good (loss of specificity)
  Source: "Do not turn off the main power switch."
  Translation: "Huwag patayin ang switch."
  Expected JSON:
  {{"criteria": {{"accuracy": 0, "fluency": 1, "coherence": 1, "cultural_appropriateness": 1, "guideline_adherence": 1, "completeness": 0}},
    "total_points": 4, "overall_score": 3, "label": "good",
    "explanation": "Omits 'main power' → specificity lost (accuracy/completeness↓); grammar/flow are fine."}}

- Example D — Fair (noticeable errors, mostly understandable)
  Source: "Store the medicine in a cool, dry place."
  Translation: "Itago ang gamot sa malamig na lugar."
  Expected JSON:
  {{"criteria": {{"accuracy": 1, "fluency": 1, "coherence": 1, "cultural_appropriateness": 1, "guideline_adherence": 0, "completeness": 0}},
    "total_points": 4, "overall_score": 3, "label": "good",
    "explanation": "Misses 'dry' and guidance nuance; otherwise natural. (If policy requires both conditions, consider Completeness=0 and Guideline=0.)"}}

- Example E — Poor (wrong meaning)
  Source: "Keep out of reach of children."
  Translation: "Maganda ang bata."
  Expected JSON:
  {{"criteria": {{"accuracy": 0, "fluency": 1, "coherence": 0, "cultural_appropriateness": 0, "guideline_adherence": 0, "completeness": 0}},
    "total_points": 1, "overall_score": 1, "label": "poor",
    "explanation": "Unrelated meaning; safety directive lost; incoherent to instruction context."}}

Additionally, you will also be given a few examples of English→Filipino pairs—each with a correct translation, a flawed translation, and a short remark explaining the flaw. Study those examples, then evaluate a new pair according to six binary criteria.

Examples (from CSV columns “source”, “correct”, “flawed”, separated by |):
	1.	The Philippines is an archipelago made up of over 7,640 islands, though only about 2,000 are inhabited. | Ang Pilipinas ay isang kapulaang binubuo ng 7,640 na isla, ngunit 2,000 lamang ang tinitirahan | Ang Pilipinas ay isang puno na binubuo ng mahigit 7,640 manok, bagaman halos 2,000 lamang ang tumira.
	2.	Philippines was also a U.S. territory from 1898 to 1946. | Ang Pilipinas ay naging isang teritoryo rin ng Estados Unidos mula 1898 hanggang 1946 | Ang Estados Unidos ay naging isang teritoryo ng Pilipinas mula 1946 hanggang 1898
	3.	The national hero of the Philippines is Dr. Jose Rizal. | Si Dr. Jose Rizal ang pambansang bayani ng Pilipinas | Ang Pilipinas ang bansang bayani ni Dr. Jose Rizal
	4.	The national animal of the Philippines is the Carabao. | Ang pambansang hayop ng Pilipinas ay ang kalabaw | Ang pambansang hayop ng Pilipinas ay ang aso
	5.	The national bird is the Philippine Eagle, one of the largest and most powerful eagles in the world. | Ang pambansang ibon ay ang Philippine Eagle, isa sa pinakamalaki at pinakamalakas na agila sa mundo | Ang karaniwang ibon na Philippine Eagle ay isang maliit na Agila
    

Scoring rubric (binary 0/1 for each):
1) Accuracy — Meaning preserved (entities, polarity, tense/aspect, quantities, conditions).
2) Fluency — Natural, grammatical Filipino (orthography, morphology, agreement).
3) Coherence — Logical flow; clear referents/connectors; consistent register.
4) Cultural Appropriateness — Idiomatic usage; avoids unjustified Taglish/calques; suitable register.
5) Guideline Adherence — Follows stated domain/style rules (or general editorial norms if none provided).
6) Completeness — No omissions/additions; all content rendered faithfully.

Hard rules:
- Critical meaning error (e.g., negation flip, wrong entity) → Accuracy=0.
- Major omission/addition → Completeness=0 (and Accuracy=0 if meaning affected).
- Pervasive unjustified Taglish/calques in formal context → Fluency=0 (and possibly Cultural=0).

Scoring aggregation:
- Compute total_points = sum of the six criteria (0–6).
- Map to overall_score (integer 1–5):
  0–1 → 1 (“poor”)
  2   → 2 (“fair”)
  3–4 → 3 (“good”)
  5   → 4 (“very_good”)
  6   → 5 (“excellent”)
- Label must match overall_score exactly:
  1→"poor", 2→"fair", 3→"good", 4→"very_good", 5→"excellent".

VALIDATION CHECKS (must hold):
- total_points == accuracy+fluency+coherence+cultural_appropriateness+guideline_adherence+completeness
- overall_score and label match the mapping above.
- Use integers only (0/1 for criteria; 1–5 for overall_score). No extra keys.

OUTPUT FORMAT — return JSON ONLY (no prose/backticks). Exactly this schema:
{{"criteria": {{"accuracy": 0 or 1, "fluency": 0 or 1, "coherence": 0 or 1, "cultural_appropriateness": 0 or 1, "guideline_adherence": 0 or 1, "completeness": 0 or 1}},
  "total_points": integer 0-6,
  "overall_score": integer 1-5,
  "label": "poor"|"fair"|"good"|"very_good"|"excellent",
  "explanation": "≤120 words; brief evidence for each criterion"}}
        """
    )
    
    parser = StrOutputParser()
    gemini_chain = judge_prompt | gemini_judge | parser
    chatgpt_chain = judge_prompt | chatgpt_judge | parser
    
    input_data = {
        "source": source_text,
        "translation": translated_text,
        "reference": reference_text or "N/A"
    }

    gemini_opinion = ""
    chatgpt_opinion = ""

    with ThreadPoolExecutor(max_workers=2) as executor:
        future_gemini = executor.submit(gemini_chain.invoke, input_data)
        future_chatgpt = executor.submit(chatgpt_chain.invoke, input_data)
        
        try:
            print("...getting opinion from Gemini...")
            gemini_opinion = future_gemini.result()
            print("...getting opinion from GPT-4...")
            chatgpt_opinion = future_chatgpt.result()
        except Exception as e:
            return f"An error occurred while consulting models: {e}"

    return f"""Consultation results:
- Opinion from Gemini-2.5-Pro:
{gemini_opinion}

- Opinion from GPT-4:
{chatgpt_opinion}
"""

tools = [search_tool, opinion_pooling_tool]

prompt_template = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            """You are a rigorous, impartial English→Filipino translation judge with deep expertise in Filipino grammar, style, and cultural nuance. Evaluate ONLY the given source/translation using the criteria below. Favor idiomatic Filipino that preserves meaning. Penalize omissions/additions, mistranslations (polarity/negation, tense/aspect, quantities, named entities), awkward calques, unjustified Taglish, and register mismatches. Do NOT rewrite the translation—only judge it. When uncertain, choose the lower score and justify briefly with evidence. Assume formal register unless stated otherwise. If no domain/style guide is provided, use general editorial norms as the guideline.

SCORED REFERENCE EXAMPLES (for patterning; do NOT output these):

- Example A — Excellent
  Source: "The meeting was postponed because of the storm."
  Translation: "Naantala ang pagpupulong dahil sa bagyo."
  Expected JSON:
  {{"criteria": {{"accuracy": 1, "fluency": 1, "coherence": 1, "cultural_appropriateness": 1, "guideline_adherence": 1, "completeness": 1}},
    "total_points": 6, "overall_score": 5, "label": "excellent",
    "explanation": "Idiomatic and precise; preserves cause and entities; no omissions/additions."}}

- Example B — Very good (minor style issue)
  Source: "Please submit the report by Friday."
  Translation: "Pakiusap na isumite ang ulat pagsapit ng Biyernes."
  Expected JSON:
  {{"criteria": {{"accuracy": 1, "fluency": 1, "coherence": 1, "cultural_appropriateness": 1, "guideline_adherence": 1, "completeness": 0}},
    "total_points": 5, "overall_score": 4, "label": "very_good",
    "explanation": "Meaning preserved; minor completeness/style nuance (tone/softener not fully mirrored)."}}

- Example C — Good (loss of specificity)
  Source: "Do not turn off the main power switch."
  Translation: "Huwag patayin ang switch."
  Expected JSON:
  {{"criteria": {{"accuracy": 0, "fluency": 1, "coherence": 1, "cultural_appropriateness": 1, "guideline_adherence": 1, "completeness": 0}},
    "total_points": 4, "overall_score": 3, "label": "good",
    "explanation": "Omits 'main power' → specificity lost (accuracy/completeness↓); grammar/flow are fine."}}

- Example D — Fair (noticeable errors, mostly understandable)
  Source: "Store the medicine in a cool, dry place."
  Translation: "Itago ang gamot sa malamig na lugar."
  Expected JSON:
  {{"criteria": {{"accuracy": 1, "fluency": 1, "coherence": 1, "cultural_appropriateness": 1, "guideline_adherence": 0, "completeness": 0}},
    "total_points": 4, "overall_score": 3, "label": "good",
    "explanation": "Misses 'dry' and guidance nuance; otherwise natural. (If policy requires both conditions, consider Completeness=0 and Guideline=0.)"}}

- Example E — Poor (wrong meaning)
  Source: "Keep out of reach of children."
  Translation: "Maganda ang bata."
  Expected JSON:
  {{"criteria": {{"accuracy": 0, "fluency": 1, "coherence": 0, "cultural_appropriateness": 0, "guideline_adherence": 0, "completeness": 0}},
    "total_points": 1, "overall_score": 1, "label": "poor",
    "explanation": "Unrelated meaning; safety directive lost; incoherent to instruction context."}}

Additionally, you will also be given a few examples of English→Filipino pairs—each with a correct translation, a flawed translation, and a short remark explaining the flaw. Study those examples, then evaluate a new pair according to six binary criteria.

Examples (from CSV columns “source”, “correct”, “flawed”, separated by |):
	1.	The Philippines is an archipelago made up of over 7,640 islands, though only about 2,000 are inhabited. | Ang Pilipinas ay isang kapulaang binubuo ng 7,640 na isla, ngunit 2,000 lamang ang tinitirahan | Ang Pilipinas ay isang puno na binubuo ng mahigit 7,640 manok, bagaman halos 2,000 lamang ang tumira.
	2.	Philippines was also a U.S. territory from 1898 to 1946. | Ang Pilipinas ay naging isang teritoryo rin ng Estados Unidos mula 1898 hanggang 1946 | Ang Estados Unidos ay naging isang teritoryo ng Pilipinas mula 1946 hanggang 1898
	3.	The national hero of the Philippines is Dr. Jose Rizal. | Si Dr. Jose Rizal ang pambansang bayani ng Pilipinas | Ang Pilipinas ang bansang bayani ni Dr. Jose Rizal
	4.	The national animal of the Philippines is the Carabao. | Ang pambansang hayop ng Pilipinas ay ang kalabaw | Ang pambansang hayop ng Pilipinas ay ang aso
	5.	The national bird is the Philippine Eagle, one of the largest and most powerful eagles in the world. | Ang pambansang ibon ay ang Philippine Eagle, isa sa pinakamalaki at pinakamalakas na agila sa mundo | Ang karaniwang ibon na Philippine Eagle ay isang maliit na Agila
    

Scoring rubric (binary 0/1 for each):
1) Accuracy — Meaning preserved (entities, polarity, tense/aspect, quantities, conditions).
2) Fluency — Natural, grammatical Filipino (orthography, morphology, agreement).
3) Coherence — Logical flow; clear referents/connectors; consistent register.
4) Cultural Appropriateness — Idiomatic usage; avoids unjustified Taglish/calques; suitable register.
5) Guideline Adherence — Follows stated domain/style rules (or general editorial norms if none provided).
6) Completeness — No omissions/additions; all content rendered faithfully.

Hard rules:
- Critical meaning error (e.g., negation flip, wrong entity) → Accuracy=0.
- Major omission/addition → Completeness=0 (and Accuracy=0 if meaning affected).
- Pervasive unjustified Taglish/calques in formal context → Fluency=0 (and possibly Cultural=0).


VALIDATION CHECKS (must hold):
- total_points == accuracy+fluency+coherence+cultural_appropriateness+guideline_adherence+completeness
- Use integers only (0/1 for criteria; 1–5 for overall_score). No extra keys.

You can use tools like Tavily Search to clarify uncertainties about specific words or phrases, but do not use them for general translation help. If you are still uncertain after using the search tool, you can use the opinion pooling tool to consult other AI models for their evaluations, Youre final answer must be the average of the two models' scores if ever you use the opinion pooling tool.

Please reason before answering like why thats your score for the criteria. After your done type your final answer by typing 'FINAL:' followed by your answer in the following JSON format schema:{{"criteria": {{"accuracy": 0 or 1, "fluency": 0 or 1, "coherence": 0 or 1, "cultural_appropriateness": 0 or 1, "guideline_adherence": 0 or 1, "completeness": 0 or 1}},
  "explanation": "≤120 words; brief evidence for each criterion"}}
""",
        ),
        (
            "human",
            """NOW Please evaluate the following translation.

**Source:**
{source_text}

**Translation (Filipino):**
{translated_text}
""",
        ),
        ("ai", "{agent_scratchpad}"), # Where the agent keeps its intermediate work (thoughts, tool calls)
    ]
)


llm = ChatGoogleGenerativeAI(model="gemini-2.5-flash-lite", temperature=0.1)

agent = create_openai_tools_agent(llm, tools, prompt_template)

agent_executor = AgentExecutor(
    agent=agent,
    tools=tools,
    verbose=True,
)


if __name__ == "__main__":
    print("--- Example 2: A trickier translation with business jargon ---")
    source_tricky = "To maximize synergy, we must leverage our core competencies and streamline our workflow."
    translation_tricky = "Para ma-maximize ang synergy, dapat gamitin natin ang ating core competencies at i-streamline ang workflow."
    
    response_tricky = agent_executor.invoke({
        "source_text": source_tricky,
        "translated_text": translation_tricky,
    })
    print("\n--- FINAL OUTPUT ---")
    print(response_tricky["output"])

--- Example 2: A trickier translation with business jargon ---


[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m
Invoking: `tavily_search_results_json` with `{'query': 'Translate synergy core competencies workflow in Filipino'}`
responded: 
I will evaluate the translation based on accuracy, fluency, coherence, cultural appropriateness, guideline adherence, and completeness.

*   **Accuracy:** The translation uses English terms like "maximize," "synergy," "core competencies," and "streamline" directly in the Filipino sentence. While understandable, it doesn't translate these concepts into Filipino, potentially affecting accuracy if a full Filipino equivalent was expected. However, the core meaning of leveraging strengths and improving processes is conveyed. I'll lean towards a 1 here, assuming the intent was to keep some technical terms.
*   **Fluency:** The sentence structure is grammatically sound in Filipino, even with the embedded English terms. The use of "Para," "ma-ma

In [None]:
import pandas as pd
import json
import re
import time

MAX_RETRIES = 3
RETRY_DELAY_SECONDS = 2

def extract_final_json(text: str):
    """
    Finds the 'FINAL:' marker in the input string and parses the following segment as JSON.
    This version is more robust and tries to find a JSON object even without the marker.
    Handles optional whitespace between 'FINAL:' and the JSON object.
    Returns the parsed JSON object, or raises ValueError if not found or invalid.
    """
    match = re.search(r'FINAL:\s*({.*?})', text, re.DOTALL)
    if not match:
        match = re.search(r'({.*})', text, re.DOTALL)
        if not match:
            raise ValueError("No 'FINAL:' marker or JSON object found in the input string.")
    
    json_str = match.group(1)
    try:
        return json.loads(json_str)
    except json.JSONDecodeError as e:
        raise ValueError(f"Invalid JSON found. Error: {e}. String was: {json_str}")

def sum_criteria_points(data: dict) -> int:
    """
    Takes in a JSON-like dict with a 'criteria' field containing 0/1 values,
    and returns the sum of the 1s.
    """
    criteria = data.get("criteria", {})
    if not isinstance(criteria, dict):
        return 0
    return sum(value for value in criteria.values() if isinstance(value, int))

try:
    df = pd.read_csv("validation.csv")
except FileNotFoundError:
    print("Error: The file 'Datasets - Human-Labeled Validation Set.csv' was not found.")
    # Exit the script if the essential input file is missing.
    exit()


print(f"Found columns: {df.columns.tolist()}")
print(f"Processing {len(df)} rows...")

results = []

for index, row in df.iterrows():
    attempts = 0
    while attempts < MAX_RETRIES:
        try:
            response_tricky = agent_executor.invoke({
                "source_text": row['Source Text (English)'],
                "translated_text": row['Target Text (Filipino)'],
            })
            response_output = response_tricky["output"]


            print(f"\n--- Processing Row {index} (Attempt {attempts + 1}/{MAX_RETRIES}) ---")
            
            final_json = extract_final_json(response_tricky["output"])
            total_points = sum_criteria_points(final_json)

            print("Successfully extracted JSON:", final_json)
            print("Total points:", total_points)
            
            results.append({
                "row_index": index,
                "source_text": row.get('Source Text (English)'),
                "target_text": row.get('Target Text (Filipino)'),
                "final_json": final_json,
                "total_points": total_points,
                "status": "Success"
            })

            break

        except ValueError as e:
            attempts += 1
            print(f"An error occurred on row {index}: {e}")
            if attempts < MAX_RETRIES:
                print(f"Retrying in {RETRY_DELAY_SECONDS} seconds...")
                time.sleep(RETRY_DELAY_SECONDS)
            else:
                print(f"Failed to process row {index} after {MAX_RETRIES} attempts. Skipping.")
                results.append({
                    "row_index": index,
                    "source_text": row.get('Source Text (English)'),
                    "target_text": row.get('Target Text (Filipino)'),
                    "final_json": None,
                    "total_points": 0,
                    "status": f"Failed after {MAX_RETRIES} attempts"
                })

results_df = pd.DataFrame(results)
print("\n--- Processing Complete ---")
print(results_df)

Found columns: ['Source Text (English)', 'Target Text (Filipino)', 'Final Score', 'Rater 1 Explanation', 'Rater 2 Explanation']
Processing 40 rows...


[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m
Invoking: `tavily_search_results_json` with `{'query': 'afternoon sun in Filipino'}`
responded: 


The search results indicate that "hapon" means "afternoon" and "araw" means "sun." The phrase "hapon na araw" literally translates to "afternoon sun."

Here's the evaluation:

*   **Accuracy:** 1. The translation accurately conveys the meaning of the source text. "Nagtawanan" (laughed) and "naglaro" (played) are correct verbs, and "sa ilalim ng hapon na araw" correctly translates to "under the afternoon sun."
*   **Fluency:** 1. The sentence structure and word choices are natural and grammatically correct in Filipino.
*   **Coherence:** 1. The sentence flows logically and the meaning is clear.
*   **Cultural Appropriateness:** 1. The translation uses appropriate Filipino terms and

KeyboardInterrupt: 

In [None]:
comparison = []
def map_to_overall_score(num):
    if 0 <= num <= 1:
        return 1, "poor"
    elif num == 2:
        return 2, "fair"
    elif 3 <= num <= 4:
        return 3, "good"
    elif num == 5:
        return 4, "very_good"
    elif num == 6:
        return 5, "excellent"
    else:
        raise ValueError("Input number is out of the valid range 0 to 6")
    
for idx in results_df['row_index']:
    model_row = results_df.loc[results_df['row_index'] == idx].iloc[0]
    model_score = model_row['total_points']
    # Get human score from df
    if idx in df.index:
        human_score = df.loc[idx, 'Final Score']
    else:
        human_score = None

    MSE = ((model_score - human_score) ** 2)/2

    comparison.append({
        'row_index': idx,
        'source_text': model_row['source_text'],
        'target_text': model_row['target_text'],
        'model_score': map_to_overall_score(model_score),
        'human_score': human_score,
        'MSE': MSE,
    })

comparison_df = pd.DataFrame(comparison)
print(comparison_df.columns)

Index(['row_index', 'source_text', 'target_text', 'model_score', 'human_score',
       'MSE'],
      dtype='object')


In [None]:
results_df.to_csv('results.csv', index=False)

In [None]:
mse_values = comparison_df['MSE'].dropna()
average_mse = mse_values.mean() if not mse_values.empty else None
print(f"\nAverage MSE of results: {average_mse}")



Average MSE of results: 3.95


In [None]:
import pandas as pd
import json
import re
import time
import numpy as np # Using numpy for easy statistical calculations

NUM_ROWS_TO_TEST = 3
REPETITIONS_PER_ROW = 5
RETRY_DELAY_SECONDS = 1 # Delay between requests if needed

def simulate_agent_executor(input_data):
    """
    This function simulates the behavior of agent_executor.invoke.
    It returns slightly different outputs to test consistency.
    In your real use case, you would replace calls to this function
    with your actual agent_executor.invoke call.
    """
    import random
    
    points = [
        '"accuracy": 1',
        '"fluency": 1',
        '"style": 1',
        '"clarity": 1',
        f'"tone": {random.choice([0, 1])}' 
    ]
    random.shuffle(points)
    
    output_format = random.choice([
        'Here is the final analysis. FINAL: { "criteria": { ' + ', '.join(points) + ' } }',
        'FINAL: { "criteria": { ' + ', '.join(points) + ' } }',
        # Malformed output simulation
        'FINAL: { "criteria": { ' + ', '.join(points) + ', } }' # Extra trailing comma
    ])
    
    return {"output": output_format}


def extract_final_json(text: str):
    """
    Finds the 'FINAL:' marker or a JSON object in the input string and parses it.
    Returns the parsed JSON object, or raises ValueError if not found or invalid.
    """
    match = re.search(r'FINAL:\s*({.*?})', text, re.DOTALL)
    if not match:
        match = re.search(r'({.*})', text, re.DOTALL)
        if not match:
            raise ValueError("No 'FINAL:' marker or JSON object found.")
    
    json_str = match.group(1)
    try:
        json_str_fixed = re.sub(r',\s*}', '}', json_str)
        json_str_fixed = re.sub(r',\s*]', ']', json_str_fixed)
        return json.loads(json_str_fixed)
    except json.JSONDecodeError as e:
        raise ValueError(f"Invalid JSON found. Error: {e}. String was: {json_str}")

def sum_criteria_points(data: dict) -> int:
    """
    Takes a JSON-like dict with a 'criteria' field and sums the integer values.
    """
    criteria = data.get("criteria", {})
    if not isinstance(criteria, dict):
        return 0
    return sum(value for value in criteria.values() if isinstance(value, int))


try:
    df = pd.read_csv("validation.csv")
except FileNotFoundError:
    print("Error: The file 'Datasets - Human-Labeled Validation Set.csv' was not found.")
    exit()

# A list to hold the detailed results for analysis
consistency_results = []

print(f"--- Starting Consistency Test ---")
print(f"Testing {NUM_ROWS_TO_TEST} rows, with {REPETITIONS_PER_ROW} calls per row.")

for index, row in df.head(NUM_ROWS_TO_TEST).iterrows():
    print(f"\n--- Testing Row {index} ---")
    source_text = row.get('Source Text (English)', 'N/A')
    print(f"Source Text: \"{source_text[:80]}...\"") # Print a snippet of the text

    scores = []
    
    for i in range(REPETITIONS_PER_ROW):
        try:
            response_tricky = agent_executor.invoke({
                 "source_text": row['Source Text (English)'],
                 "translated_text": row['Target Text (Filipino)'],
            })
              
            final_json = extract_final_json(response_tricky["output"])
            total_points = sum_criteria_points(final_json)
            scores.append(total_points)
            print(f"  Repetition {i+1}: Success, Score = {total_points}")

        except ValueError as e:
            scores.append(None) # Using None to indicate failure
            print(f"  Repetition {i+1}: FAILED to parse output. Error: {e}")
            
        time.sleep(RETRY_DELAY_SECONDS)

    valid_scores = [s for s in scores if s is not None]
    
    if not valid_scores:
        print("  -> RESULT: No successful runs for this row.")
        stats = {
            "row_index": index,
            "scores": scores,
            "successful_runs": 0,
            "consistency_score": 0
        }
    else:
        mean_score = np.mean(valid_scores)
        std_dev = np.std(valid_scores)
        min_score = np.min(valid_scores)
        max_score = np.max(valid_scores)
        
        # We can define a "consistency score" as 1 minus the standard deviation.
        # This is a simple metric; a score of 1.0 means perfect consistency.
        consistency_score = 1.0 - std_dev

        print("\n  -> Consistency Analysis for Row {}:".format(index))
        print(f"     Scores Obtained: {scores}")
        print(f"     Successful Runs: {len(valid_scores)}/{REPETITIONS_PER_ROW}")
        print(f"     Average Score: {mean_score:.2f}")
        print(f"     Standard Deviation: {std_dev:.2f}")
        print(f"     Min/Max Scores: {min_score}/{max_score}")
        print(f"     Consistency Score (1.0 - std_dev): {consistency_score:.2f}")
        
        stats = {
            "row_index": index,
            "scores": scores,
            "successful_runs": len(valid_scores),
            "average_score": mean_score,
            "std_dev": std_dev,
            "min_score": min_score,
            "max_score": max_score,
            "consistency_score": consistency_score
        }

    consistency_results.append(stats)


print("\n\n Overall Consistency Report")
results_df = pd.DataFrame(consistency_results)
results_df.set_index('row_index', inplace=True)
print(results_df)


--- Starting Consistency Test ---
Testing 3 rows, with 5 calls per row.

--- Testing Row 0 ---
Source Text: "The children laughed and played under the afternoon sun...."


[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m
Invoking: `tavily_search_results_json` with `{'query': "Translate 'afternoon sun' in Filipino"}`
responded: 


The search results indicate that "hapon" is the Filipino word for "afternoon." The phrase "hapon na araw" literally translates to "afternoon sun." While grammatically correct, a more natural and idiomatic way to express this in Filipino would be "araw ng hapon" or "araw sa hapon." However, the current translation is understandable.

Here's the evaluation:

*   **Accuracy:** 1 - The meaning is preserved. "Hapon na araw" directly translates to "afternoon sun."
*   **Fluency:** 1 - The sentence is grammatically correct and understandable, though not the most idiomatic phrasing.
*   **Coherence:** 1 - The sentence flows logically and the meaning is clea

In [None]:
results_df.to_csv('results2.csv', index=False)

In [None]:
print(results_df)

                                scores  successful_runs  average_score  \
row_index                                                                
0                   [6, 6, None, 6, 6]                4           6.00   
1          [None, None, 6, None, None]                1           6.00   
2                   [0, 2, 3, None, 2]                4           1.75   

            std_dev  min_score  max_score  consistency_score  
row_index                                                     
0          0.000000          6          6           1.000000  
1          0.000000          6          6           1.000000  
2          1.089725          0          3          -0.089725  


In [None]:
average_consistency_score = results_df['consistency_score'].mean()
print(f"\nAverage Consistency Score across all tested rows: {average_consistency_score:.2f}")


Average Consistency Score across all tested rows: 0.64


## Only one tool use

In [None]:
from typing import Literal, Optional
from typing import Literal, Optional
from concurrent.futures import ThreadPoolExecutor
from langchain_openai import ChatOpenAI
from langchain_core.output_parsers import StrOutputParser
from langchain_core.output_parsers import StrOutputParser
from langchain_core.pydantic_v1 import BaseModel, Field
from langchain.agents import AgentExecutor, create_openai_tools_agent
from langchain_community.tools.tavily_search import TavilySearchResults
from langchain.tools import tool
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_core.prompts import ChatPromptTemplate


class TranslationEvaluation(BaseModel):
    """The final evaluation of the English-to-Filipino translation."""
    score: int = Field(..., description="Numerical score from 1 (poor) to 5 (perfect).")
    label: Literal["Incomprehensible", "Poor", "Good", "Excellent", "Perfect"] = Field(
        ..., description="Categorical label for the translation quality."
    )
    reasoning: str = Field(
        ..., description="Detailed, point-by-point reasoning for the score, citing specific examples from the text."
    )


search_tool = TavilySearchResults(k=1)
search_tool.description = (
    "Use this to search for definitions, synonyms, or cultural context of specific English or Filipino words and phrases. Mention the word TRANSLATE so that tavily knows that your looking for the counterpart of that word like \"Translate 'food' in Filino\"."
)


@tool
def opinion_pooling_tool(source_text: str, translated_text: str, reference_text: Optional[str] = None) -> str:
    """
    Use this ONLY as a last resort if the search tool did not clarify your uncertainty. 
    This tool consults other expert AI models (Gemini and GPT-4) for their evaluations. It is very expensive.
    """
    print("\n--- CONSULTING EXPENSIVE OPINION POOLING TOOL ---")
    

    gemini_judge = ChatGoogleGenerativeAI(model="gemini-2.5-pro", temperature=0.2)
    chatgpt_judge = ChatOpenAI(model="gpt-4o", temperature=0.2) # Using a different GPT model


    judge_prompt = ChatPromptTemplate.from_template(
        """You are a rigorous, impartial English→Filipino translation judge with deep expertise in Filipino grammar, style, and cultural nuance. Evaluate ONLY the given source/translation using the criteria below. Favor idiomatic Filipino that preserves meaning. Penalize omissions/additions, mistranslations (polarity/negation, tense/aspect, quantities, named entities), awkward calques, unjustified Taglish, and register mismatches. Do NOT rewrite the translation—only judge it. When uncertain, choose the lower score and justify briefly with evidence. Assume formal register unless stated otherwise. If no domain/style guide is provided, use general editorial norms as the guideline.

SCORED REFERENCE EXAMPLES (for patterning; do NOT output these):

- Example A — Excellent
  Source: "The meeting was postponed because of the storm."
  Translation: "Naantala ang pagpupulong dahil sa bagyo."
  Expected JSON:
  {{"criteria": {{"accuracy": 1, "fluency": 1, "coherence": 1, "cultural_appropriateness": 1, "guideline_adherence": 1, "completeness": 1}},
    "total_points": 6, "overall_score": 5, "label": "excellent",
    "explanation": "Idiomatic and precise; preserves cause and entities; no omissions/additions."}}

- Example B — Very good (minor style issue)
  Source: "Please submit the report by Friday."
  Translation: "Pakiusap na isumite ang ulat pagsapit ng Biyernes."
  Expected JSON:
  {{"criteria": {{"accuracy": 1, "fluency": 1, "coherence": 1, "cultural_appropriateness": 1, "guideline_adherence": 1, "completeness": 0}},
    "total_points": 5, "overall_score": 4, "label": "very_good",
    "explanation": "Meaning preserved; minor completeness/style nuance (tone/softener not fully mirrored)."}}

- Example C — Good (loss of specificity)
  Source: "Do not turn off the main power switch."
  Translation: "Huwag patayin ang switch."
  Expected JSON:
  {{"criteria": {{"accuracy": 0, "fluency": 1, "coherence": 1, "cultural_appropriateness": 1, "guideline_adherence": 1, "completeness": 0}},
    "total_points": 4, "overall_score": 3, "label": "good",
    "explanation": "Omits 'main power' → specificity lost (accuracy/completeness↓); grammar/flow are fine."}}

- Example D — Fair (noticeable errors, mostly understandable)
  Source: "Store the medicine in a cool, dry place."
  Translation: "Itago ang gamot sa malamig na lugar."
  Expected JSON:
  {{"criteria": {{"accuracy": 1, "fluency": 1, "coherence": 1, "cultural_appropriateness": 1, "guideline_adherence": 0, "completeness": 0}},
    "total_points": 4, "overall_score": 3, "label": "good",
    "explanation": "Misses 'dry' and guidance nuance; otherwise natural. (If policy requires both conditions, consider Completeness=0 and Guideline=0.)"}}

- Example E — Poor (wrong meaning)
  Source: "Keep out of reach of children."
  Translation: "Maganda ang bata."
  Expected JSON:
  {{"criteria": {{"accuracy": 0, "fluency": 1, "coherence": 0, "cultural_appropriateness": 0, "guideline_adherence": 0, "completeness": 0}},
    "total_points": 1, "overall_score": 1, "label": "poor",
    "explanation": "Unrelated meaning; safety directive lost; incoherent to instruction context."}}

Additionally, you will also be given a few examples of English→Filipino pairs—each with a correct translation, a flawed translation, and a short remark explaining the flaw. Study those examples, then evaluate a new pair according to six binary criteria.

Examples (from CSV columns “source”, “correct”, “flawed”, separated by |):
	1.	The Philippines is an archipelago made up of over 7,640 islands, though only about 2,000 are inhabited. | Ang Pilipinas ay isang kapulaang binubuo ng 7,640 na isla, ngunit 2,000 lamang ang tinitirahan | Ang Pilipinas ay isang puno na binubuo ng mahigit 7,640 manok, bagaman halos 2,000 lamang ang tumira.
	2.	Philippines was also a U.S. territory from 1898 to 1946. | Ang Pilipinas ay naging isang teritoryo rin ng Estados Unidos mula 1898 hanggang 1946 | Ang Estados Unidos ay naging isang teritoryo ng Pilipinas mula 1946 hanggang 1898
	3.	The national hero of the Philippines is Dr. Jose Rizal. | Si Dr. Jose Rizal ang pambansang bayani ng Pilipinas | Ang Pilipinas ang bansang bayani ni Dr. Jose Rizal
	4.	The national animal of the Philippines is the Carabao. | Ang pambansang hayop ng Pilipinas ay ang kalabaw | Ang pambansang hayop ng Pilipinas ay ang aso
	5.	The national bird is the Philippine Eagle, one of the largest and most powerful eagles in the world. | Ang pambansang ibon ay ang Philippine Eagle, isa sa pinakamalaki at pinakamalakas na agila sa mundo | Ang karaniwang ibon na Philippine Eagle ay isang maliit na Agila
    

Scoring rubric (binary 0/1 for each):
1) Accuracy — Meaning preserved (entities, polarity, tense/aspect, quantities, conditions).
2) Fluency — Natural, grammatical Filipino (orthography, morphology, agreement).
3) Coherence — Logical flow; clear referents/connectors; consistent register.
4) Cultural Appropriateness — Idiomatic usage; avoids unjustified Taglish/calques; suitable register.
5) Guideline Adherence — Follows stated domain/style rules (or general editorial norms if none provided).
6) Completeness — No omissions/additions; all content rendered faithfully.

Hard rules:
- Critical meaning error (e.g., negation flip, wrong entity) → Accuracy=0.
- Major omission/addition → Completeness=0 (and Accuracy=0 if meaning affected).
- Pervasive unjustified Taglish/calques in formal context → Fluency=0 (and possibly Cultural=0).

Scoring aggregation:
- Compute total_points = sum of the six criteria (0–6).
- Map to overall_score (integer 1–5):
  0–1 → 1 (“poor”)
  2   → 2 (“fair”)
  3–4 → 3 (“good”)
  5   → 4 (“very_good”)
  6   → 5 (“excellent”)
- Label must match overall_score exactly:
  1→"poor", 2→"fair", 3→"good", 4→"very_good", 5→"excellent".

VALIDATION CHECKS (must hold):
- total_points == accuracy+fluency+coherence+cultural_appropriateness+guideline_adherence+completeness
- overall_score and label match the mapping above.
- Use integers only (0/1 for criteria; 1–5 for overall_score). No extra keys.

OUTPUT FORMAT — return JSON ONLY (no prose/backticks). Exactly this schema:
{{"criteria": {{"accuracy": 0 or 1, "fluency": 0 or 1, "coherence": 0 or 1, "cultural_appropriateness": 0 or 1, "guideline_adherence": 0 or 1, "completeness": 0 or 1}},
  "total_points": integer 0-6,
  "overall_score": integer 1-5,
  "label": "poor"|"fair"|"good"|"very_good"|"excellent",
  "explanation": "≤120 words; brief evidence for each criterion"}}
        """
    )
    
    # Create simple chains for each judge
    parser = StrOutputParser()
    gemini_chain = judge_prompt | gemini_judge | parser
    chatgpt_chain = judge_prompt | chatgpt_judge | parser
    
    input_data = {
        "source": source_text,
        "translation": translated_text,
        "reference": reference_text or "N/A"
    }

    gemini_opinion = ""
    chatgpt_opinion = ""

    # Run API calls in parallel to save time
    with ThreadPoolExecutor(max_workers=2) as executor:
        future_gemini = executor.submit(gemini_chain.invoke, input_data)
        future_chatgpt = executor.submit(chatgpt_chain.invoke, input_data)
        
        try:
            print("...getting opinion from Gemini...")
            gemini_opinion = future_gemini.result()
            print("...getting opinion from GPT-4...")
            chatgpt_opinion = future_chatgpt.result()
        except Exception as e:
            return f"An error occurred while consulting models: {e}"

    return f"""Consultation results:
- Opinion from Gemini-2.5-Pro:
{gemini_opinion}

- Opinion from GPT-4:
{chatgpt_opinion}
"""

# List of all available tools
tools = [search_tool]

prompt_template = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            """You are a rigorous, impartial English→Filipino translation judge with deep expertise in Filipino grammar, style, and cultural nuance. Evaluate ONLY the given source/translation using the criteria below. Favor idiomatic Filipino that preserves meaning. Penalize omissions/additions, mistranslations (polarity/negation, tense/aspect, quantities, named entities), awkward calques, unjustified Taglish, and register mismatches. Do NOT rewrite the translation—only judge it. When uncertain, choose the lower score and justify briefly with evidence. Assume formal register unless stated otherwise. If no domain/style guide is provided, use general editorial norms as the guideline.

SCORED REFERENCE EXAMPLES (for patterning; do NOT output these):

- Example A — Excellent
  Source: "The meeting was postponed because of the storm."
  Translation: "Naantala ang pagpupulong dahil sa bagyo."
  Expected JSON:
  {{"criteria": {{"accuracy": 1, "fluency": 1, "coherence": 1, "cultural_appropriateness": 1, "guideline_adherence": 1, "completeness": 1}},
    "total_points": 6, "overall_score": 5, "label": "excellent",
    "explanation": "Idiomatic and precise; preserves cause and entities; no omissions/additions."}}

- Example B — Very good (minor style issue)
  Source: "Please submit the report by Friday."
  Translation: "Pakiusap na isumite ang ulat pagsapit ng Biyernes."
  Expected JSON:
  {{"criteria": {{"accuracy": 1, "fluency": 1, "coherence": 1, "cultural_appropriateness": 1, "guideline_adherence": 1, "completeness": 0}},
    "total_points": 5, "overall_score": 4, "label": "very_good",
    "explanation": "Meaning preserved; minor completeness/style nuance (tone/softener not fully mirrored)."}}

- Example C — Good (loss of specificity)
  Source: "Do not turn off the main power switch."
  Translation: "Huwag patayin ang switch."
  Expected JSON:
  {{"criteria": {{"accuracy": 0, "fluency": 1, "coherence": 1, "cultural_appropriateness": 1, "guideline_adherence": 1, "completeness": 0}},
    "total_points": 4, "overall_score": 3, "label": "good",
    "explanation": "Omits 'main power' → specificity lost (accuracy/completeness↓); grammar/flow are fine."}}

- Example D — Fair (noticeable errors, mostly understandable)
  Source: "Store the medicine in a cool, dry place."
  Translation: "Itago ang gamot sa malamig na lugar."
  Expected JSON:
  {{"criteria": {{"accuracy": 1, "fluency": 1, "coherence": 1, "cultural_appropriateness": 1, "guideline_adherence": 0, "completeness": 0}},
    "total_points": 4, "overall_score": 3, "label": "good",
    "explanation": "Misses 'dry' and guidance nuance; otherwise natural. (If policy requires both conditions, consider Completeness=0 and Guideline=0.)"}}

- Example E — Poor (wrong meaning)
  Source: "Keep out of reach of children."
  Translation: "Maganda ang bata."
  Expected JSON:
  {{"criteria": {{"accuracy": 0, "fluency": 1, "coherence": 0, "cultural_appropriateness": 0, "guideline_adherence": 0, "completeness": 0}},
    "total_points": 1, "overall_score": 1, "label": "poor",
    "explanation": "Unrelated meaning; safety directive lost; incoherent to instruction context."}}

Additionally, you will also be given a few examples of English→Filipino pairs—each with a correct translation, a flawed translation, and a short remark explaining the flaw. Study those examples, then evaluate a new pair according to six binary criteria.

Examples (from CSV columns “source”, “correct”, “flawed”, separated by |):
	1.	The Philippines is an archipelago made up of over 7,640 islands, though only about 2,000 are inhabited. | Ang Pilipinas ay isang kapulaang binubuo ng 7,640 na isla, ngunit 2,000 lamang ang tinitirahan | Ang Pilipinas ay isang puno na binubuo ng mahigit 7,640 manok, bagaman halos 2,000 lamang ang tumira.
	2.	Philippines was also a U.S. territory from 1898 to 1946. | Ang Pilipinas ay naging isang teritoryo rin ng Estados Unidos mula 1898 hanggang 1946 | Ang Estados Unidos ay naging isang teritoryo ng Pilipinas mula 1946 hanggang 1898
	3.	The national hero of the Philippines is Dr. Jose Rizal. | Si Dr. Jose Rizal ang pambansang bayani ng Pilipinas | Ang Pilipinas ang bansang bayani ni Dr. Jose Rizal
	4.	The national animal of the Philippines is the Carabao. | Ang pambansang hayop ng Pilipinas ay ang kalabaw | Ang pambansang hayop ng Pilipinas ay ang aso
	5.	The national bird is the Philippine Eagle, one of the largest and most powerful eagles in the world. | Ang pambansang ibon ay ang Philippine Eagle, isa sa pinakamalaki at pinakamalakas na agila sa mundo | Ang karaniwang ibon na Philippine Eagle ay isang maliit na Agila
    

Scoring rubric (binary 0/1 for each):
1) Accuracy — Meaning preserved (entities, polarity, tense/aspect, quantities, conditions).
2) Fluency — Natural, grammatical Filipino (orthography, morphology, agreement).
3) Coherence — Logical flow; clear referents/connectors; consistent register.
4) Cultural Appropriateness — Idiomatic usage; avoids unjustified Taglish/calques; suitable register.
5) Guideline Adherence — Follows stated domain/style rules (or general editorial norms if none provided).
6) Completeness — No omissions/additions; all content rendered faithfully.

Hard rules:
- Critical meaning error (e.g., negation flip, wrong entity) → Accuracy=0.
- Major omission/addition → Completeness=0 (and Accuracy=0 if meaning affected).
- Pervasive unjustified Taglish/calques in formal context → Fluency=0 (and possibly Cultural=0).


VALIDATION CHECKS (must hold):
- total_points == accuracy+fluency+coherence+cultural_appropriateness+guideline_adherence+completeness
- Use integers only (0/1 for criteria; 1–5 for overall_score). No extra keys.

You can use tools like Tavily Search to clarify uncertainties about specific words or phrases, but do not use them for general translation help.

Please reason before answering like why thats your score for the criteria. After your done type your final answer by typing 'FINAL:' followed by your answer in the following JSON format schema:{{"criteria": {{"accuracy": 0 or 1, "fluency": 0 or 1, "coherence": 0 or 1, "cultural_appropriateness": 0 or 1, "guideline_adherence": 0 or 1, "completeness": 0 or 1}},
  "explanation": "≤120 words; brief evidence for each criterion"}}
""",
        ),
        (
            "human",
            """NOW Please evaluate the following translation.

**Source:**
{source_text}

**Translation (Filipino):**
{translated_text}
""",
        ),
        ("ai", "{agent_scratchpad}"), # Where the agent keeps its intermediate work (thoughts, tool calls)
    ]
)

llm = ChatGoogleGenerativeAI(model="gemini-2.5-flash-lite", temperature=0.1)

agent = create_openai_tools_agent(llm, tools, prompt_template)

agent_executor = AgentExecutor(
    agent=agent,
    tools=tools,
    verbose=True, # Set to True to see the agent's thought process
)

if __name__ == "__main__":
    print("--- Example 2: A trickier translation with business jargon ---")
    source_tricky = "To maximize synergy, we must leverage our core competencies and streamline our workflow."
    translation_tricky = "Para ma-maximize ang synergy, dapat gamitin natin ang ating core competencies at i-streamline ang workflow."
    
    response_tricky = agent_executor.invoke({
        "source_text": source_tricky,
        "translated_text": translation_tricky,
    })
    print("\n--- FINAL OUTPUT ---")
    print(response_tricky["output"])

--- Example 2: A trickier translation with business jargon ---


[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m
Invoking: `tavily_search_results_json` with `{'query': 'Filipino translation for maximize synergy'}`
responded: 


[0m[36;1m[1;3m[{'title': 'Maximize in Filipino | English to Filipino Dictionary', 'url': 'https://www.translate.com/dictionary/english-filipino/maximize-18206178', 'content': 'Filipino translation of maximize is i-maximize. Tap once to copy the translated word. Translate.com.', 'score': 0.81630427}, {'title': '[PDF] Philippines-Human-Rights-2020-2022.pdf - ohchr', 'url': 'https://www.ohchr.org/sites/default/files/documents/countries/ph/2022-09-14/Philippines-Human-Rights-2020-2022.pdf', 'content': 'The UNJP offers a new mode of human rights cooperation that is more responsive to the complexities of issues confronting nations, harnessing opportunities for partnerships, optimizing synergies and strengths in the UN system, and anchoring programs for 

In [None]:
import pandas as pd
import json
import re
import time
MAX_RETRIES = 3
RETRY_DELAY_SECONDS = 2

def extract_final_json(text: str):
    """
    Finds the 'FINAL:' marker in the input string and parses the following segment as JSON.
    This version is more robust and tries to find a JSON object even without the marker.
    Handles optional whitespace between 'FINAL:' and the JSON object.
    Returns the parsed JSON object, or raises ValueError if not found or invalid.
    """
    # A non-greedy regex is often safer. It stops at the first closing brace.
    match = re.search(r'FINAL:\s*({.*?})', text, re.DOTALL)
    if not match:
        # If 'FINAL:' is not found, as a fallback, search for any JSON object in the text.
        # This can help if the model forgets the marker but still outputs a valid JSON.
        match = re.search(r'({.*})', text, re.DOTALL)
        if not match:
            raise ValueError("No 'FINAL:' marker or JSON object found in the input string.")
    
    json_str = match.group(1)
    try:
        return json.loads(json_str)
    except json.JSONDecodeError as e:
        # The error message now includes the problematic string for easier debugging.
        raise ValueError(f"Invalid JSON found. Error: {e}. String was: {json_str}")

def sum_criteria_points(data: dict) -> int:
    """
    Takes in a JSON-like dict with a 'criteria' field containing 0/1 values,
    and returns the sum of the 1s.
    """
    criteria = data.get("criteria", {})
    # This check ensures that if the 'criteria' value is not a dictionary,
    # the function returns 0 instead of raising an error.
    if not isinstance(criteria, dict):
        return 0
    return sum(value for value in criteria.values() if isinstance(value, int))


try:
    df = pd.read_csv("validation.csv")
except FileNotFoundError:
    print("Error: The file 'Datasets - Human-Labeled Validation Set.csv' was not found.")
    # Exit the script if the essential input file is missing.
    exit()


print(f"Found columns: {df.columns.tolist()}")
print(f"Processing {len(df)} rows...")

results = []

for index, row in df.iterrows():
    attempts = 0
    while attempts < MAX_RETRIES:
        try:
            response_tricky = agent_executor.invoke({
                "source_text": row['Source Text (English)'],
                "translated_text": row['Target Text (Filipino)'],
            })
            response_output = response_tricky["output"]


            print(f"\n--- Processing Row {index} (Attempt {attempts + 1}/{MAX_RETRIES}) ---")
            
            final_json = extract_final_json(response_tricky["output"])
            total_points = sum_criteria_points(final_json)

            print("Successfully extracted JSON:", final_json)
            print("Total points:", total_points)
            
            results.append({
                "row_index": index,
                "source_text": row.get('Source Text (English)'),
                "target_text": row.get('Target Text (Filipino)'),
                "final_json": final_json,
                "total_points": total_points,
                "status": "Success"
            })

            # If the processing was successful, break out of the retry loop.
            break

        except ValueError as e:
            attempts += 1
            print(f"An error occurred on row {index}: {e}")
            if attempts < MAX_RETRIES:
                print(f"Retrying in {RETRY_DELAY_SECONDS} seconds...")
                time.sleep(RETRY_DELAY_SECONDS)
            else:
                print(f"Failed to process row {index} after {MAX_RETRIES} attempts. Skipping.")
                results.append({
                    "row_index": index,
                    "source_text": row.get('Source Text (English)'),
                    "target_text": row.get('Target Text (Filipino)'),
                    "final_json": None,
                    "total_points": 0,
                    "status": f"Failed after {MAX_RETRIES} attempts"
                })

results_df = pd.DataFrame(results)
print("\n--- Processing Complete ---")
print(results_df)

Found columns: ['Source Text (English)', 'Target Text (Filipino)', 'Final Score', 'Rater 1 Explanation', 'Rater 2 Explanation']
Processing 40 rows...


[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m
Invoking: `tavily_search_results_json` with `{'query': 'Filipino word for afternoon sun'}`
responded: 


The source sentence is "The children laughed and played under the afternoon sun."

The Filipino translation is "Ang mga bata ay nagtawanan at naglaro sa ilalim ng hapon na araw."

Let's break down the evaluation:

*   **Accuracy:** The translation is accurate. "Ang mga bata" means "The children," "ay nagtawanan" means "laughed," and "naglaro" means "played." "Sa ilalim ng" means "under." The phrase "hapon na araw" translates to "afternoon sun." All core components are correctly represented.
*   **Fluency:** The sentence is fluent and grammatically correct in Filipino. The word order and verb conjugations are appropriate.
*   **Coherence:** The sentence is coherent and makes 

In [None]:
comparison = []
def map_to_overall_score(num):
    if 0 <= num <= 1:
        return 1, "poor"
    elif num == 2:
        return 2, "fair"
    elif 3 <= num <= 4:
        return 3, "good"
    elif num == 5:
        return 4, "very_good"
    elif num == 6:
        return 5, "excellent"
    else:
        raise ValueError("Input number is out of the valid range 0 to 6")
    
for idx in results_df['row_index']:
    # Get model score from results_df
    model_row = results_df.loc[results_df['row_index'] == idx].iloc[0]
    model_score = model_row['total_points']
    # Get human score from df
    if idx in df.index:
        human_score = df.loc[idx, 'Final Score']
    else:
        human_score = None

    MSE = ((model_score - human_score) ** 2)/2

    comparison.append({
        'row_index': idx,
        'source_text': model_row['source_text'],
        'target_text': model_row['target_text'],
        'model_score': map_to_overall_score(model_score),
        'human_score': human_score,
        'MSE': MSE,
    })

comparison_df = pd.DataFrame(comparison)
print(comparison_df.columns)

Index(['row_index', 'source_text', 'target_text', 'model_score', 'human_score',
       'MSE'],
      dtype='object')


In [None]:
# save comparison_df to a CSV file
comparison_df.to_csv("1Atoolresults.csv", index=False)

In [None]:
mse_values = comparison_df['MSE'].dropna()
average_mse = mse_values.mean() if not mse_values.empty else None
print(f"\nAverage MSE of results: {average_mse}")


Average MSE of results: 5.225


## Pooling tool use

In [None]:
from typing import Literal, Optional
from typing import Literal, Optional
from concurrent.futures import ThreadPoolExecutor
from langchain_openai import ChatOpenAI
from langchain_core.output_parsers import StrOutputParser
from langchain_core.output_parsers import StrOutputParser
from langchain_core.pydantic_v1 import BaseModel, Field
from langchain.agents import AgentExecutor, create_openai_tools_agent
from langchain_community.tools.tavily_search import TavilySearchResults
from langchain.tools import tool
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_core.prompts import ChatPromptTemplate


class TranslationEvaluation(BaseModel):
    """The final evaluation of the English-to-Filipino translation."""
    score: int = Field(..., description="Numerical score from 1 (poor) to 5 (perfect).")
    label: Literal["Incomprehensible", "Poor", "Good", "Excellent", "Perfect"] = Field(
        ..., description="Categorical label for the translation quality."
    )
    reasoning: str = Field(
        ..., description="Detailed, point-by-point reasoning for the score, citing specific examples from the text."
    )


search_tool = TavilySearchResults(k=1)
search_tool.description = (
    "Use this to search for definitions, synonyms, or cultural context of specific English or Filipino words and phrases. Mention the word TRANSLATE so that tavily knows that your looking for the counterpart of that word like \"Translate 'food' in Filino\"."
)

@tool
def opinion_pooling_tool(source_text: str, translated_text: str, reference_text: Optional[str] = None) -> str:
    """
    Use this ONLY as a last resort if the search tool did not clarify your uncertainty. 
    This tool consults other expert AI models (Gemini and GPT-4) for their evaluations. It is very expensive.
    """
    print("\n--- CONSULTING EXPENSIVE OPINION POOLING TOOL ---")
    
    gemini_judge = ChatGoogleGenerativeAI(model="gemini-2.5-pro", temperature=0.2)
    chatgpt_judge = ChatOpenAI(model="gpt-4o", temperature=0.2) # Using a different GPT model

    judge_prompt = ChatPromptTemplate.from_template(
        """You are a rigorous, impartial English→Filipino translation judge with deep expertise in Filipino grammar, style, and cultural nuance. Evaluate ONLY the given source/translation using the criteria below. Favor idiomatic Filipino that preserves meaning. Penalize omissions/additions, mistranslations (polarity/negation, tense/aspect, quantities, named entities), awkward calques, unjustified Taglish, and register mismatches. Do NOT rewrite the translation—only judge it. When uncertain, choose the lower score and justify briefly with evidence. Assume formal register unless stated otherwise. If no domain/style guide is provided, use general editorial norms as the guideline.

SCORED REFERENCE EXAMPLES (for patterning; do NOT output these):

- Example A — Excellent
  Source: "The meeting was postponed because of the storm."
  Translation: "Naantala ang pagpupulong dahil sa bagyo."
  Expected JSON:
  {{"criteria": {{"accuracy": 1, "fluency": 1, "coherence": 1, "cultural_appropriateness": 1, "guideline_adherence": 1, "completeness": 1}},
    "total_points": 6, "overall_score": 5, "label": "excellent",
    "explanation": "Idiomatic and precise; preserves cause and entities; no omissions/additions."}}

- Example B — Very good (minor style issue)
  Source: "Please submit the report by Friday."
  Translation: "Pakiusap na isumite ang ulat pagsapit ng Biyernes."
  Expected JSON:
  {{"criteria": {{"accuracy": 1, "fluency": 1, "coherence": 1, "cultural_appropriateness": 1, "guideline_adherence": 1, "completeness": 0}},
    "total_points": 5, "overall_score": 4, "label": "very_good",
    "explanation": "Meaning preserved; minor completeness/style nuance (tone/softener not fully mirrored)."}}

- Example C — Good (loss of specificity)
  Source: "Do not turn off the main power switch."
  Translation: "Huwag patayin ang switch."
  Expected JSON:
  {{"criteria": {{"accuracy": 0, "fluency": 1, "coherence": 1, "cultural_appropriateness": 1, "guideline_adherence": 1, "completeness": 0}},
    "total_points": 4, "overall_score": 3, "label": "good",
    "explanation": "Omits 'main power' → specificity lost (accuracy/completeness↓); grammar/flow are fine."}}

- Example D — Fair (noticeable errors, mostly understandable)
  Source: "Store the medicine in a cool, dry place."
  Translation: "Itago ang gamot sa malamig na lugar."
  Expected JSON:
  {{"criteria": {{"accuracy": 1, "fluency": 1, "coherence": 1, "cultural_appropriateness": 1, "guideline_adherence": 0, "completeness": 0}},
    "total_points": 4, "overall_score": 3, "label": "good",
    "explanation": "Misses 'dry' and guidance nuance; otherwise natural. (If policy requires both conditions, consider Completeness=0 and Guideline=0.)"}}

- Example E — Poor (wrong meaning)
  Source: "Keep out of reach of children."
  Translation: "Maganda ang bata."
  Expected JSON:
  {{"criteria": {{"accuracy": 0, "fluency": 1, "coherence": 0, "cultural_appropriateness": 0, "guideline_adherence": 0, "completeness": 0}},
    "total_points": 1, "overall_score": 1, "label": "poor",
    "explanation": "Unrelated meaning; safety directive lost; incoherent to instruction context."}}

Additionally, you will also be given a few examples of English→Filipino pairs—each with a correct translation, a flawed translation, and a short remark explaining the flaw. Study those examples, then evaluate a new pair according to six binary criteria.

Examples (from CSV columns “source”, “correct”, “flawed”, separated by |):
	1.	The Philippines is an archipelago made up of over 7,640 islands, though only about 2,000 are inhabited. | Ang Pilipinas ay isang kapulaang binubuo ng 7,640 na isla, ngunit 2,000 lamang ang tinitirahan | Ang Pilipinas ay isang puno na binubuo ng mahigit 7,640 manok, bagaman halos 2,000 lamang ang tumira.
	2.	Philippines was also a U.S. territory from 1898 to 1946. | Ang Pilipinas ay naging isang teritoryo rin ng Estados Unidos mula 1898 hanggang 1946 | Ang Estados Unidos ay naging isang teritoryo ng Pilipinas mula 1946 hanggang 1898
	3.	The national hero of the Philippines is Dr. Jose Rizal. | Si Dr. Jose Rizal ang pambansang bayani ng Pilipinas | Ang Pilipinas ang bansang bayani ni Dr. Jose Rizal
	4.	The national animal of the Philippines is the Carabao. | Ang pambansang hayop ng Pilipinas ay ang kalabaw | Ang pambansang hayop ng Pilipinas ay ang aso
	5.	The national bird is the Philippine Eagle, one of the largest and most powerful eagles in the world. | Ang pambansang ibon ay ang Philippine Eagle, isa sa pinakamalaki at pinakamalakas na agila sa mundo | Ang karaniwang ibon na Philippine Eagle ay isang maliit na Agila
    

Scoring rubric (binary 0/1 for each):
1) Accuracy — Meaning preserved (entities, polarity, tense/aspect, quantities, conditions).
2) Fluency — Natural, grammatical Filipino (orthography, morphology, agreement).
3) Coherence — Logical flow; clear referents/connectors; consistent register.
4) Cultural Appropriateness — Idiomatic usage; avoids unjustified Taglish/calques; suitable register.
5) Guideline Adherence — Follows stated domain/style rules (or general editorial norms if none provided).
6) Completeness — No omissions/additions; all content rendered faithfully.

Hard rules:
- Critical meaning error (e.g., negation flip, wrong entity) → Accuracy=0.
- Major omission/addition → Completeness=0 (and Accuracy=0 if meaning affected).
- Pervasive unjustified Taglish/calques in formal context → Fluency=0 (and possibly Cultural=0).

Scoring aggregation:
- Compute total_points = sum of the six criteria (0–6).
- Map to overall_score (integer 1–5):
  0–1 → 1 (“poor”)
  2   → 2 (“fair”)
  3–4 → 3 (“good”)
  5   → 4 (“very_good”)
  6   → 5 (“excellent”)
- Label must match overall_score exactly:
  1→"poor", 2→"fair", 3→"good", 4→"very_good", 5→"excellent".

VALIDATION CHECKS (must hold):
- total_points == accuracy+fluency+coherence+cultural_appropriateness+guideline_adherence+completeness
- overall_score and label match the mapping above.
- Use integers only (0/1 for criteria; 1–5 for overall_score). No extra keys.

OUTPUT FORMAT — return JSON ONLY (no prose/backticks). Exactly this schema:
{{"criteria": {{"accuracy": 0 or 1, "fluency": 0 or 1, "coherence": 0 or 1, "cultural_appropriateness": 0 or 1, "guideline_adherence": 0 or 1, "completeness": 0 or 1}},
  "total_points": integer 0-6,
  "overall_score": integer 1-5,
  "label": "poor"|"fair"|"good"|"very_good"|"excellent",
  "explanation": "≤120 words; brief evidence for each criterion"}}
        """
    )
    

    parser = StrOutputParser()
    gemini_chain = judge_prompt | gemini_judge | parser
    chatgpt_chain = judge_prompt | chatgpt_judge | parser
    
    input_data = {
        "source": source_text,
        "translation": translated_text,
        "reference": reference_text or "N/A"
    }

    gemini_opinion = ""
    chatgpt_opinion = ""

    with ThreadPoolExecutor(max_workers=2) as executor:
        future_gemini = executor.submit(gemini_chain.invoke, input_data)
        future_chatgpt = executor.submit(chatgpt_chain.invoke, input_data)
        
        try:
            print("...getting opinion from Gemini...")
            gemini_opinion = future_gemini.result()
            print("...getting opinion from GPT-4...")
            chatgpt_opinion = future_chatgpt.result()
        except Exception as e:
            return f"An error occurred while consulting models: {e}"

    return f"""Consultation results:
- Opinion from Gemini-2.5-Pro:
{gemini_opinion}

- Opinion from GPT-4:
{chatgpt_opinion}
"""

tools = [opinion_pooling_tool]

prompt_template = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            """You are a rigorous, impartial English→Filipino translation judge with deep expertise in Filipino grammar, style, and cultural nuance. Evaluate ONLY the given source/translation using the criteria below. Favor idiomatic Filipino that preserves meaning. Penalize omissions/additions, mistranslations (polarity/negation, tense/aspect, quantities, named entities), awkward calques, unjustified Taglish, and register mismatches. Do NOT rewrite the translation—only judge it. When uncertain, choose the lower score and justify briefly with evidence. Assume formal register unless stated otherwise. If no domain/style guide is provided, use general editorial norms as the guideline.

SCORED REFERENCE EXAMPLES (for patterning; do NOT output these):

- Example A — Excellent
  Source: "The meeting was postponed because of the storm."
  Translation: "Naantala ang pagpupulong dahil sa bagyo."
  Expected JSON:
  {{"criteria": {{"accuracy": 1, "fluency": 1, "coherence": 1, "cultural_appropriateness": 1, "guideline_adherence": 1, "completeness": 1}},
    "total_points": 6, "overall_score": 5, "label": "excellent",
    "explanation": "Idiomatic and precise; preserves cause and entities; no omissions/additions."}}

- Example B — Very good (minor style issue)
  Source: "Please submit the report by Friday."
  Translation: "Pakiusap na isumite ang ulat pagsapit ng Biyernes."
  Expected JSON:
  {{"criteria": {{"accuracy": 1, "fluency": 1, "coherence": 1, "cultural_appropriateness": 1, "guideline_adherence": 1, "completeness": 0}},
    "total_points": 5, "overall_score": 4, "label": "very_good",
    "explanation": "Meaning preserved; minor completeness/style nuance (tone/softener not fully mirrored)."}}

- Example C — Good (loss of specificity)
  Source: "Do not turn off the main power switch."
  Translation: "Huwag patayin ang switch."
  Expected JSON:
  {{"criteria": {{"accuracy": 0, "fluency": 1, "coherence": 1, "cultural_appropriateness": 1, "guideline_adherence": 1, "completeness": 0}},
    "total_points": 4, "overall_score": 3, "label": "good",
    "explanation": "Omits 'main power' → specificity lost (accuracy/completeness↓); grammar/flow are fine."}}

- Example D — Fair (noticeable errors, mostly understandable)
  Source: "Store the medicine in a cool, dry place."
  Translation: "Itago ang gamot sa malamig na lugar."
  Expected JSON:
  {{"criteria": {{"accuracy": 1, "fluency": 1, "coherence": 1, "cultural_appropriateness": 1, "guideline_adherence": 0, "completeness": 0}},
    "total_points": 4, "overall_score": 3, "label": "good",
    "explanation": "Misses 'dry' and guidance nuance; otherwise natural. (If policy requires both conditions, consider Completeness=0 and Guideline=0.)"}}

- Example E — Poor (wrong meaning)
  Source: "Keep out of reach of children."
  Translation: "Maganda ang bata."
  Expected JSON:
  {{"criteria": {{"accuracy": 0, "fluency": 1, "coherence": 0, "cultural_appropriateness": 0, "guideline_adherence": 0, "completeness": 0}},
    "total_points": 1, "overall_score": 1, "label": "poor",
    "explanation": "Unrelated meaning; safety directive lost; incoherent to instruction context."}}

Additionally, you will also be given a few examples of English→Filipino pairs—each with a correct translation, a flawed translation, and a short remark explaining the flaw. Study those examples, then evaluate a new pair according to six binary criteria.

Examples (from CSV columns “source”, “correct”, “flawed”, separated by |):
	1.	The Philippines is an archipelago made up of over 7,640 islands, though only about 2,000 are inhabited. | Ang Pilipinas ay isang kapulaang binubuo ng 7,640 na isla, ngunit 2,000 lamang ang tinitirahan | Ang Pilipinas ay isang puno na binubuo ng mahigit 7,640 manok, bagaman halos 2,000 lamang ang tumira.
	2.	Philippines was also a U.S. territory from 1898 to 1946. | Ang Pilipinas ay naging isang teritoryo rin ng Estados Unidos mula 1898 hanggang 1946 | Ang Estados Unidos ay naging isang teritoryo ng Pilipinas mula 1946 hanggang 1898
	3.	The national hero of the Philippines is Dr. Jose Rizal. | Si Dr. Jose Rizal ang pambansang bayani ng Pilipinas | Ang Pilipinas ang bansang bayani ni Dr. Jose Rizal
	4.	The national animal of the Philippines is the Carabao. | Ang pambansang hayop ng Pilipinas ay ang kalabaw | Ang pambansang hayop ng Pilipinas ay ang aso
	5.	The national bird is the Philippine Eagle, one of the largest and most powerful eagles in the world. | Ang pambansang ibon ay ang Philippine Eagle, isa sa pinakamalaki at pinakamalakas na agila sa mundo | Ang karaniwang ibon na Philippine Eagle ay isang maliit na Agila
    

Scoring rubric (binary 0/1 for each):
1) Accuracy — Meaning preserved (entities, polarity, tense/aspect, quantities, conditions).
2) Fluency — Natural, grammatical Filipino (orthography, morphology, agreement).
3) Coherence — Logical flow; clear referents/connectors; consistent register.
4) Cultural Appropriateness — Idiomatic usage; avoids unjustified Taglish/calques; suitable register.
5) Guideline Adherence — Follows stated domain/style rules (or general editorial norms if none provided).
6) Completeness — No omissions/additions; all content rendered faithfully.

Hard rules:
- Critical meaning error (e.g., negation flip, wrong entity) → Accuracy=0.
- Major omission/addition → Completeness=0 (and Accuracy=0 if meaning affected).
- Pervasive unjustified Taglish/calques in formal context → Fluency=0 (and possibly Cultural=0).


VALIDATION CHECKS (must hold):
- total_points == accuracy+fluency+coherence+cultural_appropriateness+guideline_adherence+completeness
- Use integers only (0/1 for criteria; 1–5 for overall_score). No extra keys.

You can use tools like opinion_pooling_tool to clarify uncertainties about specific words or phrases, but do not use them all the time since they are expensive.

Please reason before answering like why thats your score for the criteria. After your done type your final answer by typing 'FINAL:' followed by your answer in the following JSON format schema:{{"criteria": {{"accuracy": 0 or 1, "fluency": 0 or 1, "coherence": 0 or 1, "cultural_appropriateness": 0 or 1, "guideline_adherence": 0 or 1, "completeness": 0 or 1}},
  "explanation": "≤120 words; brief evidence for each criterion"}}
""",
        ),
        (
            "human",
            """NOW Please evaluate the following translation.

**Source:**
{source_text}

**Translation (Filipino):**
{translated_text}
""",
        ),
        ("ai", "{agent_scratchpad}"), # Where the agent keeps its intermediate work (thoughts, tool calls)
    ]
)


llm = ChatGoogleGenerativeAI(model="gemini-2.5-flash-lite", temperature=0.1)


agent = create_openai_tools_agent(llm, tools, prompt_template)


agent_executor = AgentExecutor(
    agent=agent,
    tools=tools,
    verbose=True, # Set to True to see the agent's thought process
)

if __name__ == "__main__":
    print("--- Example 2: A trickier translation with business jargon ---")
    source_tricky = "To maximize synergy, we must leverage our core competencies and streamline our workflow."
    translation_tricky = "Para ma-maximize ang synergy, dapat gamitin natin ang ating core competencies at i-streamline ang workflow."
    
    response_tricky = agent_executor.invoke({
        "source_text": source_tricky,
        "translated_text": translation_tricky,
    })
    print("\n--- FINAL OUTPUT ---")
    print(response_tricky["output"])

--- Example 2: A trickier translation with business jargon ---


[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m
I will evaluate the translation based on accuracy, fluency, coherence, cultural appropriateness, guideline adherence, and completeness.

*   **Accuracy:** The translation uses Taglish terms like "maximize," "synergy," "core competencies," and "streamline." While the meaning is generally conveyed, these are direct borrowings without Filipino equivalents, which impacts accuracy in a formal context. The core meaning of leveraging strengths and improving processes is present, but the *way* it's expressed is not fully accurate to formal Filipino.
*   **Fluency:** The sentence structure is understandable, but the heavy reliance on English terms makes it sound like a direct, unadapted translation rather than fluent Filipino. The grammar itself isn't incorrect, but the word choices are unnatural for formal Filipino.
*   **Coherence:** The sentence is coherent and the mea

In [None]:
import pandas as pd
import json
import re
import time


MAX_RETRIES = 3

RETRY_DELAY_SECONDS = 2

def extract_final_json(text: str):
    """
    Finds the 'FINAL:' marker in the input string and parses the following segment as JSON.
    This version is more robust and tries to find a JSON object even without the marker.
    Handles optional whitespace between 'FINAL:' and the JSON object.
    Returns the parsed JSON object, or raises ValueError if not found or invalid.
    """
    # A non-greedy regex is often safer. It stops at the first closing brace.
    match = re.search(r'FINAL:\s*({.*?})', text, re.DOTALL)
    if not match:
        # If 'FINAL:' is not found, as a fallback, search for any JSON object in the text.
        # This can help if the model forgets the marker but still outputs a valid JSON.
        match = re.search(r'({.*})', text, re.DOTALL)
        if not match:
            raise ValueError("No 'FINAL:' marker or JSON object found in the input string.")
    
    json_str = match.group(1)
    try:
        return json.loads(json_str)
    except json.JSONDecodeError as e:
        # The error message now includes the problematic string for easier debugging.
        raise ValueError(f"Invalid JSON found. Error: {e}. String was: {json_str}")

def sum_criteria_points(data: dict) -> int:
    """
    Takes in a JSON-like dict with a 'criteria' field containing 0/1 values,
    and returns the sum of the 1s.
    """
    criteria = data.get("criteria", {})
    if not isinstance(criteria, dict):
        return 0
    return sum(value for value in criteria.values() if isinstance(value, int))

try:
    df = pd.read_csv("validation.csv")
except FileNotFoundError:
    print("Error: The file 'Datasets - Human-Labeled Validation Set.csv' was not found.")

    exit()




print(f"Found columns: {df.columns.tolist()}")
print(f"Processing {len(df)} rows...")

# A list to store the results for each row.
results = []

for index, row in df.iterrows():
    attempts = 0
    while attempts < MAX_RETRIES:
        try:
            response_tricky = agent_executor.invoke({
                "source_text": row['Source Text (English)'],
                "translated_text": row['Target Text (Filipino)'],
            })
            response_output = response_tricky["output"]

            print(f"\n--- Processing Row {index} (Attempt {attempts + 1}/{MAX_RETRIES}) ---")
            
            final_json = extract_final_json(response_tricky["output"])
            total_points = sum_criteria_points(final_json)

            print("Successfully extracted JSON:", final_json)
            print("Total points:", total_points)
            
            results.append({
                "row_index": index,
                "source_text": row.get('Source Text (English)'),
                "target_text": row.get('Target Text (Filipino)'),
                "final_json": final_json,
                "total_points": total_points,
                "status": "Success"
            })

            break

        except ValueError as e:
            attempts += 1
            print(f"An error occurred on row {index}: {e}")
            if attempts < MAX_RETRIES:
                print(f"Retrying in {RETRY_DELAY_SECONDS} seconds...")
                time.sleep(RETRY_DELAY_SECONDS)
            else:
                print(f"Failed to process row {index} after {MAX_RETRIES} attempts. Skipping.")
                results.append({
                    "row_index": index,
                    "source_text": row.get('Source Text (English)'),
                    "target_text": row.get('Target Text (Filipino)'),
                    "final_json": None,
                    "total_points": 0,
                    "status": f"Failed after {MAX_RETRIES} attempts"
                })

results_df = pd.DataFrame(results)
print("\n--- Processing Complete ---")
print(results_df)


Found columns: ['Source Text (English)', 'Target Text (Filipino)', 'Final Score', 'Rater 1 Explanation', 'Rater 2 Explanation']
Processing 40 rows...


[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m
Invoking: `opinion_pooling_tool` with `{'translated_text': 'Ang mga bata ay nagtawanan at naglaro sa ilalim ng hapon na araw.', 'source_text': 'The children laughed and played under the afternoon sun.'}`
responded: 


[0m
--- CONSULTING EXPENSIVE OPINION POOLING TOOL ---
...getting opinion from Gemini...
...getting opinion from GPT-4...
[36;1m[1;3mConsultation results:
- Opinion from Gemini-2.5-Pro:
```json
{
  "criteria": {
    "accuracy": 1,
    "fluency": 1,
    "coherence": 1,
    "cultural_appropriateness": 1,
    "guideline_adherence": 1,
    "completeness": 1
  },
  "total_points": 6,
  "overall_score": 5,
  "label": "excellent",
  "explanation": "The translation is a perfect, one-to-one rendering of the source. It accurately identifies 'Sampaguita' as the national fl

InternalServerError: 500 An internal error has occurred. Please retry or report in https://developers.generativeai.google/guide/troubleshooting

In [None]:
results_df = pd.DataFrame(results)
print("\n--- Processing Complete ---")
print(results_df)


--- Processing Complete ---
    row_index                                        source_text  \
0           0  The children laughed and played under the afte...   
1           1           She took a break to gather her thoughts.   
2           2                        It's raining cats and dogs.   
3           3                 Thank you for coming to the event.   
4           4          That designer bag costs an arm and a leg.   
5           5  Mark accidentally let the cat out of the bag a...   
6           6  This would be easier if blood came in more col...   
7           7  Stealth is not your thing. But it's adorable y...   
8           8  Which of the following tools is LEAST likely t...   
9           9             I can't stop smiling when you're there   
10         10                 I just want to be your everything.   
11         11  If I give you all my life, would you still be ...   
12         12  A stack overflow happens when there is too muc...   
13         13  The 

In [None]:
comparison = []
def map_to_overall_score(num):
    if 0 <= num <= 1:
        return 1, "poor"
    elif num == 2:
        return 2, "fair"
    elif 3 <= num <= 4:
        return 3, "good"
    elif num == 5:
        return 4, "very_good"
    elif num == 6:
        return 5, "excellent"
    else:
        raise ValueError("Input number is out of the valid range 0 to 6")
    
for idx in results_df['row_index']:
    # Get model score from results_df
    model_row = results_df.loc[results_df['row_index'] == idx].iloc[0]
    model_score = model_row['total_points']
    # Get human score from df
    if idx in df.index:
        human_score = df.loc[idx, 'Final Score']
    else:
        human_score = None

    MSE = ((model_score - human_score) ** 2)/2

    comparison.append({
        'row_index': idx,
        'source_text': model_row['source_text'],
        'target_text': model_row['target_text'],
        'model_score': map_to_overall_score(model_score),
        'human_score': human_score,
        'MSE': MSE,
    })

comparison_df = pd.DataFrame(comparison)
print(comparison_df.columns)

Index(['row_index', 'source_text', 'target_text', 'model_score', 'human_score',
       'MSE'],
      dtype='object')


In [None]:
comparison_df.to_csv("1Btoolresults.csv", index=False)

In [None]:
mse_values = comparison_df['MSE'].dropna()
average_mse = mse_values.mean() if not mse_values.empty else None
print(f"\nAverage MSE of results: {average_mse}")


Average MSE of results: 3.5789473684210527
