In [None]:
# === MODEL AND PROMPT CONFIGURATION ===
# What this does: Sets up which LLM model to use and the evaluation prompt template
# Why we need it: We need to configure which AI model will judge translation quality
# Input: None (configuration constants)
# Output: MODEL_NAME and LLM_PROMPT_TEMPLATE variables used later

# MODEL_NAME: Specifies which language model to use for evaluation
# "qwen3:4b" = Qwen3 model with 4 billion parameters (smaller, faster)
# "qwen3:8b" = Qwen3 model with 8 billion parameters (larger, potentially better quality)
# Why: Different model sizes have different capabilities and speeds
# Business reason: 4b model is faster/cheaper, 8b might give better evaluations
# Technical reason: Model name must match what's available in Ollama
 # MODEL_NAME = "qwen3:8b"  # Alternative model (commented out)
MODEL_NAME = "qwen3:4b"  # Active model: 4 billion parameter version

# LLM_PROMPT_TEMPLATE: The instructions we give to the LLM to evaluate translations
# What: A detailed prompt that tells the AI how to judge translation quality
# Why: We need consistent, structured evaluation criteria
# Business reason: Standardized evaluation ensures fair comparison across language pairs
# Technical reason: LLMs need clear instructions to produce consistent outputs
# Format: Uses {placeholders} that get filled in with actual data later
LLM_PROMPT_TEMPLATE = """
You are an expert multilingual linguist evaluating the QUALITY of a candidate translation.

Given:

Source language: {source_language}
Target language: {target_language}

Source text: {source_text}
Target text: {target_text}

Evaluate how well the TARGET TEXT translates the SOURCE TEXT.

Evaluation Criteria

Adequacy (Meaning Preservation)
Is the full meaning accurately conveyed?
Are details, nuances, logic preserved?
Any mistranslations or factual distortions?

Completeness
Any omissions or unjustified additions?

Fluency (Target Language Quality)
Is it grammatical, natural, and idiomatic?

Terminology & Entities
Are domain terms correct and consistent?

Style & Tone
Does it match the sourceâ€™s register and intent?

Structure & Formatting
Is structure coherent?

Faithfulness
No invented facts or reinterpretations.

Scoring (1â€“10)

1â€“2 (Unusable)
3â€“4 (Poor)
5â€“6 (Fair)
7â€“8 (Good)
9â€“10 (Excellent)

Choose ONE overall score (1â€“10).
Do NOT reward fluency if meaning is wrong.

Output Format (Strict)

Total rating: <single integer from 1 to 10>

Rules:
Integer only.
No extra fields.
Be concise but concrete.
"""

In [None]:
#!/usr/bin/env python
# coding: utf-8

# ==========================================================
# LLM JUDGE CONFIG
# ==========================================================

#

# === IMPORT STATEMENTS ===
# What this does: Loads all necessary libraries for LLM-based evaluation
# Why we need it: Each library provides specific functionality for evaluating translations

# pandas: Data manipulation library - works with tables/dataframes
# Why: We need to load CSV files with translations and save evaluation results
import pandas as pd

# pathlib.Path: Modern way to handle file/folder paths (works on Windows/Mac/Linux)
# Why: We need to read input files and write output files safely
from pathlib import Path

# ollama: Library to interact with Ollama (local LLM server)
# Why: We use Ollama to run the Qwen model locally for translation evaluation
import ollama

# re: Regular expressions library (pattern matching in text)
# Why: We need to extract numeric scores from LLM text responses
import re

# tqdm: Progress bar library - shows visual progress during long operations
# Why: Evaluation takes time, so we show progress bars so users know it's working
from tqdm.auto import tqdm

# === CONFIGURATION ===
# What this does: Sets up input/output file paths
# Why we need it: Defines where to read translation data and where to save results
# Input: None (configuration)
# Output: Path objects for input directory and output file

# input_dir: Directory containing the translation CSV files
# Why: Centralized location where translation files from previous step are stored
input_dir = Path("results")
input_dir.mkdir(exist_ok=True)  # ensure folder exists (creates if missing)

# output_path: Where to save the evaluation results
# Why: We need a specific file to save all evaluation scores
# Note: Filename says "8b" but we're using 4b model (may be copy-paste artifact)
output_path = input_dir / "qwen3_8b_llm_judge_results.csv"

# === SCORE EXTRACTION FUNCTION ===
# What this does: Extracts numeric score (1-10) from LLM text response
# Why: LLMs return text, but we need a number to calculate statistics
# Input: Text string (LLM response)
# Output: Integer score (1-10) or None if no score found

def extract_score(text):
    # Regular expression pattern: \b(10|[1-9])\b
    # \b = word boundary (ensures we match whole numbers, not parts of larger numbers)
    # (10|[1-9]) = matches either "10" or any digit 1-9
    # Why: LLM might return "Total rating: 8" or "Score: 9" - we extract just the number
    match = re.search(r"\b(10|[1-9])\b", text)
    if match:
        # Convert matched string to integer
        return int(match.group(1))
    # Return None if no score found (handles cases where LLM doesn't follow format)
    return None


# === LLM JUDGE FUNCTION ===
# What this does: Calls the LLM to evaluate a single translation pair
# Why: This is the core evaluation logic - asks AI to score translation quality
# Business reason: Automated evaluation scales better than human evaluation
# Input:
#   - source_lang: Source language code (e.g., "eng")
#   - target_lang: Target language code (e.g., "jpn")
#   - source_text: Original text in source language
#   - target_text: Translated text in target language
# Output: Integer score (1-10) or None if extraction failed

def call_llm_judge(source_lang, target_lang, source_text, target_text):

    # Format the prompt template with actual data
    # .format(): Replaces {placeholders} in template with actual values
    # Why: We need to customize the prompt for each translation pair
    # Data transformation: Template with placeholders â†’ filled prompt string
    prompt = LLM_PROMPT_TEMPLATE.format(
        source_language=source_lang,
        target_language=target_lang,
        source_text=source_text,
        target_text=target_text
    )

    # Call Ollama API to get LLM response
    # model=MODEL_NAME: Which model to use (qwen3:4b)
    # messages: Conversation format - single user message with our prompt
    # options={"temperature": 0}: Set randomness to 0 (deterministic, consistent outputs)
    #   Why: We want consistent scores, not random variations
    #   Technical reason: temperature=0 makes model always pick most likely response
    # Input: Prompt string
    # Output: Dictionary with LLM response
    response = ollama.chat(
        model=MODEL_NAME,
        messages=[{"role": "user", "content": prompt}],
        options={"temperature": 0}
    )

    # Extract the actual text content from response
    # response["message"]["content"]: Gets the text the LLM generated
    # .strip(): Removes leading/trailing whitespace
    # Why: Clean up the response before extracting score
    output = response["message"]["content"].strip()
    
    # Extract numeric score from the text response
    score = extract_score(output)

    # Return the score (or None if extraction failed)
    return score


# === LOAD CSV FILES ===
# What this does: Loads translation data from CSV files created in previous step
# Why we need it: We need the translations to evaluate them
# Input: CSV files in "results" folder
# Output: Combined DataFrame with all translations

print("ðŸ“‚ Loading translation files...")

# Load English source translations
# Input: CSV file with columns: source_language, target_language, original_text, translated_text
# Output: DataFrame with English â†’ all target languages translations
eng_df = pd.read_csv(input_dir / "english_translations.csv")

# Load French source translations
# Input: CSV file with same structure but French source texts
# Output: DataFrame with French â†’ all target languages translations
fra_df = pd.read_csv(input_dir / "french_translations.csv")

# Combine both dataframes into one
# pd.concat(): Stacks dataframes vertically (one on top of the other)
# ignore_index=True: Resets row numbers (0, 1, 2... instead of keeping original indices)
# Why: We want all translations in one place for easier processing
# Data transformation: Two separate DataFrames â†’ One combined DataFrame
# Input: eng_df and fra_df
# Output: full_df with all translation pairs
full_df = pd.concat([eng_df, fra_df], ignore_index=True)

print(f"âœ… Loaded {len(full_df)} total rows")

  from .autonotebook import tqdm as notebook_tqdm


ðŸ“‚ Loading translation files...
âœ… Loaded 27916 total rows


In [None]:


# === EVALUATION PHASE ===
# What this does: Evaluates translation quality for each language pair using LLM judge
# Why we need it: This is the main evaluation - scores each translation pair
# Business reason: Need quantitative scores to compare translation quality across languages
# Input: Combined DataFrame with all translations
# Output: CSV file with mean scores per language pair

# Initialize list to store evaluation results
# Why: We'll collect scores for each language pair before saving
# Structure: List of dictionaries, each with language pair and mean score
evaluation_results = []

# Group translations by source language and target language
# groupby(): Splits dataframe into groups based on column values
# Why: We want to evaluate each language pair separately (e.g., engâ†’jpn, fraâ†’jpn)
# Input: DataFrame with source_language and target_language columns
# Output: Grouped object that can iterate over (src_lang, tgt_lang) pairs
grouped = full_df.groupby(["source_language", "target_language"])

print("\n" + "=" * 60)
print("STARTING LLM-AS-JUDGE EVALUATION")
print("=" * 60)

# Loop through each language pair (e.g., engâ†’jpn, engâ†’kor, fraâ†’jpn, etc.)
# Why: We need to evaluate each sourceâ†’target combination separately
# tqdm: Shows progress bar for all language pairs
for (src_lang, tgt_lang), group in tqdm(grouped, desc="Language Pairs"):

    print(f"\nðŸ“Š Evaluating {src_lang} â†’ {tgt_lang}")

    # Initialize list to store scores for this language pair
    scores = []

    # Sample size: Use up to 20 translations per language pair
    # min(20, len(group)): Use 20 if available, otherwise use all available
    # Why: Evaluating all translations would be too slow/expensive
    # Business reason: Sampling reduces cost and time while maintaining statistical validity
    # Technical reason: 20 samples is often enough for reliable mean estimates
    sample_size = min(20, len(group))
    
    # Randomly sample translations from this language pair
    # .sample(): Randomly selects rows from the group
    # random_state=42: Sets random seed for reproducibility (same sample each run)
    #   Why: Reproducible results - same translations evaluated each time
    # Input: Group of translations for this language pair
    # Output: Sampled rows (up to 20 translations)
    sampled_rows = group.sample(sample_size, random_state=42)

    # Loop through each sampled translation
    # iterrows(): Iterates over DataFrame rows (index, row data)
    # tqdm: Shows progress bar for this language pair
    # leave=False: Don't keep progress bar after completion (cleaner output)
    for _, row in tqdm(
        sampled_rows.iterrows(),
        total=sample_size,
        desc=f"{src_lang}->{tgt_lang}",
        leave=False
    ):

        # Call LLM to evaluate this translation
        # Input: Language codes and texts from current row
        # Output: Score (1-10) or None if extraction failed
        score = call_llm_judge(
            row["source_language"],
            row["target_language"],
            row["original_text"],
            row["translated_text"]
        )

        # Only add valid scores (skip None values)
        # Why: Some LLM responses might not contain extractable scores
        if score is not None:
            scores.append(score)

    # Check if we got any valid scores
    # Why: Can't calculate mean if no scores were extracted
    if len(scores) == 0:
        print("âš  No valid scores for this pair")
        continue  # Skip to next language pair

    # Calculate mean (average) score for this language pair
    # sum(scores): Add all scores together
    # len(scores): Count of scores
    # round(..., 3): Round to 3 decimal places (e.g., 8.333)
    # Why: Mean score represents overall quality for this language pair
    # Data transformation: List of scores â†’ single mean value
    avg_score = round(sum(scores) / len(scores), 3)

    # Create result dictionary for this language pair
    # Why: Structured data format for saving to CSV
    result_row = {
        "source_language": src_lang,        # Which source language
        "target_language": tgt_lang,        # Which target language
        "num_samples": len(scores),         # How many translations were evaluated
        "mean_llm_judge_score": avg_score   # Average quality score (1-10)
    }

    # Add this result to our collection
    evaluation_results.append(result_row)

    # âœ… PRINT RESULT IMMEDIATELY
    # Why: User can see progress in real-time (helpful for long runs)
    print("\nâœ… RESULT")
    print(f"Language Pair : {src_lang} â†’ {tgt_lang}")
    print(f"Samples Used  : {len(scores)}")
    print(f"Mean Score    : {avg_score}")
    print("-" * 50)

    # âœ… SAVE AFTER EACH LANGUAGE PAIR
    # Why: Incremental saving - if script crashes, we don't lose all progress
    # Convert results list to DataFrame
    # Input: List of dictionaries
    # Output: DataFrame with columns: source_language, target_language, num_samples, mean_llm_judge_score
    results_df = pd.DataFrame(evaluation_results)
    # Save to CSV (overwrites file each time with updated results)
    # index=False: Don't save row numbers
    # Expected output location: results/qwen3_8b_llm_judge_results.csv
    results_df.to_csv(output_path, index=False)

# === FINAL SUMMARY ===
# What this does: Prints completion message with file location
# Why: User needs to know where to find the final results
# Input: None
# Output: Console message

print("\n" + "=" * 60)
print("ðŸŽ‰ EVALUATION COMPLETE")
print(f"ðŸ“„ Results saved to: {output_path}")
print("=" * 60)


STARTING LLM-AS-JUDGE EVALUATION


Language Pairs:   0%|                                                                           | 0/28 [00:00<?, ?it/s]


ðŸ“Š Evaluating eng â†’ dan



eng->dan:   0%|                                                                                 | 0/20 [00:00<?, ?it/s][A

In [None]:
a=5

In [None]:
a