In [None]:
%pip install pymupdf together openai

In [None]:
from together import Together
from openai import OpenAI
import fitz
import json
import re
import os
import base64
import io
from PIL import Image
from dotenv import load_dotenv
load_dotenv()


client = Together(api_key=os.environ.get("TOGETHER_API_KEY"))
openai_client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))

MINER_MODEL = "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8"  

COUNCIL_MODELS = [
    "mistralai/Mixtral-8x7B-Instruct-v0.1",
    "openai/gpt-oss-120b",
    "deepseek-ai/DeepSeek-V3.1",
    "deepcogito/cogito-v2-1-671b",
    "moonshotai/Kimi-K2-Thinking",

]

META_JURY_MODEL = "gpt-5.2"  

OUTPUT_FILE = "results_corvinul.jsonl"


In [32]:
def get_total_pages(pdf_path: str) -> int:
    doc = fitz.open(pdf_path)
    total_pages = len(doc)
    doc.close()
    return total_pages

In [None]:
def pdf_page_to_base64(pdf_path: str, page_num: int, dpi: int = 200) -> str:
    """
    Converts PDF page to PNG Base64.
    Resizes to max 1280px for better text readability.
    """
    doc = fitz.open(pdf_path)
    page = doc[page_num]
    
    pix = page.get_pixmap(dpi=dpi)
    
    mode = "RGBA" if pix.alpha else "RGB"
    img = Image.frombytes(mode, [pix.width, pix.height], pix.samples)
    
    if img.mode in ("RGBA", "LA", "P"):
        background = Image.new("RGB", img.size, (255, 255, 255))
        if img.mode == "P":
            img = img.convert("RGBA")
        background.paste(img, mask=img.split()[-1] if img.mode == "RGBA" else None)
        img = background
        
    max_size = 1280
    if max(img.size) > max_size:
        ratio = max_size / max(img.size)
        new_size = (int(img.width * ratio), int(img.height * ratio))
        img = img.resize(new_size, Image.Resampling.LANCZOS)

    buffered = io.BytesIO()
    img.save(buffered, format="PNG", optimize=True)
    img_bytes = buffered.getvalue()
    
    return base64.b64encode(img_bytes).decode('utf-8')

In [None]:
def extract_clean_json(response_text: str) -> dict:
    """
    Extracts the first valid JSON object from a string, ignoring 
    <think> blocks, markdown wrappers, and conversational text.
    """
    clean_text = re.sub(r"```json\s*", "", response_text, flags=re.IGNORECASE)
    clean_text = re.sub(r"```", "", clean_text)

    clean_text = re.sub(r"<think>.*?</think>", "", clean_text, flags=re.DOTALL)

    match = re.search(r'\{.*\}', clean_text, re.DOTALL)
    
    if match:
        json_str = match.group(0)
        try:
            return json.loads(json_str, strict=False)
        except json.JSONDecodeError:
            return {"error": "JSON Decode Error", "raw_content": response_text}
    else:
        return {"error": "No JSON found", "raw_content": response_text}

In [35]:
def analyze_image(prompt: str, image_b64: str, model: str) -> dict:
    """Calls vision model with image and returns JSON response."""
    response = client.chat.completions.create(
        model=model,
        messages=[
            {"role": "system", "content": prompt},
            {"role": "user", "content": [
                {"type": "text", "text": "Analyze this textbook page:"},
                {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_b64}"}}
            ]}
        ],
        response_format={"type": "json_object"},
    )

    result_text = response.choices[0].message.content.strip()
    return json.loads(result_text, strict=False)

In [None]:
def analyze_text(prompt: str, content: str, model: str, **kwargs) -> dict:
    """
    Calls text model and extracts JSON, handling thinking models.
    """
    if "max_tokens" not in kwargs:
        if "thinking" in model.lower() or "r1" in model.lower():
            kwargs["max_tokens"] = 16000 
        else:
            kwargs["max_tokens"] = 4096

    result_text = ""
    
    try:
        response = client.chat.completions.create(
            model=model,
            messages=[
                {"role": "system", "content": prompt},
                {"role": "user", "content": content}
            ],
            response_format={"type": "json_object"},
            **kwargs 
        )
        result_text = response.choices[0].message.content.strip()
    except Exception:
        try:
            response = client.chat.completions.create(
                model=model,
                messages=[
                    {"role": "system", "content": prompt},
                    {"role": "user", "content": content}
                ],
                **kwargs 
            )
            result_text = response.choices[0].message.content.strip()
        except Exception as e:
            return {"error": f"API Error: {str(e)}"}
    
    if not result_text:
        return {"error": "Empty response from model", "raw_response": ""}

    data = extract_clean_json(result_text)
    if isinstance(data, dict):
        data["_raw_response"] = result_text
    return data

In [37]:
MINER_PROMPT = """You are an expert Educational Auditor performing a preliminary bias scan of a Romanian history textbook for controversial content.

You will receive a BATCH of pages. Analyze them TOGETHER.

### YOUR CORE MISSION
Identify **ANY passage that could influence a student's historical interpretation** in a non-neutral way.

This includes:
- Subtle framing choices
- Value-laden adjectives
- Selective emphasis or silence
- Normalization of one perspective
- Assumptions presented as facts
- National, ethnic, political, or moral alignment cues

You are NOT deciding whether the content is definitively biased or wrong.
You are identifying **POTENTIAL POINTS OF INTERPRETIVE INFLUENCE** that merit further review.

### CRITICAL RULE: VERBATIM QUOTE EXTRACTION
When you fill the "quotes" field in the JSON, you must act as a strict OCR engine:
1. **EXACT MATCH:** Copy the text **exactly** as it appears in the image. Do not paraphrase, summarize, or fix typos.
2. **NO TRANSLATION:** If the text is in Romanian, quote it in Romanian.
3. **PRESERVE STYLE:** Keep the original punctuation and diacritics if visible.
4. **CONTEXT:** If the biased text is part of a longer sentence, include enough context so the meaning is clear.

### EVIDENTIARY COMPLETENESS REQUIREMENT

All interpretive concerns must be supported by direct quotations.

In the "quotes" field:
- Include EVERY passage that materially contributed to identifying the concern.
- If the issue depends on multiple sentences, sections, or contrasting passages, include each relevant excerpt as a separate quote entry.
- Do NOT rely on unquoted text to justify your explanation.
- Do NOT quote only a fragment if the broader sentence or paragraph materially affects meaning.
- The explanation must be fully supported by the quoted text.
- Copy the text **exactly** as it appears in the image. Do not paraphrase, summarize, or fix typos.
- If the text is in Romanian, quote it in Romanian.
- Keep the original punctuation and diacritics if visible.

If an idea influenced your concern but is not directly quoted, you must add the relevant text to the quotes field.


### AUTHOR VS. SOURCE AWARENESS

History textbooks often include quotations from earlier periods that reflect the language, values, or prejudices of their time.

IMPORTANT:
- Offensive or exclusionary language appearing in a **Primary Source** does NOT automatically constitute textbook bias.
- Do NOT treat historical quotations as problematic merely for reflecting past attitudes.

Your task is to:
- Correctly identify whether the language comes from the **Textbook Author** or from a **Historical Source**
- Flag the passage ONLY if the way the quote is used (selection, emphasis, framing, or lack of context) could influence interpretation

### OUTPUT FORMAT (JSON ONLY)
You may use multi-step reasoning internally.
Return a JSON object with a list of issues.
{
  "issues": [
    {
      "controversy": "Short title",
      "source_type": "<TEXTBOOK_NARRATIVE|PRIMARY_SOURCE_USAGE>",
      "quotes": [
        {"text": "PASTE EXACT TEXT FROM IMAGE HERE", "page_offset": 0}
      ],
      "explanation": "Explain why this passage could influence interpretation or raise a potential concern. Avoid definitive judgments."
    }
  ]
}

If no issues found, return:
{"issues": []}
"""

COUNCIL_PROMPT = """You are an expert Educational Historian auditing a Romanian high school history textbook.

### INPUT DATA
- **CLAIM:** {controversy}
- **CONTEXT TYPE:** {source_type} (Is this the Author's Narrative or a Primary Source?)
- **EVIDENCE:** {quotes}
- **MINER'S NOTE:** {explanation}

You are encouraged to assign low severity or dismiss concerns when appropriate.

### STEP 1: Attribution Assessment
Determine whether the interpretive concern arises from:
- The **Textbook Author's Narrative**, or
- The **Use of a Primary Source**

If the passage is a Primary Source:
- Assess whether it is contextualized or neutrally presented
- Neutral presentation generally corresponds to low severity

### STEP 2: Category Classification
Select ONE category from the list below that best describes the primary interpretive concern.
You MUST use one of these exact strings ‚Äî do not invent new category names.

**Language & Framing**
- Linguistic Tone
- Narrative Framing
- Moral Loading

**Perspective & Representation**
- Perspective Limitation
- Group Representation
- Agency Attribution

**Structure & Emphasis**
- Selection Bias
- Omission / Underdevelopment
- Asymmetrical Detail

**Source Handling**
- Primary Source Framing
- Attribution Ambiguity
- Source Selection Bias

**Interpretation & Pedagogy**
- Teleological Narrative
- National or Cultural Centering
- Presentism

### STEP 3: Severity Assessment (7-Point Scale)
- **1: Neutral / Pedagogically Sound**
- **2: Negligible Framing**
- **3: Minor Imbalance**
- **4: Moderate Bias**
- **5: Significant Distortion**
- **6: Severe Misrepresentation**
- **7: Educational Harm / Propaganda**

### STEP 4: Confidence
Rate your confidence from 0.0 (uncertain) to 1.0 (certain).

### OUTPUT FORMAT (JSON ONLY)
{{
  "attribution": "<TEXTBOOK_NARRATIVE|PRIMARY_SOURCE_USAGE>",
  "category": "<one exact category string from the Step 2 list>",
  "severity": <integer 1-7>,
  "confidence": <number 0.0-1.0>,
  "reasoning": "Concise, outcome-focused justification explaining the assessment."
}}

"""

META_JURY_PROMPT = """
You are the Chief Juror of an Educational Audit Council.

Your role is to synthesize multiple expert evaluations into a single authoritative final verdict.
You must exercise independent judgment: weigh the juror inputs, identify consensus or disagreement, and produce your own reasoned conclusion.

### YOUR GOALS
1. Identify consensus or meaningful disagreement across jurors
2. Weight evaluations by juror confidence
3. Produce a final severity score and system-level confidence
4. Flag cases requiring human review when appropriate

### INPUT DATA
**Original Controversy:** {controversy}  
**Source Type:** {source_type}  
**Evidence Quotes:** {quotes}  
**Miner's Note:** {explanation}  
**Individual Juror Evaluations:** {juror_evaluations}

### ALLOWED TAXONOMY
The final_category MUST be one of the following exact strings:
[Linguistic Tone, Narrative Framing, Moral Loading, Perspective Limitation, Group Representation, Agency Attribution, Selection Bias, Omission / Underdevelopment, Asymmetrical Detail, Primary Source Framing, Attribution Ambiguity, Source Selection Bias, Teleological Narrative, National or Cultural Centering, Presentism, INVALID_INPUT]

### DECISION LOGIC
- **High-Confidence Consensus:** If jurors converge on a similar severity and category with confidence > 0.7, adopt their consensus.

- **Resolving Disagreements:** If jurors disagree, do NOT just calculate a blind average. Give more weight to high-confidence jurors. Read their reasoning and select the category and severity that is best supported by the evidence. 

- **Human Review Flag:** Flag for human review when high-confidence jurors differ by more than 1.5 severity points

### OUTPUT FORMAT (JSON ONLY)
{{
  "final_attribution": "<TEXTBOOK_NARRATIVE|PRIMARY_SOURCE_USAGE>",
  "final_category": "<one exact category string from the ALLOWED TAXONOMY>",
  "final_severity": <integer 1-7>,
  "system_confidence": <number 0.0-1.0>,
  "flag_for_human_review": <true|false>,
  "synthesis_summary": "Concise summary of consensus or disagreement and the rationale for the final verdict.",
  "variance_note": "If flagged for review, briefly describe the nature of the disagreement. If not flagged, set to empty string."
}}
"""


In [None]:
def run_miner_batch(pdf_path: str, start_page: int, num_pages: int = 10, debug: bool = False) -> list:
    """Analyzes multiple PDF pages together for controversial content."""
    if debug:
        print(f"  [miner] pages {start_page+1}‚Äì{start_page+num_pages} | model: {MINER_MODEL.split('/')[-1]}")

    message_content = [{"type": "text", "text": "Analyze these textbook pages together as a batch:"}]
    total_size = 0

    for offset in range(num_pages):
        page_num = start_page + offset
        image_b64 = pdf_page_to_base64(pdf_path, page_num)
        total_size += len(image_b64)
        message_content.append(
            {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_b64}"}}
        )

    if debug:
        avg_size = total_size // num_pages
        print(f"  [miner] {num_pages} pages encoded | avg size: {avg_size:,} chars")

    response = client.chat.completions.create(
        model=MINER_MODEL,
        messages=[
            {"role": "system", "content": MINER_PROMPT},
            {"role": "user", "content": message_content},
        ],
        response_format={"type": "json_object"},
    )

    result_text = response.choices[0].message.content.strip()
    result = json.loads(result_text, strict=False)

    if isinstance(result, dict) and "issues" in result:
        candidates = result["issues"]
    elif isinstance(result, list):
        candidates = result
    else:
        candidates = [result]

    for item in candidates:
        if "source_type" not in item:
            item["source_type"] = "TEXTBOOK_NARRATIVE"
        if "quotes" in item:
            for quote in item["quotes"]:
                if isinstance(quote, dict):
                    page_offset = quote.get("page_offset", 0)
                    quote["page"] = start_page + page_offset + 1

    if debug:
        print(f"  [miner]: {len(candidates)} issue(s) found")

    return candidates


In [None]:
def run_council(controversy: str, quotes: list, explanation: str, source_type: str = "Unknown", debug: bool = False) -> dict:
    """
    Runs council of models sequentially, then Meta-Jury synthesis using OpenAI.
    """
    council_results = []

    formatted_quotes = []
    for q in quotes:
        text = q.get('text', '') if isinstance(q, dict) else str(q)
        formatted_quotes.append(f"- {text}")

    prompt = COUNCIL_PROMPT.format(
        controversy=controversy,
        source_type=source_type,
        quotes="\n".join(formatted_quotes),
        explanation=explanation
    )

    MAX_RETRIES = 3

    for model in COUNCIL_MODELS:
        model_name = model.split('/')[-1]
        if debug:
            print(f"  [council] querying {model_name}...")

        result = None
        valid = False

        for attempt in range(MAX_RETRIES):
            result = analyze_text(prompt, "", model)

            has_severity = "severity" in result
            has_confidence = "confidence" in result
            has_category = "category" in result
            has_reasoning = "reasoning" in result and bool(result.get("reasoning", "").strip())

            severity_valid = False
            if has_severity:
                try:
                    test_severity = float(result.get("severity", -1))
                    severity_valid = 1 <= test_severity <= 7
                except Exception:
                    severity_valid = False

            confidence_valid = False
            if has_confidence:
                try:
                    test_confidence = float(result.get("confidence", -1))
                    confidence_valid = 0 <= test_confidence <= 1
                except Exception:
                    confidence_valid = False

            if has_severity and has_confidence and has_category and has_reasoning and severity_valid and confidence_valid:
                valid = True
                if debug and attempt > 0:
                    print(f"  [council] {model_name} passed on attempt {attempt + 1}")
                break

            if attempt < MAX_RETRIES - 1:
                if debug:
                    print(f"  [council] {model_name} retry {attempt + 1}/{MAX_RETRIES} "
                          f"(severity={severity_valid}, confidence={confidence_valid}, "
                          f"category={has_category}, reasoning={has_reasoning})")
            else:
                print(f"  {model_name} dropped (failed validation after {MAX_RETRIES} attempts)")

        if not valid:
            continue

        attribution = result.get("attribution", "UNKNOWN")
        category = result.get("category", "Unknown")
        reasoning = result.get("reasoning", "")

        raw_severity = result.get("severity", 4)
        severity = 4
        try:
            if isinstance(raw_severity, (int, float)):
                severity = float(raw_severity)
            elif isinstance(raw_severity, str):
                s = str(raw_severity).strip()
                if "-" in s:
                    parts = s.split("-")
                    nums = [float(p) for p in parts if p.strip().replace('.', '', 1).isdigit()]
                    if len(nums) >= 2:
                        severity = sum(nums[:2]) / 2
                    elif len(nums) == 1:
                        severity = nums[0]
                else:
                    match = re.search(r"(\d+(\.\d+)?)", s)
                    if match:
                        severity = float(match.group(1))
        except Exception:
            severity = 4
        severity = max(1, min(7, severity))

        raw_confidence = result.get("confidence", 0.5)
        confidence = 0.5
        try:
            if isinstance(raw_confidence, (int, float)):
                confidence = float(raw_confidence)
            elif isinstance(raw_confidence, str):
                s = str(raw_confidence).strip()
                match = re.search(r"(\d+(\.\d+)?)", s)
                if match:
                    confidence = float(match.group(1))
        except Exception:
            confidence = 0.5
        confidence = max(0, min(1, confidence))

        if debug:
            print(f"  [council]: {model_name}: {severity}/7, conf={confidence:.2f}, cat={category}")

        council_results.append({
            "model": model_name,
            "attribution": attribution,
            "category": category,
            "severity": severity,
            "confidence": confidence,
            "reasoning": reasoning
        })

    if not council_results:
        print(f"  All jurors dropped: returning fallback result")
        return {
            "final_severity": 1,
            "system_confidence": 0.0,
            "final_category": "Unknown",
            "final_attribution": "UNKNOWN",
            "flag_for_human_review": True,
            "synthesis_reasoning": "All jurors failed validation. Manual review required.",
            "variance_analysis": "No valid juror responses.",
            "individual_jurors": []
        }

    juror_evaluations = []
    for idx, jr in enumerate(council_results, 1):
        juror_evaluations.append(
            f"**Juror {idx} ({jr['model']}):**\n"
            f"- Category: {jr['category']}\n"
            f"- Severity: {jr['severity']}/7\n"
            f"- Confidence: {jr['confidence']:.2f}\n"
            f"- Attribution: {jr['attribution']}\n"
            f"- Reasoning: {jr['reasoning']}\n"
        )

    meta_prompt = META_JURY_PROMPT.format(
        controversy=controversy,
        source_type=source_type,
        quotes="\n".join(formatted_quotes),
        explanation=explanation,
        juror_evaluations="\n".join(juror_evaluations)
    )

    if debug:
        print(f"  [meta-jury] synthesizing {len(council_results)} juror(s) via {META_JURY_MODEL}...")

    try:
        meta_response = openai_client.chat.completions.create(
            model=META_JURY_MODEL,
            messages=[
                {"role": "system", "content": meta_prompt},
                {"role": "user", "content": "Synthesize the juror evaluations and provide your final verdict."}
            ],
            response_format={"type": "json_object"}
        )

        meta_text = meta_response.choices[0].message.content.strip()
        meta_result = json.loads(meta_text, strict=False)

    except Exception as e:
        error_msg = f"Meta-Jury unavailable: {str(e)}"
        print(f"  ‚úó Meta-Jury failed: {str(e)}")
        return {
            "final_severity": None,
            "system_confidence": None,
            "final_category": None,
            "final_attribution": None,
            "flag_for_human_review": True,
            "synthesis_reasoning": error_msg,
            "variance_analysis": "Meta-Jury call failed. No verdict produced.",
            "individual_jurors": council_results
        }

    if debug:
        flagged = meta_result.get('flag_for_human_review')
        print(f"  [meta-jury]: {meta_result.get('final_severity')}/7, "
              f"conf={meta_result.get('system_confidence', 0):.2f}"
              + (" flagged" if flagged else ""))

    return {
        "final_severity": meta_result.get("final_severity"),
        "system_confidence": meta_result.get("system_confidence"),
        "final_category": meta_result.get("final_category"),
        "final_attribution": meta_result.get("final_attribution"),
        "flag_for_human_review": meta_result.get("flag_for_human_review", False),
        "synthesis_reasoning": meta_result.get("synthesis_summary", meta_result.get("synthesis_reasoning", "")),
        "variance_analysis": meta_result.get("variance_note", meta_result.get("variance_analysis", "")),
        "individual_jurors": council_results
    }


In [None]:
def run_pipeline(pdf_path: str, pages_per_batch: int = 10, debug: bool = False):
    """Main execution loop: Process PDF in batches ‚Üí Council evaluation."""
    print(f"Starting analysis: {pdf_path}")
    print(f"Batch size: {pages_per_batch} pages")

    total_pages = get_total_pages(pdf_path)
    print(f"Total pages: {total_pages}\n")

    total_controversies = 0
    batch_num = 0

    for start_page in range(0, total_pages, pages_per_batch):
        batch_num += 1
        end_page = min(start_page + pages_per_batch, total_pages)
        actual_pages = end_page - start_page

        print(f"\nBatch {batch_num}  [pages {start_page+1}‚Äì{end_page}]")

        controversies = run_miner_batch(pdf_path, start_page, actual_pages, debug=debug)

        if not controversies:
            print(f"  No issues found")
            continue

        print(f"  Found {len(controversies)} potential issue(s)")

        for idx, item in enumerate(controversies, 1):
            controversy = item.get("controversy", "Untitled")
            quotes = item.get("quotes", [])
            explanation = item.get("explanation", "")
            source_type = item.get("source_type", "TEXTBOOK_NARRATIVE")

            print(f"\n  Issue {idx}/{len(controversies)}: {controversy}")

            council_result = run_council(controversy, quotes, explanation, source_type, debug=debug)
            final_severity = council_result["final_severity"]

            result = {
                "batch": batch_num,
                "pages": f"{start_page+1}-{end_page}",
                "controversy": controversy,
                "quotes": quotes,
                "explanation": explanation,
                "source_type": source_type,
                "final_severity": final_severity,
                "system_confidence": council_result["system_confidence"],
                "final_category": council_result["final_category"],
                "final_attribution": council_result["final_attribution"],
                "flagged_for_review": council_result["flag_for_human_review"],
                "synthesis_reasoning": council_result["synthesis_reasoning"],
                "variance_analysis": council_result.get("variance_analysis", ""),
                "individual_jurors": council_result["individual_jurors"]
            }

            with open(OUTPUT_FILE, 'a', encoding='utf-8') as f:
                f.write(json.dumps(result, ensure_ascii=False) + '\n')

            flagged = council_result["flag_for_human_review"]
            confidence = council_result["system_confidence"]
            category = council_result["final_category"]

            severity_str = f"{final_severity}/7" if final_severity is not None else "N/A"
            confidence_str = f"{confidence:.2f}" if confidence is not None else "N/A"
            category_str = category if category is not None else "N/A"
            flag_str = " FLAGGED" if flagged else ""

            print(f"  Severity: {severity_str}  Category: {category_str}  Conf: {confidence_str}{flag_str}")

            total_controversies += 1

    print(f"Total: {total_controversies} controversies. Saved to {OUTPUT_FILE}")


run_pipeline("./Manuale Istorie/Clasa a 11 a CORVINUL.pdf", pages_per_batch=5, debug=False)


Starting chapter-based analysis of: ./Manuale Istorie/Clasa a 11 a CORVINUL.pdf
Processing 5 pages per batch
Total pages: 144


BATCH 1: Pages 1-5
Analyzing 5 pages together...

[DEBUG] PDF: ./Manuale Istorie/Clasa a 11 a CORVINUL.pdf
[DEBUG] Processing pages 1 to 5

[DEBUG] Added page 1:
[DEBUG]   - Base64 length: 1,075,396 chars
[DEBUG] Added page 2:
[DEBUG]   - Base64 length: 7,412 chars
[DEBUG] Added page 3:
[DEBUG]   - Base64 length: 124,264 chars
[DEBUG] Added page 4:
[DEBUG]   - Base64 length: 390,108 chars
[DEBUG] Added page 5:
[DEBUG]   - Base64 length: 717,664 chars
[DEBUG] Average image size: 462,968 chars
[DEBUG] System prompt length: 3300 chars
[DEBUG] Expected JSON format: {"issues": [...]}


[DEBUG] Response text length: 1020 chars
[DEBUG] Full raw response text:
{
  "issues": [
    {
      "controversy": "Nationalistic Lyrics",
      "source_type": "PRIMARY_SOURCE_USAGE",
      "quotes": [
        {
          "text": "De»ôteaptƒÉ-te, rom√¢ne!",
          "page_offset": 

In [None]:
def generate_html_report(jsonl_file: str, output_html: str = "controversy_report.html"):
    """Generate a clean, readable HTML report from results.jsonl."""
    
    results = []
    with open(jsonl_file, 'r', encoding='utf-8') as f:
        for line in f:
            results.append(json.loads(line, strict=False))
    
    results.sort(key=lambda x: x.get('final_severity', 0), reverse=True)
    
    all_categories = sorted(set(r.get('final_category', 'Unknown') for r in results))

    total_severity = sum(r.get('final_severity', 0) for r in results)
    avg_severity = total_severity / len(results) if results else 0
    high_severity_count = sum(1 for r in results if r.get('final_severity', 0) >= 5)
    flagged_count = sum(1 for r in results if r.get('flagged_for_review', False))
    
    html = f"""
<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>Textbook Analysis Report</title>
    <style>
        * {{
            margin: 0;
            padding: 0;
            box-sizing: border-box;
        }}
        
        body {{
            font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif;
            line-height: 1.6;
            color: #1a1a1a;
            background-color: #f8f9fa;
            padding: 40px 20px;
        }}
        
        .container {{
            max-width: 1000px;
            margin: 0 auto;
        }}
        
        h1 {{
            color: #2c3e50;
            font-size: 2em;
            font-weight: 600;
            margin-bottom: 30px;
            letter-spacing: -0.5px;
        }}
        
        .summary {{
            background: white;
            padding: 20px;
            margin-bottom: 20px;
            border-left: 4px solid #2c3e50;
            box-shadow: 0 1px 3px rgba(0,0,0,0.1);
        }}
        
        .summary p {{
            margin: 8px 0;
            color: #555;
        }}
        
        .summary strong {{
            color: #2c3e50;
        }}

        /* ‚îÄ‚îÄ Category Filter Bar ‚îÄ‚îÄ */
        .filter-bar {{
            background: white;
            padding: 16px 20px;
            margin-bottom: 24px;
            box-shadow: 0 1px 3px rgba(0,0,0,0.08);
            display: flex;
            align-items: center;
            gap: 12px;
            flex-wrap: wrap;
        }}

        .filter-bar label {{
            font-weight: 600;
            color: #2c3e50;
            font-size: 0.9em;
            white-space: nowrap;
        }}

        .filter-btn {{
            padding: 5px 14px;
            border: 1px solid #d0d0d0;
            border-radius: 20px;
            background: #f8f9fa;
            color: #444;
            font-size: 0.85em;
            cursor: pointer;
            transition: all 0.15s;
            white-space: nowrap;
        }}

        .filter-btn:hover {{
            border-color: #2c3e50;
            color: #2c3e50;
        }}

        .filter-btn.active {{
            background: #2c3e50;
            border-color: #2c3e50;
            color: white;
        }}

        .filter-count {{
            margin-left: auto;
            font-size: 0.85em;
            color: #888;
        }}

        .finding {{
            background: white;
            margin-bottom: 1px;
            box-shadow: 0 1px 3px rgba(0,0,0,0.05);
            transition: opacity 0.2s;
        }}

        .finding.hidden {{
            display: none;
        }}
        
        .finding-header {{
            padding: 20px;
            cursor: pointer;
            display: flex;
            justify-content: space-between;
            align-items: center;
            user-select: none;
            transition: background-color 0.2s;
        }}
        
        .finding-header:hover {{
            background-color: #f8f9fa;
        }}
        
        .finding-title {{
            color: #2c3e50;
            font-weight: 500;
            flex: 1;
            margin-right: 20px;
        }}
        
        .finding-meta {{
            display: flex;
            gap: 15px;
            align-items: center;
        }}
        
        .finding-score {{
            color: #2c3e50;
            font-weight: 600;
            font-size: 1.1em;
        }}
        
        .finding-category {{
            color: #666;
            font-size: 0.9em;
            background: #f0f0f0;
            padding: 4px 8px;
            border-radius: 3px;
        }}
        
        .flagged-indicator {{
            color: #e74c3c;
            font-size: 0.9em;
        }}
        
        .finding-content {{
            display: none;
            padding: 0 20px 20px 20px;
            border-top: 1px solid #f0f0f0;
        }}
        
        .finding.expanded .finding-content {{
            display: block;
        }}
        
        .metadata {{
            color: #666;
            font-size: 0.9em;
            margin-bottom: 20px;
            padding: 10px;
            background: #f8f9fa;
            border-radius: 3px;
        }}
        
        .section {{
            margin: 20px 0;
        }}
        
        .section-title {{
            color: #2c3e50;
            font-weight: 600;
            margin-bottom: 10px;
            font-size: 1em;
            text-transform: uppercase;
            letter-spacing: 0.5px;
        }}
        
        .explanation {{
            background: #f8f9fa;
            padding: 15px;
            margin: 10px 0;
            line-height: 1.7;
            border-left: 3px solid #3498db;
        }}
        
        .quote {{
            background: #fff9e6;
            padding: 12px;
            margin: 8px 0;
            border-left: 3px solid #f39c12;
        }}
        
        .quote-text {{
            color: #1a1a1a;
            margin-bottom: 5px;
            font-style: italic;
        }}
        
        .quote-page {{
            color: #999;
            font-size: 0.85em;
        }}
        
        .meta-jury {{
            background: #e8f5e9;
            padding: 15px;
            margin: 15px 0;
            border-left: 3px solid #4caf50;
        }}
        
        .meta-jury-title {{
            font-weight: 600;
            color: #2e7d32;
            margin-bottom: 10px;
        }}
        
        .meta-jury-stats {{
            display: grid;
            grid-template-columns: repeat(auto-fit, minmax(200px, 1fr));
            gap: 10px;
            margin: 10px 0;
        }}
        
        .meta-stat {{
            background: white;
            padding: 10px;
            border-radius: 3px;
        }}
        
        .meta-stat-label {{
            font-size: 0.85em;
            color: #666;
        }}
        
        .meta-stat-value {{
            font-size: 1.2em;
            font-weight: 600;
            color: #2e7d32;
        }}
        
        table {{
            width: 100%;
            border-collapse: collapse;
            margin-top: 10px;
        }}
        
        th, td {{
            padding: 12px;
            text-align: left;
            border-bottom: 1px solid #f0f0f0;
            font-size: 0.9em;
        }}
        
        th {{
            background: #2c3e50;
            color: white;
            font-weight: 500;
        }}
        
        .model-name {{
            color: #2c3e50;
            font-weight: 500;
        }}
        
        .severity-badge {{
            display: inline-block;
            padding: 3px 8px;
            border-radius: 3px;
            font-weight: 600;
            font-size: 0.9em;
        }}
        
        .severity-1, .severity-2 {{
            background: #d4edda;
            color: #155724;
        }}
        
        .severity-3, .severity-4 {{
            background: #fff3cd;
            color: #856404;
        }}
        
        .severity-5, .severity-6 {{
            background: #f8d7da;
            color: #721c24;
        }}
        
        .severity-7 {{
            background: #721c24;
            color: white;
        }}
    </style>
</head>
<body>
    <div class="container">
        <h1>Textbook Analysis Report ‚Äî Multi-Dimensional Audit</h1>
        
        <div class="summary">
            <p><strong>Total findings:</strong> {len(results)}</p>
            <p><strong>Average severity:</strong> {avg_severity:.2f}/7 (7-point Likert scale)</p>
            <p><strong>High severity findings (‚â•5):</strong> {high_severity_count}</p>
            <p><strong>Flagged for human review:</strong> {flagged_count}</p>
            <p><strong>Evaluation method:</strong> Multi-dimensional audit with Meta-Jury synthesis</p>
        </div>

        <!-- Category Filter Bar -->
        <div class="filter-bar">
            <label>Filter by category:</label>
            <button class="filter-btn active" data-filter="all" onclick="filterFindings(this)">All</button>
"""
    for cat in all_categories:
        cat_js = cat.replace("'", "\\'")
        html += f'            <button class="filter-btn" data-filter="{cat_js}" onclick="filterFindings(this)">{cat}</button>\n'

    html += f"""            <span class="filter-count" id="filter-count">{len(results)} of {len(results)} shown</span>
        </div>
"""
    
    for i, result in enumerate(results, 1):
        severity = result.get('final_severity', 0)
        confidence = result.get('system_confidence', 0)
        category = result.get('final_category', 'Unknown')
        flagged = result.get('flagged_for_review', False)
        flag_icon = ' ‚ö†Ô∏è' if flagged else ''
        
        severity_class = f"severity-{int(severity)}"
        cat_attr = category.replace('"', '&quot;')
        
        html += f"""
        <div class="finding" data-category="{cat_attr}" onclick="this.classList.toggle('expanded')">
            <div class="finding-header">
                <div class="finding-title">{result['controversy']}{flag_icon}</div>
                <div class="finding-meta">
                    <div class="finding-category">{category}</div>
                    <div class="finding-score"><span class="severity-badge {severity_class}">{severity}/7</span></div>
                </div>
            </div>
            
            <div class="finding-content">
                <div class="metadata">
                    Pages {result['pages']} | Batch {result['batch']} | System Confidence: {confidence:.2f}
                    {' | <span style="color: #e74c3c; font-weight: 500;">‚ö†Ô∏è Flagged for Review</span>' if flagged else ''}
                </div>
                
                <div class="section">
                    <div class="section-title">Miner's Analysis</div>
                    <div class="explanation">{result['explanation']}</div>
                </div>
                
                <div class="section">
                    <div class="section-title">Evidence</div>
"""
        
        for quote in result['quotes']:
            if isinstance(quote, dict):
                quote_text = quote.get('text', '')
                page_num = quote.get('page', quote.get('page_offset', 'Unknown'))
            else:
                quote_text = quote
                page_num = 'Unknown'
            
            html += f"""
                    <div class="quote">
                        <div class="quote-text">"{quote_text}"</div>
                        <div class="quote-page">Page: {page_num}</div>
                    </div>
"""
        
        synthesis = result.get('synthesis_reasoning', 'N/A')
        variance = result.get('variance_analysis', '')
        
        html += f"""
                </div>
                
                <div class="section">
                    <div class="meta-jury">
                        <div class="meta-jury-title">üèõÔ∏è Meta-Jury Final Verdict</div>
                        <div class="meta-jury-stats">
                            <div class="meta-stat">
                                <div class="meta-stat-label">Final Severity</div>
                                <div class="meta-stat-value">{severity}/7</div>
                            </div>
                            <div class="meta-stat">
                                <div class="meta-stat-label">System Confidence</div>
                                <div class="meta-stat-value">{confidence:.2f}</div>
                            </div>
                            <div class="meta-stat">
                                <div class="meta-stat-label">Category</div>
                                <div class="meta-stat-value" style="font-size: 1em;">{category}</div>
                            </div>
                        </div>
                        <p style="margin-top: 10px;"><strong>Synthesis:</strong> {synthesis}</p>
                        {f'<p style="margin-top: 10px; color: #e74c3c;"><strong>Variance Analysis:</strong> {variance}</p>' if variance else ''}
                    </div>
                </div>
                
                <div class="section">
                    <div class="section-title">Individual Juror Evaluations</div>
                    <table>
                        <thead>
                            <tr>
                                <th>Juror Model</th>
                                <th>Category</th>
                                <th>Severity</th>
                                <th>Confidence</th>
                                <th>Reasoning</th>
                            </tr>
                        </thead>
                        <tbody>
"""
        
        for juror in result.get('individual_jurors', []):
            model_name = juror.get('model', 'Unknown')
            juror_severity = juror.get('severity', 'N/A')
            juror_confidence = juror.get('confidence', 'N/A')
            juror_category = juror.get('category', 'Unknown')
            reasoning = juror.get('reasoning', juror.get('error', 'No reasoning'))
            
            juror_severity_class = f"severity-{int(float(juror_severity))}" if isinstance(juror_severity, (int, float)) else ""
            
            html += f"""
                            <tr>
                                <td class="model-name">{model_name}</td>
                                <td>{juror_category}</td>
                                <td><span class="severity-badge {juror_severity_class}">{juror_severity}/7</span></td>
                                <td>{juror_confidence if isinstance(juror_confidence, str) else f'{juror_confidence:.2f}'}</td>
                                <td>{reasoning}</td>
                            </tr>
"""
        
        html += """
                        </tbody>
                    </table>
                </div>
            </div>
        </div>
"""
    
    html += f"""
    </div>

    <script>
        function filterFindings(btn) {{
            const filter = btn.getAttribute('data-filter');

            // Update active button
            document.querySelectorAll('.filter-btn').forEach(b => b.classList.remove('active'));
            btn.classList.add('active');

            // Show/hide findings
            const findings = document.querySelectorAll('.finding');
            let visibleCount = 0;
            findings.forEach(f => {{
                if (filter === 'all' || f.getAttribute('data-category') === filter) {{
                    f.classList.remove('hidden');
                    visibleCount++;
                }} else {{
                    f.classList.add('hidden');
                    // Collapse hidden findings to avoid stale open state
                    f.classList.remove('expanded');
                }}
            }});

            document.getElementById('filter-count').textContent =
                visibleCount + ' of {len(results)} shown';
        }}
    </script>
</body>
</html>
"""
    
    with open(output_html, 'w', encoding='utf-8') as f:
        f.write(html)
    
    print(f"Report generated: {output_html}")
    print(f"{len(results)} findings analyzed")
    print(f"Average severity: {avg_severity:.2f}/7")
    print(f"High severity (>=5): {high_severity_count}")
    print(f"Flagged for review: {flagged_count}")
    
    return output_html

generate_html_report(OUTPUT_FILE)


Report generated: controversy_report.html
94 findings analyzed
Average severity: 2.95/7
High severity (>=5): 3
Flagged for review: 11


'controversy_report.html'