In [1]:
# Import required libraries
import os
from dotenv import load_dotenv
from llama_parse import LlamaParse
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, Settings
from llama_index.llms.openai import OpenAI
from llama_index.embeddings.openai import OpenAIEmbedding
from ragas.metrics import faithfulness, answer_relevancy, context_recall, context_precision
# from ragas.metrics.critique import harmfulness
from ragas import evaluate
from datasets import Dataset
import pandas as pd
from llama_index.core import Document
import chromadb
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
from llama_index.vector_stores.chroma import ChromaVectorStore
from llama_index.core import StorageContext
from llama_index.core.query_engine import RetrieverQueryEngine


In [2]:
# Load environment variables from .env
load_dotenv()

# Access environment variables
langchain_api_key = os.getenv("LANGCHAIN_API_KEY")
openai_api_key = os.getenv("OPENAI_API_KEY")
llama_cloud_api_key = os.getenv("LLAMA_CLOUD_API_KEY")

os.environ['LANGCHAIN_TRACING_V2'] = 'true'
os.environ['LANGCHAIN_ENDPOINT'] = 'https://api.smith.langchain.com'
os.environ['LANGCHAIN_API_KEY'] = langchain_api_key
os.environ['OPENAI_API_KEY'] = openai_api_key
os.environ['LLAMA_CLOUD_API_KEY'] = llama_cloud_api_key

In [3]:
print("\nStep 1: Setting up LlamaIndex...")

# Initialize the LLM
llm = OpenAI(
    model="gpt-4.1-mini",  # or "gpt-3.5-turbo" for cost efficiency
    temperature=0.1,
    api_key=openai_api_key  # Make sure openai_api_key is defined
)

# Initialize settings
Settings.llm = OpenAI(
    model="gpt-4.1-mini", 
    temperature=0.1, 
    api_key=openai_api_key
    )

Settings.embed_model = OpenAIEmbedding(
    model="text-embedding-3-large",
    api_key=openai_api_key
)
    
# Configure chunking settings
# Settings.chunk_size = 1024
# Settings.chunk_overlap = 200

print("✅ LlamaIndex setup complete")


Step 1: Setting up LlamaIndex...
✅ LlamaIndex setup complete


In [None]:
# Step 2: Document Parsing - Choose Your Method
print("Step 2: Document Parsing")
print("=" * 50)

def parse_document(method="baseline"):
    """
    Parse document using the specified method.
    
    Args:
        method (str): 'baseline', 'llamaparse', or 'docling'
    
    Returns:
        tuple: (markdown_content, markdown_nodes, method_used)
    """
    from llama_index.core import Document
    from llama_index.core.node_parser import MarkdownNodeParser
    from llama_index.core import SimpleDirectoryReader
    from llama_index.core.node_parser import SentenceSplitter
    
    file_path = "./data/apple_2021_10k.pdf"
    cache_dir = "./parsed_docs"
    os.makedirs(cache_dir, exist_ok=True)
    
    file_stem = os.path.splitext(os.path.basename(file_path))[0]
    
    if method == "baseline":
        print("🔧 Using Baseline parsing (Simple text extraction)...")
        output_md = os.path.join(cache_dir, f"{file_stem}_baseline.md")
        
        if os.path.exists(output_md):
            print(f"📁 Found cached baseline markdown at: {output_md}")
            with open(output_md, "r", encoding="utf-8") as f:
                markdown_content = f.read()
            
            document = Document(text=markdown_content, metadata={"source": file_path})
            splitter = SentenceSplitter(chunk_size=1024, chunk_overlap=100)
            markdown_nodes = splitter.get_nodes_from_documents([document])
        else:
            print("🔄 Parsing document with baseline method...")
            docs = SimpleDirectoryReader(input_files=[file_path]).load_data()
            raw_text = "\n\n".join([d.text or "" for d in docs])
            markdown_content = raw_text
            
            with open(output_md, "w", encoding="utf-8") as f:
                f.write(markdown_content)
            
            document = Document(text=markdown_content, metadata={"source": file_path})
            splitter = SentenceSplitter(chunk_size=1024, chunk_overlap=100)
            markdown_nodes = splitter.get_nodes_from_documents([document])
        
        print(f"✅ Baseline parsing complete. Nodes: {len(markdown_nodes)}")
        
    elif method == "llamaparse":
        print("🤖 Using LlamaParse (AI-powered parsing)...")
        output_md = os.path.join(cache_dir, f"{file_stem}_llama.md")
        
        if os.path.exists(output_md):
            print(f"📁 Found cached LlamaParse markdown at: {output_md}")
            with open(output_md, 'r', encoding='utf-8') as f:
                markdown_content = f.read()
            
            document = Document(text=markdown_content, metadata={"source": file_path})
            markdown_parser = MarkdownNodeParser()
            markdown_nodes = markdown_parser.get_nodes_from_documents([document])
        else:
            print("🔄 Parsing document with LlamaParse...")
            parser = LlamaParse(
                api_key=llama_cloud_api_key,
                parse_mode="parse_page_with_llm",
                high_res_ocr=True,
                adaptive_long_table=True,
                outlined_table_extraction=True,
                output_tables_as_HTML=True,
            )
            
            # Use synchronous parsing for LlamaParse
            result = parser.parse(file_path)
            markdown_nodes = result.get_markdown_nodes(split_by_page=True)
            markdown_content = "\n\n".join([node.text for node in markdown_nodes])
            
            with open(output_md, 'w', encoding='utf-8') as f:
                f.write(markdown_content)
        
        print(f"✅ LlamaParse complete. Nodes: {len(markdown_nodes)}")
        
    elif method == "docling":
        print("📄 Using Docling (Advanced document parsing)...")
        output_md = os.path.join(cache_dir, f"{file_stem}_docling.md")
        
        if os.path.exists(output_md):
            print(f"📁 Found cached Docling markdown at: {output_md}")
            with open(output_md, "r", encoding="utf-8") as f:
                markdown_content = f.read()
        else:
            print("🔄 Parsing document with Docling...")
            try:
                from docling.document_converter import DocumentConverter, DocumentConversionInput
                converter = DocumentConverter()
                result = converter.convert(DocumentConversionInput.from_path(file_path))
            except ImportError:
                from docling.document_converter import DocumentConverter
                converter = DocumentConverter()
                result = converter.convert(file_path)
            
            if hasattr(result.document, "export_to_markdown"):
                markdown_content = result.document.export_to_markdown()
            else:
                markdown_content = result.document.export_to_text()
            
            with open(output_md, "w", encoding="utf-8") as f:
                f.write(markdown_content)
        
        document = Document(text=markdown_content, metadata={"source": file_path})
        markdown_nodes = MarkdownNodeParser().get_nodes_from_documents([document])
        print(f"✅ Docling complete. Nodes: {len(markdown_nodes)}")
    
    else:
        raise ValueError(f"Unknown method: {method}. Choose from: 'baseline', 'llamaparse', 'docling'")
    
    return markdown_content, markdown_nodes, method

# Interactive method selection
print("\n📋 Available parsing methods:")
print("1. baseline  - Simple text extraction (fastest)")
print("2. llamaparse - AI-powered parsing (most accurate)")
print("3. docling   - Advanced document parsing (balanced)")

# Get user input
method_choice = input("\n🎯 Choose parsing method (1, 2, or 3): ").strip()

method_map = {
    "1": "baseline",
    "2": "llamaparse", 
    "3": "docling"
}

if method_choice not in method_map:
    print("⚠️  Invalid choice. Defaulting to baseline method.")
    method_choice = "1"

selected_method = method_map[method_choice]
print(f"\n🚀 Selected method: {selected_method.upper()}")
print("-" * 50)

# Parse document with selected method
markdown_content, markdown_nodes, method_used = parse_document(selected_method)

# Store the selected method globally for use in evaluation saving
PARSING_METHOD = method_used

print(f"\n🎉 Document parsing complete!")
print(f"📊 Method used: {method_used.upper()}")
print(f"📄 Content length: {len(markdown_content):,} characters")
print(f"🔗 Number of nodes: {len(markdown_nodes)}")
print(f"💾 Cached for future use")
print(f"🏷️  Method '{method_used}' will be used for evaluation file naming")

Step 2 (Baseline): Parsing document without LLM...
Found cached baseline markdown at: ./parsed_docs/apple_2021_10k_baseline.md
✅ Baseline parsed. Nodes: 77 | Markdown saved: ./parsed_docs/apple_2021_10k_baseline.md


In [5]:
# Setup our LlamaParse Indexes

from llama_index.core import VectorStoreIndex, Settings
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core.postprocessor import SimilarityPostprocessor
from llama_index.core.retrievers import VectorIndexRetriever
from llama_index.core.query_engine import RetrieverQueryEngine


markdown_index = VectorStoreIndex(
    nodes=markdown_nodes, 
    show_progress=True
    )

# Configure retriever with better settings
retriever = VectorIndexRetriever(
    index=markdown_index,
    similarity_top_k=5,  # Retrieve more candidates
    vector_store_query_mode="default",  # Combines semantic and keyword search
    alpha=0.3  # Weight between semantic (0) and keyword (1) search
)

markdown_query_engine = RetrieverQueryEngine.from_args(
    retriever=retriever,
    node_postprocessors=[
        SimilarityPostprocessor(similarity_cutoff=0.6)  # Filter out low similarity results
    ],
    response_mode="compact"  # Better for multi-document responses
)

Generating embeddings:   0%|          | 0/1 [00:00<?, ?it/s]

BadRequestError: Error code: 400 - {'error': {'message': "This model's maximum context length is 8192 tokens, however you requested 64951 tokens (64951 in your prompt; 0 for the completion). Please reduce your prompt; or completion length.", 'type': 'invalid_request_error', 'param': None, 'code': None}}

In [None]:
query_1 = "Where in the 10-K does Apple incorporate the 2022 Proxy Statement by reference?"

response_1 = await markdown_query_engine.aquery(query_1)
print("\n***********Markdown Query Engine***********")
print(response_1)

# for node in response_1.source_nodes:
#     print("\n***********Source Node***********")
#     print(node.node.text)      # chunk text
#     print(node.node.metadata)  # chunk metadata
#     print(node.score)          # similarity score


***********Markdown Query Engine***********
Apple incorporates the 2022 Proxy Statement by reference in Part III of the Annual Report on Form 10-K, specifically for the information required in Items 10, 11, 12, and 13.


In [20]:
# === Load QA pairs (question, reference) ===
# Expects a JSON list of objects: {"question": str, "reference": str}
# If qa_pairs.json is missing, falls back to a small built-in sample.

import os, json
from typing import List, Dict

qa_path = "/Users/rivyesch/Dev/tutorials/RAG/qa_pairs.json"

def load_qa_pairs(path: str) -> List[Dict[str, str]]:
    if os.path.exists(path):
        with open(path, "r") as f:
            data = json.load(f)
        # basic validation
        cleaned = []
        for i, item in enumerate(data):
            q = item.get("question")
            r = item.get("reference")
            if isinstance(q, str) and isinstance(r, str):
                cleaned.append({"question": q.strip(), "reference": r.strip()})
        if not cleaned:
            raise ValueError("qa_pairs.json is present but empty or invalid. Expected list of {question, reference}.")
        return cleaned
    # Fallback sample (replace by providing qa_pairs.json)
    print("qa_pairs.json not found. Using a small built-in sample.")
    return [
        {"question": "What is Apple’s fiscal year-end date in the 2021 Form 10-K?", "reference": "September 25, 2021."},
        {"question": "How many shares of common stock were outstanding as of October 15, 2021?", "reference": "16,406,397,000 shares."},
        {"question": "Which iPhone models were released in October and November 2020?", "reference": "iPhone 12, iPhone 12 mini, iPhone 12 Pro, and iPhone 12 Pro Max."},
    ]

qa_pairs = load_qa_pairs(qa_path)
print(f"Loaded {len(qa_pairs)} QA pairs")


Loaded 35 QA pairs


In [21]:
# === LlamaIndex retrieval evaluation using QA references (robust mapping) ===
from llama_index.core.evaluation import RetrieverEvaluator
from typing import List, Dict
import re

try:
    from rapidfuzz import fuzz
    _HAS_RAPIDFUZZ = True
except Exception:
    _HAS_RAPIDFUZZ = False

# Normalize to reduce false negatives (quotes, dashes, punctuation, whitespace)
_def_dash = re.compile(r"[\u2012\u2013\u2014\u2015]")
_def_quotes = str.maketrans({"“": '"', "”": '"', "‘": "'", "’": "'"})
_whitespace = re.compile(r"\s+")
_punct = re.compile(r"[^\w\s.$%+-]")

def normalize_text(text: str) -> str:
    if not isinstance(text, str):
        return ""
    t = text.translate(_def_quotes)
    t = _def_dash.sub("-", t)
    t = t.lower().strip()
    t = _punct.sub(" ", t)
    t = _whitespace.sub(" ", t)
    return t

def map_reference_to_node_ids(reference: str, nodes: List, top_k_fallback: int = 3) -> List[str]:
    # 1) normalized substring
    ref_norm = normalize_text(reference)
    matched: List[str] = []
    if ref_norm:
        for n in nodes:
            try:
                if ref_norm in normalize_text(n.text or ""):
                    matched.append(n.node_id)
            except Exception:
                continue
        if matched:
            return list(dict.fromkeys(matched))

    # 2) fuzzy partial match
    if _HAS_RAPIDFUZZ and ref_norm:
        scored = []
        for n in nodes:
            try:
                s = fuzz.partial_ratio(ref_norm, normalize_text(n.text or ""))
                scored.append((s, n.node_id))
            except Exception:
                continue
        scored.sort(reverse=True)
        fuzzy_ids = [nid for s, nid in scored[:top_k_fallback] if s >= 70]
        if fuzzy_ids:
            return list(dict.fromkeys(fuzzy_ids))

    # 3) vector similarity fallback using retriever
    try:
        cands = retriever.retrieve(reference)
        vs_ids = [c.node.node_id for c in (cands or [])][:top_k_fallback]
        if vs_ids:
            return list(dict.fromkeys(vs_ids))
    except Exception:
        pass

    return []

# Build expected ids and report coverage
qa_expected_ids: List[List[str]] = []
unmatched = 0
for item in qa_pairs:
    ids = map_reference_to_node_ids(item["reference"], markdown_nodes)
    if not ids:
        unmatched += 1
    qa_expected_ids.append(ids)
print(f"Reference→node mapping: matched={len(qa_pairs)-unmatched}, unmatched={unmatched} of {len(qa_pairs)}")

# Evaluate retrieval
metrics = ["hit_rate", "mrr", "precision", "recall", "ap", "ndcg"]
retriever_evaluator = RetrieverEvaluator.from_metric_names(metrics, retriever=retriever)

li_results: List[Dict] = []
print(f"Evaluating {len(qa_pairs)} QA pairs with LlamaIndex metrics...")
for i, item in enumerate(qa_pairs, start=1):
    expected_ids = qa_expected_ids[i-1]
    if not expected_ids:
        print(f"⚠️  QA {i} has no matched expected_ids. Skipping.")
        continue
    res = retriever_evaluator.evaluate(query=item["question"], expected_ids=expected_ids)
    li_results.append({
        "index": i,
        "question": item["question"],
        "reference": item["reference"],
        "expected_ids": expected_ids,
        "retrieved_ids": res.retrieved_ids,
        "metrics": res.metric_vals_dict,
    })
    m = res.metric_vals_dict
    print(f"#{i} HR={m.get('hit_rate',0):.2f} MRR={m.get('mrr',0):.2f} nDCG={m.get('ndcg',0):.2f}")

if li_results:
    n = len(li_results)
    avg_hr = sum(r["metrics"].get("hit_rate", 0.0) for r in li_results) / n
    avg_mrr = sum(r["metrics"].get("mrr", 0.0) for r in li_results) / n
    avg_ndcg = sum(r["metrics"].get("ndcg", 0.0) for r in li_results) / n
    print(f"\nLlamaIndex averages over {n} evaluated QA pairs:")
    print(f"- Hit Rate: {avg_hr:.3f}")
    print(f"- MRR:      {avg_mrr:.3f}")
    print(f"- nDCG:     {avg_ndcg:.3f}")
else:
    print("No LlamaIndex results to aggregate.")

Reference→node mapping: matched=35, unmatched=0 of 35
Evaluating 35 QA pairs with LlamaIndex metrics...
#1 HR=0.00 MRR=0.00 nDCG=0.00
#2 HR=1.00 MRR=1.00 nDCG=0.91
#3 HR=1.00 MRR=1.00 nDCG=1.00
#4 HR=1.00 MRR=1.00 nDCG=1.00
#5 HR=1.00 MRR=1.00 nDCG=1.00
#6 HR=1.00 MRR=0.50 nDCG=0.30
#7 HR=0.00 MRR=0.00 nDCG=0.00
#8 HR=1.00 MRR=0.50 nDCG=0.30
#9 HR=0.00 MRR=0.00 nDCG=0.00
#10 HR=1.00 MRR=0.50 nDCG=0.73
#11 HR=1.00 MRR=0.50 nDCG=0.48
#12 HR=1.00 MRR=1.00 nDCG=1.00
#13 HR=1.00 MRR=1.00 nDCG=0.77
#14 HR=1.00 MRR=1.00 nDCG=0.91
#15 HR=1.00 MRR=1.00 nDCG=1.00
#16 HR=1.00 MRR=1.00 nDCG=1.00
#17 HR=1.00 MRR=1.00 nDCG=0.47
#18 HR=1.00 MRR=1.00 nDCG=0.47
#19 HR=1.00 MRR=0.25 nDCG=0.20
#20 HR=1.00 MRR=1.00 nDCG=0.67
#21 HR=1.00 MRR=1.00 nDCG=0.77
#22 HR=1.00 MRR=0.50 nDCG=0.30
#23 HR=1.00 MRR=0.50 nDCG=0.53
#24 HR=1.00 MRR=1.00 nDCG=0.47
#25 HR=1.00 MRR=0.50 nDCG=0.30
#26 HR=1.00 MRR=0.25 nDCG=0.20
#27 HR=1.00 MRR=1.00 nDCG=0.70
#28 HR=1.00 MRR=1.00 nDCG=0.70
#29 HR=1.00 MRR=0.50 nDCG=0.48
#30 HR

In [22]:
# === RAGAS evaluation (robust, safe printing) ===
from datasets import Dataset
from ragas import evaluate
from ragas.metrics import answer_relevancy, faithfulness, context_recall, context_precision

questions = [q["question"] for q in qa_pairs]
references = [q["reference"] for q in qa_pairs]

answers = []
all_contexts = []
for q in questions:
    resp = markdown_query_engine.query(q)
    answers.append(str(resp))
    ctx = [sn.node.text for sn in (resp.source_nodes or [])]
    # Ensure non-empty contexts; if empty, fall back to retriever top-k to avoid downstream errors
    if not ctx:
        try:
            cands = retriever.retrieve(q)
            ctx = [c.node.text for c in (cands or [])]
        except Exception:
            ctx = []
    all_contexts.append(ctx)

ragas_ds = Dataset.from_dict({
    "question": questions,
    "answer": answers,
    "contexts": all_contexts,
    "reference": references,  # can also be a list[str] per row if you have multiple gold references
})

# Smaller batch_size can avoid worker IndexErrors; disable progress for cleaner logs if desired
ragas_result = evaluate(
    dataset=ragas_ds,
    metrics=[context_precision, context_recall, faithfulness, answer_relevancy],
    batch_size=16,
    show_progress=True,
)

print("RAGAS:")
try:
    df = ragas_result.to_pandas()
    cols = [c for c in ["context_precision","context_recall","faithfulness","answer_relevancy"] if c in df.columns]
    agg = {c: float(df[c].mean()) for c in cols}
    for k, v in agg.items():
        print(f"- {k}: {v:.4f}")
except Exception as e:
    # Fallback printing if API differs
    print("Could not summarize via DataFrame:", e)
    print(ragas_result)


Evaluating:   0%|          | 0/140 [00:00<?, ?it/s]

Batch 1/9:   0%|          | 0/16 [00:00<?, ?it/s]

RAGAS:
- context_precision: 0.7554
- context_recall: 0.8286
- faithfulness: 0.6888
- answer_relevancy: 0.7022


In [None]:
# === Save evaluation artifacts organized by parsing method ===
import os, json
from datetime import datetime

# Create organized directory structure
base_dir = "./evaluation"
method_dir = os.path.join(base_dir, PARSING_METHOD.title())  # e.g., "./evaluation/Baseline", "./evaluation/Llamaparse", "./evaluation/Docling"
os.makedirs(method_dir, exist_ok=True)

ts = datetime.now().strftime("%Y%m%d_%H%M%S")

print(f"📁 Saving evaluation results to: {method_dir}")
print(f"🏷️  Parsing method: {PARSING_METHOD.upper()}")
print(f"⏰ Timestamp: {ts}")
print("=" * 60)

# Save LlamaIndex results (per-QA JSON + aggregates JSON)
try:
    li_full_path = os.path.join(method_dir, f"llamaindex_metrics_{PARSING_METHOD}_{ts}.json")
    with open(li_full_path, "w") as f:
        json.dump(li_results, f, indent=2)

    # compute aggregates from li_results
    def _avg(key: str) -> float:
        vals = [r.get("metrics", {}).get(key) for r in li_results]
        vals = [v for v in vals if isinstance(v, (int, float))]
        return float(sum(vals) / len(vals)) if vals else 0.0

    li_aggregates = {
        "parsing_method": PARSING_METHOD,
        "hit_rate": _avg("hit_rate"),
        "mrr": _avg("mrr"),
        "ndcg": _avg("ndcg"),
        "precision": _avg("precision"),
        "recall": _avg("recall"),
        "ap": _avg("ap"),
        "evaluated_pairs": len(li_results),
        "timestamp": ts
    }

    li_agg_path = os.path.join(method_dir, f"llamaindex_aggregate_metrics_{PARSING_METHOD}_{ts}.json")
    with open(li_agg_path, "w") as f:
        json.dump({"aggregates": li_aggregates}, f, indent=2)

    print(f"✅ LlamaIndex results -> {li_full_path}")
    print(f"✅ LlamaIndex aggregates -> {li_agg_path}")
except Exception as e:
    print("❌ Could not save LlamaIndex artifacts:", e)

# Save RAGAS metrics (aggregates + per-row CSV)
try:
    ragas_json_path = os.path.join(method_dir, f"ragas_aggregate_metrics_{PARSING_METHOD}_{ts}.json")
    ragas_csv_path = os.path.join(method_dir, f"ragas_metrics_{PARSING_METHOD}_{ts}.csv")

    # Prefer DataFrame output from EvaluationResult
    df = ragas_result.to_pandas()
    cols = [c for c in ["context_precision","context_recall","faithfulness","answer_relevancy"] if c in df.columns]
    aggregates = {
        "parsing_method": PARSING_METHOD,
        "timestamp": ts
    }
    aggregates.update({c: float(df[c].mean()) for c in cols})

    # Save aggregates JSON
    with open(ragas_json_path, "w") as f:
        json.dump({"aggregates": aggregates}, f, indent=2)

    # Save full per-row metrics CSV
    try:
        df.to_csv(ragas_csv_path, index=False)
    except Exception:
        pass

    print(f"✅ RAGAS aggregates -> {ragas_json_path}")
    print(f"✅ RAGAS per-row metrics -> {ragas_csv_path}")

except Exception as e:
    # Fallback: try to_dict; else repr
    try:
        ragas_fallback_path = os.path.join(method_dir, f"ragas_metrics_{PARSING_METHOD}_{ts}.json")
        payload = getattr(ragas_result, "to_dict", lambda: {"result": repr(ragas_result)})()
        with open(ragas_fallback_path, "w") as f:
            json.dump(payload, f, indent=2, default=str)
        print(f"✅ RAGAS fallback -> {ragas_fallback_path}")
    except Exception as e2:
        print("❌ Could not save RAGAS metrics:", e2)

print("\n" + "=" * 60)
print(f"🎉 All evaluation results saved for {PARSING_METHOD.upper()} parsing method!")
print(f"📁 Location: {method_dir}")
print("=" * 60)

Saved LlamaIndex results -> /Users/rivyesch/Dev/tutorials/RAG/llamaindex_metrics_20250922_235056.json
Saved LlamaIndex aggregates -> /Users/rivyesch/Dev/tutorials/RAG/llamaindex_aggregate_metrics_20250922_235056.json
Saved RAGAS aggregates -> /Users/rivyesch/Dev/tutorials/RAG/ragas_aggregate_metrics_20250922_235056.json
Saved RAGAS per-row metrics -> /Users/rivyesch/Dev/tutorials/RAG/ragas_metrics_20250922_235056.csv


In [None]:
# === Optional: Compare results across parsing methods ===
import json
import os
from pathlib import Path

def compare_parsing_methods():
    """
    Compare evaluation results across different parsing methods.
    This function reads all saved evaluation results and creates a comparison.
    """
    base_dir = Path("./evaluation")
    
    if not base_dir.exists():
        print("❌ No evaluation directory found. Run evaluations first.")
        return
    
    comparison_data = {}
    
    # Find all method directories
    for method_dir in base_dir.iterdir():
        if method_dir.is_dir():
            method_name = method_dir.name
            print(f"\n🔍 Analyzing {method_name} results...")
            
            # Find the most recent aggregate files
            llama_files = list(method_dir.glob(f"llamaindex_aggregate_metrics_{method_name.lower()}*.json"))
            ragas_files = list(method_dir.glob(f"ragas_aggregate_metrics_{method_name.lower()}*.json"))
            
            method_data = {"method": method_name}
            
            # Load LlamaIndex results
            if llama_files:
                latest_llama = max(llama_files, key=lambda x: x.stat().st_mtime)
                try:
                    with open(latest_llama, 'r') as f:
                        llama_data = json.load(f)
                        method_data["llamaindex"] = llama_data.get("aggregates", {})
                        print(f"  ✅ LlamaIndex: HR={method_data['llamaindex'].get('hit_rate', 0):.3f}, MRR={method_data['llamaindex'].get('mrr', 0):.3f}")
                except Exception as e:
                    print(f"  ❌ Error reading LlamaIndex results: {e}")
            
            # Load RAGAS results
            if ragas_files:
                latest_ragas = max(ragas_files, key=lambda x: x.stat().st_mtime)
                try:
                    with open(latest_ragas, 'r') as f:
                        ragas_data = json.load(f)
                        method_data["ragas"] = ragas_data.get("aggregates", {})
                        print(f"  ✅ RAGAS: Faith={method_data['ragas'].get('faithfulness', 0):.3f}, Relev={method_data['ragas'].get('answer_relevancy', 0):.3f}")
                except Exception as e:
                    print(f"  ❌ Error reading RAGAS results: {e}")
            
            comparison_data[method_name] = method_data
    
    if comparison_data:
        # Save comparison summary
        comparison_path = base_dir / "parsing_method_comparison.json"
        with open(comparison_path, 'w') as f:
            json.dump(comparison_data, f, indent=2)
        
        print(f"\n📊 Comparison summary saved to: {comparison_path}")
        print("\n🏆 Method Rankings:")
        
        # Rank methods by key metrics
        if len(comparison_data) > 1:
            # LlamaIndex Hit Rate ranking
            llama_hr_ranking = sorted(
                [(name, data["llamaindex"].get("hit_rate", 0)) for name, data in comparison_data.items()],
                key=lambda x: x[1], reverse=True
            )
            print(f"\n📈 Hit Rate Ranking:")
            for i, (method, score) in enumerate(llama_hr_ranking, 1):
                print(f"  {i}. {method}: {score:.3f}")
            
            # RAGAS Faithfulness ranking
            ragas_faith_ranking = sorted(
                [(name, data["ragas"].get("faithfulness", 0)) for name, data in comparison_data.items()],
                key=lambda x: x[1], reverse=True
            )
            print(f"\n🎯 Faithfulness Ranking:")
            for i, (method, score) in enumerate(ragas_faith_ranking, 1):
                print(f"  {i}. {method}: {score:.3f}")
    else:
        print("❌ No evaluation data found to compare.")

# Uncomment the line below to run the comparison
# compare_parsing_methods()
