In [1]:
import sys
from pathlib import Path
import json
import pandas as pd
import torch
from langchain.vectorstores import FAISS
from langchain_openai import OpenAIEmbeddings
from dotenv import load_dotenv
import os

# Append retrievers to the Python path (not just src)
sys.path.append(str(Path().resolve().parent / "src"))

# Import pipeline components
from retrievers.vectorrag.index_faiss import build_faiss_index_from_json
from retrievers.vectorrag.document_loader import load_json_documents
from retrievers.vectorrag.chunker import chunk_documents
from retrievers.vectorrag.embedder import init_embedder
from retrievers.vectorrag.index_faiss import build_faiss_index, load_faiss_index
from retrievers.vectorrag.retriever import rerank_search

from generator.generator import ChatGPTGenerator 
import numpy as np
import json
from retrievers.vectorrag.reranker import CrossEncoderReranker
import numpy as np


In [2]:
import json

# Load enriched corpus
path = "/Users/alex/Documents/Data Science Master/thesis_RAG/data/data_processed/Train_Val_Test/context_with_metadata_dedup_enriched.jsonl"
docs = []

with open(path, 'r', encoding='utf-8') as f:
    for line in f:
        if line.strip():
            docs.append(json.loads(line))

# Quick check
print(f"Loaded {len(docs)} documents")
print(json.dumps(docs[0], indent=2))

Loaded 7696 documents
{
  "context": "['interest rate to a variable interest rate based on the three-month libor plus 2.05% ( 2.05 % ) ( 2.34% ( 2.34 % ) as of october 31 , 2009 ) .', 'if libor changes by 100 basis points , our annual interest expense would change by $ 3.8 million .', 'foreign currency exposure as more fully described in note 2i .', 'in the notes to consolidated financial statements contained in item 8 of this annual report on form 10-k , we regularly hedge our non-u.s .', 'dollar-based exposures by entering into forward foreign currency exchange contracts .', 'the terms of these contracts are for periods matching the duration of the underlying exposure and generally range from one month to twelve months .', 'currently , our largest foreign currency exposure is the euro , primarily because our european operations have the highest proportion of our local currency denominated expenses .', 'relative to foreign currency exposures existing at october 31 , 2009 and november 

In [3]:
import json
import ast
import re


docs = []

LIST_RE = re.compile(r"\[.*?\]", re.S)

def parse_context(raw):
    if isinstance(raw, list):
        return raw
    if isinstance(raw, str):
        try:
            m = LIST_RE.search(raw)
            if m:
                return ast.literal_eval(m.group(0))
        except Exception:
            pass
    return [raw]

def build_text_and_meta(entry):
    context_list = parse_context(entry.get("context", ""))
    tabular_tail = entry["context"]
    if isinstance(tabular_tail, str):
        tabular_tail = tabular_tail.split("]")[-1].strip()

    full_context = " ".join(context_list)
    if isinstance(tabular_tail, str) and tabular_tail not in full_context:
        full_context += "\n" + tabular_tail

    title = entry.get("title", "")
    ticker = entry.get("ticker")
    year = entry.get("year")
    page = entry.get("page")

    # Build metadata-aware title
    if ticker and year and page:
        header = f"[TITLE: {title}, TICKER: {ticker}, YEAR: {year}, PAGE: {page}]"
    else:
        header = f"[TITLE: {title}, SOURCE_ID: {entry.get('source_id')}]"

    return {
        "text": f"{header}\n{full_context}",
        "metadata": {
            "title": title,
            "source_id": entry.get("source_id"),
            "ticker": ticker,
            "year": year,
            "page": page,
            "source": entry.get("source")
        }
    }

with open(path, 'r', encoding='utf-8') as f:
    for line in f:
        if line.strip():
            entry = json.loads(line)
            docs.append(build_text_and_meta(entry))

print(f"Loaded {len(docs)} documents.")
print(json.dumps(docs[0], indent=2))

Loaded 7696 documents.
{
  "text": "[TITLE: FinQA, TICKER: ADI, YEAR: 2009, PAGE: 49]\ninterest rate to a variable interest rate based on the three-month libor plus 2.05% ( 2.05 % ) ( 2.34% ( 2.34 % ) as of october 31 , 2009 ) . if libor changes by 100 basis points , our annual interest expense would change by $ 3.8 million . foreign currency exposure as more fully described in note 2i . in the notes to consolidated financial statements contained in item 8 of this annual report on form 10-k , we regularly hedge our non-u.s . dollar-based exposures by entering into forward foreign currency exchange contracts . the terms of these contracts are for periods matching the duration of the underlying exposure and generally range from one month to twelve months . currently , our largest foreign currency exposure is the euro , primarily because our european operations have the highest proportion of our local currency denominated expenses . relative to foreign currency exposures existing at octob

In [4]:
import json

# Show full JSON for first FinDER document
finder_doc = next(doc for doc in docs if doc['metadata'].get('source', '').lower() == 'finder')
print(json.dumps(finder_doc, indent=2))

{
  "text": "[TITLE: FinDER, SOURCE_ID: b33fcee7]\nCboe Global Markets, Inc. and Subsidiaries\n\nConsolidated Statements of Income\n\nYears ended December 31, 2023, 2022, and 2021\n\n(In millions, except per share data)\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n    \n\n2023\n\n    \n\n2022\n\n    \n\n2021\n\n \n\nRevenues:\n\n\n\n\n\n\n\n\n\n\n\nCash and spot markets\n\n\n$\n\n1,445.1\n\n\n$\n\n1,777.6\n\n\n$\n\n1,660.5\n\n\nData and access solutions\n\n\n\n539.2\n\n\n\n497.0\n\n\n\n427.7\n\n\nDerivatives markets\n\n\n \n\n1,789.2\n\n\n \n\n1,683.9\n\n\n \n\n1,406.6\n\n\nTotal revenues\n\n\n \n\n3,773.5\n\n\n \n\n3,958.5\n\n\n \n\n3,494.8\n\n\nCost of revenues:\n\n\n\n\n\n\n\n\n\n\n\n  Liquidity payments\n\n\n \n\n1,385.8\n\n\n \n\n1,670.2\n\n\n \n\n1,650.7\n\n\n  Routing and clearing\n\n\n\n79.1\n\n\n\n83.2\n\n\n\n87.8\n\n\n  Section 31 fees\n\n\n\n185.7\n\n\n\n329.8\n\n\n\n179.6\n\n\n  Royalty fees and other cost of revenues\n\n\n \n\n204.9\n\n\n \n\n133.6\n\n\n \n\n100.6\n\n\nTotal cost of reven

In [5]:
texts = [doc["text"] for doc in docs]
unique_texts = set(texts)

print(f"Total documents: {len(texts)}")
print(f"Unique texts:    {len(unique_texts)}")

if len(texts) == len(unique_texts):
    print("✅ All contexts are unique.")
else:
    print(f"⚠️ {len(texts) - len(unique_texts)} duplicate contexts found.")

Total documents: 7696
Unique texts:    7696
✅ All contexts are unique.


In [6]:
from langchain.docstore.document import Document

# Wrap your dicts into LangChain Document objects
doc_objs = []
for i, doc in enumerate(docs):
    doc_objs.append(Document(
        page_content=doc["text"],
        metadata={**doc["metadata"], "row_index": i}
    ))

chunked_docs = chunk_documents(doc_objs, chunk_size=1500, chunk_overlap=200)

print(f"Created {len(chunked_docs)} chunks.")

Created a chunk of size 1568, which is longer than the specified 1500
Created a chunk of size 1591, which is longer than the specified 1500
Created a chunk of size 2034, which is longer than the specified 1500
Created a chunk of size 1610, which is longer than the specified 1500
Created a chunk of size 1631, which is longer than the specified 1500
Created a chunk of size 1754, which is longer than the specified 1500
Created a chunk of size 4738, which is longer than the specified 1500
Created a chunk of size 3774, which is longer than the specified 1500
Created a chunk of size 3970, which is longer than the specified 1500
Created a chunk of size 7408, which is longer than the specified 1500
Created a chunk of size 2037, which is longer than the specified 1500
Created a chunk of size 1546, which is longer than the specified 1500
Created a chunk of size 4015, which is longer than the specified 1500
Created a chunk of size 4624, which is longer than the specified 1500
Created a chunk of s

Created 12059 chunks.


In [9]:
import os
from datetime import date

# Create output directory with today's date
today_str = date.today().isoformat()  # e.g., '2025-08-01'
OUT_DIR = f"outputs/{today_str}"
os.makedirs(OUT_DIR, exist_ok=True)

In [12]:
import tiktoken
from tqdm import tqdm
embedder = init_embedder()

# Initialize tokenizer for OpenAI's embedding model
enc = tiktoken.encoding_for_model("text-embedding-ada-002")
MAX_TOKENS_PER_BATCH = 290_000 
def count_tokens(text: str) -> int:
    return len(enc.encode(text))

# NEW batching with aligned metadata
batches = []
current_batch = []
current_tokens = 0

for doc in chunked_docs:
    text = doc.page_content
    meta = doc.metadata
    tokens = count_tokens(text)
    if current_tokens + tokens > MAX_TOKENS_PER_BATCH:
        batches.append(current_batch)
        current_batch = [(text, meta)]
        current_tokens = tokens
    else:
        current_batch.append((text, meta))
        current_tokens += tokens

if current_batch:
    batches.append(current_batch)

# Embed and preserve alignment
embeddings = []
texts = []
metas = []

for i, batch in enumerate(tqdm(batches, desc="Embedding chunks")):
    try:
        batch_texts = [text for text, _ in batch]
        batch_metas = [meta for _, meta in batch]

        batch_embeds = embedder.embed_documents(batch_texts)
        embeddings.extend(batch_embeds)
        texts.extend(batch_texts)
        metas.extend(batch_metas)

    except Exception as e:
        print(f"Batch {i} failed: {e}")

Embedding chunks: 100%|██████████| 16/16 [03:44<00:00, 14.05s/it]


In [10]:
from pathlib import Path
OUT_DIR = Path(f"outputs/{today_str}")  # or your actual folder

In [15]:
import numpy as np

embeddings_np = np.array(embeddings, dtype="float32")
np.save(OUT_DIR / "embeddings.npy", embeddings_np)
print(f"Saved embeddings to {OUT_DIR/'embeddings.npy'}")

Saved embeddings to outputs/2025-08-01/embeddings.npy


In [4]:
from pathlib import Path
from datetime import datetime
import numpy as np

# Get today's date in YYYY-MM-DD format
today_str = datetime.now().strftime("%Y-%m-%d")

OUT_DIR = Path(f"outputs/{today_str}")
embeddings_np = np.load(OUT_DIR / "embeddings.npy")
print(f"✅ Loaded embeddings with shape: {embeddings_np.shape}")

✅ Loaded embeddings with shape: (12059, 1536)


In [7]:
import numpy as np
import faiss

# Convert to float32 and normalize for cosine similarity
faiss.normalize_L2(embeddings_np)

# Create cosine similarity index
index = faiss.IndexFlatIP(embeddings_np.shape[1])
index.add(embeddings_np)
faiss.write_index(index, str(OUT_DIR / "index.faiss"))

# Optional sanity check
print(f"Index dimension: {index.d}, total vectors: {index.ntotal}")

Index dimension: 1536, total vectors: 12059


In [11]:
import pandas as pd

# Rebuild metas and texts from chunked_docs
texts = [doc.page_content for doc in chunked_docs]
metas = [doc.metadata for doc in chunked_docs]

for i, meta in enumerate(metas):
    meta["text"] = texts[i]  # Optional, but useful for inspection

df_meta = pd.DataFrame(metas)
df_meta.to_parquet(f"{OUT_DIR}/chunk_meta.parquet", index=False)

print(f"Metadata saved to: {OUT_DIR}/chunk_meta.parquet")

Metadata saved to: outputs/2025-08-01/chunk_meta.parquet


In [21]:
from langchain.vectorstores import FAISS
from langchain.docstore.document import Document

# Reload embeddings from file
import numpy as np
embeddings_np = np.load("outputs/2025-08-01/embeddings.npy")

# Create text+vector tuples
text_embedding_pairs = list(zip(texts, embeddings_np.tolist()))

# Create Document objects
chunked_docs = [Document(page_content=text, metadata=meta) for text, meta in zip(texts, metas)]

# Build LangChain FAISS index
faiss_index = FAISS.from_embeddings(
    text_embeddings=text_embedding_pairs,
    embedding=embedder,
    metadatas=metas
)

# Save for load_local()
faiss_index.save_local("outputs/2025-08-01")
print("✅ FAISS index saved with .faiss and .pkl for LangChain")

✅ FAISS index saved with .faiss and .pkl for LangChain


# Evaluation

In [12]:
import faiss
import numpy as np
import pandas as pd
from retrievers.vectorrag.embedder import init_embedder

embedder = init_embedder()

# Load index
index = faiss.read_index("outputs/2025-08-01/index.faiss")

# Load metadata
df_meta = pd.read_parquet("outputs/2025-08-01/chunk_meta.parquet")

# Embed your query
query = "How did foreign currency fluctuations affect ADI’s financials in 2009?"
query_embedding = embedder.embed_query(query)
query_embedding = np.array(query_embedding, dtype="float32").reshape(1, -1)

# Search top 5
D, I = index.search(query_embedding, k=5)

# Show results
for idx, score in zip(I[0], D[0]):
    print(f"Score: {score:.4f}")
    print(df_meta.iloc[idx]["text"][:700])
    print("-" * 80)

Score: 0.8712
[TITLE: ConvFinQA, TICKER: ADI, YEAR: 2010, PAGE: 50]
the following table illustrates the effect that a 10% ( 10 % ) unfavorable or favorable movement in foreign currency exchange rates , relative to the u.s . dollar , would have on the fair value of our forward exchange contracts as of october 30 , 2010 and october 31 , 2009: .
--------------------------------------------------------------------------------
Score: 0.8549
[TITLE: FinQA, TICKER: ADI, YEAR: 2009, PAGE: 49]
interest rate to a variable interest rate based on the three-month libor plus 2.05% ( 2.05 % ) ( 2.34% ( 2.34 % ) as of october 31 , 2009 ) . if libor changes by 100 basis points , our annual interest expense would change by $ 3.8 million . foreign currency exposure as more fully described in note 2i . in the notes to consolidated financial statements contained in item 8 of this annual report on form 10-k , we regularly hedge our non-u.s . dollar-based exposures by entering into forward foreign currency e

In [32]:
import asyncio
from typing import List, Dict, Any, Optional

from ragas import EvaluationDataset, evaluate
import os
from dotenv import load_dotenv

async def evaluate_ragas_dataset(
    dataset: List[Dict[str, Any]],
    metrics_list: Optional[List[str]] = None,
    llm_model: str = "gpt-4o-2024-11-20",
    llm_type: str = "openai",  # 'openai' or 'vllm'
    vllm_base_url: str = "http://localhost:8000/v1"
):
    """
    Evaluate a dataset using RAGAS with the new API.

    Args:
        dataset: List of dicts with keys: user_input, retrieved_contexts, response, reference
        metrics_list: List of metric names to compute (see available_metrics below). If None, all are used.
        llm_model: Model name (e.g., 'gpt-4o', 'gpt-3.5-turbo', 'Fin-R1')
        llm_type: 'openai' (default) or 'vllm'
        vllm_base_url: Base URL for vllm server (if using vllm)
    Returns:
        Dictionary of metric results
    """
    # Metric mapping
    from ragas.metrics import (
        LLMContextPrecisionWithReference,
        NonLLMContextPrecisionWithReference,
        LLMContextRecall,
        NonLLMContextRecall,
        ContextEntityRecall,
        Faithfulness,
        AnswerAccuracy,
        StringPresence,
    )
    available_metrics = {
        "context_precision_llm": LLMContextPrecisionWithReference,
        "context_precision_nonllm": NonLLMContextPrecisionWithReference,
        "context_recall_llm": LLMContextRecall,
        "context_recall_nonllm": NonLLMContextRecall,
        "context_entity_recall": ContextEntityRecall,
        "faithfulness": Faithfulness,
        "answer_accuracy": AnswerAccuracy,
        "string_presence": StringPresence,
    }
    # If no metrics_list, use all
    if metrics_list is None:
        metrics_list = list(available_metrics.keys())
    # Instantiate metrics
    metrics = [available_metrics[name]() for name in metrics_list if name in available_metrics]
    if not metrics:
        raise ValueError("No valid metrics selected.")

    load_dotenv()
    if llm_type == "openai":
        from langchain_openai import ChatOpenAI
        from ragas.llms import LangchainLLMWrapper
        OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
        if not OPENAI_API_KEY:
            raise ValueError("OPENAI_API_KEY not found in .env file.")
        llm = ChatOpenAI(model=llm_model, api_key=OPENAI_API_KEY)
        evaluator_llm = LangchainLLMWrapper(llm)
    elif llm_type == "vllm":
        from langchain_community.llms import VLLMOpenAI
        from ragas.llms import LangchainLLMWrapper
        llm = VLLMOpenAI(model=llm_model, base_url=vllm_base_url)
        evaluator_llm = LangchainLLMWrapper(llm)
    else:
        raise ValueError(f"Unknown llm_type: {llm_type}")

    evaluation_dataset = EvaluationDataset.from_list(dataset)
    return evaluate(
        dataset=evaluation_dataset,
        metrics=metrics,
        llm=evaluator_llm
    )


In [36]:

generator = ChatGPTGenerator()

# Load gold test data
with open("../data/data_processed/Train_Val_Test/gold_test_data_updated.json") as f:
    gold_data = json.load(f)

df_meta = pd.read_parquet(OUT_DIR / "chunk_meta.parquet")
all_docs = df_meta.to_dict(orient="records")  # becomes list of dicts with 'text' key

# Load FAISS index#
OUT_DIR = Path("outputs/2025-08-01")   
faiss_index = FAISS.load_local(
    folder_path=OUT_DIR,
    embeddings=embedder,
    allow_dangerous_deserialization=True
)

# Separate datasets
finqa_eval_dataset = []
finder_eval_dataset = []

finqa_count = 0
finder_count = 0

for sample in gold_data:
    if finqa_count >= 10 and finder_count >= 10:
        break

    question = sample["question"]
    reference = sample["answer"]
    gold_context = sample.get("gold_context", {})

    # Embed and search
    query_embedding = np.array(embedder.embed_query(question), dtype="float32").reshape(1, -1)
    D, I = faiss_index.index.search(query_embedding, 10)
    candidate_texts = [all_docs[i] for i in I[0] if "text" in all_docs[i]]

    # Rerank
    reranked_docs = CrossEncoderReranker().rerank(question, candidate_texts, top_k=5)

    # Generate
    response = generator.generate(question, [doc["text"] for doc in reranked_docs])
    print("Question:", question)
    print("Generated Answer:", response)
    print("-" * 80)

    # Determine reference contexts
    reference_contexts = []
    if isinstance(gold_context, dict):
        reference_contexts = list(gold_context.values())
    elif isinstance(gold_context, str) and gold_context.strip():
        reference_contexts = [gold_context.strip()]

    record = {
        "user_input": question,
        "retrieved_contexts": [doc["text"] for doc in reranked_docs],
        "response": response,
        "reference": reference,
    }

    # Append based on presence of gold context
    if reference_contexts and finqa_count < 10:
        record["reference_contexts"] = reference_contexts
        finqa_eval_dataset.append(record)
        finqa_count += 1
    elif not reference_contexts and finder_count < 10:
        finder_eval_dataset.append(record)
        finder_count += 1

# Evaluate
import asyncio

print("\n🔍 Evaluating FinQA subset...")
finqa_results = asyncio.run(evaluate_ragas_dataset(finqa_eval_dataset))
print(finqa_results)

print("\n🔍 Evaluating FinDER subset...")
finder_results = asyncio.run(evaluate_ragas_dataset(
    finder_eval_dataset,
    metrics_list=["faithfulness", "answer_accuracy"]
))
print(finder_results)

Question: GM operating margin 2023 vs 2022, GM.
Generated Answer: 2023: 5.4 % vs 2022: 6.6 %
--------------------------------------------------------------------------------
Question: In the financial filing of Citigroup, what percentage of incremental risk-weighted assets are student loans at january 1 , 2010?
Generated Answer: 24%
--------------------------------------------------------------------------------
Question: what is the growth rate in net revenue in 2003 for entergy corporation?
Generated Answer: I don’t know
--------------------------------------------------------------------------------
Question: NGC's cyber investments boost investor confidence, enhance valuation, and bolster stability.
Generated Answer: NGC’s ongoing spending on cyber defenses, industry‐wide information‐sharing memberships, and third-party maturity assessments demonstrate proactive risk management, which reassures investors, helps preserve firm value, and supports the company’s operational and financi

Evaluating:   0%|          | 0/80 [00:00<?, ?it/s]

Exception raised in Job[0]: TimeoutError()
Exception raised in Job[4]: TimeoutError()
Exception raised in Job[34]: TimeoutError()
Exception raised in Job[40]: TimeoutError()
Exception raised in Job[42]: TimeoutError()
Exception raised in Job[44]: TimeoutError()
Exception raised in Job[45]: TimeoutError()
Exception raised in Job[52]: TimeoutError()


{'llm_context_precision_with_reference': 0.5500, 'non_llm_context_precision_with_reference': 0.0333, 'context_recall': 0.1250, 'non_llm_context_recall': 0.0500, 'context_entity_recall': 0.0000, 'faithfulness': 0.3056, 'nv_accuracy': 0.3000, 'string_present': 0.3000}

🔍 Evaluating FinDER subset...


Evaluating:   0%|          | 0/20 [00:00<?, ?it/s]

An error occurred: Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4o in organization org-b9EC73CxcpNUA7w9Zh1KhBEH on tokens per min (TPM): Limit 30000, Used 30000, Requested 571. Please try again in 1.142s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}}. Skipping a sample by assigning it nan score.
An error occurred: Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4o in organization org-b9EC73CxcpNUA7w9Zh1KhBEH on tokens per min (TPM): Limit 30000, Used 30000, Requested 771. Please try again in 1.542s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}}. Skipping a sample by assigning it nan score.


{'faithfulness': 0.5358, 'nv_accuracy': 0.4062}


In [35]:
print(finder_results)

{'faithfulness': 0.7500, 'nv_accuracy': 0.7500}


In [None]:
async def test_ceiling_performance(
    #train_path="/Users/christel/Desktop/Thesis/thesis_repo/data/data_processed/Train_Val_Test/df_test.json",
    # data/data_processed/Train_Val_Test/gold_test_data.json
    num_samples=10
):
    """
    Test ceiling performance using gold context, limited to FinQA-only samples.
    """
    # Load and filter dataset
    train_data = load_train_data(train_path)
    train_data = [s for s in train_data if s.get("source") == "FinQA" or s.get("source") == "ConvFinQA"]

    # Apply sample limit *after* filtering
    if num_samples:
        train_data = train_data[num_samples:num_samples+10]

    # Instantiate generator (prompts now handled internally)
    generator = ChatGPTGenerator()

    dataset = []
    print("\nProcessing samples...")

    for i, sample in enumerate(train_data, 1):
        question = sample["question"]
        true_answer = sample["answer"]
        source = sample.get("source", "Unknown")

        relevant_context = get_gold_context(sample)
        # get real dataset results
        

        start = time.time()
        generated_answer = generator.generate(
            question=question,
            retrieved_docs=[relevant_context],
        )
        end = time.time()

        dataset.append({
            "user_input": question,
            "retrieved_contexts": [relevant_context],
            "response": generated_answer,
            "reference": true_answer
        })

        # Print progress
        print(f"\nSample {i}:")
        print(f"Source: {source}")
        print(f"Question: {question}")
        print(f"Gold Context: {relevant_context}")
        print(f"Generated Answer: {generated_answer}")
        print(f"True Answer: {true_answer}")
        print("-" * 80)

    # Evaluate with RAGAS
    metrics_list = ["answer_accuracy", "string_presence"]
    print("\nEvaluating ceiling performance...")
    results = await evaluate_ragas_dataset(dataset, metrics_list=metrics_list)

    print("\nCeiling Performance Results:")
    print(results)

    # Additional stats
    total_samples = len(dataset)
    exact_matches = sum(1 for sample in dataset if sample['response'].strip() == sample['reference'].strip())

    print("\nAdditional Statistics:")
    print(f"Total Samples: {total_samples}")
    print(f"Exact Matches: {exact_matches}")
    print(f"Exact Match Rate: {exact_matches / total_samples:.2%}")

if __name__ == "__main__":
    asyncio.run(test_ceiling_performance())

In [None]:
# latency logging in retriever required