In [33]:
import sys
from pathlib import Path
import json
import pandas as pd
import torch
from langchain.vectorstores import FAISS
from langchain_openai import OpenAIEmbeddings
from dotenv import load_dotenv
import os

# Append retrievers to the Python path (not just src)
sys.path.append(str(Path().resolve().parent / "src"))

# Import pipeline components
from retrievers.vectorrag.index_faiss import build_faiss_index_from_json
from retrievers.vectorrag.document_loader import load_json_documents
from retrievers.vectorrag.chunker import chunk_documents
from retrievers.vectorrag.embedder import init_embedder
from retrievers.vectorrag.index_faiss import build_faiss_index, load_faiss_index
from retrievers.vectorrag.retriever import rerank_search

from generator.generator import ChatGPTGenerator 
import numpy as np
import json
from retrievers.vectorrag.reranker import CrossEncoderReranker
import numpy as np


In [None]:
import json

# Load enriched corpus
path = "/Users/alex/Documents/Data Science Master/thesis_RAG/data/Retriever_Context (Eval)/existing_embeddings_with_meta_data.jsonl"
docs = []

with open(path, 'r', encoding='utf-8') as f:
    for line in f:
        if line.strip():
            docs.append(json.loads(line))

# Quick check
print(f"Loaded {len(docs)} documents")
print(json.dumps(docs[0], indent=2))

Loaded 10130 documents
{
  "chunk_id": "row_0_chunk_0",
  "text": "Cboe Global Markets, Inc. and Subsidiaries\n\nConsolidated Statements of Income\n\nYears ended December 31, 2023, 2022, and 2021\n\n(In millions, except per share data)\n\n\n    \n\n2023\n\n    \n\n2022\n\n    \n\n2021\n\n \n\nRevenues:\n\nCash and spot markets\n\n\n$\n\n1,445.1\n\n\n$\n\n1,777.6\n\n\n$\n\n1,660.5\n\n\nData and access solutions\n\n539.2\n\n497.0\n\n427.7\n\n\nDerivatives markets\n\n\n \n\n1,789.2\n\n\n \n\n1,683.9\n\n\n \n\n1,406.6\n\n\nTotal revenues\n\n\n \n\n3,773.5\n\n\n \n\n3,958.5\n\n\n \n\n3,494.8\n\n\nCost of revenues:\n\n  Liquidity payments\n\n\n \n\n1,385.8\n\n\n \n\n1,670.2\n\n\n \n\n1,650.7\n\n\n  Routing and clearing\n\n79.1\n\n83.2\n\n87.8\n\n\n  Section 31 fees\n\n185.7\n\n329.8\n\n179.6\n\n\n  Royalty fees and other cost of revenues\n\n\n \n\n204.9\n\n\n \n\n133.6\n\n\n \n\n100.6\n\n\nTotal cost of revenues\n\n\n \n\n1,855.5\n\n\n \n\n2,216.8\n\n\n \n\n2,018.7\n\n\nRevenues less cost of

In [None]:
import json

# Paths to your files
file1 = '/Users/alex/Documents/Data Science Master/thesis_RAG/data/Retriever_Context (Eval)/embedded_chunks.json'
file2 = '/Users/alex/Documents/Data Science Master/thesis_RAG/data/Retriever_Context (Eval)/existing_embeddings_with_meta_data.jsonl'

# Inspect first few entries from embedded_chunks.json
print("📁 embedded_chunks.json")
with open(file1, 'r') as f:
    data_json = json.load(f)

print(f"Type: {type(data_json)}")
if isinstance(data_json, list):
    print(f"Loaded {len(data_json)} entries")
    print("Sample entry:")
    print(json.dumps(data_json[0], indent=2))
else:
    print("Top-level keys:", list(data_json.keys()))
    first_key = next(iter(data_json))
    print("Sample entry:")
    print(json.dumps(data_json[first_key], indent=2))

print("\n" + "-"*60 + "\n")

# Inspect first few entries from existing_embeddings_with_meta_data.jsonl
print("📁 existing_embeddings_with_meta_data.jsonl")
with open(file2, 'r') as f:
    for i, line in enumerate(f):
        entry = json.loads(line)
        print(f"Entry {i+1}:")
        print(json.dumps(entry, indent=2))
        if i >= 2:
            break  # just show 3 entries max

📁 embedded_chunks.json
Type: <class 'list'>
Loaded 10130 entries
Sample entry:
{
  "chunk_id": "row_0_chunk_0",
  "row_index": 0,
  "text": "Cboe Global Markets, Inc. and Subsidiaries\n\nConsolidated Statements of Income\n\nYears ended December 31, 2023, 2022, and 2021\n\n(In millions, except per share data)\n\n\n    \n\n2023\n\n    \n\n2022\n\n    \n\n2021\n\n \n\nRevenues:\n\nCash and spot markets\n\n\n$\n\n1,445.1\n\n\n$\n\n1,777.6\n\n\n$\n\n1,660.5\n\n\nData and access solutions\n\n539.2\n\n497.0\n\n427.7\n\n\nDerivatives markets\n\n\n \n\n1,789.2\n\n\n \n\n1,683.9\n\n\n \n\n1,406.6\n\n\nTotal revenues\n\n\n \n\n3,773.5\n\n\n \n\n3,958.5\n\n\n \n\n3,494.8\n\n\nCost of revenues:\n\n  Liquidity payments\n\n\n \n\n1,385.8\n\n\n \n\n1,670.2\n\n\n \n\n1,650.7\n\n\n  Routing and clearing\n\n79.1\n\n83.2\n\n87.8\n\n\n  Section 31 fees\n\n185.7\n\n329.8\n\n179.6\n\n\n  Royalty fees and other cost of revenues\n\n\n \n\n204.9\n\n\n \n\n133.6\n\n\n \n\n100.6\n\n\nTotal cost of revenues\n\n\n 

In [None]:
import json
import faiss
import numpy as np
import pickle

# Load embeddings from embedded_chunks.json
with open('/Users/alex/Documents/Data Science Master/thesis_RAG/data/Retriever_Context (Eval)/embedded_chunks.json', 'r') as f:
    embedded_data = json.load(f)

# Create lists for vectors and IDs
vectors = []
ids = []
texts = []

for item in embedded_data:
    vectors.append(item['embedding'])
    ids.append(item['chunk_id'])
    texts.append(item['text'])

# Convert to float32 numpy array
embedding_matrix = np.array(vectors).astype('float32')

# 🔄 Normalize vectors to unit length (for cosine similarity)
faiss.normalize_L2(embedding_matrix)

# Use Inner Product index (cosine similarity after normalization)
dimension = embedding_matrix.shape[1]
index = faiss.IndexFlatIP(dimension)
index.add(embedding_matrix)

# Save FAISS index
faiss.write_index(index, 'faiss_index.idx')

# Load metadata from JSONL
metadata_dict = {}
with open('/Users/alex/Documents/Data Science Master/thesis_RAG/data/Retriever_Context (Eval)/existing_embeddings_with_meta_data.jsonl', 'r') as f:
    for line in f:
        meta = json.loads(line)
        chunk_id = meta['chunk_id']
        metadata_dict[chunk_id] = meta

# Save chunk_id order and metadata
with open('retriever_metadata.pkl', 'wb') as f:
    pickle.dump({'chunk_ids': ids, 'metadata': metadata_dict}, f)

print("✅ FAISS cosine index and metadata saved successfully.")

✅ FAISS cosine index and metadata saved successfully.


In [None]:
import faiss
import numpy as np
import pandas as pd

# Initialize your embedder
embedder = init_embedder()

# Load FAISS index
index = faiss.read_index("/Users/alex/Documents/Data Science Master/thesis_RAG/data/Retriever_Context (Eval)/faiss_index.idx")  # or your full path

# Load metadata
with open("/Users/alex/Documents/Data Science Master/thesis_RAG/data/Retriever_Context (Eval)/retriever_metadata.pkl", "rb") as f:
    import pickle
    data = pickle.load(f)
    chunk_ids = data["chunk_ids"]
    metadata_dict = data["metadata"]

# Embed your query
query = "What was the operating income for Cboe Global Markets in 2023?"
query_embedding = embedder.embed_query(query)
query_embedding = np.array(query_embedding, dtype="float32").reshape(1, -1)
faiss.normalize_L2(query_embedding)  # Ensure cosine norm

# Search top-k
k = 5
D, I = index.search(query_embedding, k)

# Display results
print(f"\nTop {k} results for query: \"{query}\"\n")
for idx, score in zip(I[0], D[0]):
    chunk_id = chunk_ids[idx]
    metadata = metadata_dict[chunk_id]
    
    print(f"--- Rank {idx+1} ---")
    print(f"Score (cosine sim): {score:.4f}")
    print(f"Source: {metadata.get('source', 'N/A')}")
    print(f"Text:\n{metadata.get('text', '')[:700]}")
    print("-" * 80)


Top 5 results for query: "What was the operating income for Cboe Global Markets in 2023?"

--- Rank 1 ---
Score (cosine sim): 0.8828
Source: Finder
Text:
Cboe Global Markets, Inc. and Subsidiaries

Consolidated Statements of Income

Years ended December 31, 2023, 2022, and 2021

(In millions, except per share data)


    

2023

    

2022

    

2021

 

Revenues:

Cash and spot markets


$

1,445.1


$

1,777.6


$

1,660.5


Data and access solutions

539.2

497.0

427.7


Derivatives markets


 

1,789.2


 

1,683.9


 

1,406.6


Total revenues


 

3,773.5


 

3,958.5


 

3,494.8


Cost of revenues:

  Liquidity payments


 

1,385.8


 

1,670.2


 

1,650.7


  Routing and clearing

79.1

83.2

87.8


  Section 31 fees

185.7

329.8

179.6


  Royalty fees and other cost of revenues


 

204.9


 

133.6


 

100.6


Total cost of 
--------------------------------------------------------------------------------
--- Rank 881 ---
Score (cosine sim): 0.8424
Source: None
Text:


# Evaluation

In [7]:
import json
import faiss
import numpy as np
import pickle
import pandas as pd
from pathlib import Path
import asyncio


# === Paths ===
BASE_DIR = Path("/Users/alex/Documents/Data Science Master/thesis_RAG/data/Retriever_Context (Eval)")
FAISS_PATH = BASE_DIR / "faiss_index.idx"
META_PATH = BASE_DIR / "retriever_metadata.pkl"
GOLD_PATH = Path("../data/data_processed/Train_Val_Test/gold_test_data_updated.json")

# === Load Metadata and FAISS Index ===
with open(META_PATH, "rb") as f:
    data = pickle.load(f)
    chunk_ids = data["chunk_ids"]
    metadata_dict = data["metadata"]

index = faiss.read_index(str(FAISS_PATH))

# === Load Embedder and Generator ===
embedder = init_embedder()
generator = ChatGPTGenerator()
reranker = CrossEncoderReranker()

# === Load Gold Test Data ===
with open(GOLD_PATH) as f:
    gold_data = json.load(f)

# === Generate Evaluation Dataset ===
finqa_eval_dataset = []
finder_eval_dataset = []
finqa_count = 0
finder_count = 0

for sample in gold_data:
    if finqa_count >= 2 and finder_count >= 2:
        break

    question = sample["question"]
    reference = sample["answer"]
    gold_context = sample.get("gold_context", {})

    # Embed and retrieve
    query_embedding = np.array(embedder.embed_query(question), dtype="float32").reshape(1, -1)
    faiss.normalize_L2(query_embedding)
    D, I = index.search(query_embedding, 50)

    candidate_docs = [
        metadata_dict[chunk_ids[i]]
        for i in I[0]
        if chunk_ids[i] in metadata_dict and "text" in metadata_dict[chunk_ids[i]]
    ]
    reranked_docs = reranker.rerank(question, candidate_docs, top_k=10)

    # Generate answer
    response = generator.generate(question, [doc["text"] for doc in reranked_docs])

    # Prepare record for RAGAS
    record = {
        "user_input": question,
        "retrieved_contexts": [doc["text"] for doc in reranked_docs],
        "response": response,
        "reference": reference,
    }

    # === Check if reference context exists in indexed corpus ===
    reference_contexts = []
    if isinstance(gold_context, dict):
        reference_contexts = list(gold_context.values())
    elif isinstance(gold_context, str) and gold_context.strip():
        reference_contexts = [gold_context.strip()]

    if reference_contexts:
        record["reference_contexts"] = reference_contexts
        found = False
        for ref in reference_contexts:
            ref_clean = ref.lower().strip()
            for chunk_data in metadata_dict.values():
                if ref_clean in chunk_data.get("text", "").lower():
                    found = True
                    break
            if found:
                break
        if found:
            print("✅ Reference context FOUND in indexed chunks.")
        else:
            print("❌ Reference context NOT found in indexed chunks.")

    # === Append to dataset ===
    if reference_contexts and finqa_count < 2:
        finqa_eval_dataset.append(record)
        finqa_count += 1
    elif not reference_contexts and finder_count < 2:
        finder_eval_dataset.append(record)
        finder_count += 1

# === Evaluate FinQA Subset ===
print("\n🔍 Evaluating FinQA subset...")
finqa_result = asyncio.run(evaluate_ragas_dataset(finqa_eval_dataset))
finqa_df = finqa_result.to_pandas()

print(f"\n🎯 FinQA Metrics per Sample:")
for i, row in finqa_df.iterrows():
    print(f"\n--- Sample {i+1} ---")
    print(f"Question: {row['user_input']}")
    print(f"Response: {row['response']}")
    print(f"Reference Answer: {row['reference']}")
    print(f"Context Sample:\n{row['retrieved_contexts'][0][:500]}...\n")

    for col in finqa_df.columns:
        if col not in ['user_input', 'retrieved_contexts', 'response', 'reference']:
            value = row[col]
            try:
                if isinstance(value, (int, float)):
                    print(f"{col}: {value:.4f}")
                else:
                    print(f"{col}: {value}")
            except Exception as e:
                print(f"{col}: ⚠️ Error printing value ({type(value)}): {e}")

# === Evaluate FinDER Subset ===
print("\n🔍 Evaluating FinDER subset...")
finder_result = asyncio.run(evaluate_ragas_dataset(
    finder_eval_dataset,
    metrics_list=["faithfulness", "answer_accuracy"]
))
finder_df = finder_result.to_pandas()

print(f"\n🎯 FinDER Metrics per Sample:")
for i, row in finder_df.iterrows():
    print(f"\n--- Sample {i+1} ---")
    print(f"Question: {row['user_input']}")
    print(f"Response: {row['response']}")
    print(f"Reference Answer: {row['reference']}")
    print(f"Context Sample:\n{row['retrieved_contexts'][0][:500]}...\n")

    for col in finder_df.columns:
        if col not in ['user_input', 'retrieved_contexts', 'response', 'reference']:
            value = row[col]
            try:
                if isinstance(value, (int, float)):
                    print(f"{col}: {value:.4f}")
                else:
                    print(f"{col}: {value}")
            except Exception as e:
                print(f"{col}: ⚠️ Error printing value ({type(value)}): {e}")

# === Save to CSV ===
finqa_df.to_csv("finqa_eval_detailed.csv", index=False)
finder_df.to_csv("finder_eval_detailed.csv", index=False)
print("\n✅ Evaluation complete. Results saved to CSV.")

❌ Reference context NOT found in indexed chunks.
❌ Reference context NOT found in indexed chunks.

🔍 Evaluating FinQA subset...


Evaluating:   0%|          | 0/16 [00:00<?, ?it/s]


🎯 FinQA Metrics per Sample:

--- Sample 1 ---
Question: In the financial filing of Citigroup, what percentage of incremental risk-weighted assets are student loans at january 1 , 2010?
Response: 3.54%
Reference Answer: 4%
Context Sample:
['commitments .', 'for a further description of the loan loss reserve and related accounts , see 201cmanaging global risk 201d and notes 1 and 18 to the consolidated financial statements on pages 51 , 122 and 165 , respectively .', 'securitizations the company securitizes a number of different asset classes as a means of strengthening its balance sheet and accessing competitive financing rates in the market .', 'under these securitization programs , assets are sold into a trust and used as colla...

reference_contexts: ['in billions of dollars the student loans of incremental gaap assets is 14.4 ; the student loans of incremental risk- weighted assets is 3.5 ;', 'in billions of dollars the total of incremental gaap assets is $ 179.0 ; the total of inc

Evaluating:   0%|          | 0/4 [00:00<?, ?it/s]


🎯 FinDER Metrics per Sample:

--- Sample 1 ---
Question: GM operating margin 2023 vs 2022, GM.
Response: 2023 = 5.4 % vs. 2022 = 6.6 %
Reference Answer: To calculate the operating profit margin, we divide Operating Income by Total Net Sales and Revenue. For 2023, the calculation is as follows:

• 2023 Operating Profit Margin = 9,298 / 171,842 ≈ 0.0541, or about 5.41%.

For 2022, using the same method:

• 2022 Operating Profit Margin = 10,315 / 156,735 ≈ 0.0658, or about 6.58%.

This comparison shows that the operating margin declined from approximately 6.58% in 2022 to about 5.41% in 2023.
Context Sample:
Engineering
(Dollar amounts in millions)		 		Variance
Year Ended December 31,		2023		2022		2023 vs. 2022
Sales		$	2,160 			$	2,762 			(22)	%
Operating profit		$	491 			$	555 			(12)	%
As a percent of sales		22.7 	%		20.1 	%		
 
2023 vs. 2022
 		% Change
Factors Contributing to Changes - Sales		
Currency		1 	%
Other		(23)	%
(22)	%
 
Sales
Engineering segment sales decreased $602 milli

In [8]:
from difflib import SequenceMatcher

def fuzzy_match(a, b, threshold=0.85):
    return SequenceMatcher(None, a, b).ratio() >= threshold

def check_gold_context_coverage(reference_contexts, metadata_dict):
    for ref in reference_contexts:
        ref = ref.lower().strip()
        for chunk in metadata_dict.values():
            text = chunk.get("text", "").lower()
            if fuzzy_match(ref, text):
                return True
    return False

# Count how many reference contexts are covered
found_count = 0
total = 0

for sample in gold_data:
    gold_context = sample.get("gold_context", {})
    reference_contexts = []
    if isinstance(gold_context, dict):
        reference_contexts = list(gold_context.values())
    elif isinstance(gold_context, str) and gold_context.strip():
        reference_contexts = [gold_context.strip()]
    
    if reference_contexts:
        total += 1
        if check_gold_context_coverage(reference_contexts, metadata_dict):
            found_count += 1

print(f"\n📊 Gold context coverage in index: {found_count}/{total} ({found_count / total:.2%})")


📊 Gold context coverage in index: 0/133 (0.00%)


In [9]:
# Flatten all chunk texts for brute-force search
all_texts = [chunk["text"].lower() for chunk in metadata_dict.values()]

# Check if each gold context is substring of any chunk
def is_in_raw_chunks(context: str) -> bool:
    return any(context.lower() in text for text in all_texts)

count_found = 0
for sample in gold_data:
    gold_context = sample.get("gold_context", {})
    references = list(gold_context.values()) if isinstance(gold_context, dict) else [gold_context]
    for ref in references:
        if is_in_raw_chunks(ref):
            count_found += 1
            break  # only need one match to count the sample

print(f"✅ Raw chunk match found for {count_found}/{len(gold_data)} samples")

✅ Raw chunk match found for 118/201 samples


Problem: Gold context not covered in FAISS. 

In [10]:
from tqdm import tqdm

# Extract and embed all texts
all_texts = [metadata_dict[cid]["text"] for cid in chunk_ids if "text" in metadata_dict[cid]]
all_embeddings = [embedder.embed_query(text) for text in tqdm(all_texts)]

# Convert to float32 array
embedding_matrix = np.array(all_embeddings, dtype="float32")
faiss.normalize_L2(embedding_matrix)

# Create FAISS index
dimension = embedding_matrix.shape[1]
index = faiss.IndexFlatIP(dimension)
index.add(embedding_matrix)

# Save new index (overwrite or rename)
faiss.write_index(index, str(BASE_DIR / "faiss_index_complete.idx"))

# Save updated mapping order
with open(BASE_DIR / "retriever_metadata_complete.pkl", "wb") as f:
    pickle.dump({'chunk_ids': chunk_ids, 'metadata': metadata_dict}, f)

print("✅ Re-indexing complete with full corpus.")

100%|██████████| 10130/10130 [51:49<00:00,  3.26it/s]  


: 

In [1]:
import json
from difflib import SequenceMatcher

# Load gold contexts
with open("../data/data_processed/Train_Val_Test/gold_test_data_updated.json") as f:
    gold_data = json.load(f)

# Load existing embedded text chunks
with open("/Users/alex/Documents/Data Science Master/thesis_RAG/data/Retriever_Context (Eval)/embedded_chunks.json") as f:
    embedded_chunks = json.load(f)

embedded_texts = [chunk["text"].lower().strip() for chunk in embedded_chunks]

# Fuzzy match gold context against embedded texts
def fuzzy_match(a, b, threshold=0.85):
    return SequenceMatcher(None, a.strip().lower(), b.strip().lower()).ratio() >= threshold

def context_is_embedded(reference_contexts):
    for ref in reference_contexts:
        for text in embedded_texts:
            if fuzzy_match(ref, text):
                return True
    return False

found = 0
total = 0

for sample in gold_data:
    gold_context = sample.get("gold_context", {})
    reference_contexts = []

    if isinstance(gold_context, dict):
        reference_contexts = list(gold_context.values())
    elif isinstance(gold_context, str) and gold_context.strip():
        reference_contexts = [gold_context.strip()]

    if reference_contexts:
        total += 1
        if context_is_embedded(reference_contexts):
            found += 1

print(f"\n📊 Gold context found in embedded chunks: {found}/{total} ({found/total:.2%})")


📊 Gold context found in embedded chunks: 0/133 (0.00%)


In [2]:
import json
from pathlib import Path
from difflib import SequenceMatcher

# === Load existing embeddings file ===
jsonl_path = Path("/Users/alex/Documents/Data Science Master/thesis_RAG/data/Retriever_Context (Eval)/existing_embeddings_with_meta_data.jsonl")
with open(jsonl_path) as f:
    existing_chunks = [json.loads(line) for line in f]

# Build fast search index
all_texts = [chunk["text"].lower() for chunk in existing_chunks]

def fuzzy_match(a, b, threshold=0.85):
    return SequenceMatcher(None, a, b).ratio() >= threshold

# === Load gold data ===
with open("../data/data_processed/Train_Val_Test/gold_test_data_updated.json") as f:
    gold_data = json.load(f)

# === Match gold context against existing chunks ===
found = 0
total = 0

for sample in gold_data:
    gold_context = sample.get("gold_context", {})
    reference_contexts = []

    if isinstance(gold_context, dict):
        reference_contexts = list(gold_context.values())
    elif isinstance(gold_context, str) and gold_context.strip():
        reference_contexts = [gold_context.strip()]

    if reference_contexts:
        total += 1
        for ref in reference_contexts:
            ref = ref.lower().strip()
            if any(fuzzy_match(ref, chunk) for chunk in all_texts):
                found += 1
                break

print(f"\n📦 Gold contexts in JSONL: {found}/{total} ({found / total:.2%})")


📦 Gold contexts in JSONL: 0/133 (0.00%)


In [5]:
from tqdm import tqdm
import numpy as np
import faiss
import pickle
import json
import sys
from pathlib import Path
import time
sys.path.append(str(Path().resolve().parent / "src"))
from retrievers.vectorrag.embedder import init_embedder

embedder = init_embedder()

# Load raw chunk metadata
with open("/Users/alex/Documents/Data Science Master/thesis_RAG/data/Retriever_Context (Eval)/existing_embeddings_with_meta_data.jsonl") as f:
    metadata_dict = {json.loads(line)["chunk_id"]: json.loads(line) for line in f}

chunk_ids = list(metadata_dict.keys())
all_texts = [metadata_dict[cid]["text"] for cid in chunk_ids]

# === Parameters ===
BATCH_SIZE = 100
dimension = 1536  # for ada-002

# === Init empty index
index = faiss.IndexFlatIP(dimension)
chunk_id_order = []

# === Stream embedding and indexing
for i in tqdm(range(0, len(all_texts), BATCH_SIZE), desc="Embedding + Indexing"):
    batch_ids = chunk_ids[i:i+BATCH_SIZE]
    batch_texts = all_texts[i:i+BATCH_SIZE]

    try:
        embeddings = embedder.embed_documents(batch_texts)
    except Exception as e:
        print(f"❌ Batch {i}-{i+BATCH_SIZE} failed: {e}")
        time.sleep(5)
        embeddings = embedder.embed_documents(batch_texts)

    emb_np = np.array(embeddings).astype("float32")
    faiss.normalize_L2(emb_np)

    index.add(emb_np)
    chunk_id_order.extend(batch_ids)

# ✅ Save final FAISS index and metadata
faiss.write_index(index, "faiss_index_full.idx")
with open("retriever_metadata_full.pkl", "wb") as f:
    pickle.dump({"chunk_ids": chunk_id_order, "metadata": metadata_dict}, f)

print("✅ FAISS + metadata written safely with streaming.")

Embedding + Indexing: 100%|██████████| 102/102 [02:45<00:00,  1.63s/it]

✅ FAISS + metadata written safely with streaming.





In [6]:
import pickle
import faiss

# Load FAISS index
index = faiss.read_index("/Users/alex/Documents/Data Science Master/thesis_RAG/data/Retriever_Context (Eval)/faiss_index_full.idx")

# Load metadata
with open("/Users/alex/Documents/Data Science Master/thesis_RAG/data/Retriever_Context (Eval)/retriever_metadata_full.pkl", "rb") as f:
    meta = pickle.load(f)

chunk_ids = meta["chunk_ids"]
metadata_dict = meta["metadata"]

# Sanity check
print(f"🔢 Index contains {index.ntotal} vectors")
print(f"🧠 Metadata contains {len(metadata_dict)} chunks")

🔢 Index contains 10130 vectors
🧠 Metadata contains 10130 chunks


In [11]:
import json

# Load gold data
with open("../data/data_processed/Train_Val_Test/gold_test_data_updated.json") as f:
    gold_data = json.load(f)

# Lowercased all chunk texts
all_texts = [chunk["text"].lower() for chunk in metadata_dict.values()]

# Match function
def is_in_indexed_chunks(context):
    return any(context.lower() in chunk_text for chunk_text in all_texts)

# Run test
count_found = 0
for sample in gold_data:
    gold_context = sample.get("gold_context", {})
    references = list(gold_context.values()) if isinstance(gold_context, dict) else [gold_context]
    for ref in references:
        if is_in_indexed_chunks(ref):
            count_found += 1
            break  # only one match needed per sample

print(f"✅ Gold context match found for {count_found}/{len(gold_data)} samples")

✅ Gold context match found for 118/201 samples


In [20]:
import asyncio
from typing import List, Dict, Any, Optional

from ragas import EvaluationDataset, evaluate
import os
from dotenv import load_dotenv

async def evaluate_ragas_dataset(
    dataset: List[Dict[str, Any]],
    metrics_list: Optional[List[str]] = None,
    llm_model: str = "gpt-4o-2024-11-20",
    llm_type: str = "openai",  # 'openai' or 'vllm'
    vllm_base_url: str = "http://localhost:8000/v1"
):
    """
    Evaluate a dataset using RAGAS with the new API.

    Args:
        dataset: List of dicts with keys: user_input, retrieved_contexts, response, reference
        metrics_list: List of metric names to compute (see available_metrics below). If None, all are used.
        llm_model: Model name (e.g., 'gpt-4o', 'gpt-3.5-turbo', 'Fin-R1')
        llm_type: 'openai' (default) or 'vllm'
        vllm_base_url: Base URL for vllm server (if using vllm)
    Returns:
        Dictionary of metric results
    """
    # Metric mapping
    from ragas.metrics import (
        LLMContextPrecisionWithReference,
        NonLLMContextPrecisionWithReference,
        LLMContextRecall,
        NonLLMContextRecall,
        ContextEntityRecall,
        Faithfulness,
        AnswerAccuracy,
        StringPresence,
    )
    available_metrics = {
        "context_precision_llm": LLMContextPrecisionWithReference,
        "context_precision_nonllm": NonLLMContextPrecisionWithReference,
        "context_recall_llm": LLMContextRecall,
        "context_recall_nonllm": NonLLMContextRecall,
        "context_entity_recall": ContextEntityRecall,
        "faithfulness": Faithfulness,
        "answer_accuracy": AnswerAccuracy,
        "string_presence": StringPresence,
    }
    # If no metrics_list, use all
    if metrics_list is None:
        metrics_list = list(available_metrics.keys())
    # Instantiate metrics
    metrics = [available_metrics[name]() for name in metrics_list if name in available_metrics]
    if not metrics:
        raise ValueError("No valid metrics selected.")

    load_dotenv()
    if llm_type == "openai":
        from langchain_openai import ChatOpenAI
        from ragas.llms import LangchainLLMWrapper
        OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
        if not OPENAI_API_KEY:
            raise ValueError("OPENAI_API_KEY not found in .env file.")
        llm = ChatOpenAI(
            model=llm_model,
            api_key=OPENAI_API_KEY,
            max_tokens=1024,         # or more, depending on context length
            timeout=60               # increase timeout to 60s
        )
        evaluator_llm = LangchainLLMWrapper(llm)
    elif llm_type == "vllm":
        from langchain_community.llms import VLLMOpenAI
        from ragas.llms import LangchainLLMWrapper
        llm = VLLMOpenAI(model=llm_model, base_url=vllm_base_url)
        evaluator_llm = LangchainLLMWrapper(llm)
    else:
        raise ValueError(f"Unknown llm_type: {llm_type}")

    evaluation_dataset = EvaluationDataset.from_list(dataset)
    return evaluate(
        dataset=evaluation_dataset,
        metrics=metrics,
        llm=evaluator_llm
    )


In [10]:
import json
import faiss
import numpy as np
import pickle
import pandas as pd
from pathlib import Path
import asyncio


# === Paths ===
BASE_DIR = Path("/Users/alex/Documents/Data Science Master/thesis_RAG/data/Retriever_Context (Eval)")
FAISS_PATH = BASE_DIR / "faiss_index_full.idx"
META_PATH = BASE_DIR / "retriever_metadata_full.pkl"
GOLD_PATH = Path("../data/data_processed/Train_Val_Test/gold_test_data_updated.json")

# === Load Metadata and FAISS Index ===
with open(META_PATH, "rb") as f:
    data = pickle.load(f)
    chunk_ids = data["chunk_ids"]
    metadata_dict = data["metadata"]

index = faiss.read_index(str(FAISS_PATH))

# === Load Embedder and Generator ===
embedder = init_embedder()
generator = ChatGPTGenerator()
reranker = CrossEncoderReranker()

# === Load Gold Test Data ===
with open(GOLD_PATH) as f:
    gold_data = json.load(f)

# === Generate Evaluation Dataset ===
finqa_eval_dataset = []
finder_eval_dataset = []
finqa_count = 0
finder_count = 0

for sample in gold_data:
    if finqa_count >= 2 and finder_count >= 2:
        break

    question = sample["question"]
    reference = sample["answer"]
    gold_context = sample.get("gold_context", {})

    # Embed and retrieve
    query_embedding = np.array(embedder.embed_query(question), dtype="float32").reshape(1, -1)
    faiss.normalize_L2(query_embedding)
    D, I = index.search(query_embedding, 50)

    candidate_docs = [
        metadata_dict[chunk_ids[i]]
        for i in I[0]
        if chunk_ids[i] in metadata_dict and "text" in metadata_dict[chunk_ids[i]]
    ]
    reranked_docs = reranker.rerank(question, candidate_docs, top_k=10)

    # Generate answer
    response = generator.generate(question, [doc["text"] for doc in reranked_docs])

    # Prepare record for RAGAS
    record = {
        "user_input": question,
        "retrieved_contexts": [doc["text"] for doc in reranked_docs],
        "response": response,
        "reference": reference,
    }

    # === Check if reference context exists in indexed corpus ===
    reference_contexts = []
    if isinstance(gold_context, dict):
        reference_contexts = list(gold_context.values())
    elif isinstance(gold_context, str) and gold_context.strip():
        reference_contexts = [gold_context.strip()]

    if reference_contexts:
        record["reference_contexts"] = reference_contexts
        found = False
        for ref in reference_contexts:
            ref_clean = ref.lower().strip()
            for chunk_data in metadata_dict.values():
                if ref_clean in chunk_data.get("text", "").lower():
                    found = True
                    break
            if found:
                break
        if found:
            print("✅ Reference context FOUND in indexed chunks.")
        else:
            print("❌ Reference context NOT found in indexed chunks.")

    # === Append to dataset ===
    if reference_contexts and finqa_count < 2:
        finqa_eval_dataset.append(record)
        finqa_count += 1
    elif not reference_contexts and finder_count < 2:
        finder_eval_dataset.append(record)
        finder_count += 1

# === Evaluate FinQA Subset ===
print("\n🔍 Evaluating FinQA subset...")
finqa_result = asyncio.run(evaluate_ragas_dataset(finqa_eval_dataset))
finqa_df = finqa_result.to_pandas()

print(f"\n🎯 FinQA Metrics per Sample:")
for i, row in finqa_df.iterrows():
    print(f"\n--- Sample {i+1} ---")
    print(f"Question: {row['user_input']}")
    print(f"Response: {row['response']}")
    print(f"Reference Answer: {row['reference']}")
    print(f"Context Sample:\n{row['retrieved_contexts'][0][:500]}...\n")

    for col in finqa_df.columns:
        if col not in ['user_input', 'retrieved_contexts', 'response', 'reference']:
            value = row[col]
            try:
                if isinstance(value, (int, float)):
                    print(f"{col}: {value:.4f}")
                else:
                    print(f"{col}: {value}")
            except Exception as e:
                print(f"{col}: ⚠️ Error printing value ({type(value)}): {e}")

# === Evaluate FinDER Subset ===
print("\n🔍 Evaluating FinDER subset...")
finder_result = asyncio.run(evaluate_ragas_dataset(
    finder_eval_dataset,
    metrics_list=["faithfulness", "answer_accuracy"]
))
finder_df = finder_result.to_pandas()

print(f"\n🎯 FinDER Metrics per Sample:")
for i, row in finder_df.iterrows():
    print(f"\n--- Sample {i+1} ---")
    print(f"Question: {row['user_input']}")
    print(f"Response: {row['response']}")
    print(f"Reference Answer: {row['reference']}")
    print(f"Context Sample:\n{row['retrieved_contexts'][0][:500]}...\n")

    for col in finder_df.columns:
        if col not in ['user_input', 'retrieved_contexts', 'response', 'reference']:
            value = row[col]
            try:
                if isinstance(value, (int, float)):
                    print(f"{col}: {value:.4f}")
                else:
                    print(f"{col}: {value}")
            except Exception as e:
                print(f"{col}: ⚠️ Error printing value ({type(value)}): {e}")

# === Save to CSV ===
finqa_df.to_csv("finqa_eval_detailed.csv", index=False)
finder_df.to_csv("finder_eval_detailed.csv", index=False)
print("\n✅ Evaluation complete. Results saved to CSV.")

❌ Reference context NOT found in indexed chunks.
❌ Reference context NOT found in indexed chunks.

🔍 Evaluating FinQA subset...


Evaluating:   0%|          | 0/16 [00:00<?, ?it/s]


🎯 FinQA Metrics per Sample:

--- Sample 1 ---
Question: In the financial filing of Citigroup, what percentage of incremental risk-weighted assets are student loans at january 1 , 2010?
Response: 3.54%
Reference Answer: 4%
Context Sample:
['commitments .', 'for a further description of the loan loss reserve and related accounts , see 201cmanaging global risk 201d and notes 1 and 18 to the consolidated financial statements on pages 51 , 122 and 165 , respectively .', 'securitizations the company securitizes a number of different asset classes as a means of strengthening its balance sheet and accessing competitive financing rates in the market .', 'under these securitization programs , assets are sold into a trust and used as colla...

reference_contexts: ['in billions of dollars the student loans of incremental gaap assets is 14.4 ; the student loans of incremental risk- weighted assets is 3.5 ;', 'in billions of dollars the total of incremental gaap assets is $ 179.0 ; the total of inc

Evaluating:   0%|          | 0/4 [00:00<?, ?it/s]


🎯 FinDER Metrics per Sample:

--- Sample 1 ---
Question: GM operating margin 2023 vs 2022, GM.
Response: 2023: 5.4 % vs 2022: 6.6 %
Reference Answer: To calculate the operating profit margin, we divide Operating Income by Total Net Sales and Revenue. For 2023, the calculation is as follows:

• 2023 Operating Profit Margin = 9,298 / 171,842 ≈ 0.0541, or about 5.41%.

For 2022, using the same method:

• 2022 Operating Profit Margin = 10,315 / 156,735 ≈ 0.0658, or about 6.58%.

This comparison shows that the operating margin declined from approximately 6.58% in 2022 to about 5.41% in 2023.
Context Sample:
Engineering
(Dollar amounts in millions)		 		Variance
Year Ended December 31,		2023		2022		2023 vs. 2022
Sales		$	2,160 			$	2,762 			(22)	%
Operating profit		$	491 			$	555 			(12)	%
As a percent of sales		22.7 	%		20.1 	%		
 
2023 vs. 2022
 		% Change
Factors Contributing to Changes - Sales		
Currency		1 	%
Other		(23)	%
(22)	%
 
Sales
Engineering segment sales decreased $602 million,

In [None]:
finqa_eval_detailed

In [12]:
import json
from pathlib import Path

# === Load gold test data ===
GOLD_PATH = Path("../data/data_processed/Train_Val_Test/gold_test_data_updated.json")
with open(GOLD_PATH) as f:
    gold_data = json.load(f)

# === Lowercase all chunk texts for matching ===
all_texts = [chunk["text"].lower() for chunk in metadata_dict.values()]

# === Helper: match function ===
def is_in_indexed_chunks(context):
    return any(context.lower() in chunk_text for chunk_text in all_texts)

# === Filter matching samples ===
filtered_eval_dataset = []
count_found = 0

for sample in gold_data:
    gold_context = sample.get("gold_context", {})
    references = list(gold_context.values()) if isinstance(gold_context, dict) else [gold_context]

    for ref in references:
        if is_in_indexed_chunks(ref):
            sample["reference_contexts"] = references
            filtered_eval_dataset.append(sample)
            count_found += 1
            break  # only one match needed per sample

# === Report + Save ===
print(f"✅ Gold context match found for {count_found}/{len(gold_data)} samples")

with open("filtered_gold_eval_dataset.json", "w") as f:
    json.dump(filtered_eval_dataset, f, indent=2)

print("✅ Saved filtered dataset to filtered_gold_eval_dataset.json")

✅ Gold context match found for 118/201 samples
✅ Saved filtered dataset to filtered_gold_eval_dataset.json


In [None]:
import json
import faiss
import numpy as np
import pickle
import pandas as pd
from pathlib import Path
import asyncio


# === Paths ===
BASE_DIR = Path("/Users/alex/Documents/Data Science Master/thesis_RAG/data/Retriever_Context (Eval)")
FAISS_PATH = BASE_DIR / "faiss_index_full.idx"
META_PATH = BASE_DIR / "retriever_metadata_full.pkl"
GOLD_PATH = Path("/Users/alex/Documents/Data Science Master/thesis_RAG/notebooks/filtered_gold_eval_dataset.json")

# === Load Metadata and FAISS Index ===
with open(META_PATH, "rb") as f:
    data = pickle.load(f)
    chunk_ids = data["chunk_ids"]
    metadata_dict = data["metadata"]

index = faiss.read_index(str(FAISS_PATH))

# === Load Embedder and Generator ===
embedder = init_embedder()
generator = ChatGPTGenerator()
reranker = CrossEncoderReranker()

# === Load Gold Test Data ===
with open(GOLD_PATH) as f:
    gold_data = json.load(f)

# === Generate Evaluation Dataset ===
finqa_eval_dataset = []
finder_eval_dataset = []
finqa_count = 0
finder_count = 0

for sample in gold_data:
    if finqa_count >= 2 and finder_count >= 2:
        break

    question = sample["question"]
    reference = sample["answer"]
    gold_context = sample.get("gold_context", {})

    # Embed and retrieve
    query_embedding = np.array(embedder.embed_query(question), dtype="float32").reshape(1, -1)
    faiss.normalize_L2(query_embedding)
        # === FAISS retrieval ===
    D, I = index.search(query_embedding, 50)

    retrieved_chunks = [metadata_dict[chunk_ids[i]]["text"].lower() for i in I[0]]
    candidate_docs = [
        metadata_dict[chunk_ids[i]]
        for i in I[0]
        if chunk_ids[i] in metadata_dict and "text" in metadata_dict[chunk_ids[i]]
    ]

    # === Check if gold_context is in top-50 retrieved chunks ===
    gold_contexts = list(gold_context.values()) if isinstance(gold_context, dict) else [gold_context]
    gold_contexts = [ctx.lower().strip() for ctx in gold_contexts if isinstance(ctx, str)]

    match_found = False
    for gold_ctx in gold_contexts:
        for chunk in retrieved_chunks:
            if gold_ctx in chunk:
                match_found = True
                break
        if match_found:
            break

    # Only show logs if we’re about to evaluate this question
    if (reference_contexts and finqa_count < 2) or (not reference_contexts and finder_count < 2):
        if not match_found:
            print(f"\nGold context NOT retrieved for:\nQ: {question}\nGold: {gold_contexts[0][:200]}...\n")
        else:
            print(f"\nGold context retrieved for:\nQ: {question}")
    reranked_docs = reranker.rerank(question, candidate_docs, top_k=20)

    # === Check if gold_context is in reranked_docs ===
    reranked_texts = [doc["text"].lower() for doc in reranked_docs]
    gold_in_reranked = any(
        gold_ctx in doc_text
        for gold_ctx in gold_contexts
        for doc_text in reranked_texts
    )

    if (reference_contexts and finqa_count < 2) or (not reference_contexts and finder_count < 2):
        if not gold_in_reranked:
            print(f"Gold context LOST after reranking for:\nQ: {question}")
        else:
            print(f"Gold context RETAINED in reranked docs for:\nQ: {question}")

    # Generate answer
    response = generator.generate(question, [doc["text"] for doc in reranked_docs])

    # Prepare record for RAGAS
    record = {
        "user_input": question,
        "retrieved_contexts": [doc["text"] for doc in reranked_docs],
        "response": response,
        "reference": reference,
    }

    # === Check if reference context exists in indexed corpus ===
    reference_contexts = []
    if isinstance(gold_context, dict):
        reference_contexts = list(gold_context.values())
    elif isinstance(gold_context, str) and gold_context.strip():
        reference_contexts = [gold_context.strip()]

    if reference_contexts:
        record["reference_contexts"] = reference_contexts
        found = False
        for ref in reference_contexts:
            ref_clean = ref.lower().strip()
            for chunk_data in metadata_dict.values():
                if ref_clean in chunk_data.get("text", "").lower():
                    found = True
                    break
            if found:
                break
        if found:
            print("Reference context FOUND in indexed chunks.")
        else:
            print("Reference context NOT found in indexed chunks.")

    # === Append to dataset ===
    if reference_contexts and finqa_count < 2:
        finqa_eval_dataset.append(record)
        finqa_count += 1
    elif not reference_contexts and finder_count < 2:
        finder_eval_dataset.append(record)
        finder_count += 1

# === Evaluate FinQA Subset ===
print("\n🔍 Evaluating FinQA subset...")
finqa_result = asyncio.run(evaluate_ragas_dataset(finqa_eval_dataset))
finqa_df = finqa_result.to_pandas()

print(f"\n🎯 FinQA Metrics per Sample:")
for i, row in finqa_df.iterrows():
    print(f"\n--- Sample {i+1} ---")
    print(f"Question: {row['user_input']}")
    print(f"Response: {row['response']}")
    print(f"Reference Answer: {row['reference']}")
    print(f"Context Sample:\n{row['retrieved_contexts'][0][:500]}...\n")

    for col in finqa_df.columns:
        if col not in ['user_input', 'retrieved_contexts', 'response', 'reference']:
            value = row[col]
            try:
                if isinstance(value, (int, float)):
                    print(f"{col}: {value:.4f}")
                else:
                    print(f"{col}: {value}")
            except Exception as e:
                print(f"{col}: ⚠️ Error printing value ({type(value)}): {e}")

# === Evaluate FinDER Subset ===
print("\n🔍 Evaluating FinDER subset...")
finder_result = asyncio.run(evaluate_ragas_dataset(
    finder_eval_dataset,
    metrics_list=["faithfulness", "answer_accuracy"]
))
finder_df = finder_result.to_pandas()

print(f"\n🎯 FinDER Metrics per Sample:")
for i, row in finder_df.iterrows():
    print(f"\n--- Sample {i+1} ---")
    print(f"Question: {row['user_input']}")
    print(f"Response: {row['response']}")
    print(f"Reference Answer: {row['reference']}")
    print(f"Context Sample:\n{row['retrieved_contexts'][0][:500]}...\n")

    for col in finder_df.columns:
        if col not in ['user_input', 'retrieved_contexts', 'response', 'reference']:
            value = row[col]
            try:
                if isinstance(value, (int, float)):
                    print(f"{col}: {value:.4f}")
                else:
                    print(f"{col}: {value}")
            except Exception as e:
                print(f"{col}: ⚠️ Error printing value ({type(value)}): {e}")

# === Save to CSV ===
finqa_df.to_csv("finqa_eval_detailed.csv", index=False)
finder_df.to_csv("finder_eval_detailed.csv", index=False)
print("\n✅ Evaluation complete. Results saved to CSV.")


✅ Gold context retrieved for:
Q: GM operating margin 2023 vs 2022, GM.
✅ Gold context RETAINED in reranked docs for:
Q: GM operating margin 2023 vs 2022, GM.

✅ Gold context retrieved for:
Q: NGC's cyber investments boost investor confidence, enhance valuation, and bolster stability.
✅ Gold context RETAINED in reranked docs for:
Q: NGC's cyber investments boost investor confidence, enhance valuation, and bolster stability.
✅ Reference context FOUND in indexed chunks.

✅ Gold context retrieved for:
Q: what was the cost per tower in American Tower’s colombia movil acquisition?
✅ Gold context RETAINED in reranked docs for:
Q: what was the cost per tower in American Tower’s colombia movil acquisition?
✅ Reference context FOUND in indexed chunks.

🔍 Evaluating FinQA subset...


Evaluating:   0%|          | 0/16 [00:00<?, ?it/s]

Exception raised in Job[4]: TimeoutError()
Exception raised in Job[5]: TimeoutError()



🎯 FinQA Metrics per Sample:

--- Sample 1 ---
Question: Q: For GPN, what was the fair value of share awards vested in 2009?
A: 6.2
Q: what was the value in 2007?
A: 1.7
Q: what was the net change in value?
A: A0
Q: what is the net change divided by the 2007 value?
Response: Simulated answer from: ['notes to consolidated financial statements 2014 ( continued ) the following table summarizes the c...
Reference Answer: 265% increase
Context Sample:
['notes to consolidated financial statements 2014 ( continued ) the following table summarizes the changes in non-vested restricted stock awards for the year ended may 31 , 2009 ( share awards in thousands ) : share awards weighted average grant-date fair value .'] share awards weighted average grant-date fair value non-vested at may 31 2007 278 $ 37 granted 400 38 vested -136 ( 136 ) 30 forfeited -24 ( 24 ) 40 non-vested at may 31 2008 518 39 granted 430 43 vested -159 ( 159 ) 39 forfeited -27 ...

reference_contexts: ['the total fair value o

Evaluating:   0%|          | 0/4 [00:00<?, ?it/s]


🎯 FinDER Metrics per Sample:

--- Sample 1 ---
Question: GM operating margin 2023 vs 2022, GM.
Response: Simulated answer from: Engineering
(Dollar amounts in millions)		 		Variance
Year Ended December 31,		2023		2022		2023 vs. ...
Reference Answer: To calculate the operating profit margin, we divide Operating Income by Total Net Sales and Revenue. For 2023, the calculation is as follows:

• 2023 Operating Profit Margin = 9,298 / 171,842 ≈ 0.0541, or about 5.41%.

For 2022, using the same method:

• 2022 Operating Profit Margin = 10,315 / 156,735 ≈ 0.0658, or about 6.58%.

This comparison shows that the operating margin declined from approximately 6.58% in 2022 to about 5.41% in 2023.
Context Sample:
Engineering
(Dollar amounts in millions)		 		Variance
Year Ended December 31,		2023		2022		2023 vs. 2022
Sales		$	2,160 			$	2,762 			(22)	%
Operating profit		$	491 			$	555 			(12)	%
As a percent of sales		22.7 	%		20.1 	%		
 
2023 vs. 2022
 		% Change
Factors Contributing to Changes - S

AttributeError: 'list' object has no attribute 'head'

In [None]:
import json
import numpy as np
import faiss
import pickle
from pathlib import Path
import pandas as pd
from tqdm import tqdm


# === Setup Paths ===
BASE_DIR = Path("/Users/alex/Documents/Data Science Master/thesis_RAG/data/Retriever_Context (Eval)")
FAISS_PATH = BASE_DIR / "faiss_index_full.idx"
META_PATH = BASE_DIR / "retriever_metadata_full.pkl"
GOLD_PATH = Path("/Users/alex/Documents/Data Science Master/thesis_RAG/notebooks/filtered_gold_eval_dataset.json")

# === Load FAISS Index and Metadata ===
with open(META_PATH, "rb") as f:
    data = pickle.load(f)
    chunk_ids = data["chunk_ids"]
    metadata_dict = data["metadata"]

index = faiss.read_index(str(FAISS_PATH))

# === Load your OpenAI embedder ===
embedder = init_embedder()

# === Load Gold QA Dataset ===
with open(GOLD_PATH) as f:
    gold_data = json.load(f)

# === Evaluation Loop ===
def evaluate_retriever(gold_data, embedder, top_k=50):
    results = []

    for sample in tqdm(gold_data):
        question = sample["question"]
        gold_context_raw = sample.get("gold_context", {})

        # Normalize gold context
        gold_contexts = list(gold_context_raw.values()) if isinstance(gold_context_raw, dict) else [gold_context_raw]
        gold_contexts = [g.lower().strip() for g in gold_contexts if isinstance(g, str)]

        # Embed and search
        try:
            q_embed = np.array(embedder.embed_query(question)).astype("float32").reshape(1, -1)
            faiss.normalize_L2(q_embed)
            D, I = index.search(q_embed, top_k)
        except Exception as e:
            print(f"Embedding failed for question: {question}\nError: {e}")
            continue

        retrieved_chunks = [metadata_dict[chunk_ids[i]]["text"].lower() for i in I[0]]
        found = any(g in r for g in gold_contexts for r in retrieved_chunks)

        results.append({
            "question": question,
            "gold_found_in_top_k": found,
            "gold_context": gold_contexts[0][:120] if gold_contexts else None,
            "first_hit_index": next((i for i, r in enumerate(retrieved_chunks) if any(g in r for g in gold_contexts)), -1)
        })

    return pd.DataFrame(results)

# === Run Evaluation ===
df = evaluate_retriever(gold_data, embedder=embedder, top_k=50)
df.to_csv("retriever_eval.csv", index=False)

# === Print Summary ===
total = len(df)
hits = (df["gold_found_in_top_k"] == True).sum()
print(f"\nRetriever Recall@50: {hits}/{total} → {100 * hits / total:.2f}%")

100%|██████████| 118/118 [00:40<00:00,  2.94it/s]



Retriever Recall@50: 106/118 → 89.83%


In [36]:
def evaluate_reranker(gold_data, top_k_faiss=50, top_k_reranked=10):
    reranker = CrossEncoderReranker()  # your existing reranker class
    results = []

    for sample in tqdm(gold_data):
        question = sample["question"]
        gold_contexts = sample.get("gold_context", {})
        gold_contexts = list(gold_contexts.values()) if isinstance(gold_contexts, dict) else [gold_contexts]
        gold_contexts = [g.lower().strip() for g in gold_contexts if isinstance(g, str)]

        # FAISS
        q_embed = np.array(init_embedder().embed_query(question)).astype("float32").reshape(1, -1)
        faiss.normalize_L2(q_embed)
        D, I = index.search(q_embed, top_k_faiss)

        candidate_docs = [
            metadata_dict[chunk_ids[i]]
            for i in I[0]
            if chunk_ids[i] in metadata_dict and "text" in metadata_dict[chunk_ids[i]]
        ]
        reranked = reranker.rerank(question, candidate_docs, top_k=top_k_reranked)
        reranked_texts = [doc["text"].lower() for doc in reranked]

        found = any(g in r for g in gold_contexts for r in reranked_texts)

        results.append({
            "question": question,
            "gold_found_after_rerank": found,
            "gold_context": gold_contexts[0][:120] if gold_contexts else None,
            "gold_context_rank_post_rerank": next((i for i, r in enumerate(reranked_texts) if any(g in r for g in gold_contexts)), -1)
        })

    return pd.DataFrame(results)

In [37]:
df_reranked = evaluate_reranker(gold_data)
df_reranked.to_csv("reranker_eval_r10.csv", index=False)

hits = (df_reranked["gold_found_after_rerank"] == True).sum()
print(f"\nReranker Recall@10: {hits}/{len(df_reranked)} → {100 * hits / len(df_reranked):.2f}%")

100%|██████████| 118/118 [01:45<00:00,  1.12it/s]


Reranker Recall@10: 104/118 → 88.14%





In [38]:
import json
import numpy as np
import faiss
import pickle
from pathlib import Path
import pandas as pd
from tqdm import tqdm


# === Setup Paths ===
BASE_DIR = Path("/Users/alex/Documents/Data Science Master/thesis_RAG/data/Retriever_Context (Eval)")
FAISS_PATH = BASE_DIR / "faiss_index_full.idx"
META_PATH = BASE_DIR / "retriever_metadata_full.pkl"
GOLD_PATH = Path("/Users/alex/Documents/Data Science Master/thesis_RAG/data/data_processed/Train_Val_Test/gold_test_data_updated.json")

# === Load FAISS Index and Metadata ===
with open(META_PATH, "rb") as f:
    data = pickle.load(f)
    chunk_ids = data["chunk_ids"]
    metadata_dict = data["metadata"]

index = faiss.read_index(str(FAISS_PATH))

# === Load your OpenAI embedder ===
embedder = init_embedder()

# === Load Gold QA Dataset ===
with open(GOLD_PATH) as f:
    gold_data = json.load(f)

# === Evaluation Loop ===
def evaluate_retriever(gold_data, embedder, top_k=50):
    results = []

    for sample in tqdm(gold_data):
        question = sample["question"]
        gold_context_raw = sample.get("gold_context", {})

        # Normalize gold context
        gold_contexts = list(gold_context_raw.values()) if isinstance(gold_context_raw, dict) else [gold_context_raw]
        gold_contexts = [g.lower().strip() for g in gold_contexts if isinstance(g, str)]

        # Embed and search
        try:
            q_embed = np.array(embedder.embed_query(question)).astype("float32").reshape(1, -1)
            faiss.normalize_L2(q_embed)
            D, I = index.search(q_embed, top_k)
        except Exception as e:
            print(f"Embedding failed for question: {question}\nError: {e}")
            continue

        retrieved_chunks = [metadata_dict[chunk_ids[i]]["text"].lower() for i in I[0]]
        found = any(g in r for g in gold_contexts for r in retrieved_chunks)

        results.append({
            "question": question,
            "gold_found_in_top_k": found,
            "gold_context": gold_contexts[0][:120] if gold_contexts else None,
            "first_hit_index": next((i for i, r in enumerate(retrieved_chunks) if any(g in r for g in gold_contexts)), -1)
        })

    return pd.DataFrame(results)

# === Run Evaluation ===
df = evaluate_retriever(gold_data, embedder=embedder, top_k=50)
df.to_csv("retriever_eval.csv", index=False)

# === Print Summary ===
total = len(df)
hits = (df["gold_found_in_top_k"] == True).sum()
print(f"\nRetriever Recall@50: {hits}/{total} → {100 * hits / total:.2f}%")

100%|██████████| 201/201 [01:19<00:00,  2.52it/s]


Retriever Recall@50: 106/201 → 52.74%





In [40]:
df_reranked = evaluate_reranker(gold_data)
df_reranked.to_csv("reranker_eval_r10.csv", index=False)

hits = (df_reranked["gold_found_after_rerank"] == True).sum()
print(f"\nReranker Recall@10: {hits}/{len(df_reranked)} → {100 * hits / len(df_reranked):.2f}%")

100%|██████████| 201/201 [02:45<00:00,  1.21it/s]


Reranker Recall@10: 104/201 → 51.74%





In [45]:
import json
import numpy as np
import faiss
import pickle
from pathlib import Path
import pandas as pd
from tqdm import tqdm


# === Setup Paths ===
BASE_DIR = Path("/Users/alex/Documents/Data Science Master/thesis_RAG/data/Retriever_Context (Eval)")
FAISS_PATH = BASE_DIR / "faiss_index.idx"
META_PATH = BASE_DIR / "retriever_metadata.pkl"
GOLD_PATH = Path("/Users/alex/Documents/Data Science Master/thesis_RAG/notebooks/filtered_gold_eval_dataset.json")

# === Load FAISS Index and Metadata ===
with open(META_PATH, "rb") as f:
    data = pickle.load(f)
    chunk_ids = data["chunk_ids"]
    metadata_dict = data["metadata"]

index = faiss.read_index(str(FAISS_PATH))

# === Load your OpenAI embedder ===
embedder = init_embedder()

# === Load Gold QA Dataset ===
with open(GOLD_PATH) as f:
    gold_data = json.load(f)

# === Evaluation Loop ===
def evaluate_retriever(gold_data, embedder, top_k=50):
    results = []

    for sample in tqdm(gold_data):
        question = sample["question"]
        gold_context_raw = sample.get("gold_context", {})

        # Normalize gold context
        gold_contexts = list(gold_context_raw.values()) if isinstance(gold_context_raw, dict) else [gold_context_raw]
        gold_contexts = [g.lower().strip() for g in gold_contexts if isinstance(g, str)]

        # Embed and search
        try:
            q_embed = np.array(embedder.embed_query(question)).astype("float32").reshape(1, -1)
            faiss.normalize_L2(q_embed)
            D, I = index.search(q_embed, top_k)
        except Exception as e:
            print(f"Embedding failed for question: {question}\nError: {e}")
            continue

        retrieved_chunks = [metadata_dict[chunk_ids[i]]["text"].lower() for i in I[0]]
        found = any(g in r for g in gold_contexts for r in retrieved_chunks)

        results.append({
            "question": question,
            "gold_found_in_top_k": found,
            "gold_context": gold_contexts[0][:120] if gold_contexts else None,
            "first_hit_index": next((i for i, r in enumerate(retrieved_chunks) if any(g in r for g in gold_contexts)), -1)
        })

    return pd.DataFrame(results)

# === Run Evaluation ===
df = evaluate_retriever(gold_data, embedder=embedder, top_k=50)
df.to_csv("retriever_eval.csv", index=False)

# === Print Summary ===
total = len(df)
hits = (df["gold_found_in_top_k"] == True).sum()
print(f"\nRetriever Recall@50: {hits}/{total} → {100 * hits / total:.2f}%")

100%|██████████| 118/118 [00:55<00:00,  2.14it/s]


Retriever Recall@50: 106/118 → 89.83%





In [46]:
from typing import List, Dict
import pandas as pd
from tqdm import tqdm

def evaluate_context_recall(
    dataset: List[Dict],
    top_k_context_key: str = "retrieved_contexts",
    gold_context_key: str = "reference_contexts"
) -> pd.DataFrame:
    """
    Evaluates context recall for each QA sample in a dataset.
    
    Args:
        dataset: List of dicts. Each dict should include:
            - "user_input": question
            - top_k_context_key: list of retrieved context strings
            - gold_context_key: list of gold/reference context strings
        top_k_context_key: name of field containing retrieved contexts
        gold_context_key: name of field containing gold/reference contexts
        
    Returns:
        DataFrame with question, context recall, and detailed hits
    """
    results = []

    for sample in tqdm(dataset):
        question = sample.get("user_input", "N/A")
        retrieved = sample.get(top_k_context_key, [])
        gold = sample.get(gold_context_key, [])

        # Normalize
        retrieved = [r.lower().strip() for r in retrieved]
        gold = [g.lower().strip() for g in gold]

        hits = sum(any(g in r for r in retrieved) for g in gold)
        total = len(gold)
        recall = hits / total if total > 0 else 0.0

        results.append({
            "question": question,
            "context_recall": recall,
            "total_gold": total,
            "hits_found": hits,
            "missing_gold": [g for g in gold if not any(g in r for r in retrieved)]
        })

    return pd.DataFrame(results)

In [51]:
context_recall_df = evaluate_context_recall(finqa_eval_dataset)

100%|██████████| 2/2 [00:00<00:00, 295.99it/s]


In [52]:
print(context_recall_df["context_recall"].mean())

1.0


In [None]:
from typing import List, Dict
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn.metrics.pairwise import cosine_similarity

def evaluate_context_recall(
    dataset: List[Dict],
    embedder,
    top_k_context_key: str = "retrieved_contexts",
    gold_context_key: str = "reference_contexts",
    method: str = "embedding",  # "exact", "embedding", or "both"
    sim_threshold: float = 0.85
) -> pd.DataFrame:
    """
    Evaluates context recall via exact match or embedding-based similarity.

    Args:
        dataset: List of QA samples with user_input, retrieved_contexts, reference_contexts
        embedder: an object with .embed_documents() method (e.g., OpenAIEmbedder or SentenceTransformer)
        method: "exact", "embedding", or "both"
        sim_threshold: similarity threshold for embedding-based match

    Returns:
        pd.DataFrame with recall scores and detailed diagnostics
    """
    results = []

    for sample in tqdm(dataset):
        question = sample.get("user_input", "N/A")
        retrieved = sample.get(top_k_context_key, [])
        gold = sample.get(gold_context_key, [])

        retrieved = [r.lower().strip() for r in retrieved if isinstance(r, str)]
        gold = [g.lower().strip() for g in gold if isinstance(g, str)]

        em_hits = 0
        emb_hits = 0

        # === Exact Match ===
        if method in {"exact", "both"}:
            em_hits = sum(any(g in r for r in retrieved) for g in gold)

        # === Embedding Similarity ===
        if method in {"embedding", "both"} and gold and retrieved:
            try:
                gold_emb = np.array(embedder.embed_documents(gold)).astype("float32")
                retrieved_emb = np.array(embedder.embed_documents(retrieved)).astype("float32")
                sim_matrix = cosine_similarity(gold_emb, retrieved_emb)
                emb_hits = sum(np.any(sim_row >= sim_threshold) for sim_row in sim_matrix)
            except Exception as e:
                print(f"Embedding error for question: {question}\n{e}")

        # Pick which metric to use for recall
        total = len(gold)
        recall_em = em_hits / total if total else 0
        recall_emb = emb_hits / total if total else 0

        results.append({
            "question": question,
            "total_gold": total,
            "recall_em": recall_em,
            "recall_emb": recall_emb,
            "missing_em": [g for g in gold if not any(g in r for r in retrieved)],
            "missing_emb": [g for idx, g in enumerate(gold)
                            if method in {"embedding", "both"} and
                            (idx >= len(sim_matrix) or not np.any(sim_matrix[idx] >= sim_threshold))]
        })

    return pd.DataFrame(results)

In [54]:
recall_df = evaluate_context_recall(
    dataset=finqa_eval_dataset,
    embedder=embedder,
    method="both",
    sim_threshold=0.85
)
print("Avg EM Recall:", recall_df["recall_em"].mean())
print("Avg Embedding Recall:", recall_df["recall_emb"].mean())

100%|██████████| 2/2 [00:02<00:00,  1.30s/it]

Avg EM Recall: 1.0
Avg Embedding Recall: 1.0





In [56]:
def context_precision_with_reference(
    dataset: List[Dict],
    embedder,
    top_k_context_key: str = "retrieved_contexts",
    gold_context_key: str = "reference_contexts",
    method: str = "embedding",  # "exact", "embedding", or "both"
    sim_threshold: float = 0.85
) -> pd.DataFrame:
    """
    Evaluates context precision via exact match or embedding-based similarity.
 
    Args:
        dataset: List of QA samples with user_input, retrieved_contexts, reference_contexts
        embedder: an object with .embed_documents() method
        method: "exact", "embedding", or "both"
        sim_threshold: similarity threshold for embedding-based match
 
    Returns:
        pd.DataFrame with precision scores and detailed diagnostics
    """
    results = []
 
    for sample in tqdm(dataset):
        question = sample.get("user_input", "N/A")
        retrieved = sample.get(top_k_context_key, [])
        gold = sample.get(gold_context_key, [])
 
        retrieved = [r.lower().strip() for r in retrieved if isinstance(r, str)]
        gold = [g.lower().strip() for g in gold if isinstance(g, str)]
 
        em_hits = 0
        emb_hits = 0
 
        # === Exact Match ===
        if method in {"exact", "both"}:
            em_hits = sum(any(g in r for g in gold) for r in retrieved)
 
        # === Embedding Similarity ===
        if method in {"embedding", "both"} and gold and retrieved:
            try:
                gold_emb = np.array(embedder.embed_documents(gold)).astype("float32")
                retrieved_emb = np.array(embedder.embed_documents(retrieved)).astype("float32")
                sim_matrix = cosine_similarity(retrieved_emb, gold_emb)
                emb_hits = sum(np.any(sim_row >= sim_threshold) for sim_row in sim_matrix)
            except Exception as e:
                print(f"Embedding error for question: {question}\n{e}")
 
        # Pick which metric to use for precision
        total = len(retrieved)
        precision_em = em_hits / total if total else 0
        precision_emb = emb_hits / total if total else 0
 
        results.append({
            "question": question,
            "total_retrieved": total,
            "precision_em": precision_em,
            "precision_emb": precision_emb,
            "irrelevant_em": [r for r in retrieved if not any(g in r for g in gold)],
            "irrelevant_emb": [r for idx, r in enumerate(retrieved)
                               if method in {"embedding", "both"} and
                               (idx >= len(sim_matrix) or not np.any(sim_matrix[idx] >= sim_threshold))]
        })
 
    return pd.DataFrame(results)

In [57]:
# Evaluate context precision
precision_results = context_precision_with_reference(
    dataset=finqa_eval_dataset,
    embedder=embedder,
    method="both",  # or "embedding" or "exact"
    sim_threshold=0.85
)

# Summary
print("\n📊 Context Precision Summary:")
print(f"Average Exact Match Precision:     {precision_results['precision_em'].mean():.4f}")
print(f"Average Embedding-Based Precision: {precision_results['precision_emb'].mean():.4f}")

# Optional: Save detailed results
precision_results.to_csv("finqa_context_precision_diagnostics.csv", index=False)

100%|██████████| 2/2 [00:03<00:00,  1.54s/it]


📊 Context Precision Summary:
Average Exact Match Precision:     0.0500
Average Embedding-Based Precision: 0.1500





In [None]:
import asyncio
from typing import List, Dict, Any, Optional
import os
from dotenv import load_dotenv
from ragas import EvaluationDataset, evaluate
import re

def normalize(text: str) -> str:
    """
    Normalize text for evaluation by:
    - lowercasing
    - removing excess whitespace
    - optional: removing punctuation or %/$
    """
    if not isinstance(text, str):
        return ""
    text = text.lower()
    text = re.sub(r'\s+', ' ', text)
    text = text.strip()
    return text

async def evaluate_ragas_dataset(
    dataset: List[Dict[str, Any]],
    metrics_list: Optional[List[str]] = None,
    llm_model: str = "gpt-4o-mini",
    llm_type: str = "openai",  # 'openai' or 'vllm'
    vllm_base_url: str = "http://localhost:8000/v1"
):
    """
    Evaluate a dataset using RAGAS with normalized inputs.

    Args:
        dataset: List of dicts with keys: user_input, retrieved_contexts, response, reference
        metrics_list: Which metrics to use
        llm_model: LLM name (e.g., 'gpt-4o')
        llm_type: 'openai' or 'vllm'
        vllm_base_url: if using vllm locally
    """
    from ragas.metrics import (
        LLMContextPrecisionWithReference,
        NonLLMContextPrecisionWithReference,
        LLMContextRecall,
        NonLLMContextRecall,
        ContextEntityRecall,
        Faithfulness,
        AnswerAccuracy,
        StringPresence,
    )
    available_metrics = {
        "context_precision_llm": LLMContextPrecisionWithReference,
        "context_precision_nonllm": NonLLMContextPrecisionWithReference,
        "context_recall_llm": LLMContextRecall,
        "context_recall_nonllm": NonLLMContextRecall,
        "context_entity_recall": ContextEntityRecall,
        "faithfulness": Faithfulness,
        "answer_accuracy": AnswerAccuracy,
        "string_presence": StringPresence,
    }

    if metrics_list is None:
        metrics_list = list(available_metrics.keys())

    metrics = [available_metrics[name]() for name in metrics_list if name in available_metrics]

    # === Normalize dataset ===
    normalized_dataset = []
    for row in dataset:
        norm_row = {
            "user_input": normalize(row.get("user_input", "")),
            "retrieved_contexts": [normalize(c) for c in row.get("retrieved_contexts", [])],
            "response": normalize(row.get("response", "")),
            "reference": normalize(row.get("reference", "")),
        }

        # Optional: normalize reference_contexts too
        if "reference_contexts" in row:
            norm_row["reference_contexts"] = [normalize(c) for c in row["reference_contexts"]]
        
        normalized_dataset.append(norm_row)

    # === Load LLM ===
    load_dotenv()
    if llm_type == "openai":
        from langchain_openai import ChatOpenAI
        from ragas.llms import LangchainLLMWrapper
        OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
        if not OPENAI_API_KEY:
            raise ValueError("OPENAI_API_KEY not found in .env file.")
        llm = ChatOpenAI(model=llm_model, api_key=OPENAI_API_KEY)
        evaluator_llm = LangchainLLMWrapper(llm)
    elif llm_type == "vllm":
        from langchain_community.llms import VLLMOpenAI
        from ragas.llms import LangchainLLMWrapper
        llm = VLLMOpenAI(model=llm_model, base_url=vllm_base_url)
        evaluator_llm = LangchainLLMWrapper(llm)
    else:
        raise ValueError(f"Unknown llm_type: {llm_type}")

    evaluation_dataset = EvaluationDataset.from_list(normalized_dataset)
    return evaluate(
        dataset=evaluation_dataset,
        metrics=metrics,
        llm=evaluator_llm
    )

In [58]:
import spacy
from typing import List, Dict
import pandas as pd
from tqdm import tqdm
 
def context_entity_recall(
    dataset: List[Dict],
    nlp,
    top_k_context_key: str = "retrieved_contexts",
    gold_context_key: str = "reference_contexts"
) -> pd.DataFrame:
    """
    Computes entity-level recall: proportion of gold named entities found in retrieved contexts.
 
    Args:
        dataset: List of samples with user_input, retrieved_contexts, and reference_contexts
        nlp: spaCy language model for NER
        top_k_context_key: Key for retrieved contexts
        gold_context_key: Key for reference (gold) contexts
 
    Returns:
        pd.DataFrame with entity recall metrics and diagnostics
    """
    results = []
 
    for sample in tqdm(dataset):
        question = sample.get("user_input", "N/A")
        retrieved = sample.get(top_k_context_key, [])
        gold = sample.get(gold_context_key, [])
 
        # Join all contexts into one string each
        retrieved_text = " ".join([r for r in retrieved if isinstance(r, str)])
        gold_text = " ".join([g for g in gold if isinstance(g, str)])
 
        # Run NER
        retrieved_doc = nlp(retrieved_text)
        gold_doc = nlp(gold_text)
 
        # Extract unique entities (as strings)
        retrieved_ents = set(ent.text.strip().lower() for ent in retrieved_doc.ents)
        gold_ents = set(ent.text.strip().lower() for ent in gold_doc.ents)
 
        # Calculate overlap
        common_ents = retrieved_ents & gold_ents
        total_gold_ents = len(gold_ents)
        recall = len(common_ents) / total_gold_ents if total_gold_ents else 0
 
        results.append({
            "question": question,
            "retrieved_entities": list(retrieved_ents),
            "gold_entities": list(gold_ents),
            "common_entities": list(common_ents),
            "num_common_entities": len(common_ents),
            "total_gold_entities": total_gold_ents,
            "entity_recall": recall
        })
 
    return pd.DataFrame(results)

In [59]:
import spacy

# Load spaCy English model (small is fast; medium or large is more accurate)
nlp = spacy.load("en_core_web_sm")

# Run entity-level recall evaluation
entity_results = context_entity_recall(
    dataset=finqa_eval_dataset,
    nlp=nlp,
    top_k_context_key="retrieved_contexts",
    gold_context_key="reference_contexts"
)

# Print summary
print(f"\n📊 Average Entity Recall: {entity_results['entity_recall'].mean():.4f}")

# Optional: Save diagnostics to CSV
entity_results.to_csv("finqa_entity_recall_diagnostics.csv", index=False)

100%|██████████| 2/2 [00:03<00:00,  1.52s/it]


📊 Average Entity Recall: 1.0000





In [60]:
def compute_f1_from_precision_recall_dfs(
    precision_df: pd.DataFrame,
    recall_df: pd.DataFrame,
    precision_col: str = "precision_emb",
    recall_col: str = "recall_emb"
) -> pd.DataFrame:
    """
    Combines precision and recall DataFrames to compute F1 score per sample.
 
    Args:
        precision_df: Output of context_precision_with_reference()
        recall_df: Output of context_recall_with_reference()
        precision_col: Column name for precision score
        recall_col: Column name for recall score
 
    Returns:
        A merged DataFrame with F1 score added
    """
    # Join on 'question'
    merged = pd.merge(precision_df, recall_df, on="question", suffixes=("_prec", "_rec"))
 
    def safe_f1(p, r):
        return (2 * p * r / (p + r)) if (p + r) > 0 else 0.0
 
    merged["f1"] = merged.apply(lambda row: safe_f1(row[precision_col], row[recall_col]), axis=1)
 
    return merged
 

In [62]:
# Step 1: Compute context precision and recall separately
precision_df = context_precision_with_reference(
    dataset=finqa_eval_dataset,
    embedder=embedder,
    method="embedding",   # or "both"
    sim_threshold=0.85
)

recall_df = evaluate_context_recall(
    dataset=finqa_eval_dataset,
    embedder=embedder,
    method="embedding",   # or "both"
    sim_threshold=0.85
)

# Step 2: Compute F1 score from the two
f1_df = compute_f1_from_precision_recall_dfs(
    precision_df=precision_df,
    recall_df=recall_df,
    precision_col="precision_emb",  # use "precision_em" if using exact match
    recall_col="recall_emb"         # or "recall_em"
)

# Step 3: Print summary
print(f"\n🔍 Average F1 Score: {f1_df['f1'].mean():.4f}")

# Step 4 (optional): Save to CSV
f1_df.to_csv("finqa_precision_recall_f1.csv", index=False)

100%|██████████| 2/2 [00:01<00:00,  1.18it/s]
100%|██████████| 2/2 [00:04<00:00,  2.04s/it]


🔍 Average F1 Score: 0.2476





In [63]:
from typing import List, Dict
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm
 
def mean_reciprocal_rank(
    dataset: List[Dict],
    embedder = None,
    top_k_context_key: str = "retrieved_contexts",
    gold_context_key: str = "reference_contexts",
    method: str = "embedding",  # "exact" or "embedding"
    sim_threshold: float = 0.85
) -> pd.DataFrame:
    """
    Calculates MRR for retrieved contexts given gold reference contexts.
 
    Returns:
        pd.DataFrame with reciprocal ranks and full MRR score
    """
    results = []
 
    for sample in tqdm(dataset):
        question = sample.get("user_input", "N/A")
        retrieved = sample.get(top_k_context_key, [])
        gold = sample.get(gold_context_key, [])
 
        retrieved = [r.lower().strip() for r in retrieved if isinstance(r, str)]
        gold = [g.lower().strip() for g in gold if isinstance(g, str)]
 
        rr = 0.0  # Reciprocal rank
 
        if method == "exact":
            for rank, ret in enumerate(retrieved, start=1):
                if any(g in ret for g in gold):
                    rr = 1.0 / rank
                    break
 
        elif method == "embedding" and embedder and gold and retrieved:
            try:
                gold_emb = np.array(embedder.embed_documents(gold)).astype("float32")
                retrieved_emb = np.array(embedder.embed_documents(retrieved)).astype("float32")
                sim_matrix = cosine_similarity(retrieved_emb, gold_emb)
                for rank, sim_row in enumerate(sim_matrix, start=1):
                    if np.any(sim_row >= sim_threshold):
                        rr = 1.0 / rank
                        break
            except Exception as e:
                print(f"Embedding error for question: {question}\n{e}")
 
        results.append({
            "question": question,
            "reciprocal_rank": rr
        })
 
    df = pd.DataFrame(results)
    df["mrr"] = df["reciprocal_rank"].mean()
    return df

In [64]:
df = mean_reciprocal_rank(finqa_eval_dataset, embedder=embedder)

print("MRR:", df['mrr'].iloc[0])

df.head()

 

100%|██████████| 2/2 [00:01<00:00,  1.00it/s]

MRR: 0.75





Unnamed: 0,question,reciprocal_rank,mrr
0,"Q: For GPN, what was the fair value of share a...",1.0,0.75
1,what was the cost per tower in American Tower’...,0.5,0.75


In [65]:
import numpy as np
import pandas as pd
from typing import List, Dict
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm
 
def compute_ndcg(
    dataset: List[Dict],
    embedder = None,
    top_k_context_key: str = "retrieved_contexts",
    gold_context_key: str = "reference_contexts",
    method: str = "embedding",  # "exact" or "embedding"
    sim_threshold: float = 0.85,
    k: int = 10
) -> pd.DataFrame:
    """
    Compute nDCG@k for retrieved contexts.
 
    Returns:
        DataFrame with nDCG@k per sample and average
    """
    results = []
 
    for sample in tqdm(dataset):
        question = sample.get("user_input", "N/A")
        retrieved = sample.get(top_k_context_key, [])[:k]
        gold = sample.get(gold_context_key, [])
 
        retrieved = [r.lower().strip() for r in retrieved if isinstance(r, str)]
        gold = [g.lower().strip() for g in gold if isinstance(g, str)]
 
        relevance = [0] * len(retrieved)  # Relevance at each rank
 
        if method == "exact":
            for i, r in enumerate(retrieved):
                if any(g in r for g in gold):
                    relevance[i] = 1
 
        elif method == "embedding" and embedder and gold and retrieved:
            try:
                gold_emb = np.array(embedder.embed_documents(gold)).astype("float32")
                retrieved_emb = np.array(embedder.embed_documents(retrieved)).astype("float32")
                sim_matrix = cosine_similarity(retrieved_emb, gold_emb)
                for i, sim_row in enumerate(sim_matrix):
                    if np.any(sim_row >= sim_threshold):
                        relevance[i] = 1
            except Exception as e:
                print(f"Embedding error for question: {question}\n{e}")
 
        # === Compute DCG and nDCG ===
        dcg = sum(rel / np.log2(i + 2) for i, rel in enumerate(relevance))  # +2 because log2(i+1) with 0-indexing
        ideal_relevance = sorted(relevance, reverse=True)
        idcg = sum(rel / np.log2(i + 2) for i, rel in enumerate(ideal_relevance))
 
        ndcg = dcg / idcg if idcg > 0 else 0.0
 
        results.append({
            "question": question,
            "dcg": dcg,
            "idcg": idcg,
            "ndcg@{}".format(k): ndcg
        })
 
    df = pd.DataFrame(results)
    df["mean_ndcg"] = df["ndcg@{}".format(k)].mean()
    return df

In [66]:
ndcg_df = compute_ndcg(finqa_eval_dataset, embedder=embedder, k=10)
print("Average nDCG@10:", ndcg_df["mean_ndcg"].iloc[0])

100%|██████████| 2/2 [00:09<00:00,  4.79s/it]

Average nDCG@10: 0.7817254227631407





In [44]:
import json
import faiss
import numpy as np
import pickle
import pandas as pd
from pathlib import Path
import asyncio
import re


# === Paths ===
BASE_DIR = Path("/Users/alex/Documents/Data Science Master/thesis_RAG/data/Retriever_Context (Eval)")
FAISS_PATH = BASE_DIR / "faiss_index_full.idx"
META_PATH = BASE_DIR / "retriever_metadata_full.pkl"
GOLD_PATH = Path("/Users/alex/Documents/Data Science Master/thesis_RAG/notebooks/filtered_gold_eval_dataset.json")

# === Load Metadata and FAISS Index ===
with open(META_PATH, "rb") as f:
    data = pickle.load(f)
    chunk_ids = data["chunk_ids"]
    metadata_dict = data["metadata"]

index = faiss.read_index(str(FAISS_PATH))

# === Load Embedder and Generator ===
embedder = init_embedder()
generator = ChatGPTGenerator()
reranker = CrossEncoderReranker()

# === Load Gold Test Data ===
with open(GOLD_PATH) as f:
    gold_data = json.load(f)

# === Helper: Normalize strings and percentages ===
def normalize_answer(ans: str) -> str:
    ans = ans.strip().lower()
    ans = re.sub(r"[,$]", "", ans)              # Remove commas, dollar signs
    ans = re.sub(r"\s+", " ", ans)              # Collapse whitespace
    ans = ans.replace(" percent", "%")
    ans = re.sub(r"(\d)\s*%", r"\1%", ans)      # "20 %" → "20%"
    return ans

# === Generate Evaluation Dataset ===
finqa_eval_dataset = []
finder_eval_dataset = []
finqa_count = 0
finder_count = 0

for sample in gold_data:
    if finqa_count >= 2 and finder_count >= 2:
        break

    question = sample["question"]
    reference = normalize_answer(sample["answer"])
    gold_context = sample.get("gold_context", {})

    # === Check if reference context exists in indexed corpus ===
    reference_contexts = []
    if isinstance(gold_context, dict):
        reference_contexts = list(gold_context.values())
    elif isinstance(gold_context, str) and gold_context.strip():
        reference_contexts = [gold_context.strip()]

    # Embed and retrieve
    query_embedding = np.array(embedder.embed_query(question), dtype="float32").reshape(1, -1)
    faiss.normalize_L2(query_embedding)
    D, I = index.search(query_embedding, 50)

    retrieved_chunks = [metadata_dict[chunk_ids[i]]["text"].lower() for i in I[0]]
    candidate_docs = [
        metadata_dict[chunk_ids[i]]
        for i in I[0]
        if chunk_ids[i] in metadata_dict and "text" in metadata_dict[chunk_ids[i]]
    ]

    # === Check if gold_context is in top-50 retrieved chunks ===
    gold_contexts = [ctx.lower().strip() for ctx in reference_contexts if isinstance(ctx, str)]
    match_found = any(g in r for g in gold_contexts for r in retrieved_chunks)

    # Only show logs for selected samples
    if (reference_contexts and finqa_count < 2) or (not reference_contexts and finder_count < 2):
        if not match_found:
            gold_preview = gold_contexts[0][:200] if gold_contexts else "(empty)"
            print(f"\nGold context NOT retrieved for:\nQ: {question}\nGold: {gold_preview}\n")
        else:
            print(f"\nGold context retrieved for:\nQ: {question}")

    reranked_docs = reranker.rerank(question, candidate_docs, top_k=20)
    reranked_texts = [doc["text"].lower() for doc in reranked_docs]
    gold_in_reranked = any(g in d for g in gold_contexts for d in reranked_texts)

    if (reference_contexts and finqa_count < 2) or (not reference_contexts and finder_count < 2):
        if not gold_in_reranked:
            print(f"Gold context LOST after reranking for:\nQ: {question}")
        else:
            print(f"Gold context RETAINED in reranked docs for:\nQ: {question}")

    # Generate answer
    raw_response = generator.generate(question, [doc["text"] for doc in reranked_docs])
    response = normalize_answer(raw_response)

    # === Record for RAGAS ===
    record = {
        "user_input": question,
        "retrieved_contexts": [doc["text"] for doc in reranked_docs],
        "response": response,
        "reference": reference,
    }

    if reference_contexts:
        record["reference_contexts"] = reference_contexts
        found = False
        for ref in reference_contexts:
            ref_clean = ref.lower().strip()
            for chunk_data in metadata_dict.values():
                if ref_clean in chunk_data.get("text", "").lower():
                    found = True
                    break
            if found:
                break
        print("Reference context FOUND in indexed chunks." if found else "Reference context NOT found in indexed chunks.")

    # Append to correct eval set
    if reference_contexts and finqa_count < 2:
        finqa_eval_dataset.append(record)
        finqa_count += 1
    elif not reference_contexts and finder_count < 2:
        finder_eval_dataset.append(record)
        finder_count += 1

# === Evaluate FinQA Subset ===
print("\n🔍 Evaluating FinQA subset...")
finqa_result = asyncio.run(evaluate_ragas_dataset(finqa_eval_dataset))
finqa_df = finqa_result.to_pandas()

print(f"\n🎯 FinQA Metrics per Sample:")
for i, row in finqa_df.iterrows():
    print(f"\n--- Sample {i+1} ---")
    print(f"Question: {row['user_input']}")
    print(f"Response: {row['response']}")
    print(f"Reference Answer: {row['reference']}")
    print(f"Context Sample:\n{row['retrieved_contexts'][0][:500]}...\n")

    for col in finqa_df.columns:
        if col not in ['user_input', 'retrieved_contexts', 'response', 'reference']:
            value = row[col]
            try:
                if isinstance(value, (int, float)):
                    print(f"{col}: {value:.4f}")
                else:
                    print(f"{col}: {value}")
            except Exception as e:
                print(f"{col}: ⚠️ Error printing value ({type(value)}): {e}")

# === Evaluate FinDER Subset ===
print("\n🔍 Evaluating FinDER subset...")
finder_result = asyncio.run(evaluate_ragas_dataset(
    finder_eval_dataset,
    metrics_list=["faithfulness", "answer_accuracy"]
))
finder_df = finder_result.to_pandas()

print(f"\n🎯 FinDER Metrics per Sample:")
for i, row in finder_df.iterrows():
    print(f"\n--- Sample {i+1} ---")
    print(f"Question: {row['user_input']}")
    print(f"Response: {row['response']}")
    print(f"Reference Answer: {row['reference']}")
    print(f"Context Sample:\n{row['retrieved_contexts'][0][:500]}...\n")

    for col in finder_df.columns:
        if col not in ['user_input', 'retrieved_contexts', 'response', 'reference']:
            value = row[col]
            try:
                if isinstance(value, (int, float)):
                    print(f"{col}: {value:.4f}")
                else:
                    print(f"{col}: {value}")
            except Exception as e:
                print(f"{col}: ⚠️ Error printing value ({type(value)}): {e}")

# === Save to CSV ===
finqa_df.to_csv("finqa_eval_detailed.csv", index=False)
finder_df.to_csv("finder_eval_detailed.csv", index=False)
print("\n✅ Evaluation complete. Results saved to CSV.")


Gold context NOT retrieved for:
Q: GM operating margin 2023 vs 2022, GM.
Gold: (empty)

Gold context LOST after reranking for:
Q: GM operating margin 2023 vs 2022, GM.

Gold context NOT retrieved for:
Q: NGC's cyber investments boost investor confidence, enhance valuation, and bolster stability.
Gold: (empty)

Gold context LOST after reranking for:
Q: NGC's cyber investments boost investor confidence, enhance valuation, and bolster stability.

Gold context retrieved for:
Q: Q: For GPN, what was the fair value of share awards vested in 2009?
A: 6.2
Q: what was the value in 2007?
A: 1.7
Q: what was the net change in value?
A: A0
Q: what is the net change divided by the 2007 value?
Gold context RETAINED in reranked docs for:
Q: Q: For GPN, what was the fair value of share awards vested in 2009?
A: 6.2
Q: what was the value in 2007?
A: 1.7
Q: what was the net change in value?
A: A0
Q: what is the net change divided by the 2007 value?
Reference context FOUND in indexed chunks.

Gold contex

Evaluating:   0%|          | 0/16 [00:00<?, ?it/s]

Exception raised in Job[5]: TimeoutError()
Exception raised in Job[4]: TimeoutError()
Exception raised in Job[12]: TimeoutError()



🎯 FinQA Metrics per Sample:

--- Sample 1 ---
Question: q: for gpn, what was the fair value of share awards vested in 2009? a: 6.2 q: what was the value in 2007? a: 1.7 q: what was the net change in value? a: a0 q: what is the net change divided by the 2007 value?
Response: 2.647058823
Reference Answer: 265% increase
Context Sample:
['notes to consolidated financial statements 2014 ( continued ) the following table summarizes the changes in non-vested restricted stock awards for the year ended may 31 , 2009 ( share awards in thousands ) : share awards weighted average grant-date fair value .'] share awards weighted average grant-date fair value non-vested at may 31 2007 278 $ 37 granted 400 38 vested -136 ( 136 ) 30 forfeited -24 ( 24 ) 40 non-vested at may 31 2008 518 39 granted 430 43 vested -159 ( 159 ) 39 forfeited -27 ...

reference_contexts: ['the total fair value of share awards vested during the years ended may 31 , 2009 , 2008 and 2007 was $ 6.2 million , $ 4.1 million and $ 

Evaluating:   0%|          | 0/4 [00:00<?, ?it/s]

An error occurred: Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4o in organization org-b9EC73CxcpNUA7w9Zh1KhBEH on tokens per min (TPM): Limit 30000, Used 30000, Requested 334. Please try again in 668ms. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}}. Skipping a sample by assigning it nan score.
An error occurred: Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4o in organization org-b9EC73CxcpNUA7w9Zh1KhBEH on tokens per min (TPM): Limit 30000, Used 30000, Requested 729. Please try again in 1.458s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}}. Skipping a sample by assigning it nan score.



🎯 FinDER Metrics per Sample:

--- Sample 1 ---
Question: gm operating margin 2023 vs 2022, gm.
Response: approximately 5.4% in 2023 versus about 6.6% in 2022.
Reference Answer: to calculate the operating profit margin we divide operating income by total net sales and revenue. for 2023 the calculation is as follows: • 2023 operating profit margin = 9298 / 171842 ≈ 0.0541 or about 5.41%. for 2022 using the same method: • 2022 operating profit margin = 10315 / 156735 ≈ 0.0658 or about 6.58%. this comparison shows that the operating margin declined from approximately 6.58% in 2022 to about 5.41% in 2023.
Context Sample:
engineering (dollar amounts in millions) variance year ended december 31, 2023 2022 2023 vs. 2022 sales $ 2,160 $ 2,762 (22) % operating profit $ 491 $ 555 (12) % as a percent of sales 22.7 % 20.1 % 2023 vs. 2022 % change factors contributing to changes - sales currency 1 % other (23) % (22) % sales engineering segment sales decreased $602 million, or 22%, in 2023 versus 2

In [39]:
import json
import pickle
import numpy as np
import faiss
from pathlib import Path


# === Paths ===
BASE_DIR = Path("/Users/alex/Documents/Data Science Master/thesis_RAG/data/Retriever_Context (Eval)")
FAISS_PATH = BASE_DIR / "faiss_index_full.idx"
META_PATH = BASE_DIR / "retriever_metadata_full.pkl"
GOLD_PATH = Path("/Users/alex/Documents/Data Science Master/thesis_RAG/data/data_processed/Train_Val_Test/gold_test_data_updated.json")

# === Load ===
with open(META_PATH, "rb") as f:
    metadata = pickle.load(f)
    chunk_ids = metadata["chunk_ids"]
    metadata_dict = metadata["metadata"]

index = faiss.read_index(str(FAISS_PATH))
embedder = init_embedder()
generator = ChatGPTGenerator()

with open(GOLD_PATH) as f:
    gold_data = json.load(f)

# === Evaluation loop ===
results = []
for sample in gold_data[:10]:  # Adjust the slice to test more
    question = sample["question"]
    reference = sample["answer"]
    gold_contexts = sample.get("gold_context", {})
    if isinstance(gold_contexts, dict):
        gold_contexts = list(gold_contexts.values())

    # Retrieve from FAISS
    query_vec = np.array(embedder.embed_query(question)).astype("float32").reshape(1, -1)
    faiss.normalize_L2(query_vec)
    D, I = index.search(query_vec, 50)

    top_chunks = [metadata_dict[chunk_ids[i]]["text"] for i in I[0]]
    retrieved_contexts = top_chunks[:10]

    # Generate response
    answer = generator.generate(question, retrieved_contexts)

    # === Simple Metrics ===
    gold_retrieved = any(ctx.lower().strip() in chunk.lower() for ctx in gold_contexts for chunk in top_chunks)
    exact_match = answer.strip().lower() == reference.strip().lower()
    partial_match = reference.strip().lower() in answer.strip().lower()
    context_used = any(ctx.lower().strip() in answer.lower() for ctx in gold_contexts)

    results.append({
        "question": question,
        "reference": reference,
        "answer": answer,
        "gold_retrieved": gold_retrieved,
        "exact_match": exact_match,
        "partial_match": partial_match,
        "faithful_to_context": context_used,
    })

# === Print summary ===
correct = sum(r["exact_match"] for r in results)
partial = sum(r["partial_match"] for r in results)
retrieval_success = sum(r["gold_retrieved"] for r in results)
context_faithful = sum(r["faithful_to_context"] for r in results)

print(f"\nEvaluation Summary (n={len(results)}):")
print(f"✅ Gold Context Retrieved: {retrieval_success}/{len(results)}")
print(f"✅ Exact Matches: {correct}/{len(results)}")
print(f"✅ Partial Matches: {partial}/{len(results)}")
print(f"✅ Context-Faithful Generations: {context_faithful}/{len(results)}")

# Optional: save results
import pandas as pd
pd.DataFrame(results).to_csv("local_eval_results.csv", index=False)

Task exception was never retrieved
future: <Task finished name='Task-2408' coro=<evaluate_ragas_dataset() done, defined at /var/folders/hp/hkpqf03x0qdg7_8s3s9sy7gw0000gn/T/ipykernel_68125/1842337027.py:8> exception=KeyboardInterrupt()>
Traceback (most recent call last):
  File "/Users/alex/anaconda3/envs/fin_rag_env/lib/python3.11/site-packages/IPython/core/interactiveshell.py", line 3699, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "/var/folders/hp/hkpqf03x0qdg7_8s3s9sy7gw0000gn/T/ipykernel_68125/3351570119.py", line 139, in <module>
    finqa_result = asyncio.run(evaluate_ragas_dataset(finqa_eval_dataset))
                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/alex/anaconda3/envs/fin_rag_env/lib/python3.11/site-packages/nest_asyncio.py", line 30, in run
    return loop.run_until_complete(task)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/alex/anaconda3/envs/fin_rag_env/lib/python3.11/site-packages/nest_asyncio

KeyboardInterrupt: 

In [25]:
import numpy as np
import faiss

query = "sample financial question"
embedding = np.array(embedder.embed_query(query)).astype("float32").reshape(1, -1)

print("FAISS dim:", index.d)
print("Query dim:", embedding.shape[1])
assert index.d == embedding.shape[1], "❌ Embedding dimension mismatch!"

FAISS dim: 1536
Query dim: 1536


In [16]:
# After FAISS search
retrieved_chunks = [metadata_dict[chunk_ids[i]]["text"].lower() for i in I[0]]

# Check if any gold_context is present in retrieved chunks
gold_contexts = list(gold_context.values()) if isinstance(gold_context, dict) else [gold_context]
gold_contexts = [ctx.lower().strip() for ctx in gold_contexts if isinstance(ctx, str)]

# Check match
match_found = False
for gold_ctx in gold_contexts:
    for chunk in retrieved_chunks:
        if gold_ctx in chunk:
            match_found = True
            break
    if match_found:
        break

if not match_found:
    print(f"❌ Gold context NOT retrieved for:\nQ: {question}\nGold: {gold_contexts[0]}")
else:
    print(f"✅ Gold context retrieved for:\nQ: {question}")

✅ Gold context retrieved for:
Q: what was the cost per tower in American Tower’s colombia movil acquisition?


In [None]:

generator = ChatGPTGenerator()

# Load gold test data
with open("../data/data_processed/Train_Val_Test/gold_test_data_updated.json") as f:
    gold_data = json.load(f)

df_meta = pd.read_parquet(OUT_DIR / "chunk_meta.parquet")
all_docs = df_meta.to_dict(orient="records")  # becomes list of dicts with 'text' key

# Load FAISS index#
OUT_DIR = Path("outputs/2025-08-01")   
faiss_index = FAISS.load_local(
    folder_path=OUT_DIR,
    embeddings=embedder,
    allow_dangerous_deserialization=True
)

# Separate datasets
finqa_eval_dataset = []
finder_eval_dataset = []

finqa_count = 0
finder_count = 0

for sample in gold_data:
    if finqa_count >= 10 and finder_count >= 10:
        break

    question = sample["question"]
    reference = sample["answer"]
    gold_context = sample.get("gold_context", {})

    # Embed and search
    query_embedding = np.array(embedder.embed_query(question), dtype="float32").reshape(1, -1)
    D, I = faiss_index.index.search(query_embedding, 10)
    candidate_texts = [all_docs[i] for i in I[0] if "text" in all_docs[i]]

    # Rerank
    reranked_docs = CrossEncoderReranker().rerank(question, candidate_texts, top_k=5)

    # Generate
    response = generator.generate(question, [doc["text"] for doc in reranked_docs])
    print("Question:", question)
    print("Generated Answer:", response)
    print("-" * 80)

    # Determine reference contexts
    reference_contexts = []
    if isinstance(gold_context, dict):
        reference_contexts = list(gold_context.values())
    elif isinstance(gold_context, str) and gold_context.strip():
        reference_contexts = [gold_context.strip()]

    record = {
        "user_input": question,
        "retrieved_contexts": [doc["text"] for doc in reranked_docs],
        "response": response,
        "reference": reference,
    }

    # Append based on presence of gold context
    if reference_contexts and finqa_count < 10:
        record["reference_contexts"] = reference_contexts
        finqa_eval_dataset.append(record)
        finqa_count += 1
    elif not reference_contexts and finder_count < 10:
        finder_eval_dataset.append(record)
        finder_count += 1

# Evaluate
import asyncio

print("\n🔍 Evaluating FinQA subset...")
finqa_results = asyncio.run(evaluate_ragas_dataset(finqa_eval_dataset))
print(finqa_results)

print("\n🔍 Evaluating FinDER subset...")
finder_results = asyncio.run(evaluate_ragas_dataset(
    finder_eval_dataset,
    metrics_list=["faithfulness", "answer_accuracy"]
))
print(finder_results)

Question: GM operating margin 2023 vs 2022, GM.
Generated Answer: 2023: 5.4 % vs 2022: 6.6 %
--------------------------------------------------------------------------------
Question: In the financial filing of Citigroup, what percentage of incremental risk-weighted assets are student loans at january 1 , 2010?
Generated Answer: 24%
--------------------------------------------------------------------------------
Question: what is the growth rate in net revenue in 2003 for entergy corporation?
Generated Answer: I don’t know
--------------------------------------------------------------------------------
Question: NGC's cyber investments boost investor confidence, enhance valuation, and bolster stability.
Generated Answer: NGC’s ongoing spending on cyber defenses, industry‐wide information‐sharing memberships, and third-party maturity assessments demonstrate proactive risk management, which reassures investors, helps preserve firm value, and supports the company’s operational and financi

Evaluating:   0%|          | 0/80 [00:00<?, ?it/s]

Exception raised in Job[0]: TimeoutError()
Exception raised in Job[4]: TimeoutError()
Exception raised in Job[34]: TimeoutError()
Exception raised in Job[40]: TimeoutError()
Exception raised in Job[42]: TimeoutError()
Exception raised in Job[44]: TimeoutError()
Exception raised in Job[45]: TimeoutError()
Exception raised in Job[52]: TimeoutError()


{'llm_context_precision_with_reference': 0.5500, 'non_llm_context_precision_with_reference': 0.0333, 'context_recall': 0.1250, 'non_llm_context_recall': 0.0500, 'context_entity_recall': 0.0000, 'faithfulness': 0.3056, 'nv_accuracy': 0.3000, 'string_present': 0.3000}

🔍 Evaluating FinDER subset...


Evaluating:   0%|          | 0/20 [00:00<?, ?it/s]

An error occurred: Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4o in organization org-b9EC73CxcpNUA7w9Zh1KhBEH on tokens per min (TPM): Limit 30000, Used 30000, Requested 571. Please try again in 1.142s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}}. Skipping a sample by assigning it nan score.
An error occurred: Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4o in organization org-b9EC73CxcpNUA7w9Zh1KhBEH on tokens per min (TPM): Limit 30000, Used 30000, Requested 771. Please try again in 1.542s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}}. Skipping a sample by assigning it nan score.


{'faithfulness': 0.5358, 'nv_accuracy': 0.4062}


In [None]:
print(finder_results)

{'faithfulness': 0.7500, 'nv_accuracy': 0.7500}


In [None]:
async def test_ceiling_performance(
    #train_path="/Users/christel/Desktop/Thesis/thesis_repo/data/data_processed/Train_Val_Test/df_test.json",
    # data/data_processed/Train_Val_Test/gold_test_data.json
    num_samples=10
):
    """
    Test ceiling performance using gold context, limited to FinQA-only samples.
    """
    # Load and filter dataset
    train_data = load_train_data(train_path)
    train_data = [s for s in train_data if s.get("source") == "FinQA" or s.get("source") == "ConvFinQA"]

    # Apply sample limit *after* filtering
    if num_samples:
        train_data = train_data[num_samples:num_samples+10]

    # Instantiate generator (prompts now handled internally)
    generator = ChatGPTGenerator()

    dataset = []
    print("\nProcessing samples...")

    for i, sample in enumerate(train_data, 1):
        question = sample["question"]
        true_answer = sample["answer"]
        source = sample.get("source", "Unknown")

        relevant_context = get_gold_context(sample)
        # get real dataset results
        

        start = time.time()
        generated_answer = generator.generate(
            question=question,
            retrieved_docs=[relevant_context],
        )
        end = time.time()

        dataset.append({
            "user_input": question,
            "retrieved_contexts": [relevant_context],
            "response": generated_answer,
            "reference": true_answer
        })

        # Print progress
        print(f"\nSample {i}:")
        print(f"Source: {source}")
        print(f"Question: {question}")
        print(f"Gold Context: {relevant_context}")
        print(f"Generated Answer: {generated_answer}")
        print(f"True Answer: {true_answer}")
        print("-" * 80)

    # Evaluate with RAGAS
    metrics_list = ["answer_accuracy", "string_presence"]
    print("\nEvaluating ceiling performance...")
    results = await evaluate_ragas_dataset(dataset, metrics_list=metrics_list)

    print("\nCeiling Performance Results:")
    print(results)

    # Additional stats
    total_samples = len(dataset)
    exact_matches = sum(1 for sample in dataset if sample['response'].strip() == sample['reference'].strip())

    print("\nAdditional Statistics:")
    print(f"Total Samples: {total_samples}")
    print(f"Exact Matches: {exact_matches}")
    print(f"Exact Match Rate: {exact_matches / total_samples:.2%}")

if __name__ == "__main__":
    asyncio.run(test_ceiling_performance())

In [None]:
# latency logging in retriever required