In [8]:
import sys
from pathlib import Path
import json
import pandas as pd
import torch
from langchain.vectorstores import FAISS
from langchain_openai import OpenAIEmbeddings
from dotenv import load_dotenv
import os

# Append retrievers to the Python path (not just src)
sys.path.append(str(Path().resolve().parent / "src"))

# Import pipeline components
from retrievers.vectorrag.index_faiss import build_faiss_index_from_json
from retrievers.vectorrag.document_loader import load_json_documents
from retrievers.vectorrag.chunker import chunk_documents
from retrievers.vectorrag.embedder import init_embedder
from retrievers.vectorrag.index_faiss import build_faiss_index, load_faiss_index
from retrievers.vectorrag.retriever import rerank_search

from generator.generator import ChatGPTGenerator 
import numpy as np
import json
from retrievers.vectorrag.reranker import CrossEncoderReranker
import numpy as np


In [2]:
json_path = "/Users/alex/Documents/Data Science Master/thesis_RAG/data/data_processed/Train_Val_Test/unique_contexts_filtered_23_7.json"
docs_raw = load_json_documents(json_path)
print(f"Loaded {len(docs_raw)} documents.")

Loaded 5823 documents.


In [None]:
df.head(10)

Unnamed: 0,0
0,"Cboe Global Markets, Inc. and Subsidiaries\n\n..."
1,"Employees\n\nAs of December 31, 2023, we emplo..."
2,North\n\n\n\n\n\n\n\n\n\n\nCorporate\n\n\n\n\n...
3,"In 2011, the Board of Directors approved an in..."
4,"We maintain policies, procedures and controls ..."
5,Competition\n\nThe industry in which we operat...
6,"Legal Proceedings\n\nAs of December 31, 2023, ..."
7,Information about our Executive Officers\n\nSe...
8,The Company recognizes that operating in a soc...
9,The Company presents three financial statement...


In [None]:
# Count the types found in the context column
print("Type breakdown:")
print(df["context"].apply(type).value_counts())

# Show a few example entries
print("\nSample values:")
for i in range(3):
    print(f"Row {i} type: {type(df['context'].iloc[i])}")
    print(f"Row {i} value: {df['context'].iloc[i]}")
    print("─" * 60)

Type breakdown:
context
<class 'str'>     11505
<class 'list'>     5696
Name: count, dtype: int64

Sample values:
Row 0 type: <class 'str'>
Row 0 value: ['interest rate to a variable interest rate based on the three-month libor plus 2.05% ( 2.05 % ) ( 2.34% ( 2.34 % ) as of october 31 , 2009 ) .', 'if libor changes by 100 basis points , our annual interest expense would change by $ 3.8 million .', 'foreign currency exposure as more fully described in note 2i .', 'in the notes to consolidated financial statements contained in item 8 of this annual report on form 10-k , we regularly hedge our non-u.s .', 'dollar-based exposures by entering into forward foreign currency exchange contracts .', 'the terms of these contracts are for periods matching the duration of the underlying exposure and generally range from one month to twelve months .', 'currently , our largest foreign currency exposure is the euro , primarily because our european operations have the highest proportion of our local cu

In [3]:
chunked_docs = chunk_documents(docs_raw, chunk_size=1500, chunk_overlap=200)

print(f"Created {len(chunked_docs)} chunks.")

Created a chunk of size 1568, which is longer than the specified 1500
Created a chunk of size 1591, which is longer than the specified 1500
Created a chunk of size 1997, which is longer than the specified 1500
Created a chunk of size 1754, which is longer than the specified 1500
Created a chunk of size 4701, which is longer than the specified 1500
Created a chunk of size 3737, which is longer than the specified 1500
Created a chunk of size 7371, which is longer than the specified 1500
Created a chunk of size 1546, which is longer than the specified 1500
Created a chunk of size 4624, which is longer than the specified 1500
Created a chunk of size 1739, which is longer than the specified 1500
Created a chunk of size 1747, which is longer than the specified 1500
Created a chunk of size 2219, which is longer than the specified 1500
Created a chunk of size 2879, which is longer than the specified 1500
Created a chunk of size 4490, which is longer than the specified 1500
Created a chunk of s

Created 9997 chunks.


In [None]:
load_dotenv()
embedder = OpenAIEmbeddings(openai_api_key=os.getenv("OPENAI_API_KEY"))

json_path = "/Users/alex/Documents/Data Science Master/thesis_RAG/data/data_processed/Train_Val_Test/embedded_chunks.json"
faiss_path = "/Users/alex/Documents/Data Science Master/thesis_RAG/notebooks/data/embeddings"

faiss_index = build_faiss_index_from_json(json_path, embedder=embedder, save_path=faiss_path)

In [12]:
load_dotenv()

True

In [8]:
# Load environment variables
load_dotenv()

# Initialize embedder
embedder = OpenAIEmbeddings(
    openai_api_key=os.getenv("OPENAI_API_KEY"),
    model="text-embedding-ada-002"
)

# Load FAISS index
faiss_path = "/Users/alex/Documents/Data Science Master/thesis_RAG/notebooks/data/embeddings"
faiss_index = FAISS.load_local(
    faiss_path,
    embedder,
    allow_dangerous_deserialization=True
)

# Sample query
query = "What is the company’s interest expense?"
query_embedding = embedder.embed_query(query)

# Run similarity search
docs_with_scores = faiss_index.similarity_search_with_score_by_vector(query_embedding, k=5)

# Print results
for doc, score in docs_with_scores:
    print(f"Score: {score:.4f}")
    print(doc.page_content[:500])
    print("-" * 80)

Score: 0.3061
The 2022 Credit Agreement contains customary representations and warranties, affirmative and negative covenants and events of default. The negative covenants include restrictions on subsidiary indebtedness, liens and fundamental changes. These covenants are subject to a number of important exceptions and qualifications. The principal financial covenant requires a maximum consolidated leverage ratio. There were no outstanding borrowings under the 2022 Credit Agreement as of December 31, 2023. 

I
--------------------------------------------------------------------------------
Score: 0.3176
As of October 31, 2024, our material cash requirements consisted of future payments for debt and related interests, income tax liabilities related to one-time transition tax, purchase obligations, operating lease and Retirement Income Plan.

We incur interest on a revolving loan and a term loan. Using the same interest rate of October 31, 2024, and assuming borrowings as of October 31, 2

# Evaluation

In [9]:
# Load environment variables
load_dotenv()

# Initialize embedder
embedder = OpenAIEmbeddings(
    openai_api_key=os.getenv("OPENAI_API_KEY"),
    model="text-embedding-ada-002"
)

# Load FAISS index
faiss_path = "/Users/alex/Documents/Data Science Master/thesis_RAG/notebooks/data/embeddings"
faiss_index = FAISS.load_local(
    faiss_path,
    embedder,
    allow_dangerous_deserialization=True
)

# Sample query
query = "What is the company’s interest expense?"
query_embedding = embedder.embed_query(query)

# Run similarity search
docs_with_scores = faiss_index.similarity_search_with_score_by_vector(query_embedding, k=5)

# Print results
for doc, score in docs_with_scores:
    print(f"Score: {score:.4f}")
    print(doc.page_content[:500])
    print("-" * 80)

Score: 0.3061
The 2022 Credit Agreement contains customary representations and warranties, affirmative and negative covenants and events of default. The negative covenants include restrictions on subsidiary indebtedness, liens and fundamental changes. These covenants are subject to a number of important exceptions and qualifications. The principal financial covenant requires a maximum consolidated leverage ratio. There were no outstanding borrowings under the 2022 Credit Agreement as of December 31, 2023. 

I
--------------------------------------------------------------------------------
Score: 0.3176
As of October 31, 2024, our material cash requirements consisted of future payments for debt and related interests, income tax liabilities related to one-time transition tax, purchase obligations, operating lease and Retirement Income Plan.

We incur interest on a revolving loan and a term loan. Using the same interest rate of October 31, 2024, and assuming borrowings as of October 31, 2

In [10]:
generator = ChatGPTGenerator()

# Load gold test data
with open("../data/data_processed/Train_Val_Test/gold_test_data.json") as f:
    gold_data = json.load(f)

with open("../data/data_processed/Train_Val_Test/embedded_chunks.json") as f:
    all_docs = json.load(f)

# Load FAISS index
faiss_path = "/Users/alex/Documents/Data Science Master/thesis_RAG/notebooks/data/embeddings"
faiss_index = FAISS.load_local(
    faiss_path,
    embedder,
    allow_dangerous_deserialization=True
)

# Separate datasets
finqa_eval_dataset = []
finder_eval_dataset = []

finqa_count = 0
finder_count = 0

for sample in gold_data:
    if finqa_count >= 2 and finder_count >= 2:
        break

    question = sample["question"]
    reference = sample["answer"]
    gold_context = sample.get("gold_context", {})

    # Embed and search
    query_embedding = np.array(embedder.embed_query(question), dtype="float32").reshape(1, -1)
    D, I = faiss_index.index.search(query_embedding, 10)
    candidate_texts = [all_docs[i] for i in I[0] if "text" in all_docs[i]]

    # Rerank
    reranked_docs = CrossEncoderReranker().rerank(question, candidate_texts, top_k=5)

    # Generate
    response = generator.generate(question, [doc["text"] for doc in reranked_docs])
    print("Question:", question)
    print("Generated Answer:", response)
    print("-" * 80)

    # Determine reference contexts
    reference_contexts = []
    if isinstance(gold_context, dict):
        reference_contexts = list(gold_context.values())
    elif isinstance(gold_context, str) and gold_context.strip():
        reference_contexts = [gold_context.strip()]

    record = {
        "user_input": question,
        "retrieved_contexts": [doc["text"] for doc in reranked_docs],
        "response": response,
        "reference": reference,
    }

    # Append based on presence of gold context
    if reference_contexts and finqa_count < 2:
        record["reference_contexts"] = reference_contexts
        finqa_eval_dataset.append(record)
        finqa_count += 1
    elif not reference_contexts and finder_count < 2:
        finder_eval_dataset.append(record)
        finder_count += 1

# Evaluate
import asyncio

print("\n🔍 Evaluating FinQA subset...")
finqa_results = asyncio.run(evaluate_ragas_dataset(finqa_eval_dataset))
print(finqa_results)

print("\n🔍 Evaluating FinDER subset...")
finder_results = asyncio.run(evaluate_ragas_dataset(
    finder_eval_dataset,
    metrics_list=["faithfulness", "answer_accuracy"]
))
print(finder_results)

Question: GM operating margin 2023 vs 2022, GM.
Generated Answer: 5.4 % in 2023 vs 6.6 % in 2022
--------------------------------------------------------------------------------
Question: what percentage of incremental risk-weighted assets are student loans at january 1 , 2010?
Generated Answer: I don’t know
--------------------------------------------------------------------------------
Question: what is the growth rate in net revenue in 2003 for entergy corporation?
Generated Answer: 14.4
--------------------------------------------------------------------------------
Question: NGC's cyber investments boost investor confidence, enhance valuation, and bolster stability.
Generated Answer: NGC’s board-supervised program combines continuous investment in NIST-aligned defenses, third-party maturity assessments, and active participation in multiple industry ISACs, signaling strong risk management; this visible commitment reassures investors, underpins valuation, and strengthens the company

You are using a model of type HHEMv2Config to instantiate a model of type HHEMv2. This is not supported for all configurations of models and can yield errors.


Evaluating:   0%|          | 0/16 [00:00<?, ?it/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (1785 > 512). Running this sequence through the model will result in indexing errors


{'llm_context_precision_with_reference': 0.0000, 'non_llm_context_precision_with_reference': 0.0000, 'context_recall': 0.0000, 'non_llm_context_recall': 0.0000, 'context_entity_recall': 0.0000, 'faithfulness_with_hhem': 0.0000, 'nv_accuracy': 0.0000, 'string_present': 0.0000}

🔍 Evaluating FinDER subset...


You are using a model of type HHEMv2Config to instantiate a model of type HHEMv2. This is not supported for all configurations of models and can yield errors.


Evaluating:   0%|          | 0/4 [00:00<?, ?it/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (2038 > 512). Running this sequence through the model will result in indexing errors


{'faithfulness_with_hhem': 0.0000, 'nv_accuracy': 0.8750}


In [11]:
print(finder_results)

{'faithfulness_with_hhem': 0.0000, 'nv_accuracy': 0.8750}


In [None]:
async def test_ceiling_performance(
    #train_path="/Users/christel/Desktop/Thesis/thesis_repo/data/data_processed/Train_Val_Test/df_test.json",
    # data/data_processed/Train_Val_Test/gold_test_data.json
    num_samples=10
):
    """
    Test ceiling performance using gold context, limited to FinQA-only samples.
    """
    # Load and filter dataset
    train_data = load_train_data(train_path)
    train_data = [s for s in train_data if s.get("source") == "FinQA" or s.get("source") == "ConvFinQA"]

    # Apply sample limit *after* filtering
    if num_samples:
        train_data = train_data[num_samples:num_samples+10]

    # Instantiate generator (prompts now handled internally)
    generator = ChatGPTGenerator()

    dataset = []
    print("\nProcessing samples...")

    for i, sample in enumerate(train_data, 1):
        question = sample["question"]
        true_answer = sample["answer"]
        source = sample.get("source", "Unknown")

        relevant_context = get_gold_context(sample)
        # get real dataset results
        

        start = time.time()
        generated_answer = generator.generate(
            question=question,
            retrieved_docs=[relevant_context],
        )
        end = time.time()

        dataset.append({
            "user_input": question,
            "retrieved_contexts": [relevant_context],
            "response": generated_answer,
            "reference": true_answer
        })

        # Print progress
        print(f"\nSample {i}:")
        print(f"Source: {source}")
        print(f"Question: {question}")
        print(f"Gold Context: {relevant_context}")
        print(f"Generated Answer: {generated_answer}")
        print(f"True Answer: {true_answer}")
        print("-" * 80)

    # Evaluate with RAGAS
    metrics_list = ["answer_accuracy", "string_presence"]
    print("\nEvaluating ceiling performance...")
    results = await evaluate_ragas_dataset(dataset, metrics_list=metrics_list)

    print("\nCeiling Performance Results:")
    print(results)

    # Additional stats
    total_samples = len(dataset)
    exact_matches = sum(1 for sample in dataset if sample['response'].strip() == sample['reference'].strip())

    print("\nAdditional Statistics:")
    print(f"Total Samples: {total_samples}")
    print(f"Exact Matches: {exact_matches}")
    print(f"Exact Match Rate: {exact_matches / total_samples:.2%}")

if __name__ == "__main__":
    asyncio.run(test_ceiling_performance())

In [None]:
# latency logging in retriever required