In [1]:
import sys
from pathlib import Path
import pandas as pd

# Add /src to sys.path (assumes you are in notebooks/)
sys.path.append(str(Path().resolve().parent / "src"))

import json
import torch
from vectorrag.chunker import chunk_finder_sample, chunk_all
from vectorrag.embedder import embed_passages, embed_query
from vectorrag.retriever import build_faiss_index, search, save_index, load_index, rerank_search
from generator import ChatGPTGenerator

Using MPS


# VectorRAG Pipeline

This implementation defines a modular and extensible VectorRAG system for financial document question answering, supporting evaluation on datasets such as FinQA and FinDER. It is designed to run efficiently on local hardware, including Apple Silicon (M1/M2).

## Pipeline Overview

### Step 1 – Chunking
`chunker.py` splits each document sample into text and table-based passages using the `chunk_finder_sample()` function. This preserves document structure and prepares it for fine-grained retrieval.

### Step 2 – Embedding
`embedder.py` encodes all chunks into vector representations using the `BAAI/bge-base-en-v1.5` model. Batching and MPS acceleration are used for efficient local inference on MacBook hardware.

### Step 3 – Indexing
The resulting chunk embeddings are indexed using FAISS with L2 similarity (`retriever.py`), allowing fast nearest-neighbor retrieval.

### Step 4 – Dense Retrieval
Given a user query, `embed_query()` embeds it into the same space and retrieves the top-k similar chunks from the FAISS index.

### Step 5 – Cross-Encoder Reranking (optional)
If enabled, `reranker.py` re-scores the top-k retrieved documents using a cross-encoder (`cross-encoder/ms-marco-MiniLM-L-6-v2`) for improved semantic relevance.

### Step 6 – Answer Generation
`generator.py` uses the top reranked chunks to construct a prompt for the OpenAI API (or local Fin-R1 model), generating a grounded answer to the user query.

---

## Key Features

- Embedding model: `BAAI/bge-base-en-v1.5`
- FAISS-based vector retrieval (L2 similarity)
- Optional cross-encoder reranking via `cross-encoder/ms-marco-MiniLM-L-6-v2`
- Modular architecture for easy experimentation
- Optimized for CPU or MPS (Mac) environments

In [2]:
import json

with open("//Users/alex/Documents/Data Science Master/thesis_RAG/data/data_processed/merged_dataset.json", 'r') as f:
    merged_dataset = [json.loads(line) for line in f]

#df = pd.DataFrame(merged_dataset)

In [7]:
df.head(10)

Unnamed: 0,ID,question,answer,context,gold_context,operation,source
0,ADI/2009/page_49.pdf,what is the the interest expense in 2009?,380,['interest rate to a variable interest rate ba...,{'text_1': 'if libor changes by 100 basis poin...,"divide(100, 100), divide(3.8, #0)",FinQA
1,AAL/2018/page_13.pdf,what was the total operating expenses in 2018 ...,41932,['the following table shows annual aircraft fu...,{'table_1': 'year the 2018 of gallons is 4447 ...,"divide(9896, 23.6%)",FinQA
2,INTC/2013/page_71.pdf,what percentage of total cash and investments ...,53%,['the fair value of our grants receivable is d...,{'table_1': '( in millions ) the available-for...,"divide(14001, 26302)",FinQA
3,ETR/2008/page_313.pdf,what is the growth rate in net revenue in 2008?,-3.2%,"[""entergy louisiana , llc management's financi...",{'table_1': 'the 2007 net revenue of amount ( ...,"subtract(959.2, 991.1), divide(#0, 991.1)",FinQA
4,C/2010/page_272.pdf,what was the growth rate of the loans held-for...,56.25%,"['the significant changes from december 31 , 2...",{'table_1': 'in billions of dollars the decemb...,"divide(2.5, 1.6), divide(#0, 1.6)",FinQA
5,AMT/2012/page_121.pdf,for acquired customer-related and network loca...,7.4,['american tower corporation and subsidiaries ...,{'text_0': 'american tower corporation and sub...,"add(75.0, 72.7), divide(#0, 20)",FinQA
6,GIS/2019/page_45.pdf,in 2019 what was the percent of the net earnin...,63.6%,['free cash flow conversion rate we believe th...,{'table_1': 'in millions the net earnings incl...,"divide(1786.2, 2807.0)",FinQA
7,IPG/2009/page_89.pdf,what percentage decrease occurred from 2011-20...,96.55%,['notes to consolidated financial statements 2...,{'table_1': 'the deferred acquisition payments...,"subtract(34.8, 1.2), divide(#0, 34.8), multipl...",FinQA
8,CDNS/2018/page_32.pdf,how is net change in cash from financing activ...,56.6,['issuer purchases of equity securities in jan...,{'table_3': 'period the december 2 2018 2013 d...,"multiply(1327657, 42.61), divide(#0, const_100...",FinQA
9,GIS/2008/page_83.pdf,what is the change in net assets from 2007 to ...,6.9,['contributions and future benefit payments we...,{'text_4': 'it had net assets of $ 2309.9 mill...,"subtract(2309.9, 2303.0)",FinQA


In [14]:
# Count the types found in the context column
print("Type breakdown:")
print(df["context"].apply(type).value_counts())

# Show a few example entries
print("\nSample values:")
for i in range(3):
    print(f"Row {i} type: {type(df['context'].iloc[i])}")
    print(f"Row {i} value: {df['context'].iloc[i]}")
    print("─" * 60)

Type breakdown:
context
<class 'str'>     11505
<class 'list'>     5696
Name: count, dtype: int64

Sample values:
Row 0 type: <class 'str'>
Row 0 value: ['interest rate to a variable interest rate based on the three-month libor plus 2.05% ( 2.05 % ) ( 2.34% ( 2.34 % ) as of october 31 , 2009 ) .', 'if libor changes by 100 basis points , our annual interest expense would change by $ 3.8 million .', 'foreign currency exposure as more fully described in note 2i .', 'in the notes to consolidated financial statements contained in item 8 of this annual report on form 10-k , we regularly hedge our non-u.s .', 'dollar-based exposures by entering into forward foreign currency exchange contracts .', 'the terms of these contracts are for periods matching the duration of the underlying exposure and generally range from one month to twelve months .', 'currently , our largest foreign currency exposure is the euro , primarily because our european operations have the highest proportion of our local cu

In [3]:
# Chunk text + table content
all_chunks, chunk_metadata = chunk_all(merged_dataset)
print(f"Total chunks created: {len(all_chunks)}")



Total chunks created: 6113




In [4]:
# Embed all chunks with batching
embeddings = embed_passages(all_chunks, batch_size=64)
print(f"Embeddings shape: {embeddings.shape}")

Embedding 6113 passages in batches of 64...


🔄 Embedding progress: 100%|██████████| 96/96 [03:14<00:00,  2.02s/it]

Embeddings shape: torch.Size([6113, 768])





In [7]:
# Create the save directory if it doesn't exist
save_dir = Path("data/embeddings")
save_dir.mkdir(parents=True, exist_ok=True)

# Save embeddings
torch.save(embeddings, save_dir / "embeddings.pt")

# Save metadata
with open(save_dir / "chunk_metadata.json", "w") as f:
    json.dump(chunk_metadata, f)

print(f"✅ Saved embeddings and metadata to {save_dir}")

✅ Saved embeddings and metadata to data/embeddings


In [5]:
# Build & save the FAISS index
index = build_faiss_index(embeddings)
save_index(index, path="data/embeddings/faiss_index.index")


FAISS index saved to data/embeddings/faiss_index.index


In [6]:
# Test System with query
query = "What are the obligations of the borrower?"

# Step 1: Embed the query
query_embedding = embed_query(query)

# Step 2: Search FAISS and rerank
top_docs, scores = rerank_search(
    query=query,
    query_embedding=query_embedding,
    index=index,
    all_documents=all_chunks,   # Should match order of original embeddings
    top_k=10,
    rerank_k=5,
    return_scores=True
)

# Step 3: Display top reranked passages with scores
print("Top Reranked Passages:")
for i, (doc, score) in enumerate(zip(top_docs, scores)):
    print(f"{i+1}. (Score: {score:.4f})\n{doc}\n---")

config.json:   0%|          | 0.00/794 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/132 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

Top Reranked Passages:
1. (Score: -1.0604)
[Text] Loan commitments provide for financing on predetermined terms as long as the client continues to meet specified criteria. These agreements generally carry variable rates of interest and have fixed expiration dates or termination clauses. We typically charge a fee for our loan commitments. Since a commitment may expire without resulting in a loan, our aggregate outstanding commitments may significantly exceed our eventual cash outlay.
Loan commitments involve credit risk not reflected on our Consolidated Balance Sheets. We mitigate exposure to credit risk with internal controls that guide how we review and approve applications for credit, establish credit limits and, when necessary, demand collateral. In particular, we evaluate the creditworthiness of each prospective borrower on a case-by-case basis and, when appropriate, adjust the allowance for credit losses on lending-related commitments. Additional information pertaining to this all

In [None]:
# Save for eval
reranked_metadata = {
    "query": query,
    "reranked_chunks": top_docs,
    "scores": scores
}

with open("data/eval/reranked_log.jsonl", "a") as f:
    f.write(json.dumps(reranked_metadata) + "\n")