# Build the Knowledge Base 

In [2]:
import json

with open("/Users/christel/Desktop/Thesis/thesis_repo/data/data_processed/Keep/merged_dataset.json", "r") as f:
    data = [json.loads(line) for line in f if line.strip()]

In [3]:
if data:
    first_entry = data[0]
    print("First entry in the dataset:")
    print(json.dumps(first_entry, indent=2))
else:
    print("The dataset is empty.")

First entry in the dataset:
{
  "ID": "ADI/2009/page_49.pdf",
  "question": "what is the the interest expense in 2009?",
  "answer": "380",
  "context": "['interest rate to a variable interest rate based on the three-month libor plus 2.05% ( 2.05 % ) ( 2.34% ( 2.34 % ) as of october 31 , 2009 ) .', 'if libor changes by 100 basis points , our annual interest expense would change by $ 3.8 million .', 'foreign currency exposure as more fully described in note 2i .', 'in the notes to consolidated financial statements contained in item 8 of this annual report on form 10-k , we regularly hedge our non-u.s .', 'dollar-based exposures by entering into forward foreign currency exchange contracts .', 'the terms of these contracts are for periods matching the duration of the underlying exposure and generally range from one month to twelve months .', 'currently , our largest foreign currency exposure is the euro , primarily because our european operations have the highest proportion of our local c

In [4]:
source_counts = {}
for item in data:
    source = item.get("source")
    if source:
        source_counts[source] = source_counts.get(source, 0) + 1

for source, count in source_counts.items():
    print(f"{source}: {count} samples")

FinQA: 6203 samples
ConvFinQA: 5302 samples
FinDER: 5696 samples


Dedup the context: <br>
If a sample has Finder and ConFinQA or FinQA as source keep the latter two over the first two as the ID contains meta data. 

In [None]:
import json

def load_jsonl(path: str):
    """Load a JSONL file into a list of dicts."""
    with open(path, "r", encoding="utf-8") as f:
        return [json.loads(line) for line in f if line.strip()]

file_path = "/Users/christel/Desktop/Thesis/thesis_repo/data/data_processed/Keep/merged_dataset.json"
data = load_jsonl(file_path)
print(f"Loaded {len(data)} records.")
print("Sample keys:", list(data[0].keys()))


Loaded 17201 records.
Sample keys: ['ID', 'question', 'answer', 'context', 'gold_context', 'operation', 'source']


In [5]:
from collections import defaultdict, Counter
import hashlib
import json
import os

def norm_text(s):
    if s is None:
        return ""
    return " ".join(str(s).split()).strip().lower()

def hash_text(s):
    return hashlib.sha256(s.encode("utf-8")).hexdigest()

def audit_id_context_consistency(
    data, 
    id_key="ID", 
    context_key="context", 
    sample_chars=180
):
    # group rows by ID
    by_id = defaultdict(list)
    for idx, row in enumerate(data):
        rid = row.get(id_key)
        by_id[rid].append((idx, row))

    exact_dup_groups = 0
    conflict_groups = 0
    removable_row_indices = []  # rows we *could* remove because context matches exactly
    conflicts_detail = {}       # {ID: {hash: {"indices":[...], "sample": "...", "count": n}}}

    for rid, rows in by_id.items():
        if rid is None or len(rows) == 1:
            continue  # unique or missing id, skip

        # bucket by normalized context hash
        buckets = defaultdict(list)
        rep = {}
        for idx, row in rows:
            ctx_norm = norm_text(row.get(context_key, ""))
            h = hash_text(ctx_norm)
            buckets[h].append(idx)
            if h not in rep:
                rep[h] = ctx_norm[:sample_chars]

        if len(buckets) == 1:
            # all contexts identical -> safe to dedupe this ID group
            exact_dup_groups += 1
            # mark all but the first as removable
            all_idxs = next(iter(buckets.values()))
            removable_row_indices.extend(sorted(all_idxs)[1:])
        else:
            # same ID, different context -> conflict
            conflict_groups += 1
            conflicts_detail[str(rid)] = {
                h: {
                    "count": len(idxs),
                    "indices": sorted(idxs),
                    "sample": rep[h]
                } for h, idxs in buckets.items()
            }

    stats = {
        "total_rows": len(data),
        "total_ids": len(by_id),
        "duplicate_id_groups": sum(1 for rid, rows in by_id.items() if rid is not None and len(rows) > 1),
        "exact_context_dup_groups": exact_dup_groups,
        "conflict_groups": conflict_groups,
        "removable_rows_count": len(removable_row_indices),
    }

    print(
        f"Rows: {stats['total_rows']}\n"
        f"IDs (unique incl. None): {stats['total_ids']}\n"
        f"Duplicate ID groups: {stats['duplicate_id_groups']}\n"
        f"  - Exact-context groups (safe to collapse): {stats['exact_context_dup_groups']}\n"
        f"  - CONFLICT groups (same ID, different context): {stats['conflict_groups']}\n"
        f"Rows removable safely (exact dups): {stats['removable_rows_count']}"
    )
    return stats, removable_row_indices, conflicts_detail


stats, removable_row_indices, conflicts = audit_id_context_consistency(
    data,
    id_key="ID",
    context_key="context"
)


conflicts_path = "id_context_conflicts.json"
with open(conflicts_path, "w", encoding="utf-8") as f:
    json.dump(conflicts, f, ensure_ascii=False, indent=2)
print(f"Conflicts saved to: {conflicts_path} (IDs with multiple distinct contexts).")


Rows: 17201
IDs (unique incl. None): 7803
Duplicate ID groups: 1997
  - Exact-context groups (safe to collapse): 768
  - CONFLICT groups (same ID, different context): 1229
Rows removable safely (exact dups): 1361
Conflicts saved to: id_context_conflicts.json (IDs with multiple distinct contexts).


In [6]:
def dedupe_only_when_context_matches(
    data, 
    id_key="ID", 
    context_key="context", 
    keep="first"  # or "last"
):
    """
    Remove duplicates *only* when rows share the same ID *and* their normalized context is identical.
    If same ID maps to multiple distinct contexts, keep them all (flag separately).
    """
    # First pass: compute buckets for each ID
    by_id = defaultdict(list)
    for idx, row in enumerate(data):
        rid = row.get(id_key)
        ctx_norm = norm_text(row.get(context_key, ""))
        h = hash_text(ctx_norm)
        by_id[rid].append((idx, h))

    # Decide which indices to keep
    keep_indices = set()
    for rid, entries in by_id.items():
        if rid is None:
            # keep all rows without IDs
            keep_indices.update(idx for idx, _ in entries)
            continue

        # group by context-hash inside this ID
        by_hash = defaultdict(list)
        for idx, h in entries:
            by_hash[h].append(idx)

        if len(by_hash) == 1:
            # Exact-match group: keep only first (or last)
            for h, idxs in by_hash.items():
                idxs_sorted = sorted(idxs)
                if keep == "last":
                    keep_indices.add(idxs_sorted[-1])
                else:
                    keep_indices.add(idxs_sorted[0])
        else:
            # Conflict: multiple contexts for same ID -> keep all
            keep_indices.update(idx for idx, _ in entries)

    cleaned = [row for i, row in enumerate(data) if i in keep_indices]
    removed = len(data) - len(cleaned)
    return cleaned, removed

cleaned_data_safe, removed_count = dedupe_only_when_context_matches(
    data,
    id_key="ID",
    context_key="context",
    keep="first"
)

print(f"Safely removed {removed_count} rows (only exact duplicates by ID+context).")
print(f"Remaining rows: {len(cleaned_data_safe)}")

out_path = "dataset_clean_exact_id_context.jsonl"
with open(out_path, "w", encoding="utf-8") as f:
    for row in cleaned_data_safe:
        f.write(json.dumps(row, ensure_ascii=False) + "\n")
print(f"Cleaned dataset written to: {out_path}")

Safely removed 1361 rows (only exact duplicates by ID+context).
Remaining rows: 15840
Cleaned dataset written to: dataset_clean_exact_id_context.jsonl


In [7]:
from collections import defaultdict

def check_duplicate_ids(data, id_key="ID"):
    """Check if IDs are unique in the dataset."""
    seen = set()
    duplicates = defaultdict(list)
    missing = []

    for idx, row in enumerate(data):
        rid = row.get(id_key)
        if rid is None:
            missing.append(idx)
            continue
        if rid in seen:
            duplicates[rid].append(idx)
        else:
            seen.add(rid)
    
    print(f"Total records: {len(data)}")
    print(f"Unique IDs: {len(seen)}")
    print(f"Duplicate IDs: {len(duplicates)}")
    print(f"Rows with missing IDs: {len(missing)}")
    return duplicates, missing

duplicates, missing = check_duplicate_ids(data, id_key="ID")


Total records: 17201
Unique IDs: 7803
Duplicate IDs: 1997
Rows with missing IDs: 0


In [8]:
cleaned_data_safe, removed_count = dedupe_only_when_context_matches(
    data,                      
    id_key="ID",               
    context_key="context",     
    keep="first"               
)

print(f"Safely removed {removed_count} rows (only exact duplicates by ID+context).")
print(f"Remaining rows: {len(cleaned_data_safe)}")

output_path = "dataset_clean_exact_id_context.jsonl"
with open(output_path, "w", encoding="utf-8") as f:
    for row in cleaned_data_safe:
        f.write(json.dumps(row, ensure_ascii=False) + "\n")

print(f"Cleaned dataset written to: {output_path}")

Safely removed 1361 rows (only exact duplicates by ID+context).
Remaining rows: 15840
Cleaned dataset written to: dataset_clean_exact_id_context.jsonl


Check if all contexts are still in there after deduplicating: 

In [9]:
from collections import Counter
import hashlib

def normalize_context(text):
    """Lowercase, trim spaces, collapse whitespace for fair comparison."""
    if text is None:
        return ""
    return " ".join(str(text).split()).strip().lower()

def hash_context(text):
    """Hash for memory-efficient comparison."""
    return hashlib.sha256(normalize_context(text).encode("utf-8")).hexdigest()

# Hash contexts for original dataset
original_contexts = {hash_context(row.get("context")) for row in data}

# Hash contexts for cleaned dataset
cleaned_contexts = {hash_context(row.get("context")) for row in cleaned_data_safe}

print(f"Original unique contexts: {len(original_contexts)}")
print(f"Cleaned unique contexts: {len(cleaned_contexts)}")

Original unique contexts: 8925
Cleaned unique contexts: 8925


In [10]:
cleaned_context_list = [normalize_context(row.get("context")) for row in cleaned_data_safe]
original_context_list = [normalize_context(row.get("context")) for row in data]

cleaned_context_list[:5]

["['interest rate to a variable interest rate based on the three-month libor plus 2.05% ( 2.05 % ) ( 2.34% ( 2.34 % ) as of october 31 , 2009 ) .', 'if libor changes by 100 basis points , our annual interest expense would change by $ 3.8 million .', 'foreign currency exposure as more fully described in note 2i .', 'in the notes to consolidated financial statements contained in item 8 of this annual report on form 10-k , we regularly hedge our non-u.s .', 'dollar-based exposures by entering into forward foreign currency exchange contracts .', 'the terms of these contracts are for periods matching the duration of the underlying exposure and generally range from one month to twelve months .', 'currently , our largest foreign currency exposure is the euro , primarily because our european operations have the highest proportion of our local currency denominated expenses .', 'relative to foreign currency exposures existing at october 31 , 2009 and november 1 , 2008 , a 10% ( 10 % ) unfavorabl

# Create a Vector DB 

In [11]:
rows = cleaned_data_safe 

In [12]:
rows[0]

{'ID': 'ADI/2009/page_49.pdf',
 'question': 'what is the the interest expense in 2009?',
 'answer': '380',
 'context': "['interest rate to a variable interest rate based on the three-month libor plus 2.05% ( 2.05 % ) ( 2.34% ( 2.34 % ) as of october 31 , 2009 ) .', 'if libor changes by 100 basis points , our annual interest expense would change by $ 3.8 million .', 'foreign currency exposure as more fully described in note 2i .', 'in the notes to consolidated financial statements contained in item 8 of this annual report on form 10-k , we regularly hedge our non-u.s .', 'dollar-based exposures by entering into forward foreign currency exchange contracts .', 'the terms of these contracts are for periods matching the duration of the underlying exposure and generally range from one month to twelve months .', 'currently , our largest foreign currency exposure is the euro , primarily because our european operations have the highest proportion of our local currency denominated expenses .', '

In [13]:
import re
from collections import Counter


rows = cleaned_data_safe

# Regex handles: TICKER/YEAR/page49.pdf  or page_49.pdf or page-49.pdf
PAGE_RE = re.compile(
    r"(?P<ticker>[A-Za-z]{1,15})/(?P<year>(19|20)\d{2})/page[_-]?(?P<page>\d+)\.pdf$",
    flags=re.IGNORECASE
)

def extract_ticker_year_page_from_id(id_value: str):
    """Parse 'TICKER/YEAR/page_12.pdf' from the ID field."""
    if not id_value:
        return None, None, None
    s = str(id_value).strip()

    m = PAGE_RE.search(s)
    if m:
        return m.group("ticker").upper(), int(m.group("year")), int(m.group("page"))

    # Fallback: .../TICKER/YEAR/<file-with-digits>.pdf
    parts = s.strip("/").split("/")
    if len(parts) >= 3:
        ticker = parts[-3].upper()
        year_s = parts[-2]
        file_part = parts[-1]
        year = int(year_s) if year_s.isdigit() and len(year_s) == 4 else None

        m2 = re.search(r"(\d+)", file_part)
        page = int(m2.group(1)) if m2 else None

        if re.fullmatch(r"[A-Za-z]{1,15}", ticker):
            return ticker, year, page

    return None, None, None

parsed_ok = Counter()
parsed_fail = Counter()

for r in rows:
    src = (r.get("source") or r.get("title") or "").strip().lower()
    # Only parse FinQA / ConFinQA / ConvFinQA
    if src in {"finqa", "confinqa", "convfinqa"}:
        # IMPORTANT: parse from ID, not source_id
        t, y, p = extract_ticker_year_page_from_id(r.get("ID"))
        r["ticker"], r["year"], r["page"] = t, y, p
        if t is not None and y is not None and p is not None:
            parsed_ok[src] += 1
        else:
            parsed_fail[src] += 1

print("Parsed counts:", parsed_ok)
print("Failed counts:", parsed_fail)

Parsed counts: Counter({'convfinqa': 5302, 'finqa': 4842})
Failed counts: Counter()


In [14]:
rows[0]

{'ID': 'ADI/2009/page_49.pdf',
 'question': 'what is the the interest expense in 2009?',
 'answer': '380',
 'context': "['interest rate to a variable interest rate based on the three-month libor plus 2.05% ( 2.05 % ) ( 2.34% ( 2.34 % ) as of october 31 , 2009 ) .', 'if libor changes by 100 basis points , our annual interest expense would change by $ 3.8 million .', 'foreign currency exposure as more fully described in note 2i .', 'in the notes to consolidated financial statements contained in item 8 of this annual report on form 10-k , we regularly hedge our non-u.s .', 'dollar-based exposures by entering into forward foreign currency exchange contracts .', 'the terms of these contracts are for periods matching the duration of the underlying exposure and generally range from one month to twelve months .', 'currently , our largest foreign currency exposure is the euro , primarily because our european operations have the highest proportion of our local currency denominated expenses .', '

In [15]:
import json

out_path = "/Users/christel/Desktop/Thesis/thesis_repo/data/data_processed/Keep/context_with_metadata_dedup_enriched.jsonl"
with open(out_path, "w", encoding="utf-8") as f:
    for r in rows:
        f.write(json.dumps(r, ensure_ascii=False) + "\n")

print(f"Wrote: {out_path}")


Wrote: /Users/christel/Desktop/Thesis/thesis_repo/data/data_processed/Keep/context_with_metadata_dedup_enriched.jsonl


## Chunk the Data 

In [16]:
from langchain.docstore.document import Document

records = rows  

def build_docs(records):
    docs = []
    for i, r in enumerate(records):
        text = str(r.get("context") or "")
        meta = {
            "row_index": i,
            "ID": r.get("ID"),
            "source": r.get("source") or r.get("title"),
            "source_id": r.get("source_id") or r.get("ID"),
            "ticker": r.get("ticker"),
            "year": r.get("year"),
            "page": r.get("page"),
        }
        docs.append(Document(page_content=text, metadata=meta))
    return docs

docs_raw = build_docs(records)
print(f"Built {len(docs_raw)} base docs")

Built 15840 base docs


In [17]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.docstore.document import Document

def chunk_documents_recursive(docs: list[Document], chunk_size=1500, chunk_overlap=200) -> list[Document]:
    """
    Split documents into chunks of approximately `chunk_size` characters
    with an overlap of `chunk_overlap` characters.
    Uses a recursive splitter to handle texts without natural separators.
    """
    # Create the recursive splitter with fallback separators
    splitter = RecursiveCharacterTextSplitter(
        separators=["\n\n", "\n", " ", ""],
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap
    )

    chunked = []
    for doc in docs:
        # Copy metadata from the original doc
        base_meta = doc.metadata.copy()
        row_index = base_meta.get("row_index")

        # Split the current doc
        pieces = splitter.split_documents([doc])

        # Attach metadata to each chunk, including a unique chunk ID
        for local_idx, chunk in enumerate(pieces):
            chunk.metadata = {
                **base_meta,
                "chunk_id": f"row_{row_index}_chunk_{local_idx}"
            }
            chunked.append(chunk)

    return chunked

In [18]:
chunked_docs = chunk_documents_recursive(
    docs_raw, 
    chunk_size=1500, 
    chunk_overlap=200
)

print(f"Created {len(chunked_docs)} chunks.")

Created 56325 chunks.


In [19]:
chunked_docs[0].metadata, chunked_docs[0].page_content[:160]

({'row_index': 0,
  'ID': 'ADI/2009/page_49.pdf',
  'source': 'FinQA',
  'source_id': 'ADI/2009/page_49.pdf',
  'ticker': 'ADI',
  'year': 2009,
  'page': 49,
  'chunk_id': 'row_0_chunk_0'},
 "['interest rate to a variable interest rate based on the three-month libor plus 2.05% ( 2.05 % ) ( 2.34% ( 2.34 % ) as of october 31 , 2009 ) .', 'if libor chan")

In [20]:
sizes = [len(doc.page_content) for doc in chunked_docs]
print(f"Max chunk size: {max(sizes)}")
print(f"Average chunk size: {sum(sizes)//len(sizes)}")

Max chunk size: 1500
Average chunk size: 1139


In [21]:
# count the number of chunks per source
source_counts = {}
for rec in records:
    source = rec.get("source")
    if source:
        source_counts[source] = source_counts.get(source, 0) + 1

print("Chunks per source:")
for source, count in source_counts.items():
    print(f" - {source}: {count}")


Chunks per source:
 - FinQA: 4842
 - ConvFinQA: 5302
 - FinDER: 5696


## Embed Chunks 

In [26]:
import os
import json
from collections import Counter
from tqdm import tqdm
import math

try:
    from langchain_openai import OpenAIEmbeddings
except ImportError:
    from langchain.embeddings.openai import OpenAIEmbeddings

# --- 1) Initialize embedder ---
embedder = OpenAIEmbeddings(
    openai_api_key=os.getenv("OPENAI_API_KEY"),
    model="text-embedding-ada-002",  # consider "text-embedding-3-small"
)

# --- 2) Prepare data ---
records = []
texts = []

def clean_text(s):
    if s is None:
        return ""
    return " ".join(str(s).split())

for doc in chunked_docs:
    meta = doc.metadata or {}
    text = clean_text(doc.page_content)
    records.append({
        "chunk_id":  meta.get("chunk_id"),
        "row_index": meta.get("row_index"),
        "source":    meta.get("source"),
        "ID":        meta.get("ID"),
        "source_id": meta.get("source_id"),
        "ticker":    meta.get("ticker"),
        "year":      meta.get("year"),
        "page":      meta.get("page"),
        "text":      text,
        "empty_text": (len(text) == 0),
    })
    texts.append(text)

print(f"Prepared {len(records)} chunks for embedding")

# --- 3) Quick source count ---
src_counts = Counter((r.get("source") or "<missing>").strip().lower() for r in records)
print("Counts per source (input):", dict(src_counts))

# --- 4) Embed in batches with progress bar ---
BATCH_SIZE = 256
embeddings = []

print("\nEmbedding in progress...")
for start in tqdm(range(0, len(texts), BATCH_SIZE), desc="Embedding batches"):
    batch = texts[start:start + BATCH_SIZE]
    vecs = embedder.embed_documents(batch)
    if len(vecs) != len(batch):
        raise RuntimeError(f"Embed size mismatch at batch starting {start}")
    embeddings.extend(vecs)

# --- 5) Validation and attach embeddings ---
if len(embeddings) != len(records):
    raise RuntimeError(f"Embedding count mismatch: {len(embeddings)} vs {len(records)}")

dim = None
nan_count = 0
for rec, emb in zip(records, embeddings):
    if dim is None:
        dim = len(emb)
    if len(emb) != dim:
        raise ValueError(f"Inconsistent embedding dims: expected {dim}, got {len(emb)}")
    if any((v is None) or (isinstance(v, float) and math.isnan(v)) for v in emb):
        nan_count += 1
    rec["embedding"] = emb

if nan_count:
    raise ValueError(f"Found {nan_count} embeddings containing NaNs/None.")

print(f"Embedding dimension: {dim}")

# --- 6) Save to JSON ---
output_path = "/Users/christel/Desktop/Thesis/thesis_repo/data/data_processed/Keep/embedded_chunks2.json"
with open(output_path, "w", encoding="utf-8") as f:
    json.dump(records, f, ensure_ascii=False, indent=2)

print(f"Embedded chunks saved to: {output_path}")

# --- 7) Optional sanity: count non-empty text per source ---
non_empty_counts = Counter(
    (r.get("source") or "<missing>").strip().lower()
    for r in records if not r["empty_text"]
)
print("Non-empty text counts per source:", dict(non_empty_counts))

Prepared 56325 chunks for embedding
Counts per source (input): {'finqa': 19576, 'convfinqa': 20762, 'finder': 15987}

Embedding in progress...


Embedding batches: 100%|██████████| 221/221 [13:22<00:00,  3.63s/it]


Embedding dimension: 1536
Embedded chunks saved to: /Users/christel/Desktop/Thesis/thesis_repo/data/data_processed/Keep/embedded_chunks2.json
Non-empty text counts per source: {'finqa': 19576, 'convfinqa': 20762, 'finder': 15987}


In [None]:
# check the first sample in /Users/christel/Desktop/Thesis/thesis_repo/data/data_processed/Keep/embedded_chunks2.json
import json
from pprint import pprint

# Path to your embeddings file
EMBEDDED_JSON = "/Users/christel/Desktop/Thesis/thesis_repo/data/data_processed/Keep/embedded_chunks2.json"

# Load data
with open(EMBEDDED_JSON, "r", encoding="utf-8") as f:
    data = json.load(f)

print(f"Total records: {len(data)}\n")

# Take the first sample
first = data[0]

# Preview the keys
print("Keys in first record:", list(first.keys()), "\n")

# Pretty-print key values, truncating long fields like embeddings
preview = first.copy()
if "embedding" in preview:
    preview["embedding"] = f"<{len(preview['embedding'])}-dim vector; head={preview['embedding'][:5]}>"

pprint(preview)


Total records: 56325

Keys in first record: ['chunk_id', 'row_index', 'source', 'ID', 'source_id', 'ticker', 'year', 'page', 'text', 'empty_text', 'embedding'] 

{'ID': 'ADI/2009/page_49.pdf',
 'chunk_id': 'row_0_chunk_0',
 'embedding': '<1536-dim vector; head=[-0.02362525276839733, '
              '-0.011167091317474842, 0.0005769053823314607, '
              '-0.023784972727298737, -0.004731705877929926]>',
 'empty_text': False,
 'page': 49,
 'row_index': 0,
 'source': 'FinQA',
 'source_id': 'ADI/2009/page_49.pdf',
 'text': "['interest rate to a variable interest rate based on the three-month "
         'libor plus 2.05% ( 2.05 % ) ( 2.34% ( 2.34 % ) as of october 31 , '
         "2009 ) .', 'if libor changes by 100 basis points , our annual "
         "interest expense would change by $ 3.8 million .', 'foreign currency "
         "exposure as more fully described in note 2i .', 'in the notes to "
         'consolidated financial statements contained in item 8 of this annual '
     

## Build FAISS Index 

In [2]:
import os, json, faiss, numpy as np
from collections import OrderedDict, Counter
import math

EMBEDDED_JSON = "/Users/christel/Desktop/Thesis/thesis_repo/Can_be_deleted/Die_wahrheit/embedded_chunks2.json"
OUT_DIR       = "/Users/christel/Desktop/Thesis/thesis_repo/data/data_processed/"
INDEX_PATH    = os.path.join(OUT_DIR, "faiss_index.idx")
META_PATH     = os.path.join(OUT_DIR, "retriever_metadata.pkl")

with open(EMBEDDED_JSON, "r", encoding="utf-8") as f:
    recs = json.load(f)

def _norm(s):
    return " ".join((s or "").split())

# ---- sanity: required fields present? ----
missing_src = sum(1 for r in recs if not r.get("source"))
missing_id  = sum(1 for r in recs if not r.get("ID"))
missing_emb = sum(1 for r in recs if "embedding" not in r)
if missing_src or missing_id or missing_emb:
    raise ValueError(
        f"Refusing to build: missing source={missing_src}, missing ID={missing_id}, "
        f"missing embedding field={missing_emb}"
    )

# ---- counts before any dedupe ----
before_cnt = Counter((r.get("source") or "<missing>").strip().lower() for r in recs)
print("Counts BEFORE dedupe:", dict(before_cnt))

# ---- safer dedupe: prefer (ID, chunk_id), then (ID, text), then (source, text) ----
dedup = OrderedDict()
for r in recs:
    id_ = _norm(r.get("ID"))
    cid = _norm(r.get("chunk_id"))
    src = (r.get("source") or "").strip().lower()
    txt = _norm(r.get("text", ""))

    if id_ and cid:
        key = ("id_chunk", id_, cid)
    elif id_:
        key = ("id_text", id_, txt)
    else:
        key = ("src_text", src, txt)

    if key not in dedup:
        dedup[key] = r

recs = list(dedup.values())

# ---- counts after dedupe ----
after_cnt = Counter((r.get("source") or "<missing>").strip().lower() for r in recs)
print("Counts AFTER  dedupe:", dict(after_cnt))

# ---- build matrix (float32), basic validation ----
X = np.asarray([r["embedding"] for r in recs], dtype="float32")
if X.ndim != 2:
    raise ValueError(f"Embedding array shape is {X.shape}, expected 2-D [n, d]")
if not np.isfinite(X).all():
    bad = np.isnan(X).sum() + np.isinf(X).sum()
    raise ValueError(f"Found {bad} non-finite values in embeddings")

# Cosine via inner product => L2-normalize rows
faiss.normalize_L2(X)

# ---- aligned metadata (row order == FAISS row order) ----
meta = [{
    "chunk_id": r.get("chunk_id"),
    "row_index": r.get("row_index"),
    "text": r.get("text", ""),
    "ID": r.get("ID"),
    "source": r.get("source"),
    "page": r.get("page"),
    "year": r.get("year"),
    "ticker": r.get("ticker"),
} for r in recs]

# ---- build exact IP index ----
d = X.shape[1]
index = faiss.IndexFlatIP(d)
index.add(X)

os.makedirs(OUT_DIR, exist_ok=True)
faiss.write_index(index, INDEX_PATH)
with open(META_PATH, "w", encoding="utf-8") as f:
    json.dump(meta, f, ensure_ascii=False, indent=2)

print(f"Saved index -> {INDEX_PATH} (ntotal={index.ntotal})")
print(f"Saved aligned meta -> {META_PATH} (len={len(meta)})")
assert index.ntotal == len(meta), "FAISS rows and metadata length must match"

Counts BEFORE dedupe: {'finqa': 19576, 'convfinqa': 20762, 'finder': 15987}
Counts AFTER  dedupe: {'finqa': 19576, 'convfinqa': 20762, 'finder': 15987}
Saved index -> /Users/christel/Desktop/Thesis/thesis_repo/data/data_processed/faiss_index.idx (ntotal=56325)
Saved aligned meta -> /Users/christel/Desktop/Thesis/thesis_repo/data/data_processed/retriever_metadata.pkl (len=56325)


# Implement the Retriever 

In [4]:
import sys
from pathlib import Path
sys.path.append("/Users/christel/Desktop/Thesis/thesis_repo")


from src.retrievers.vectorrag.index_faiss import build_faiss_index_from_json
from src.retrievers.vectorrag.document_loader import load_json_documents
from src.retrievers.vectorrag.chunker import chunk_documents
from src.retrievers.vectorrag.embedder import init_embedder
from src.retrievers.vectorrag.index_faiss import build_faiss_index, load_faiss_index
from src.retrievers.vectorrag.retriever import rerank_search, faiss_search
from src.retrievers.vectorrag.reranker import CrossEncoderReranker

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
import torch
import numpy as np
import pickle
import faiss
import os

# ---- Paths ----
INDEX_PATH = "/Users/christel/Desktop/Thesis/thesis_repo/data/data_processed/faiss_index.idx"
META_PATH  = "/Users/christel/Desktop/Thesis/thesis_repo/data/data_processed/retriever_metadata.pkl"

# ---- Load FAISS index ----
faiss_index = faiss.read_index(INDEX_PATH)

# ---- Load metadata from pickle ----
with open(META_PATH, "rb") as f:  # <-- rb (read-binary) mode for pickle
    aligned_meta = pickle.load(f)

# ---- Load embedder ----
try:
    from langchain_openai import OpenAIEmbeddings
except ImportError:
    from langchain.embeddings.openai import OpenAIEmbeddings

embedder = OpenAIEmbeddings(
    openai_api_key=os.getenv("OPENAI_API_KEY"),
    model="text-embedding-ada-002"  # Or "text-embedding-3-small" for newer models
)

# ---- Utility to embed query ----
def make_query_tensor(query: str) -> torch.Tensor:
    vec = np.array(embedder.embed_query(query), dtype="float32").reshape(1, -1)
    faiss.normalize_L2(vec)
    return torch.from_numpy(vec)


In [27]:
def search_faiss_raw(query: str, top_k: int = 5):
    query_tensor = make_query_tensor(query)
    indices, scores = faiss_search(faiss_index, query_tensor, top_k=top_k)
    
    results = []
    for rank, (idx, score) in enumerate(zip(indices, scores), 1):
        m = aligned_meta[int(idx)]  # metadata aligned to index
        results.append({
            "rank": rank,
            "score": float(score),
            "chunk_id": m.get("chunk_id"),
            "ID": m.get("ID"),
            "source": m.get("source"),
            "text": m.get("text", ""),
            "row_index": m.get("row_index"),
            "page": m.get("page"),
            "ticker": m.get("ticker"),
            "year": m.get("year"),
        })
    return results


In [None]:
import pandas as pd
pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)
pd.set_option("display.max_colwidth", None)

In [None]:
train_df = pd.DataFrame(json.load(open("/Users/christel/Desktop/Thesis/thesis_repo/data/data_processed/Keep/Train_Val_Test/df_train.json")))
print(train_df[["question"]].head(1))
print(train_df[["gold_context"]].head(1))

                                                                                                                            question
0  for acquired customer-related and network location intangibles , what is the expected annual amortization expenses , in millions?
                                                                                                                                                                                                                                                                                                                                                                                                          gold_context
0  {'text_0': 'american tower corporation and subsidiaries notes to consolidated financial statements ( 3 ) consists of customer-related intangibles of approximately $ 75.0 million and network location intangibles of approximately $ 72.7 million .', 'text_1': 'the customer-related intangibles and network location intangibles 

In [20]:
pd.reset_option("display.max_rows")
pd.reset_option("display.max_columns")
pd.reset_option("display.max_colwidth")

In [28]:
query = "for acquired customer-related and network location intangibles , what is the expected annual amortization expenses , in millions?"
results = search_faiss_raw(query, top_k=5)

print(f"\nTop results for query: {query}\n")
for r in results:
    preview = r["text"].replace("\n", " ")[:200]
    print(f"{r['rank']:>2}. score={r['score']:.3f} | ID={r['ID']} | source={r['source']}")
    print(f"    {preview}...\n")


Top results for query: for acquired customer-related and network location intangibles , what is the expected annual amortization expenses , in millions?

 1. score=0.876 | ID=None | source=None
    ['( 1 ) consists of customer relationships of approximately $ 205.4 million and network location intangibles of approximately $ 55.5 million .', 'the customer relationships and network location intang...

 2. score=0.873 | ID=None | source=None
    92 | 2017 form 10-k finite-lived intangible assets are amortized over their estimated useful lives and tested for impairment if events or changes in circumstances indicate that the asset may be impair...

 3. score=0.872 | ID=None | source=None
    ['( 1 ) consists of customer-related intangibles of approximately $ 0.4 million and network location intangibles of approximately $ 0.7 million .', 'the customer-related intangibles and network locati...

 4. score=0.870 | ID=None | source=None
    ['american tower corporation and subsidiaries notes to

In [29]:
import numpy as np
import faiss
import os

# ---- Make sure embedder is initialized ----
try:
    from langchain_openai import OpenAIEmbeddings
except ImportError:
    from langchain.embeddings.openai import OpenAIEmbeddings

embedder = OpenAIEmbeddings(
    openai_api_key=os.getenv("OPENAI_API_KEY"),
    model="text-embedding-ada-002"
)

# ---- Define the function ----
def embed_query_vec(query: str) -> np.ndarray:
    """Return normalized query vector for cosine FAISS search."""
    v = np.array(embedder.embed_query(query), dtype="float32")[None, :]
    faiss.normalize_L2(v)
    return v

In [None]:
# show raw FAISS output (indices + scores) and check uniqueness
q = query
k = 5

v = embed_query_vec(q)                 
scores, idxs = index.search(v, k)

print("indices:", idxs[0].tolist())
print("scores: ", [round(float(s), 3) for s in scores[0]])
print("unique idxs:", len(set(idxs[0].tolist())), "of", len(idxs[0]))

import hashlib
def h(s): return hashlib.sha256((s or "").encode("utf-8")).hexdigest()[:10]

for i, s in zip(idxs[0], scores[0]):
    m = meta[int(i)]
    print(f"i={int(i):6d} score={float(s):.3f} chunk_id={m.get('chunk_id')} id={m.get('ID')} src={m.get('source')}"
          f" text_hash={h(m.get('text',''))} len={len(m.get('text',''))}")


indices: [6084, 8719, 3491, 17, 11401]
scores:  [0.876, 0.873, 0.872, 0.87, 0.868]
unique idxs: 5 of 5
i=  6084 score=0.876 chunk_id=row_2118_chunk_1 id=None src=None text_hash=f1730790f4 len=1495
i=  8719 score=0.873 chunk_id=row_5094_chunk_0 id=None src=None text_hash=2d91bb20e4 len=1326
i=  3491 score=0.872 chunk_id=row_1040_chunk_1 id=None src=None text_hash=55974a32a9 len=1498
i=    17 score=0.870 chunk_id=row_5_chunk_0 id=None src=None text_hash=18eef8cab3 len=1486
i= 11401 score=0.868 chunk_id=row_7696_chunk_1 id=None src=None text_hash=f32fb15569 len=997


Try to improve performance using a reranker: 

In [None]:
# --- CONFIG ---
TOP_K_FAISS = 50   # retrieve this many from FAISS
RERANK_K    = 10   # keep this many after reranking
QUERY       = q

# --- imports ---
import os, json, numpy as np, faiss, torch
from src.retrievers.vectorrag.retriever import rerank_search    
from src.retrievers.vectorrag.reranker import CrossEncoderReranker 

# Load index + metadata
index = faiss.read_index(INDEX_PATH)
with open(META_PATH, "r", encoding="utf-8") as f:
    aligned_meta = json.load(f)

# Build the “all_documents” list in FAISS order (must align with index add-order)
all_documents = [m.get("text", "") for m in aligned_meta]

# ---- Build the SAME embedder used for documents ----
try:
    from langchain_openai import OpenAIEmbeddings
except ImportError:
    from langchain.embeddings.openai import OpenAIEmbeddings

embedder = OpenAIEmbeddings(
    openai_api_key=os.getenv("OPENAI_API_KEY"),
    model="text-embedding-ada-002",
)

def make_query_tensor(text: str) -> torch.Tensor:
    """Embed + L2-normalize so inner product = cosine."""
    v = np.array(embedder.embed_query(text), dtype="float32").reshape(1, -1)
    faiss.normalize_L2(v)
    return torch.from_numpy(v)

# ---- Call your reranker pipeline ----
q_emb = make_query_tensor(QUERY)

reranked_texts, reranked_scores = rerank_search(
    query=QUERY,
    query_embedding=q_emb,
    index=index,
    all_documents=all_documents,
    top_k=TOP_K_FAISS,
    rerank_k=RERANK_K,
    return_scores=True
)

# ---- Pretty print ----
print(f"\nTop {RERANK_K} results for query: {QUERY}\n")
for i, (txt, sc) in enumerate(zip(reranked_texts, reranked_scores), 1):
    preview = (txt or "").replace("\n", " ")[:220]
    print(f"{i:>2}. score={float(sc):.4f}  {preview} ...\n")



Top 10 results for query: for acquired customer-related and network location intangibles , what is the expected annual amortization expenses , in millions?

 1. score=0.9933  ( 1 ) consists of customer-related intangibles of approximately $ 0.4 million and network location intangibles of approximately $ 0.7 million . the customer-related intangibles and network location intangibles are being  ...

 2. score=0.9933  ( 1 ) consists of customer-related intangibles of approximately $ 80.0 million and network location intangibles of approximately $ 38.0 million . the customer-related intangibles and network location intangibles are bein ...

 3. score=0.9931  american tower corporation and subsidiaries notes to consolidated financial statements ( 3 ) consists of customer-related intangibles of approximately $ 75.0 million and network location intangibles of approximately $ 72 ...

 4. score=0.9928  ( 1 ) consists of customer relationships of approximately $ 205.4 million and network locati