# Add meta data to the data

## Create a meta data file 

In [1]:
# check the /Users/christel/Desktop/Thesis/thesis_repo/notebooks/context_with_metadata_dedup_enriched.jsonl file for rows with missing ticker, year or page key 
import json
# Load the JSONL file
file_path = "/Users/christel/Desktop/Thesis/thesis_repo/notebooks/context_with_metadata_dedup_enriched.jsonl"
with open(file_path, 'r', encoding='utf-8') as file:
    data = [json.loads(line) for line in file]

# Check for missing keys
missing_ticker = [item for item in data if "ticker" not in item]
missing_year = [item for item in data if "year" not in item]
missing_page = [item for item in data if "page" not in item]

print(f"Missing ticker: {len(missing_ticker)} samples")
print(f"Missing year: {len(missing_year)} samples")
print(f"Missing page: {len(missing_page)} samples")

Missing ticker: 0 samples
Missing year: 0 samples
Missing page: 0 samples


In [4]:
# print the type of keys each sample has 
for i, item in enumerate(data):
    keys = set(item.keys())
    print(f"Sample {i} keys: {keys}")
    if i >= 10:  # Limit output to first 10 samples
        break

Sample 0 keys: {'source_id', 'year', 'ticker', 'source', 'context', 'title', 'page'}
Sample 1 keys: {'source_id', 'year', 'ticker', 'source', 'context', 'title', 'page'}
Sample 2 keys: {'source_id', 'year', 'ticker', 'source', 'context', 'title', 'page'}
Sample 3 keys: {'source_id', 'year', 'ticker', 'source', 'context', 'title', 'page'}
Sample 4 keys: {'source_id', 'year', 'ticker', 'source', 'context', 'title', 'page'}
Sample 5 keys: {'source_id', 'year', 'ticker', 'source', 'context', 'title', 'page'}
Sample 6 keys: {'source_id', 'year', 'ticker', 'source', 'context', 'title', 'page'}
Sample 7 keys: {'source_id', 'year', 'ticker', 'source', 'context', 'title', 'page'}
Sample 8 keys: {'source_id', 'year', 'ticker', 'source', 'context', 'title', 'page'}
Sample 9 keys: {'source_id', 'year', 'ticker', 'source', 'context', 'title', 'page'}
Sample 10 keys: {'source_id', 'year', 'ticker', 'source', 'context', 'title', 'page'}


In [5]:
# print the first sample
if data:
    print("First sample:")
    print(json.dumps(data[0], indent=2))

First sample:
{
  "context": "['interest rate to a variable interest rate based on the three-month libor plus 2.05% ( 2.05 % ) ( 2.34% ( 2.34 % ) as of october 31 , 2009 ) .', 'if libor changes by 100 basis points , our annual interest expense would change by $ 3.8 million .', 'foreign currency exposure as more fully described in note 2i .', 'in the notes to consolidated financial statements contained in item 8 of this annual report on form 10-k , we regularly hedge our non-u.s .', 'dollar-based exposures by entering into forward foreign currency exchange contracts .', 'the terms of these contracts are for periods matching the duration of the underlying exposure and generally range from one month to twelve months .', 'currently , our largest foreign currency exposure is the euro , primarily because our european operations have the highest proportion of our local currency denominated expenses .', 'relative to foreign currency exposures existing at october 31 , 2009 and november 1 , 2008

Add another key called chunks, which lists all chunks that where made from this context document with their IDs

In [6]:
import json, re, hashlib
from ast import literal_eval
from collections import defaultdict

meta_path = "/Users/christel/Desktop/Thesis/thesis_repo/notebooks/context_with_metadata_dedup_enriched.jsonl"   # your metadata file
docs_path = "/Users/christel/Desktop/Thesis/thesis_repo/data/data_processed/unique_contexts_filtered.json"
embedded_chunks_path = "/Users/christel/Desktop/Thesis/thesis_repo/data/data_processed/embedded_chunks.json"  # or .jsonl
out_path = "metadata_enriched_with_chunk_ids2.jsonl"

# ---------- helpers ----------
def normalize_text(s):
    if s is None:
        return ""
    if isinstance(s, list):
        s = "\n\n".join(map(str, s))
    else:
        s = str(s).strip()
        if s.startswith("[") and s.endswith("]"):
            # handle stringified lists
            try:
                v = literal_eval(s)
                if isinstance(v, list):
                    s = "\n\n".join(map(str, v))
            except Exception:
                pass
    s = re.sub(r"\s+", " ", s.strip())
    return s

def sha256(s):
    return hashlib.sha256(s.encode("utf-8")).hexdigest()

def load_json_or_jsonl(path):
    with open(path, "r", encoding="utf-8") as f:
        first = f.read(1)
        f.seek(0)
        if first == "[":
            return json.load(f)
        else:
            return [json.loads(line) for line in f if line.strip()]

# ---------- 1) load the original contexts used for chunking ----------
raw_contexts = load_json_or_jsonl(docs_path)  # list[str]
row_index_to_context = {i: normalize_text(x if isinstance(x, str) else "") for i, x in enumerate(raw_contexts)}
context_to_row_index = {v: i for i, v in row_index_to_context.items()}

# ---------- 2) load embedded chunks and group chunk_ids by row_index ----------
chunks = load_json_or_jsonl(embedded_chunks_path)
chunks_by_row = defaultdict(list)
for ch in chunks:
    ridx = ch.get("row_index")
    cid  = ch.get("chunk_id")
    if isinstance(ridx, int) and cid:
        chunks_by_row[ridx].append(cid)

# ---------- 3) load metadata rows and enrich with doc_id + chunk_ids ----------
meta_rows = load_json_or_jsonl(meta_path)

def get_source(row):
    return (row.get("source") or row.get("title") or "").strip()

out = []
missing_match = 0
for m in meta_rows:
    ctx_norm = normalize_text(m.get("context", ""))
    ridx = context_to_row_index.get(ctx_norm)  # None if this meta row didn't exist in the file you chunked
    source = get_source(m)
    doc_id = f"{source}|{m.get('source_id') or ''}"
    doc_hash = sha256(ctx_norm)

    out.append({
        **m,  # keep original metadata
        "doc_id": doc_id,
        "doc_hash": doc_hash,
        "row_index": ridx,                        # helpful for joins
        "chunk_ids": chunks_by_row.get(ridx, []), # empty if not chunked yet
    })
    if ridx is None:
        missing_match += 1

# ---------- 4) write enriched metadata ----------
with open(out_path, "w", encoding="utf-8") as f:
    for row in out:
        f.write(json.dumps(row, ensure_ascii=False) + "\n")

print(f"Wrote: {out_path}")
print(f"Total meta rows: {len(meta_rows)} | matched to a row_index: {len(meta_rows)-missing_match} | unmatched: {missing_match}")


Wrote: metadata_enriched_with_chunk_ids2.jsonl
Total meta rows: 7696 | matched to a row_index: 5203 | unmatched: 2493


In [7]:
#inspect the output
import json
# Load the enriched metadata file
with open(out_path, 'r', encoding='utf-8') as file:
    enriched_data = [json.loads(line) for line in file] 

# print the type of keys each sample has
for i, item in enumerate(enriched_data):
    keys = set(item.keys())
    print(f"Sample {i} keys: {keys}")
    if i >= 10:  # Limit output to first 10 samples
        break

Sample 0 keys: {'source_id', 'year', 'ticker', 'source', 'doc_hash', 'doc_id', 'context', 'chunk_ids', 'row_index', 'title', 'page'}
Sample 1 keys: {'source_id', 'year', 'ticker', 'source', 'doc_hash', 'doc_id', 'context', 'chunk_ids', 'row_index', 'title', 'page'}
Sample 2 keys: {'source_id', 'year', 'ticker', 'source', 'doc_hash', 'doc_id', 'context', 'chunk_ids', 'row_index', 'title', 'page'}
Sample 3 keys: {'source_id', 'year', 'ticker', 'source', 'doc_hash', 'doc_id', 'context', 'chunk_ids', 'row_index', 'title', 'page'}
Sample 4 keys: {'source_id', 'year', 'ticker', 'source', 'doc_hash', 'doc_id', 'context', 'chunk_ids', 'row_index', 'title', 'page'}
Sample 5 keys: {'source_id', 'year', 'ticker', 'source', 'doc_hash', 'doc_id', 'context', 'chunk_ids', 'row_index', 'title', 'page'}
Sample 6 keys: {'source_id', 'year', 'ticker', 'source', 'doc_hash', 'doc_id', 'context', 'chunk_ids', 'row_index', 'title', 'page'}
Sample 7 keys: {'source_id', 'year', 'ticker', 'source', 'doc_hash', 

In [9]:
#print the row_index and chunk_id, doc_hash, doc_id of the first sample
if enriched_data:
    print("First sample:")
    print(json.dumps(enriched_data[5], indent=2))
    print("Row index:", enriched_data[5].get("row_index"))
    print("Chunk IDs:", enriched_data[5].get("chunk_ids"))
    print("Doc Hash:", enriched_data[5].get("doc_hash"))
    print("Doc ID:", enriched_data[5].get("doc_id"))
else:
    print("No enriched data found.")

First sample:
{
  "context": "['american tower corporation and subsidiaries notes to consolidated financial statements ( 3 ) consists of customer-related intangibles of approximately $ 75.0 million and network location intangibles of approximately $ 72.7 million .', 'the customer-related intangibles and network location intangibles are being amortized on a straight-line basis over periods of up to 20 years .', '( 4 ) the company expects that the goodwill recorded will be deductible for tax purposes .', 'the goodwill was allocated to the company 2019s international rental and management segment .', 'on september 12 , 2012 , the company entered into a definitive agreement to purchase up to approximately 348 additional communications sites from telef f3nica mexico .', 'on september 27 , 2012 and december 14 , 2012 , the company completed the purchase of 279 and 2 communications sites , for an aggregate purchase price of $ 63.5 million ( including value added tax of $ 8.8 million ) .', 'th

In [10]:
# print the number of samples where chunk_ids is empty
empty_chunk_ids_count = sum(1 for item in enriched_data if not item.get("chunk_ids"))
print(f"Number of samples with empty chunk_ids: {empty_chunk_ids_count}")
# print the number of samples with chunk_ids
non_empty_chunk_ids_count = sum(1 for item in enriched_data if item.get("chunk_ids"))
print(f"Number of samples with non-empty chunk_ids: {non_empty_chunk_ids_count}")

Number of samples with empty chunk_ids: 2493
Number of samples with non-empty chunk_ids: 5203


In [13]:
# number of samples that have a non "none" or "null" ticker, year or page key and non empty chunk_ids
non_empty_ticker_year_page_count = sum(1 for item in enriched_data if 
    item.get("ticker") not in [None, "null", "none"] and
    item.get("year") not in [None, "null", "none"] and
    item.get("page") not in [None, "null", "none"] and
    item.get("chunk_ids")
)
print(f"Number of samples with non-empty ticker, year, page and chunk_ids: {non_empty_ticker_year_page_count}")

Number of samples with non-empty ticker, year, page and chunk_ids: 0


# Update the knowledge graph 

Update one chunk to check whether the code is working

In [12]:
import os, json
from dotenv import load_dotenv
from langchain_neo4j import Neo4jGraph

load_dotenv()

META_PATH = "/Users/christel/Desktop/Thesis/thesis_repo/notebooks/metadata_enriched_with_chunk_ids2.jsonl"
TARGET_CHUNK_ID = "row_7_chunk_1"

graph = Neo4jGraph(
    url=os.getenv("NEO4J_URI"),
    username=os.getenv("NEO4J_USERNAME"),
    password=os.getenv("NEO4J_PASSWORD"),
)

UPSERT_ONE = """
MATCH (c:Chunk {id: $chunk_id})
SET  c.source      = coalesce($source, c.source),
     c.source_id   = coalesce($source_id, c.source_id),
     c.ticker      = coalesce($ticker, c.ticker),
     c.year        = coalesce(toInteger($year), c.year),
     c.page        = coalesce(toInteger($page), c.page),
     c.text_len    = coalesce(toInteger($text_len), c.text_len),
     c.has_metadata = ($ticker IS NOT NULL OR $year IS NOT NULL OR $page IS NOT NULL)
RETURN c
"""

def stream_json_or_jsonl(path):
    with open(path, "r", encoding="utf-8") as f:
        first = f.read(1)
        f.seek(0)
        if first == "[":
            for r in json.load(f):
                yield r
        else:
            for line in f:
                line = line.strip()
                if line:
                    yield json.loads(line)

def find_doc_row_for_chunk(path, chunk_id):
    """Return a metadata row whose chunk_ids list contains chunk_id."""
    for r in stream_json_or_jsonl(path):
        # case 1: per-chunk file (has 'chunk_id')
        if r.get("chunk_id") == chunk_id:
            return r
        # case 2: per-document file (has 'chunk_ids': list)
        ids = r.get("chunk_ids")
        if isinstance(ids, list) and chunk_id in ids:
            return r
    return None

rec = find_doc_row_for_chunk(META_PATH, TARGET_CHUNK_ID)
if not rec:
    print(f"No metadata found in {META_PATH} for chunk_id={TARGET_CHUNK_ID}")
else:
    params = {
        "chunk_id":  TARGET_CHUNK_ID,
        "source":    rec.get("source") or rec.get("title"),
        "source_id": rec.get("source_id"),
        "ticker":    rec.get("ticker"),
        "year":      rec.get("year"),
        "page":      rec.get("page"),
        # you likely don’t have per-chunk text here; 0 is fine
        "text_len":  len(rec.get("text") or "") if "text" in rec else 0,
    }
    res = graph.query(UPSERT_ONE, params)
    print("Updated node:", res[0]["c"] if res else "Chunk not found in DB")

Updated node: {'text_len': 0, 'has_metadata': False, 'id': 'row_7_chunk_1', 'source': 'Finder', 'source_id': 'aaa2a2aa', 'text': 'Senior Vice President, Chief Accounting Officer\n\n\nFredric J. Tomczyk. Mr. Tomczyk is our current Chief Executive Officer and director. He has served as our CEO since September 2023. Mr. Tomczyk served on our Board as an independent director from July 2019 to September 2023. He is the retired President and Chief Executive Officer of TD Ameritrade Holding Corporation, a position he held from October 2008 to October 2016. Prior to this position, he held positions of increasing responsibility and leadership with the TD organization from 1999. Mr. Tomczyk was also a member of the TD Ameritrade board of directors from 2006 to 2007 and 2008 to 2016. Prior to joining the TD organization in 1999, Mr. Tomczyk was President and Chief Executive Officer of London Life. He currently serves on the board of Willis Towers Watson PLC, a publicly traded company, and is a me

it worked. We will now perform this operation on the rest of the chunks 

In [14]:
import os, json
from dotenv import load_dotenv
from langchain_neo4j import Neo4jGraph

load_dotenv()

META_PATH = "/Users/christel/Desktop/Thesis/thesis_repo/notebooks/metadata_enriched_with_chunk_ids2.jsonl"

graph = Neo4jGraph(
    url=os.getenv("NEO4J_URI"),
    username=os.getenv("NEO4J_USERNAME"),
    password=os.getenv("NEO4J_PASSWORD"),
)

# 0) (optional) ensure unique chunk IDs
graph.query("CREATE CONSTRAINT unique_chunk_id IF NOT EXISTS FOR (c:Chunk) REQUIRE c.id IS UNIQUE")

# 1) REMOVE the old text_len property from all chunks (one statement)
graph.query("MATCH (c:Chunk) REMOVE c.text_len")

# 2) Upsert metadata to chunks (no text_len; has_metadata computed from node values)
UPSERT_CHUNK_META = """
UNWIND $rows AS r
WITH r
WHERE r.chunk_id IS NOT NULL
MATCH (c:Chunk {id: r.chunk_id})
SET  c.source    = coalesce(r.source,    c.source),
     c.source_id = coalesce(r.source_id, c.source_id),
     c.ticker    = coalesce(r.ticker,    c.ticker),
     c.year      = coalesce(toInteger(r.year), c.year),
     c.page      = coalesce(toInteger(r.page), c.page)
SET  c.has_metadata = any(v IN [c.ticker, c.year, c.page] WHERE v IS NOT NULL)
"""

def stream_json_or_jsonl(path):
    with open(path, "r", encoding="utf-8") as f:
        first = f.read(1)
        f.seek(0)
        if first == "[":
            for r in json.load(f):
                yield r
        else:
            for line in f:
                line = line.strip()
                if line:
                    yield json.loads(line)

def per_chunk_rows(meta_path):
    """Yield dicts like {'chunk_id', 'source', 'source_id', 'ticker','year','page'}."""
    for r in stream_json_or_jsonl(meta_path):
        # prefer 'source', fallback to 'title'
        src = (r.get("source") or r.get("title") or None)
        base = {
            "source": src,
            "source_id": r.get("source_id"),
            "ticker": r.get("ticker"),
            "year":   r.get("year"),
            "page":   r.get("page"),
        }
        if isinstance(r.get("chunk_ids"), list):
            for cid in r["chunk_ids"]:
                if cid:
                    yield {"chunk_id": cid, **base}
        elif r.get("chunk_id"):  # if you ever have per-chunk file
            yield {"chunk_id": r["chunk_id"], **base}

# 3) Batch update
BATCH = 1000
batch, total = [], 0
for row in per_chunk_rows(META_PATH):
    batch.append(row)
    if len(batch) >= BATCH:
        graph.query(UPSERT_CHUNK_META, {"rows": batch})
        total += len(batch)
        batch.clear()
if batch:
    graph.query(UPSERT_CHUNK_META, {"rows": batch})
    total += len(batch)

print(f"Updated metadata on ~{total} chunks.")


Updated metadata on ~9104 chunks.


# Update the knowledge base 

## Did it work? Gold in context

In [100]:
# === Load JSON files ===
with open("/Users/christel/Desktop/Thesis/thesis_repo/data/merged_gold_eval_dataset.json") as f:
    gold_data = json.load(f)

with open("/Users/christel/Desktop/Thesis/thesis_repo/data/data_processed/existing_embeddings_with_meta_data.jsonl") as f:
    embedded_data = [json.loads(line) for line in f]

In [101]:
print(f"Gold data length: {len(gold_data)}")
print(f"Embedded data length: {len(embedded_data)}")

Gold data length: 149
Embedded data length: 10146


In [105]:
import json
import pandas as pd

# === Extract all embedded text chunks ===
embedded_texts = [entry["text"].lower().strip() for entry in embedded_data if "text" in entry]

# === Process gold dataset ===
results = []

for sample in gold_data:
    sample_id = sample.get("ID", "N/A")
    reference_contexts = sample.get("reference_contexts") or []

    
    # Clean and normalize
    reference_contexts = [ctx.lower().strip() for ctx in reference_contexts if isinstance(ctx, str) and ctx.strip()]
    
    reference_present = bool(reference_contexts)
    
    embedded_found = any(
        any(ref in text for text in embedded_texts)
        for ref in reference_contexts
    ) if reference_present else False

    results.append({
        "ID": sample_id,
        "reference_context_present": reference_present,
        "reference_context_embedded": embedded_found
    })

# === Create DataFrame ===
df = pd.DataFrame(results)
df.head(149)


Unnamed: 0,ID,reference_context_present,reference_context_embedded
0,182f0809,False,False
1,c593f878,False,False
2,cb08b8b0,False,False
3,GPN/2009/page_85.pdf,True,True
4,AMT/2012/page_123.pdf,True,True
...,...,...,...
144,AMT/2004/page_91.pdf,False,False
145,AON/2011/page_61.pdf,False,False
146,BDX/2016/page_21.pdf,False,False
147,CAT/2017/page_69.pdf,False,False


In [106]:
print(df)

                        ID  reference_context_present  \
0                 182f0809                      False   
1                 c593f878                      False   
2                 cb08b8b0                      False   
3     GPN/2009/page_85.pdf                       True   
4    AMT/2012/page_123.pdf                       True   
..                     ...                        ...   
144   AMT/2004/page_91.pdf                      False   
145   AON/2011/page_61.pdf                      False   
146   BDX/2016/page_21.pdf                      False   
147   CAT/2017/page_69.pdf                      False   
148  EOG/2017/page_102.pdf                      False   

     reference_context_embedded  
0                         False  
1                         False  
2                         False  
3                          True  
4                          True  
..                          ...  
144                       False  
145                       False  
146      

In [107]:
# count the number of samples where reference_context_embedded is True
embedded_count = df["reference_context_embedded"].sum()
print(f"Number of samples with embedded reference contexts: {embedded_count}")

Number of samples with embedded reference contexts: 50


In [109]:
# count the number of samples where reference_context_present in df is True
present_count = df["reference_context_present"].sum()
print(f"Number of samples with present reference contexts: {present_count}")


Number of samples with present reference contexts: 50


In [111]:
# count the number of samples where reference_context_present is True and reference_context_embedded is True
count_present_and_embedded = df[(df["reference_context_present"]) & (df["reference_context_embedded"])].shape[0]
print(f"Number of samples with reference_context_present and reference_context_embedded: {count_present_and_embedded}")

Number of samples with reference_context_present and reference_context_embedded: 50


In [110]:
# count the number of samples in gold_data where gold_context is not empty
gold_context_count = sum(1 for sample in gold_data if sample.get("gold_context"))
print(f"Number of samples with non-empty gold_context: {gold_context_count}")

Number of samples with non-empty gold_context: 81


In [118]:
# count the number of samples in gold_data where reference_context is not empty
reference_context_count = sum(1 for sample in gold_data if sample.get("reference_contexts"))
print(f"Number of samples with non-empty gold_context: {reference_context_count}")

Number of samples with non-empty gold_context: 118


In [126]:
long_reference_context_count = sum(
    1 for sample in gold_data
    if any(
        isinstance(ctx, str) and len(ctx.strip()) > 10
        for ctx in sample.get("reference_contexts") or []
    )
)

print(f"Number of samples with reference_context longer than 10 characters: {long_reference_context_count}")


Number of samples with reference_context longer than 10 characters: 50


In [127]:
long_gold_context_count = sum(
    1 for sample in gold_data
    if isinstance(sample.get("gold_context"), dict) and any(
        isinstance(text, str) and len(text.strip()) > 10
        for text in sample["gold_context"].values()
    )
)

print(f"Number of samples with gold_context strings longer than 10 characters: {long_gold_context_count}")


Number of samples with gold_context strings longer than 10 characters: 81


In [139]:
import re
import unicodedata

def normalize_for_search(text):
    # Normalize unicode, lowercase, collapse whitespace
    text = unicodedata.normalize("NFKD", text)
    text = text.lower()
    text = re.sub(r'\s+', ' ', text)  # Replace tabs/newlines/multiple spaces with single space
    return text.strip()

def ctrl_f_like_match(snippet, full_text):
    if not isinstance(snippet, str) or not isinstance(full_text, str):
        return False
    snippet_norm = normalize_for_search(snippet)
    full_text_norm = normalize_for_search(full_text)
    return snippet_norm in full_text_norm



In [146]:
gold_data = pd.DataFrame(gold_data)
gold_data.head()

Unnamed: 0,ID,question,answer,context,gold_context,operation,source,reasoning_type,reference_contexts
0,182f0809,"GM operating margin 2023 vs 2022, GM.","To calculate the operating profit margin, we d...","[CONSOLIDATED INCOME STATEMENTS\n(In millions,...",,Division,FinDER,Arithmetic,[]
1,c593f878,NGC's cyber investments boost investor confide...,The provided information allows us to assess h...,[We recognize the critical importance of maint...,,,FinDER,Non-numerical,[]
2,cb08b8b0,"SBA Comm., credit evals & DTA quality receivab...",The details provided illustrate that SBA Commu...,[Site leasing revenues\n\nRevenue from site le...,,,FinDER,Non-numerical,[]
3,GPN/2009/page_85.pdf,"Q: For GPN, what was the fair value of share a...",265%,['notes to consolidated financial statements 2...,{'text_2': 'the total fair value of share awar...,"subtract(6.2, 1.7), divide(#0, 1.7)",ConvFinQA,Compositional,[the total fair value of share awards vested d...
4,AMT/2012/page_123.pdf,what was the cost per tower in American Tower’...,85607,['american tower corporation and subsidiaries ...,"{'text_8': '( 201ccolombia movil 201d ) , wher...","multiply(182.0, const_1000000), divide(#0, 2126)",FinQA,Compositional,"[( 201ccolombia movil 201d ) , whereby atc sit..."


In [156]:
sample_1 = gold_data[gold_data["ID"] == "VLO/2018/page_99.pdf"]
# filter gold_context and context for sample 1
gold_context_1 = sample_1["gold_context"]
context_1 = sample_1["context"]

In [158]:
print(sample_1)

                      ID                                           question  \
96  VLO/2018/page_99.pdf  Q: For Valero, what was the sum spent on commo...   

   answer                                            context  \
96    4.1  ['table of contents valero energy corporation ...   

                                         gold_context  \
96  {'text_8': 'during the years ended december 31...   

                      operation     source reasoning_type  \
96  add(1.5, 1.3), add(#0, 1.3)  ConvFinQA  Compositional   

                                   reference_contexts  
96  [during the years ended december 31 , 2018 , 2...  


In [155]:
print(gold_context_1)

96    {'text_8': 'during the years ended december 31...
Name: gold_context, dtype: object


In [157]:
print(context_1)

96    ['table of contents valero energy corporation ...
Name: context, dtype: object


In [160]:
import re
import unicodedata

# === Normalize like Ctrl+F ===
def normalize(text):
    if not isinstance(text, str):
        return ""
    text = unicodedata.normalize("NFKD", text)
    text = text.lower()
    text = re.sub(r"\s+", " ", text)
    return text.strip()

# === Extract gold context dict from Series (row 96) ===
gold_dict = gold_context_1.iloc[0]  # safely extract the dict from the Series

# === Normalize gold_context strings ===
gold_texts = [
    normalize(text)
    for text in gold_dict.values()
    if isinstance(text, str) and text.strip()
]

# === Extract and normalize context string ===
context_chunks = context_1.iloc[0]  # safely extract list from Series
context_text = normalize(" ".join(chunk for chunk in context_chunks if isinstance(chunk, str)))

# === Perform match ===
match_found = any(gold in context_text for gold in gold_texts)

# === Print result ===
print("✅ Match found!" if match_found else "❌ No match found.")


❌ No match found.


In [141]:
print(gold_context_1)

{}


In [None]:
match_found = any(
    ctrl_f_like_match(gold_text, ctx)
    for gold_text in gold_texts
    for ctx in context_texts
)


In [133]:
present_count = 0

for sample in gold_data:
    if has_gold_context(sample):
        present_count += 1

print(f"Samples with gold_context present: {present_count}")


Samples with gold_context present: 81


In [137]:
# sample 1 == samples where ID == AWK\/2018\/page_152.pdf
sample_1 = df[df["ID"] == "VLO\/2018\/page_99.pdf"]

present = has_gold_context(sample_1)
print(f"Sample 1 has gold_context present: {present}")

Sample 1 has gold_context present: False


In [131]:
count_gold_context_in_context = 0

for sample in gold_data:
    gold_context = sample.get("gold_context")
    context_list = sample.get("context", [])
    
    if isinstance(gold_context, dict) and gold_context and isinstance(context_list, list):
        gold_texts = [normalize(val) for val in gold_context.values() if isinstance(val, str)]
        context_texts = [normalize(ctx) for ctx in context_list if isinstance(ctx, str)]

        if any(
            any(gold_text in context_text for context_text in context_texts)
            for gold_text in gold_texts
        ):
            count_gold_context_in_context += 1

print(f"Number of samples where gold_context is present in context: {count_gold_context_in_context}")


Number of samples where gold_context is present in context: 0


## Identify contexts that need to be embedded 

In [20]:
# filter for finder and convfinqa samples in /Users/christel/Desktop/Thesis/thesis_repo/data/gold_test_data_updated.json
import json

# Load the gold test data
with open("/Users/christel/Desktop/Thesis/thesis_repo/data/merged_gold_eval_dataset.json", "r") as f:
    gold_test_data = json.load(f)

# Filter for Finder and ConvFinQA samples
finqa_samples = []
convfinqa_samples = []

for item in gold_test_data:
    source = item.get("source", "").lower().strip()
    
    if source == "finqa":
        finqa_samples.append(item)
    elif source == "convfinqa":
        convfinqa_samples.append(item)

# Print results
print(f"Total samples in gold test data: {len(gold_test_data)}")
print(f"Finder samples: {len(finqa_samples)}")
print(f"ConvFinQA samples: {len(convfinqa_samples)}")
print(f"Combined Finder + ConvFinQA samples: {len(finqa_samples) + len(convfinqa_samples)}")

# Combine both if needed
finder_and_convfinqa_samples = finqa_samples + convfinqa_samples

# Optional: Save filtered samples to new files
with open("finqa_samples.json", "w") as f:
    json.dump(finqa_samples, f, indent=2)

with open("convfinqa_samples.json", "w") as f:
    json.dump(convfinqa_samples, f, indent=2)

with open("finder_and_convfinqa_samples.json", "w") as f:
    json.dump(finder_and_convfinqa_samples, f, indent=2)

print("\nSaved filtered samples to separate files.")

# Preview first sample from each
if finqa_samples:
    print("\nFirst Finder sample:")
    print(json.dumps(finqa_samples[0], indent=2))

if convfinqa_samples:
    print("\nFirst ConvFinQA sample:")
    print(json.dumps(convfinqa_samples[0], indent=2))

Total samples in gold test data: 149
Finder samples: 44
ConvFinQA samples: 37
Combined Finder + ConvFinQA samples: 81

Saved filtered samples to separate files.

First Finder sample:
{
  "ID": "AMT/2012/page_123.pdf",
  "question": "what was the cost per tower in American Tower\u2019s colombia movil acquisition?",
  "answer": "85607",
  "context": "['american tower corporation and subsidiaries notes to consolidated financial statements the allocation of the purchase price was finalized during the year ended december 31 , 2012 .', 'the following table summarizes the allocation of the aggregate purchase consideration paid and the amounts of assets acquired and liabilities assumed based upon their estimated fair value at the date of acquisition ( in thousands ) : purchase price allocation .']\n\tfinal purchase price allocation\nnon-current assets\t$ 2\nproperty and equipment\t3590\nintangible assets ( 1 )\t1062\nother non-current liabilities\t-91 ( 91 )\nfair value of net assets acquired\

In [21]:
# check for contexts in finqa and confinqa that are identical 
import json
from collections import defaultdict

def normalize_context(context):
    """Normalize context for comparison"""
    if isinstance(context, list):
        return ' '.join(str(item) for item in context)
    elif isinstance(context, str):
        return context.strip()
    else:
        return str(context).strip()

# Extract and normalize contexts from both datasets
finqa_contexts = {}
convfinqa_contexts = {}

for item in finqa_samples:
    context = normalize_context(item.get("context", ""))
    if context:
        finqa_contexts[context] = item

for item in convfinqa_samples:
    context = normalize_context(item.get("context", ""))
    if context:
        convfinqa_contexts[context] = item

# Find identical contexts
identical_contexts = []
finqa_context_set = set(finqa_contexts.keys())
convfinqa_context_set = set(convfinqa_contexts.keys())

common_contexts = finqa_context_set.intersection(convfinqa_context_set)

print(f"Total FinQA contexts: {len(finqa_context_set)}")
print(f"Total ConvFinQA contexts: {len(convfinqa_context_set)}")
print(f"Identical contexts between FinQA and ConvFinQA: {len(common_contexts)}")

# Detailed analysis of identical contexts
if common_contexts:
    print(f"\nFound {len(common_contexts)} identical contexts:")
    
    for i, context in enumerate(list(common_contexts)[:5]):  # Show first 5 examples
        finqa_item = finqa_contexts[context]
        convfinqa_item = convfinqa_contexts[context]
        
        print(f"\n--- Identical Context {i+1} ---")
        print(f"Context preview: {context[:100]}...")
        print(f"FinQA question: {finqa_item.get('question', 'N/A')}")
        print(f"ConvFinQA question: {convfinqa_item.get('question', 'N/A')}")
        print(f"FinQA answer: {finqa_item.get('answer', 'N/A')}")
        print(f"ConvFinQA answer: {convfinqa_item.get('answer', 'N/A')}")
    
    if len(common_contexts) > 5:
        print(f"\n... and {len(common_contexts) - 5} more identical contexts")

    # Save identical context pairs for further analysis
    identical_pairs = []
    for context in common_contexts:
        identical_pairs.append({
            "context": context,
            "finqa_item": finqa_contexts[context],
            "convfinqa_item": convfinqa_contexts[context]
        })
    
    with open("identical_contexts_finqa_convfinqa.json", "w") as f:
        json.dump(identical_pairs, f, indent=2)
    
    print(f"\nSaved {len(identical_pairs)} identical context pairs to 'identical_contexts_finqa_convfinqa.json'")

else:
    print("\nNo identical contexts found between FinQA and ConvFinQA datasets.")

# Additional statistics
print(f"\nContext overlap percentage:")
print(f"FinQA contexts that appear in ConvFinQA: {len(common_contexts)/len(finqa_context_set)*100:.2f}%")
print(f"ConvFinQA contexts that appear in FinQA: {len(common_contexts)/len(convfinqa_context_set)*100:.2f}%")


Total FinQA contexts: 44
Total ConvFinQA contexts: 37
Identical contexts between FinQA and ConvFinQA: 0

No identical contexts found between FinQA and ConvFinQA datasets.

Context overlap percentage:
FinQA contexts that appear in ConvFinQA: 0.00%
ConvFinQA contexts that appear in FinQA: 0.00%


In [22]:
# number of unique contexts in gold_test_data
import pandas as pd
gold_test_data_df = pd.DataFrame(gold_test_data)
gold_test_data_df.head()

Unnamed: 0,ID,question,answer,context,gold_context,operation,source,reasoning_type,reference_contexts
0,182f0809,"GM operating margin 2023 vs 2022, GM.","To calculate the operating profit margin, we d...","[CONSOLIDATED INCOME STATEMENTS\n(In millions,...",,Division,FinDER,Arithmetic,[]
1,c593f878,NGC's cyber investments boost investor confide...,The provided information allows us to assess h...,[We recognize the critical importance of maint...,,,FinDER,Non-numerical,[]
2,cb08b8b0,"SBA Comm., credit evals & DTA quality receivab...",The details provided illustrate that SBA Commu...,[Site leasing revenues\n\nRevenue from site le...,,,FinDER,Non-numerical,[]
3,GPN/2009/page_85.pdf,"Q: For GPN, what was the fair value of share a...",265%,['notes to consolidated financial statements 2...,{'text_2': 'the total fair value of share awar...,"subtract(6.2, 1.7), divide(#0, 1.7)",ConvFinQA,Compositional,[the total fair value of share awards vested d...
4,AMT/2012/page_123.pdf,what was the cost per tower in American Tower’...,85607,['american tower corporation and subsidiaries ...,"{'text_8': '( 201ccolombia movil 201d ) , wher...","multiply(182.0, const_1000000), divide(#0, 2126)",FinQA,Compositional,"[( 201ccolombia movil 201d ) , whereby atc sit..."


In [25]:
def merge_context(row):
    # Handle None or list in 'context'
    context_raw = row['context']
    context_str = ' '.join(context_raw) if isinstance(context_raw, list) else (context_raw or "")
    
    # Handle None or list in 'reference_contexts'
    references = row['reference_contexts']
    if not references:
        references = []

    # Append references not already in context
    additions = [ref for ref in references if ref not in context_str]
    
    # Combine and return
    full_context = context_str + ' ' + ' '.join(additions) if additions else context_str
    return full_context

# Apply function
gold_test_data_df['full_context'] = gold_test_data_df.apply(merge_context, axis=1)


In [26]:
gold_test_data_df.head()

Unnamed: 0,ID,question,answer,context,gold_context,operation,source,reasoning_type,reference_contexts,full_context
0,182f0809,"GM operating margin 2023 vs 2022, GM.","To calculate the operating profit margin, we d...","[CONSOLIDATED INCOME STATEMENTS\n(In millions,...",,Division,FinDER,Arithmetic,[],"CONSOLIDATED INCOME STATEMENTS\n(In millions, ..."
1,c593f878,NGC's cyber investments boost investor confide...,The provided information allows us to assess h...,[We recognize the critical importance of maint...,,,FinDER,Non-numerical,[],We recognize the critical importance of mainta...
2,cb08b8b0,"SBA Comm., credit evals & DTA quality receivab...",The details provided illustrate that SBA Commu...,[Site leasing revenues\n\nRevenue from site le...,,,FinDER,Non-numerical,[],Site leasing revenues\n\nRevenue from site lea...
3,GPN/2009/page_85.pdf,"Q: For GPN, what was the fair value of share a...",265%,['notes to consolidated financial statements 2...,{'text_2': 'the total fair value of share awar...,"subtract(6.2, 1.7), divide(#0, 1.7)",ConvFinQA,Compositional,[the total fair value of share awards vested d...,['notes to consolidated financial statements 2...
4,AMT/2012/page_123.pdf,what was the cost per tower in American Tower’...,85607,['american tower corporation and subsidiaries ...,"{'text_8': '( 201ccolombia movil 201d ) , wher...","multiply(182.0, const_1000000), divide(#0, 2126)",FinQA,Compositional,"[( 201ccolombia movil 201d ) , whereby atc sit...",['american tower corporation and subsidiaries ...


In [50]:
# Convert contexts to strings for counting unique values
def normalize_context(context):
    """Convert context to string format for comparison"""
    if isinstance(context, list):
        return ' '.join(str(item) for item in context)
    elif isinstance(context, str):
        return context.strip()
    else:
        return str(context).strip()

# Apply normalization and count unique contexts
gold_test_data_df['context_normalized'] = gold_test_data_df['full_context'].apply(normalize_context)
unique_contexts_count = gold_test_data_df['context_normalized'].nunique()

print(f"Unique contexts in gold_test_data: {unique_contexts_count} contexts")

Unique contexts in gold_test_data: 149 contexts


In [51]:
print(len(gold_test_data_df))

149


check whether any of the context already exist in the database

In [52]:
import pandas as pd
import json

# Step 1: Load the JSONL file (each line is a separate JSON object)
jsonl_path = "/Users/christel/Desktop/Thesis/thesis_repo/data/data_processed/existing_embeddings_with_meta_data.jsonl"
with open(jsonl_path, "r") as f:
    json_data = [json.loads(line) for line in f]

# Step 2: Extract the "text" values from the JSON data
existing_texts_set = set(entry["text"] for entry in json_data)

# Step 3: Define a function to check if context is in the existing texts
def is_duplicate(context):
    if isinstance(context, list):
        context_str = " ".join(context)
    else:
        context_str = context or ""
    return context_str in existing_texts_set

# Step 4: Apply the function and store the result in 'dup' column
gold_test_data_df["dup"] = gold_test_data_df["context"].apply(is_duplicate)


In [53]:
import pandas as pd
import json

# Step 1: Load the JSONL file
jsonl_path = "/Users/christel/Desktop/Thesis/thesis_repo/data/data_processed/existing_embeddings_with_meta_data.jsonl"
with open(jsonl_path, "r") as f:
    json_data = [json.loads(line) for line in f]

# Step 2: Extract the set of source_ids from the JSON data
existing_source_ids = set(entry["source_id"] for entry in json_data if "source_id" in entry)

# Step 3: Check which IDs from gold_test_data_df are in existing_source_ids
# Replace 'your_id_column' with the actual column name in your DataFrame
def check_id_exists(row_id):
    return row_id in existing_source_ids

# If your ID column is named 'id' or 'source_id', update this accordingly
gold_test_data_df["id_exists"] = gold_test_data_df["ID"].apply(check_id_exists)


In [54]:
gold_test_data_df.head()

Unnamed: 0,ID,question,answer,context,gold_context,operation,source,reasoning_type,reference_contexts,full_context,context_normalized,dup,id_exists,ticker,date,page
0,182f0809,"GM operating margin 2023 vs 2022, GM.","To calculate the operating profit margin, we d...","[CONSOLIDATED INCOME STATEMENTS\n(In millions,...",,Division,FinDER,Arithmetic,[],"CONSOLIDATED INCOME STATEMENTS\n(In millions, ...","CONSOLIDATED INCOME STATEMENTS\n(In millions, ...",True,True,,,
1,c593f878,NGC's cyber investments boost investor confide...,The provided information allows us to assess h...,[We recognize the critical importance of maint...,,,FinDER,Non-numerical,[],We recognize the critical importance of mainta...,We recognize the critical importance of mainta...,True,True,,,
2,cb08b8b0,"SBA Comm., credit evals & DTA quality receivab...",The details provided illustrate that SBA Commu...,[Site leasing revenues\n\nRevenue from site le...,,,FinDER,Non-numerical,[],Site leasing revenues\n\nRevenue from site lea...,Site leasing revenues\n\nRevenue from site lea...,False,True,,,
3,GPN/2009/page_85.pdf,"Q: For GPN, what was the fair value of share a...",265%,['notes to consolidated financial statements 2...,{'text_2': 'the total fair value of share awar...,"subtract(6.2, 1.7), divide(#0, 1.7)",ConvFinQA,Compositional,[the total fair value of share awards vested d...,['notes to consolidated financial statements 2...,['notes to consolidated financial statements 2...,False,True,GPN,2009.0,85.0
4,AMT/2012/page_123.pdf,what was the cost per tower in American Tower’...,85607,['american tower corporation and subsidiaries ...,"{'text_8': '( 201ccolombia movil 201d ) , wher...","multiply(182.0, const_1000000), divide(#0, 2126)",FinQA,Compositional,"[( 201ccolombia movil 201d ) , whereby atc sit...",['american tower corporation and subsidiaries ...,['american tower corporation and subsidiaries ...,False,True,AMT,2012.0,123.0


In [79]:
# print a row where id_exists is False 
missing_samples = gold_test_data_df[gold_test_data_df["id_exists"] == False]
missing_samples.head(5)

Unnamed: 0,ID,question,answer,context,gold_context,operation,source,reasoning_type,reference_contexts,full_context,context_normalized,dup,id_exists,ticker,date,page
58,782c2953,Impact of DVA property sale proceeds on CF inv...,The data shows that proceeds from the sale of ...,[The following table summarizes our major sour...,,,FinDER,Non-numerical,[],The following table summarizes our major sourc...,The following table summarizes our major sourc...,True,False,,,
126,MSI/2008/page_110.pdf,"Q: For MSI, what was the balance of stockholde...",700%,['stockholders 2019 equity derivative instrume...,{'table_1': 'the balance at january 1 of 2008 ...,"subtract(16, 2), divide(#0, 2)",ConvFinQA,Compositional,,['stockholders 2019 equity derivative instrume...,['stockholders 2019 equity derivative instrume...,False,False,MSI,2008.0,110.0
127,AMT/2008/page_14.pdf,"Q: For AMT, what was the number of owned sites...",23200,['( 201cati 201d ) and spectrasite communicati...,{'table_1': 'country the united states of numb...,"add(19400, 2500), add(#0, 1100), add(#1, 200)",ConvFinQA,Compositional,,['( 201cati 201d ) and spectrasite communicati...,['( 201cati 201d ) and spectrasite communicati...,False,False,AMT,2008.0,14.0
128,TFX/2014/page_44.pdf,"Q: For TFX, what was the percentage difference...",2%,['stock performance graph the following graph ...,{'table_1': 'company / index the teleflex inco...,"subtract(102, 100), divide(#0, 100)",ConvFinQA,Compositional,,['stock performance graph the following graph ...,['stock performance graph the following graph ...,False,False,TFX,2014.0,44.0
129,CE/2014/page_32.pdf,"Q: For CE, what was the number of shares purch...",27.7,['celanese purchases of its equity securities ...,{'text_2': '( 2 ) our board of directors has a...,"multiply(468128, 59.25), divide(#0, const_1000...",ConvFinQA,Compositional,,['celanese purchases of its equity securities ...,['celanese purchases of its equity securities ...,False,False,CE,2014.0,32.0


In [80]:
print(len(missing_samples))

17


In [35]:
import json

# Load unique_contexts_filtered.json
with open("/Users/christel/Desktop/Thesis/thesis_repo/data/data_processed/unique_contexts_filtered.json", "r") as f:
    unique_contexts_filtered = json.load(f)

# Normalize contexts from unique_contexts_filtered (convert to set for faster lookup)
normalized_unique_contexts = set()
for context in unique_contexts_filtered:
    normalized_context = normalize_context(context)
    if normalized_context:  # Skip empty contexts
        normalized_unique_contexts.add(normalized_context)

# Check for matches in gold_test_data_df
matches = []
gold_contexts_normalized = []

for idx, row in gold_test_data_df.iterrows():
    gold_context = normalize_context(row['context'])
    gold_contexts_normalized.append(gold_context)
    
    if gold_context in normalized_unique_contexts:
        matches.append({
            'index': idx,
            'source': row.get('source', 'N/A'),
            'question': row.get('question', 'N/A')[:100] + '...' if len(str(row.get('question', ''))) > 100 else row.get('question', 'N/A'),
            'context_preview': gold_context[:100] + '...' if len(gold_context) > 100 else gold_context
        })

# Print results
print(f"Total contexts in unique_contexts_filtered: {len(normalized_unique_contexts)}")
print(f"Total contexts in gold_test_data: {len(gold_contexts_normalized)}")
print(f"Matching contexts found: {len(matches)}")

if matches:
    print(f"\nFirst 10 matches:")
    for i, match in enumerate(matches[:10]):
        print(f"\nMatch {i+1}:")
        print(f"  Index: {match['index']}")
        print(f"  Source: {match['source']}")
        print(f"  Question: {match['question']}")
        print(f"  Context preview: {match['context_preview']}")
    
    if len(matches) > 10:
        print(f"\n... and {len(matches) - 10} more matches")
    
    # Save detailed matches for further analysis
    #with open("gold_test_matches_in_unique_contexts.json", "w") as f:
    #    json.dump(matches, f, indent=2)
    
    #print(f"\nSaved {len(matches)} matches to 'gold_test_matches_in_unique_contexts.json'")

else:
    print("\nNo matching contexts found between gold_test_data and unique_contexts_filtered.")

# Calculate overlap percentage
overlap_percentage = (len(matches) / len(gold_test_data_df)) * 100
print(f"\nOverlap: {overlap_percentage:.2f}% of gold test contexts are found in unique_contexts_filtered")

# Check for unique contexts in gold_test_data that are NOT in unique_contexts_filtered
unique_in_gold = set(gold_contexts_normalized) - normalized_unique_contexts
print(f"Contexts in gold_test_data but NOT in unique_contexts_filtered: {len(unique_in_gold)}")

Total contexts in unique_contexts_filtered: 5956
Total contexts in gold_test_data: 149
Matching contexts found: 68

First 10 matches:

Match 1:
  Index: 0
  Source: FinDER
  Question: GM operating margin 2023 vs 2022, GM.
  Context preview: CONSOLIDATED INCOME STATEMENTS
(In millions, except per share amounts)
 	Years Ended December 31,
 	...

Match 2:
  Index: 1
  Source: FinDER
  Question: NGC's cyber investments boost investor confidence, enhance valuation, and bolster stability.
  Context preview: We recognize the critical importance of maintaining the safety and security of our systems and data ...

Match 3:
  Index: 2
  Source: FinDER
  Question: SBA Comm., credit evals & DTA quality receivables.
  Context preview: Site leasing revenues

Revenue from site leasing is recognized on a straight-line basis over the cur...

Match 4:
  Index: 6
  Source: FinDER
  Question: R&D expense as a % of net product revs for VRTX in 2023.
  Context preview: VERTEX PHARMACEUTICALS INCORPORATED
Con

In [81]:
# Normalize the context column (if not already done)
if 'context_normalized' not in missing_samples.columns:
    missing_samples['context_normalized'] = missing_samples['context'].apply(normalize_context)

# Extract the normalized contexts from matches
matched_contexts = set()
for match in matches:
    # Get the original row to extract its normalized context
    original_row = gold_test_data_df.iloc[match['index']]
    matched_context = normalize_context(original_row['context'])
    matched_contexts.add(matched_context)

# Filter out rows where the normalized context matches any context in matches
missing_samples = missing_samples[
    ~missing_samples['context_normalized'].isin(matched_contexts)
]

# Reset the index
missing_samples = missing_samples.reset_index(drop=True)

print(f"Original dataframe size: {len(gold_test_data_df)}")
print(f"Number of matches to remove: {len(matched_contexts)}")
print(f"Filtered dataframe size: {len(missing_samples)}")
print(f"Removed {len(gold_test_data_df) - len(missing_samples)} rows")

# Display the first few rows of the filtered dataframe
missing_samples.head()

Original dataframe size: 149
Number of matches to remove: 68
Filtered dataframe size: 17
Removed 132 rows


Unnamed: 0,ID,question,answer,context,gold_context,operation,source,reasoning_type,reference_contexts,full_context,context_normalized,dup,id_exists,ticker,date,page
0,782c2953,Impact of DVA property sale proceeds on CF inv...,The data shows that proceeds from the sale of ...,[The following table summarizes our major sour...,,,FinDER,Non-numerical,[],The following table summarizes our major sourc...,The following table summarizes our major sourc...,True,False,,,
1,MSI/2008/page_110.pdf,"Q: For MSI, what was the balance of stockholde...",700%,['stockholders 2019 equity derivative instrume...,{'table_1': 'the balance at january 1 of 2008 ...,"subtract(16, 2), divide(#0, 2)",ConvFinQA,Compositional,,['stockholders 2019 equity derivative instrume...,['stockholders 2019 equity derivative instrume...,False,False,MSI,2008.0,110.0
2,AMT/2008/page_14.pdf,"Q: For AMT, what was the number of owned sites...",23200,['( 201cati 201d ) and spectrasite communicati...,{'table_1': 'country the united states of numb...,"add(19400, 2500), add(#0, 1100), add(#1, 200)",ConvFinQA,Compositional,,['( 201cati 201d ) and spectrasite communicati...,['( 201cati 201d ) and spectrasite communicati...,False,False,AMT,2008.0,14.0
3,TFX/2014/page_44.pdf,"Q: For TFX, what was the percentage difference...",2%,['stock performance graph the following graph ...,{'table_1': 'company / index the teleflex inco...,"subtract(102, 100), divide(#0, 100)",ConvFinQA,Compositional,,['stock performance graph the following graph ...,['stock performance graph the following graph ...,False,False,TFX,2014.0,44.0
4,CE/2014/page_32.pdf,"Q: For CE, what was the number of shares purch...",27.7,['celanese purchases of its equity securities ...,{'text_2': '( 2 ) our board of directors has a...,"multiply(468128, 59.25), divide(#0, const_1000...",ConvFinQA,Compositional,,['celanese purchases of its equity securities ...,['celanese purchases of its equity securities ...,False,False,CE,2014.0,32.0


In [82]:
# number of samples in gold_test_filtered_df that have source == FinDER 
# number of samples in gold_test_filtered_df that have source == FinDER 
finder_count = len(missing_samples[missing_samples['source'] == 'FinDER'])
print(f"Number of samples with source == 'FinDER': {finder_count}")

Number of samples with source == 'FinDER': 1


In [84]:
# print samples that has source == FinDER 
finder_samples = missing_samples[missing_samples['source'] == 'FinDER']
finder_samples.head()

Unnamed: 0,ID,question,answer,context,gold_context,operation,source,reasoning_type,reference_contexts,full_context,context_normalized,dup,id_exists,ticker,date,page
0,782c2953,Impact of DVA property sale proceeds on CF inv...,The data shows that proceeds from the sale of ...,[The following table summarizes our major sour...,,,FinDER,Non-numerical,[],The following table summarizes our major sourc...,The following table summarizes our major sourc...,True,False,,,


## enrich the database with meta data 

In [85]:
# Create new columns by splitting the ID
def extract_metadata_from_id(id_str):
    """Extract ticker, date, and page from ID string like 'AMT/2012/page_123.pdf'"""
    if pd.isna(id_str) or not isinstance(id_str, str):
        return pd.Series([None, None, None])
    
    parts = id_str.split('/')
    
    if len(parts) >= 3:
        ticker = parts[0]
        date = parts[1]
        # Extract page number from the filename (e.g., 'page_123.pdf' -> '123')
        page_part = parts[2]
        if 'page_' in page_part:
            page = page_part.split('page_')[1].split('.')[0]
        else:
            page = None
    else:
        ticker = None
        date = None
        page = None
    
    return pd.Series([ticker, date, page])

# Apply the function to create the new columns
missing_samples[['ticker', 'date', 'page']] = missing_samples['ID'].apply(extract_metadata_from_id)

# Display the results
print("Sample of new columns:")
print(missing_samples[['ID', 'ticker', 'date', 'page']].head(10))

# Check for any missing values
print(f"\nMissing values:")
print(f"Ticker: {missing_samples['ticker'].isna().sum()}")
print(f"Date: {missing_samples['date'].isna().sum()}")
print(f"Page: {missing_samples['page'].isna().sum()}")

Sample of new columns:
                      ID ticker  date  page
0               782c2953   None  None  None
1  MSI/2008/page_110.pdf    MSI  2008   110
2   AMT/2008/page_14.pdf    AMT  2008    14
3   TFX/2014/page_44.pdf    TFX  2014    44
4    CE/2014/page_32.pdf     CE  2014    32
5  HFC/2017/page_103.pdf    HFC  2017   103
6    DG/2006/page_58.pdf     DG  2006    58
7   STT/2011/page_94.pdf    STT  2011    94
8   UAA/2017/page_52.pdf    UAA  2017    52
9   HII/2018/page_64.pdf    HII  2018    64

Missing values:
Ticker: 1
Date: 1
Page: 1


In [86]:
missing_samples.head()

Unnamed: 0,ID,question,answer,context,gold_context,operation,source,reasoning_type,reference_contexts,full_context,context_normalized,dup,id_exists,ticker,date,page
0,782c2953,Impact of DVA property sale proceeds on CF inv...,The data shows that proceeds from the sale of ...,[The following table summarizes our major sour...,,,FinDER,Non-numerical,[],The following table summarizes our major sourc...,The following table summarizes our major sourc...,True,False,,,
1,MSI/2008/page_110.pdf,"Q: For MSI, what was the balance of stockholde...",700%,['stockholders 2019 equity derivative instrume...,{'table_1': 'the balance at january 1 of 2008 ...,"subtract(16, 2), divide(#0, 2)",ConvFinQA,Compositional,,['stockholders 2019 equity derivative instrume...,['stockholders 2019 equity derivative instrume...,False,False,MSI,2008.0,110.0
2,AMT/2008/page_14.pdf,"Q: For AMT, what was the number of owned sites...",23200,['( 201cati 201d ) and spectrasite communicati...,{'table_1': 'country the united states of numb...,"add(19400, 2500), add(#0, 1100), add(#1, 200)",ConvFinQA,Compositional,,['( 201cati 201d ) and spectrasite communicati...,['( 201cati 201d ) and spectrasite communicati...,False,False,AMT,2008.0,14.0
3,TFX/2014/page_44.pdf,"Q: For TFX, what was the percentage difference...",2%,['stock performance graph the following graph ...,{'table_1': 'company / index the teleflex inco...,"subtract(102, 100), divide(#0, 100)",ConvFinQA,Compositional,,['stock performance graph the following graph ...,['stock performance graph the following graph ...,False,False,TFX,2014.0,44.0
4,CE/2014/page_32.pdf,"Q: For CE, what was the number of shares purch...",27.7,['celanese purchases of its equity securities ...,{'text_2': '( 2 ) our board of directors has a...,"multiply(468128, 59.25), divide(#0, const_1000...",ConvFinQA,Compositional,,['celanese purchases of its equity securities ...,['celanese purchases of its equity securities ...,False,False,CE,2014.0,32.0


## create the new chunks and embeddings <br>
The result should be appended to: <br>
- embeddings: /Users/christel/Desktop/Thesis/thesis_repo/data/data_processed/embedded_chunks.json
- new contexts: /Users/christel/Desktop/Thesis/thesis_repo/data/data_processed/unique_contexts_filtered.json
- meta data: /Users/christel/Desktop/Thesis/thesis_repo/notebooks/existing_embeddings_with_meta_data.jsonl

In [87]:
# inspect the output file existing_embeddings_with_meta_data.jsonl
import json 

# Load the existing embeddings with meta data
with open("/Users/christel/Desktop/Thesis/thesis_repo/data/data_processed/existing_embeddings_with_meta_data.jsonl", 'r', encoding='utf-8') as file:
    existing_embeddings = [json.loads(line) for line in file]

# print one sample 
existing_embeddings_df = pd.DataFrame(existing_embeddings)
existing_embeddings_df.head()

Unnamed: 0,chunk_id,text,source,source_id,ticker,year,page
0,row_0_chunk_0,"Cboe Global Markets, Inc. and Subsidiaries\n\n...",Finder,b33fcee7,,,
1,row_0_chunk_1,15.6\n\n\n Goodwill impairment\n\n—\n\n460.9\...,Finder,b33fcee7,,,
2,row_1_chunk_0,"Employees\n\nAs of December 31, 2023, we emplo...",Finder,b8a1383c,,,
3,row_2_chunk_0,North\n\n\nCorporate\n\nAmerican\n\n\nEurope a...,Finder,9826209b,,,
4,row_3_chunk_0,"In 2011, the Board of Directors approved an in...",Finder,30eb0cd9,,,


In [88]:
import json

# Load unique_contexts_filtered.json - it's a single JSON array, not JSONL
with open("/Users/christel/Desktop/Thesis/thesis_repo/data/data_processed/unique_contexts_filtered.json", 'r', encoding='utf-8') as file:
    unique_contexts = json.load(file)  # Use json.load() instead of json.loads(line)

print(f"Loaded {len(unique_contexts)} unique contexts")

unique_contexts_df = pd.DataFrame(unique_contexts)
print(len(unique_contexts_df))

Loaded 5956 unique contexts
5956


In [89]:
count = sum(
    all(g in (row['reference_contexts'] or []) for g in (row['gold_context'] if isinstance(row['gold_context'], list) else [row['gold_context']]))
    for _, row in missing_samples.iterrows()
)

print(f"Number of rows where all gold_context values are in reference_contexts: {count}")

Number of rows where all gold_context values are in reference_contexts: 1


In [90]:
# drop samples with source == FinDER from missing_samples
print(f"Number of samples before filtering FinDER: {len(missing_samples)}")
missing_samples = missing_samples[missing_samples['source'] != 'FinDER']
print(f"Number of samples after filtering FinDER: {len(missing_samples)}")

Number of samples before filtering FinDER: 17
Number of samples after filtering FinDER: 16


In [91]:
import os, json, re
from pathlib import Path
import pandas as pd
from dotenv import load_dotenv

from langchain.schema import Document
from langchain.text_splitter import CharacterTextSplitter
from langchain_openai import OpenAIEmbeddings  # pip install langchain-openai

# ------------------ CONFIG ------------------
load_dotenv()

DOCS_PATH = "/Users/christel/Desktop/Thesis/thesis_repo/data/data_processed/unique_contexts_filtered.json"
CHUNK_SIZE = 1500
CHUNK_OVERLAP = 200
EMBED_MODEL = "text-embedding-ada-002"  # same as before

assert isinstance(missing_samples, pd.DataFrame)

# ------------------ HELPERS ------------------
def normalize_context(s: str) -> str:
    if s is None:
        return ""
    s = str(s).strip()
    return re.sub(r"\s+", " ", s)

def to_int_or_none(x):
    try:
        if pd.isna(x): return None
        s = str(x).strip()
        if not s: return None
        # accept YYYY or YYYY-mm-dd
        return int(s[:4])
    except Exception:
        return None

# ------------------ 1) LOAD EXISTING CONTEXTS ------------------
if Path(DOCS_PATH).exists():
    with open(DOCS_PATH, "r", encoding="utf-8") as f:
        contexts = json.load(f)
else:
    contexts = []

start_row_index = len(contexts)
existing_norm = set(normalize_context(c) for c in contexts)

# ------------------ 3) SELECT NEW UNIQUE CONTEXTS ------------------
new_contexts_to_append = []          # object 1 (list[str], in append order)
new_doc_rows = []                    # helper rows for chunking/embedding

seen_new_norm = set()
for _, r in missing_samples.iterrows():
    ctx = normalize_context(r.get("context_normalized", ""))
    if not ctx:
        continue
    # skip if already in file or duplicated within the batch
    if ctx in existing_norm or ctx in seen_new_norm:
        continue

    row_index = start_row_index + len(new_contexts_to_append)
    new_contexts_to_append.append(ctx)
    seen_new_norm.add(ctx)

    new_doc_rows.append({
        "row_index": row_index,
        "context": ctx,
        "source": (r.get("source") or "").strip(),
        "source_id": r.get("source_id") or r.get("ID"),  # handles either column
        "ticker": (None if pd.isna(r.get("ticker")) else r.get("ticker")),
        "year": to_int_or_none(r.get("date")),
        "page": (None if pd.isna(r.get("page")) else int(r.get("page"))),
    })

print(f"Will append {len(new_contexts_to_append)} new contexts "
      f"(existing file currently has {start_row_index}).")

# ------------------ 4) CHUNK NEW ROWS ------------------
splitter = CharacterTextSplitter(
    separator="\n\n",
    chunk_size=CHUNK_SIZE,
    chunk_overlap=CHUNK_OVERLAP
)

all_new_chunks = []   # for embedding
for row in new_doc_rows:
    doc = Document(page_content=row["context"], metadata={"row_index": row["row_index"]})
    chunks = splitter.split_documents([doc])
    for local_idx, ch in enumerate(chunks):
        all_new_chunks.append({
            "chunk_id": f"row_{row['row_index']}_chunk_{local_idx}",
            "row_index": row["row_index"],
            "text": ch.page_content,
            "source": row["source"],
            "source_id": row["source_id"],
            "ticker": row["ticker"],
            "year": row["year"],
            "page": row["page"],
        })

print(f"Created {len(all_new_chunks)} new chunks to embed.")

# ------------------ 5) EMBED NEW CHUNKS ------------------
embedded_new_chunks_to_append = []   # object 2 (list[dict])
if all_new_chunks:
    embedding_model = OpenAIEmbeddings(
        openai_api_key=os.getenv("OPENAI_API_KEY"),
        model=EMBED_MODEL
    )

    for ch in all_new_chunks:
        try:
            emb = embedding_model.embed_query(ch["text"])
            embedded_new_chunks_to_append.append({
                "chunk_id": ch["chunk_id"],
                "row_index": ch["row_index"],
                "text": ch["text"],
                "embedding": emb,
            })
        except Exception as e:
            print(f"Failed embedding {ch['chunk_id']}: {e}")

print(f"Embedded {len(embedded_new_chunks_to_append)} / {len(all_new_chunks)} new chunks.")

# ------------------ 6) BUILD PER-CHUNK METADATA RECORDS ------------------
chunk_meta_records_to_append = []    # object 3 (list[dict])
for ch in all_new_chunks:
    chunk_meta_records_to_append.append({
        "chunk_id": ch["chunk_id"],
        "text": ch["text"],
        "source": ch["source"],
        "source_id": ch["source_id"],
        "ticker": ch["ticker"],
        "year": ch["year"],
        "page": ch["page"],
    })

# ------------------ 7) SHOW SUMMARIES & SAMPLES ------------------
import itertools, json as _json

def preview_list(name, seq, n=2):
    print(f"\n{name}: {len(seq)}")
    sample = list(itertools.islice(seq, n))
    if sample:
        print(f"Sample {name} item:")
        print(_json.dumps(sample[0], ensure_ascii=False, indent=2) if isinstance(sample[0], dict) else sample[0][:400])

preview_list("new_contexts_to_append", new_contexts_to_append, n=1)
preview_list("embedded_new_chunks_to_append", embedded_new_chunks_to_append, n=1)
preview_list("chunk_meta_records_to_append", chunk_meta_records_to_append, n=1)

Will append 16 new contexts (existing file currently has 5956).
Created 16 new chunks to embed.
Embedded 16 / 16 new chunks.

new_contexts_to_append: 16
Sample new_contexts_to_append item:
['stockholders 2019 equity derivative instruments activity , net of tax , included in non-owner changes to equity within the consolidated statements of stockholders 2019 equity for the years ended december 31 , 2008 , 2007 and 2006 is as follows: .'] 2008 2007 2006 balance at january 1 $ 2014 $ 16 $ 2 increase ( decrease ) in fair value -9 ( 9 ) -6 ( 6 ) 75 reclassifications to earnings 2 -10 ( 10

embedded_new_chunks_to_append: 16
Sample embedded_new_chunks_to_append item:
{
  "chunk_id": "row_5956_chunk_0",
  "row_index": 5956,
  "text": "['stockholders 2019 equity derivative instruments activity , net of tax , included in non-owner changes to equity within the consolidated statements of stockholders 2019 equity for the years ended december 31 , 2008 , 2007 and 2006 is as follows: .'] 2008 2007 200

### Check whether gold_context is present full_context

In [92]:
# count the number of samples in missing_samples where full_context has the same length as context_normalized 
same_length_count = sum(
    1 for _, row in missing_samples.iterrows()
    if len(normalize_context(row['full_context'])) == len(normalize_context(row['context_normalized']))
)
print(f"\nNumber of samples where full_context length equals context_normalized length: {same_length_count}")


Number of samples where full_context length equals context_normalized length: 16


### Update the database 

In [93]:
# inspect last sample in the /Users/christel/Desktop/Thesis/thesis_repo/data/data_processed/embedded_chunks.json file
with open("/Users/christel/Desktop/Thesis/thesis_repo/data/data_processed/embedded_chunks.json", 'r', encoding='utf-8') as file:
    embedded_chunks = json.load(file)

print(f"Loaded {len(embedded_chunks)} embedded chunks")
if embedded_chunks:
    last_sample = embedded_chunks[-1]
    print("Last sample in embedded_chunks:")
    print(json.dumps(last_sample, indent=2))

Loaded 10130 embedded chunks
Last sample in embedded_chunks:
{
  "chunk_id": "row_5955_chunk_0",
  "row_index": 5955,
  "text": "['92 | 2017 form 10-k finite-lived intangible assets are amortized over their estimated useful lives and tested for impairment if events or changes in circumstances indicate that the asset may be impaired .', 'in 2016 , gross customer relationship intangibles of $ 96 million and related accumulated amortization of $ 27 million as well as gross intellectual property intangibles of $ 111 million and related accumulated amortization of $ 48 million from the resource industries segment were impaired .', 'the fair value of these intangibles was determined to be insignificant based on an income approach using expected cash flows .', 'the fair value determination is categorized as level 3 in the fair value hierarchy due to its use of internal projections and unobservable measurement inputs .', 'the total impairment of $ 132 million was a result of restructuring acti

In [94]:
# inspect last sample in the /Users/christel/Desktop/Thesis/thesis_repo/data/data_processed/embedded_chunks.json file
with open("/Users/christel/Desktop/Thesis/thesis_repo/data/data_processed/unique_contexts_filtered.json", 'r', encoding='utf-8') as file:
    unique_contexts = json.load(file)

print(f"Loaded {len(unique_contexts)} unique contexts")

with open("/Users/christel/Desktop/Thesis/thesis_repo/notebooks/metadata_enriched_with_chunk_ids2.jsonl", 'r', encoding='utf-8') as file:
    meta_data_file = [json.loads(line) for line in file if line.strip()]

print(f"Loaded {len(meta_data_file)} meta data lines")

Loaded 5956 unique contexts
Loaded 7696 meta data lines


In [None]:
from pathlib import Path
import json

DOCS_PATH = "/Users/christel/Desktop/Thesis/thesis_repo/data/data_processed/unique_contexts_filtered.json"
EMB_PATH  = "/Users/christel/Desktop/Thesis/thesis_repo/data/data_processed/embedded_chunks.json"

# ------------------ 1) Append to unique_contexts_filtered.json ------------------
if Path(DOCS_PATH).exists():
    with open(DOCS_PATH, "r", encoding="utf-8") as f:
        contexts = json.load(f)
else:
    contexts = []

before_len_contexts = len(contexts)
contexts.extend(new_contexts_to_append)
after_len_contexts = len(contexts)

with open(DOCS_PATH, "w", encoding="utf-8") as f:
    json.dump(contexts, f, ensure_ascii=False, indent=2)

print(f"[unique_contexts_filtered.json] {before_len_contexts} → {after_len_contexts} entries")


# ------------------ 2) Append to embedded_chunks.json (deduplicated by chunk_id) ------------------
if Path(EMB_PATH).exists():
    with open(EMB_PATH, "r", encoding="utf-8") as f:
        existing_embs = json.load(f)
else:
    existing_embs = []

before_len_embs = len(existing_embs)
existing_ids = {e["chunk_id"] for e in existing_embs}
to_add = [e for e in embedded_new_chunks_to_append if e["chunk_id"] not in existing_ids]
existing_embs.extend(to_add)
after_len_embs = len(existing_embs)

with open(EMB_PATH, "w", encoding="utf-8") as f:
    json.dump(existing_embs, f, ensure_ascii=False)

print(f"[embedded_chunks.json] {before_len_embs} → {after_len_embs} entries (+{len(to_add)} added)")

[unique_contexts_filtered.json] 5956 → 5972 entries
[embedded_chunks.json] 10130 → 10146 entries (+16 added)
[existing_embeddings_with_meta_data.jsonl] 0 → 16 lines (+16 added)


In [96]:
CHUNK_META_PATH = "/Users/christel/Desktop/Thesis/thesis_repo/data/data_processed/existing_embeddings_with_meta_data.jsonl"
# ------------------ 3) Append to existing_embeddings_with_meta_data.jsonl ------------------
if Path(CHUNK_META_PATH).exists():
    with open(CHUNK_META_PATH, "r", encoding="utf-8") as f:
        num_lines_before = sum(1 for _ in f)
else:
    num_lines_before = 0

with open(CHUNK_META_PATH, "a", encoding="utf-8") as f:
    for record in chunk_meta_records_to_append:
        f.write(json.dumps(record, ensure_ascii=False) + "\n")

num_lines_after = num_lines_before + len(chunk_meta_records_to_append)
print(f"[existing_embeddings_with_meta_data.jsonl] {num_lines_before} → {num_lines_after} lines (+{len(chunk_meta_records_to_append)} added)")

[existing_embeddings_with_meta_data.jsonl] 10130 → 10146 lines (+16 added)


In [97]:
# print the last 5 examples in /Users/christel/Desktop/Thesis/thesis_repo/data/data_processed/embedded_chunks.json
with open("/Users/christel/Desktop/Thesis/thesis_repo/data/data_processed/embedded_chunks.json", 'r', encoding='utf-8') as file:
    embedded_chunks = json.load(file)
print(f"Loaded {len(embedded_chunks)} embedded chunks")
if embedded_chunks:
    last_samples = embedded_chunks[-5:]  # Get the last 5 samples
    print("Last 5 samples in embedded_chunks:")
    for i, sample in enumerate(last_samples):
        print(f"Sample {i+1}:")
        print(json.dumps(sample, indent=2, ensure_ascii=False))
else:
    print("No embedded chunks found.")

Loaded 10146 embedded chunks
Last 5 samples in embedded_chunks:
Sample 1:
{
  "chunk_id": "row_5967_chunk_0",
  "row_index": 5967,
  "text": "['american tower corporation and subsidiaries notes to consolidated financial statements 2014 ( continued ) 7 .', 'derivative financial instruments under the terms of the credit facility , the company is required to enter into interest rate protection agreements on at least 50% ( 50 % ) of its variable rate debt .', 'under these agreements , the company is exposed to credit risk to the extent that a counterparty fails to meet the terms of a contract .', 'such exposure is limited to the current value of the contract at the time the counterparty fails to perform .', 'the company believes its contracts as of december 31 , 2004 are with credit worthy institutions .', 'as of december 31 , 2004 , the company had two interest rate caps outstanding with an aggregate notional amount of $ 350.0 million ( each at an interest rate of 6.0% ( 6.0 % ) ) that ex

In [98]:
# print the record with row_5951_chunk_0 from /Users/christel/Desktop/Thesis/thesis_repo/data/data_processed/existing_embeddings_with_meta_data.jsonl
with open("/Users/christel/Desktop/Thesis/thesis_repo/data/data_processed/existing_embeddings_with_meta_data.jsonl", 'r', encoding='utf-8') as file:
    for line in file:
        record = json.loads(line.strip())
        if record.get("chunk_id") == "row_5951_chunk_0":
            print("Found record with chunk_id 'row_5951_chunk_0':")
            print(json.dumps(record, indent=2, ensure_ascii=False))
            break
    else:
        print("No record found with chunk_id 'row_5951_chunk_0'.")

Found record with chunk_id 'row_5951_chunk_0':
{
  "chunk_id": "row_5951_chunk_0",
  "text": "['on-balance sheet securitizations the company engages in on-balance sheet securitizations .', 'these are securitizations that do not qualify for sales treatment ; thus , the assets remain on the company 2019s balance sheet .', 'the following table presents the carrying amounts and classification of consolidated assets and liabilities transferred in transactions from the consumer credit card , student loan , mortgage and auto businesses , accounted for as secured borrowings : in billions of dollars december 31 , december 31 .'] in billions of dollars december 31 2008 december 31 2007 cash $ 0.3 $ 0.1 available-for-sale securities 0.1 0.2 loans 7.5 7.4 allowance for loan losses -0.1 ( 0.1 ) -0.1 ( 0.1 ) total assets $ 7.8 $ 7.6 long-term debt $ 6.3 $ 5.8 other liabilities 0.3 0.4 total liabilities $ 6.6 $ 6.2 ['all assets are restricted from being sold or pledged as collateral .', 'the cash flo

# Update the knowledge graph 

Run create_kg.py to append the embedded chunks and new_meta_data_update_file.py to update the meta data. <br>

Test the meta data updating file: 

In [3]:
from neo4j import GraphDatabase

def get_driver():
    uri = os.getenv("NEO4J_URI")
    user = os.getenv("NEO4J_USERNAME")
    pwd = os.getenv("NEO4J_PASSWORD")

    if not all([uri, user, pwd]):
        raise RuntimeError("Missing Neo4j connection info in environment variables")

    # 🔥 DO NOT pass `encrypted` explicitly anymore
    return GraphDatabase.driver(uri, auth=(user, pwd))


In [4]:
import json

# Prepare test metadata for a single chunk
chunk_id = "row_5947_chunk_0"
metadata = {
    "chunk_id": chunk_id,
    "source": "ConvFinQA",
    "source_id": "AAPL/2012/page_71.pdf",
    "ticker": "AAPL",
    "year": 2012,
    "page": 71
}

CHECK_EXISTENCE_Q = """
MATCH (c:Chunk {id: $chunk_id})-[:PART_OF]->(d:Document)
RETURN count(*) AS cnt
"""

UPDATE_META_Q = """
MATCH (c:Chunk {id: $chunk_id})-[:PART_OF]->(d:Document)
SET  d.source    = coalesce($source,    d.source),
     d.source_id = coalesce($source_id, d.source_id),
     d.ticker    = coalesce($ticker,    d.ticker),
     d.year      = coalesce(toInteger($year), d.year),
     c.page      = coalesce(toInteger($page), c.page),
     c.has_metadata = any(v IN [d.ticker, d.year, c.page] WHERE v IS NOT NULL)
"""

# Run update
driver = get_driver()
with driver.session() as session:
    exists = session.run(CHECK_EXISTENCE_Q, {"chunk_id": chunk_id}).single()["cnt"]
    if exists == 0:
        print(f"Chunk not found: {chunk_id}")
    else:
        session.run(UPDATE_META_Q, metadata)
        print(f"Metadata updated for: {chunk_id}")


Metadata updated for: row_5947_chunk_0


### Test the new update function 

In [3]:
test_chunk = next(c for c in embedded_chunks if c["chunk_id"] == "row_5823_chunk_0")

In [4]:
print(test_chunk)

{'chunk_id': 'row_5823_chunk_0', 'row_index': 5823, 'text': "['commitments .', 'for a further description of the loan loss reserve and related accounts , see 201cmanaging global risk 201d and notes 1 and 18 to the consolidated financial statements on pages 51 , 122 and 165 , respectively .', 'securitizations the company securitizes a number of different asset classes as a means of strengthening its balance sheet and accessing competitive financing rates in the market .', 'under these securitization programs , assets are sold into a trust and used as collateral by the trust to obtain financing .', 'the cash flows from assets in the trust service the corresponding trust securities .', 'if the structure of the trust meets certain accounting guidelines , trust assets are treated as sold and are no longer reflected as assets of the company .', 'if these guidelines are not met , the assets continue to be recorded as the company 2019s assets , with the financing activity recorded as liabiliti

In [None]:
import os
import json
import logging
import signal

from dotenv import load_dotenv
from tenacity import retry, stop_after_attempt, wait_exponential

from langchain.docstore.document import Document
from langchain_openai import ChatOpenAI
from langchain_experimental.graph_transformers import LLMGraphTransformer
from langchain_neo4j import Neo4jGraph
from langchain_community.graphs.graph_document import Node, Relationship

# === Setup Logging ===
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(message)s")

# === Load Environment Variables ===
load_dotenv()

# === Config ===
EMBEDDINGS_JSON  = "/Users/christel/Desktop/Thesis/thesis_repo/data/data_processed/embedded_chunks.json"
CHUNK_VECTOR_PROPERTY = "embedding"
TIMEOUT_SECONDS  = 300

# === Init LLM ===
@retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=4, max=10))
def init_llm():
    return ChatOpenAI(
        openai_api_key=os.getenv("OPENAI_API_KEY"),
        model_name="gpt-4o-mini"
    )

llm = init_llm()

# === Init Neo4j client ===
graph = Neo4jGraph(
    url=os.getenv("NEO4J_URI"),
    username=os.getenv("NEO4J_USERNAME"),
    password=os.getenv("NEO4J_PASSWORD")
)

# === Timeout Utilities ===
class TimeoutException(Exception):
    pass

def timeout_handler(signum, frame):
    raise TimeoutException()

signal.signal(signal.SIGALRM, timeout_handler)

# === (Optional) Ensure uniqueness constraints ===
graph.query("CREATE CONSTRAINT unique_document_id IF NOT EXISTS FOR (d:Document) REQUIRE d.id IS UNIQUE")
graph.query("CREATE CONSTRAINT unique_chunk_id    IF NOT EXISTS FOR (c:Chunk)    REQUIRE c.id IS UNIQUE")
graph.query("CREATE CONSTRAINT unique_entity_id   IF NOT EXISTS FOR (e:Entity)   REQUIRE e.id IS UNIQUE")

# === Load your precomputed embeddings ===
with open(EMBEDDINGS_JSON, "r") as f:
    embedded_chunks = json.load(f)

# === Pick your test sample ===
test_chunk = next(c for c in embedded_chunks if c["chunk_id"] == "row_5823_chunk_0")
chunk_id   = test_chunk["chunk_id"]     # e.g. 'row_5823_chunk_0'
row_index  = test_chunk["row_index"]    # e.g. 5823
text       = test_chunk["text"]
embedding  = test_chunk["embedding"]

logging.info(f"Inserting single chunk: {chunk_id} (row_index={row_index})")

# --- Step 1: MERGE Document ↔ Chunk and set properties + vector ---
signal.alarm(TIMEOUT_SECONDS)
try:
    graph.query(
        """
        MERGE (d:Document {id: $doc_id})
        MERGE (c:Chunk    {id: $chunk_id})
        SET d.row_index = $row_index,
            c.text       = $text
        MERGE (c)-[:PART_OF]->(d)
        WITH c
        CALL db.create.setNodeVectorProperty(c, $vector_prop, $vector)
        """,
        {
            "doc_id":      str(row_index),
            "chunk_id":    chunk_id,
            "row_index":   row_index,
            "text":        text,
            "vector_prop": CHUNK_VECTOR_PROPERTY,
            "vector":      embedding,
        }
    )
    signal.alarm(0)
    logging.info("Document & Chunk inserted (with embedding).")
except TimeoutException:
    logging.error("Timeout during chunk/document insertion.")
    raise


# --- Step 2: Extract entities from the chunk text ---
doc_transformer = LLMGraphTransformer(llm=llm)

signal.alarm(TIMEOUT_SECONDS)
try:
    fake_doc   = Document(page_content=text, metadata={"row_index": row_index})
    graph_docs = doc_transformer.convert_to_graph_documents([fake_doc])
    signal.alarm(0)
    if not graph_docs or not graph_docs[0].nodes:
        logging.warning("No entities extracted.")
except TimeoutException:
    logging.error("Timeout during entity extraction.")
    raise

# --- Step 3: Normalize ONLY the extracted nodes + build HAS_ENTITY rels ---
for graph_doc in graph_docs:
    entities = []
    relationships = []

    # Normalize each LLM‐extracted node
    for n in graph_doc.nodes:
        raw_id   = (n.id or "").strip()
        raw_type = (n.type or "").strip()
        if not raw_id or not raw_type:
            logging.warning(f"Skipping malformed: id={raw_id}, type={raw_type}")
            continue

        eid = raw_id.lower()
        n.id   = eid
        n.type = "Entity"
        n.properties["id"]            = eid
        n.properties["original_type"] = raw_type

        entities.append(n)
        # build a rel from our existing chunk to this entity
        relationships.append(
            Relationship(
                source=Node(id=chunk_id, type="Chunk"), 
                target=n, 
                type="HAS_ENTITY"
            )
        )

    # overwrite the graph_doc
    graph_doc.nodes = entities
    graph_doc.relationships = relationships


# --- Step 4: Push entities + HAS_ENTITY → chunk
signal.alarm(TIMEOUT_SECONDS)
try:
    graph.add_graph_documents(
        graph_docs,
        baseEntityLabel="Entity",   # all nodes in graph_doc.nodes get :Entity
        include_source=False
    )
    signal.alarm(0)
    logging.info("Entities + relationships inserted.")
except TimeoutException:
    logging.error("Timeout during graph insertion.")
    raise


All files: 

In [1]:
import os
import json
import logging
import signal

from dotenv import load_dotenv
from tenacity import retry, stop_after_attempt, wait_exponential

from langchain.docstore.document import Document
from langchain_openai import ChatOpenAI
from langchain_experimental.graph_transformers import LLMGraphTransformer
from langchain_neo4j import Neo4jGraph
from langchain_community.graphs.graph_document import Node, Relationship

# === Setup Logging ===
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(message)s")

# === Load Environment Variables ===
load_dotenv()

# === Config ===
EMBEDDINGS_JSON        = "/Users/christel/Desktop/Thesis/thesis_repo/data/data_processed/embedded_chunks.json"
CHUNK_VECTOR_PROPERTY  = "embedding"
TIMEOUT_SECONDS        = 300
START_FROM_ROW         = 5823   # ← only process row_index >= this

# === Init LLM with retries ===
@retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=4, max=10))
def init_llm():
    return ChatOpenAI(
        openai_api_key=os.getenv("OPENAI_API_KEY"),
        model_name="gpt-4o-mini"
    )

llm = init_llm()

# === Init Neo4j client ===
graph = Neo4jGraph(
    url=os.getenv("NEO4J_URI"),
    username=os.getenv("NEO4J_USERNAME"),
    password=os.getenv("NEO4J_PASSWORD")
)

# === Timeout Utilities ===
class TimeoutException(Exception):
    pass

def timeout_handler(signum, frame):
    raise TimeoutException()

signal.signal(signal.SIGALRM, timeout_handler)

# === Ensure uniqueness constraints ===
graph.query("CREATE CONSTRAINT unique_document_id IF NOT EXISTS FOR (d:Document) REQUIRE d.id IS UNIQUE")
graph.query("CREATE CONSTRAINT unique_chunk_id    IF NOT EXISTS FOR (c:Chunk)    REQUIRE c.id IS UNIQUE")
graph.query("CREATE CONSTRAINT unique_entity_id   IF NOT EXISTS FOR (e:Entity)   REQUIRE e.id IS UNIQUE")

# === Load Precomputed Embeddings ===
with open(EMBEDDINGS_JSON, "r") as f:
    embedded_chunks = json.load(f)

# === Prepare the LLM→Graph transformer ===
doc_transformer = LLMGraphTransformer(llm=llm)

# === Main Loop over all chunks ===
for item in embedded_chunks:
    chunk_id  = item["chunk_id"]
    row_index = item["row_index"]
    text      = item["text"]
    vector    = item["embedding"]

    # Skip until we reach our starting row
    if row_index < START_FROM_ROW:
        continue

    logging.info(f"⏳ Processing chunk {chunk_id} (row_index={row_index})")

    # --- Step 1: MERGE Document & Chunk + set text & embedding ---
    signal.alarm(TIMEOUT_SECONDS)
    try:
        graph.query(
            """
            MERGE (d:Document {id: $doc_id})
            SET d.row_index = $row_index
            MERGE (c:Chunk {id: $chunk_id})
            SET c.text = $text
            MERGE (c)-[:PART_OF]->(d)
            WITH c
            CALL db.create.setNodeVectorProperty(c, $vector_prop, $vector)
            """,
            {
                "doc_id":      str(row_index),
                "row_index":   row_index,
                "chunk_id":    chunk_id,
                "text":        text,
                "vector_prop": CHUNK_VECTOR_PROPERTY,
                "vector":      vector,
            }
        )
        signal.alarm(0)
        logging.info("Document & Chunk upserted (with embedding).")
    except TimeoutException:
        logging.error(f"Timeout inserting chunk {chunk_id}")
        continue
    except Exception as e:
        logging.error(f"Error inserting chunk {chunk_id}: {e}")
        continue

    # --- Step 2: Extract entities from the chunk text ---
    signal.alarm(TIMEOUT_SECONDS)
    try:
        fake_doc   = Document(page_content=text, metadata={"row_index": row_index})
        graph_docs = doc_transformer.convert_to_graph_documents([fake_doc])
        signal.alarm(0)
    except TimeoutException:
        logging.error(f"Timeout extracting entities for {chunk_id}")
        continue
    except Exception as e:
        logging.error(f"LLM graph extraction failed for {chunk_id}: {e}")
        continue

    if not graph_docs or not graph_docs[0].nodes:
        logging.info(f"No entities found for {chunk_id}")
        continue

    # --- Step 3: Normalize entities + build relationships to the chunk ---
    graph_doc = graph_docs[0]
    entities     = []
    relationships = []
    chunk_node   = Node(id=chunk_id, type="Chunk")

    for n in graph_doc.nodes:
        raw_id   = (n.id or "").strip()
        raw_type = (n.type or "").strip()
        if not raw_id or not raw_type:
            logging.warning(f"Skipping malformed entity: id={raw_id}, type={raw_type}")
            continue

        eid = raw_id.lower()
        n.id   = eid
        n.type = "Entity"
        n.properties["id"]            = eid
        n.properties["original_type"] = raw_type

        entities.append(n)
        relationships.append(
            Relationship(source=chunk_node, target=n, type="HAS_ENTITY")
        )

    graph_doc.nodes = entities
    graph_doc.relationships = relationships

    # --- Step 4: Manually MERGE each Entity + HAS_ENTITY rel per chunk ---
    signal.alarm(TIMEOUT_SECONDS)
    try:
        # ❶ upsert all entities for this chunk
        for ent in entities:
            graph.query(
                """
                MERGE (e:Entity {id: $entity_id})
                SET e.original_type = $original_type
                """,
                {
                    "entity_id":    ent.id,
                    "original_type": ent.properties["original_type"],
                },
            )

        # ❷ upsert the relationships from chunk→entity
        for ent in entities:
            graph.query(
                """
                MATCH (c:Chunk {id: $chunk_id})
                MATCH (e:Entity {id: $entity_id})
                MERGE (c)-[:HAS_ENTITY]->(e)
                """,
                {
                    "chunk_id":    chunk_id,
                    "entity_id":   ent.id,
                },
            )

        signal.alarm(0)
        logging.info(f"✅ Entities & relationships inserted for {chunk_id}")
    except TimeoutException:
        logging.error(f"⏰ Timeout inserting entities for {chunk_id}")
        signal.alarm(0)
        continue

2025-08-06 13:33:41,655 Received notification from DBMS server: {severity: INFORMATION} {code: Neo.ClientNotification.Schema.IndexOrConstraintAlreadyExists} {category: SCHEMA} {title: `CREATE CONSTRAINT unique_document_id IF NOT EXISTS FOR (e:Document) REQUIRE (e.id) IS UNIQUE` has no effect.} {description: `CONSTRAINT unique_document_id FOR (e:Document) REQUIRE (e.id) IS UNIQUE` already exists.} {position: None} for query: 'CREATE CONSTRAINT unique_document_id IF NOT EXISTS FOR (d:Document) REQUIRE d.id IS UNIQUE'
2025-08-06 13:33:41,720 Received notification from DBMS server: {severity: INFORMATION} {code: Neo.ClientNotification.Schema.IndexOrConstraintAlreadyExists} {category: SCHEMA} {title: `CREATE CONSTRAINT unique_chunk_id IF NOT EXISTS FOR (e:Chunk) REQUIRE (e.id) IS UNIQUE` has no effect.} {description: `CONSTRAINT unique_chunk_id FOR (e:Chunk) REQUIRE (e.id) IS UNIQUE` already exists.} {position: None} for query: 'CREATE CONSTRAINT unique_chunk_id    IF NOT EXISTS FOR (c:Chun

# Test the retriever 

In [6]:
import sys
import os
from dotenv import load_dotenv

# Adjust path to include the module directory (if needed)
#sys.path.append("/Users/christel/Desktop/Thesis/thesis_repo/src/retrievers/graphrag")
#/Users/christel/Desktop/Thesis/thesis_repo/src/retrievers/graphrag/graphRAG_retriever.py
# Add the full path to the parent "src" folder
sys.path.append("/Users/christel/Desktop/Thesis/thesis_repo")

# Load environment variables
load_dotenv()

# Now import your function
from src.retrievers.graphrag.graphRAG_retriever import retrieve
# Ensure your .env is loaded (if not already)


# Call the retriever with a test query
query = "For STT, what was the percent change in the value of commercial paper outstanding between 2010 and 2011?"
results = retrieve(query)

# Display results
print("=== Stage 1 Output ===")
print(results["stage1"])

print("\n=== Top Retrieved Chunks ===")
for r in results["candidates"]:
    print(f"Chunk ID: {r['chunkId']}")
    print(f"Fused Score: {r['fusedScore']:.4f}")
    print(f"Entities: {r['entities']}")
    print(f"Text: {r['text'][:300]}...\n")


2025-08-06 17:07:23,871 HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"


=== Stage 1 Output ===
{'expandedText': 'For STT, what was the percent change in the value of commercial paper outstanding between 2010 and 2011?', 'entityIds_hi': [], 'entityIds_med': [], 'filters': {'year': [2010, 2011], 'period': None, 'issuer_texts': ['STT']}}

=== Top Retrieved Chunks ===
Chunk ID: row_5962_chunk_0
Fused Score: 0.8411
Entities: []
Text: ['we maintain an effective universal shelf registration that allows for the public offering and sale of debt securities , capital securities , common stock , depositary shares and preferred stock , and warrants to purchase such securities , including any shares into which the preferred stock and dep...

Chunk ID: row_5848_chunk_0
Fused Score: 0.8025
Entities: []
Text: ['page 26 of 100 our calculation of adjusted net earnings is summarized below: .'] ( $ in millions except per share amounts ) 2010 2009 2008 net earnings attributable to ball corporation as reported $ 468.0 $ 387.9 $ 319.5 discontinued operations net of tax 74.9 2.2 -