# Create the knowledge graph 

In [None]:
import json

with open("/Users/christel/Desktop/Thesis/thesis_repo/data/data_processed/Keep/embedded_chunks.json", 'r', encoding='utf-8') as file:
    embedded_chunks = json.load(file)

print(f"Loaded {len(embedded_chunks)} embedded chunks")
if embedded_chunks:
    last_sample = embedded_chunks[-1]
    print("Last sample in embedded_chunks:")
    print(json.dumps(last_sample, indent=2))

Loaded 56325 embedded chunks
Last sample in embedded_chunks:
{
  "chunk_id": "row_15839_chunk_0",
  "row_index": 15839,
  "text": "['Schwab\u2019s principal contractual obligations as of December 31, 2023 include payments on brokered CDs; payments on FHLB borrowings, other short-term borrowings, and long-term debt; lease payments including legally-binding minimum lease payments for leases signed but not yet commenced; credit-related financial instruments, representing our banking subsidiaries\u2019 commitments to extend credit to banking clients, purchase mortgage loans, and fund CRA investments; and purchase obligations for services such as advertising and marketing, telecommunications, hardware- and software-related agreements, and professional services. For information on our contractual obligations for brokered CDs, FHLB borrowings, other short-term borrowings, long-term debt, leases, and credit-related financial instruments, see Item 8 \u2013 Notes 11, 12, 13, and 14. As of Decemb

Test run: 

In [None]:
from neo4j import GraphDatabase
import os

def get_driver():
    uri = os.getenv("NEO4J_URI")
    user = os.getenv("NEO4J_USERNAME")
    pwd = os.getenv("NEO4J_PASSWORD")

    if not all([uri, user, pwd]):
        raise RuntimeError("Missing Neo4j connection info in environment variables")

    return GraphDatabase.driver(uri, auth=(user, pwd))

In [None]:
test_chunk = next(c for c in embedded_chunks if c["chunk_id"] == "row_5823_chunk_0")

In [None]:
print(test_chunk)

In [None]:
import os
import json
import logging
import signal

from dotenv import load_dotenv
from tenacity import retry, stop_after_attempt, wait_exponential

from langchain.docstore.document import Document
from langchain_openai import ChatOpenAI
from langchain_experimental.graph_transformers import LLMGraphTransformer
from langchain_neo4j import Neo4jGraph
from langchain_community.graphs.graph_document import Node, Relationship

# === Setup Logging ===
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(message)s")

# === Load Environment Variables ===
load_dotenv()

# === Config ===
EMBEDDINGS_JSON  = "/Users/christel/Desktop/Thesis/thesis_repo/data/data_processed/embedded_chunks.json"
CHUNK_VECTOR_PROPERTY = "embedding"
TIMEOUT_SECONDS  = 300

# === Init LLM ===
@retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=4, max=10))
def init_llm():
    return ChatOpenAI(
        openai_api_key=os.getenv("OPENAI_API_KEY"),
        model_name="gpt-4o-mini"
    )

llm = init_llm()

# === Init Neo4j client ===
graph = Neo4jGraph(
    url=os.getenv("NEO4J_URI"),
    username=os.getenv("NEO4J_USERNAME"),
    password=os.getenv("NEO4J_PASSWORD")
)

# === Timeout Utilities ===
class TimeoutException(Exception):
    pass

def timeout_handler(signum, frame):
    raise TimeoutException()

signal.signal(signal.SIGALRM, timeout_handler)

graph.query("CREATE CONSTRAINT unique_document_id IF NOT EXISTS FOR (d:Document) REQUIRE d.id IS UNIQUE")
graph.query("CREATE CONSTRAINT unique_chunk_id    IF NOT EXISTS FOR (c:Chunk)    REQUIRE c.id IS UNIQUE")
graph.query("CREATE CONSTRAINT unique_entity_id   IF NOT EXISTS FOR (e:Entity)   REQUIRE e.id IS UNIQUE")

# === Load precomputed embeddings ===
with open(EMBEDDINGS_JSON, "r") as f:
    embedded_chunks = json.load(f)

# === test sample ===
test_chunk = next(c for c in embedded_chunks if c["chunk_id"] == "row_5823_chunk_0")
chunk_id   = test_chunk["chunk_id"]     
row_index  = test_chunk["row_index"]    
text       = test_chunk["text"]
embedding  = test_chunk["embedding"]

logging.info(f"Inserting single chunk: {chunk_id} (row_index={row_index})")

# --- Step 1: MERGE Document, Chunk and set properties + vector ---
signal.alarm(TIMEOUT_SECONDS)
try:
    graph.query(
        """
        MERGE (d:Document {id: $doc_id})
        MERGE (c:Chunk    {id: $chunk_id})
        SET d.row_index = $row_index,
            c.text       = $text
        MERGE (c)-[:PART_OF]->(d)
        WITH c
        CALL db.create.setNodeVectorProperty(c, $vector_prop, $vector)
        """,
        {
            "doc_id":      str(row_index),
            "chunk_id":    chunk_id,
            "row_index":   row_index,
            "text":        text,
            "vector_prop": CHUNK_VECTOR_PROPERTY,
            "vector":      embedding,
        }
    )
    signal.alarm(0)
    logging.info("Document & Chunk inserted (with embedding).")
except TimeoutException:
    logging.error("Timeout during chunk/document insertion.")
    raise


# --- Step 2: Extract entities from the chunk text ---
doc_transformer = LLMGraphTransformer(llm=llm)

signal.alarm(TIMEOUT_SECONDS)
try:
    fake_doc   = Document(page_content=text, metadata={"row_index": row_index})
    graph_docs = doc_transformer.convert_to_graph_documents([fake_doc])
    signal.alarm(0)
    if not graph_docs or not graph_docs[0].nodes:
        logging.warning("No entities extracted.")
except TimeoutException:
    logging.error("Timeout during entity extraction.")
    raise

# --- Step 3: Normalize the extracted nodes + build HAS_ENTITY rels ---
for graph_doc in graph_docs:
    entities = []
    relationships = []

    # Normalize each LLM‐extracted node
    for n in graph_doc.nodes:
        raw_id   = (n.id or "").strip()
        raw_type = (n.type or "").strip()
        if not raw_id or not raw_type:
            logging.warning(f"Skipping malformed: id={raw_id}, type={raw_type}")
            continue

        eid = raw_id.lower()
        n.id   = eid
        n.type = "Entity"
        n.properties["id"]            = eid
        n.properties["original_type"] = raw_type

        entities.append(n)
        # build a rel from existing chunk to this entity
        relationships.append(
            Relationship(
                source=Node(id=chunk_id, type="Chunk"), 
                target=n, 
                type="HAS_ENTITY"
            )
        )

    # overwrite the graph_doc
    graph_doc.nodes = entities
    graph_doc.relationships = relationships


# --- Step 4: Push entities + HAS_ENTITY → chunk
signal.alarm(TIMEOUT_SECONDS)
try:
    graph.add_graph_documents(
        graph_docs,
        baseEntityLabel="Entity",   # all nodes in graph_doc.nodes get :Entity
        include_source=False
    )
    signal.alarm(0)
    logging.info("Entities + relationships inserted.")
except TimeoutException:
    logging.error("Timeout during graph insertion.")
    raise


All files: 

In [None]:
import os
import json
import logging
import signal

from dotenv import load_dotenv
from tenacity import retry, stop_after_attempt, wait_exponential

from langchain.docstore.document import Document
from langchain_openai import ChatOpenAI
from langchain_experimental.graph_transformers import LLMGraphTransformer
from langchain_neo4j import Neo4jGraph
from langchain_community.graphs.graph_document import Node, Relationship

# === Setup Logging ===
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(message)s")

# === Load Environment Variables ===
load_dotenv()

# === Config ===
EMBEDDINGS_JSON        = "/Users/christel/Desktop/Thesis/thesis_repo/data/data_processed/embedded_chunks.json"
CHUNK_VECTOR_PROPERTY  = "embedding"
TIMEOUT_SECONDS        = 300
START_FROM_ROW         = 5823   

# === Init LLM with retries ===
@retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=4, max=10))
def init_llm():
    return ChatOpenAI(
        openai_api_key=os.getenv("OPENAI_API_KEY"),
        model_name="gpt-4o-mini"
    )

llm = init_llm()

# === Init Neo4j client ===
graph = Neo4jGraph(
    url=os.getenv("NEO4J_URI"),
    username=os.getenv("NEO4J_USERNAME"),
    password=os.getenv("NEO4J_PASSWORD")
)

# === Timeout Utilities ===
class TimeoutException(Exception):
    pass

def timeout_handler(signum, frame):
    raise TimeoutException()

signal.signal(signal.SIGALRM, timeout_handler)

# === Ensure uniqueness constraints ===
graph.query("CREATE CONSTRAINT unique_document_id IF NOT EXISTS FOR (d:Document) REQUIRE d.id IS UNIQUE")
graph.query("CREATE CONSTRAINT unique_chunk_id    IF NOT EXISTS FOR (c:Chunk)    REQUIRE c.id IS UNIQUE")
graph.query("CREATE CONSTRAINT unique_entity_id   IF NOT EXISTS FOR (e:Entity)   REQUIRE e.id IS UNIQUE")

# === Load Precomputed Embeddings ===
with open(EMBEDDINGS_JSON, "r") as f:
    embedded_chunks = json.load(f)

# === Prepare the LLM→Graph transformer ===
doc_transformer = LLMGraphTransformer(llm=llm)

# === Main Loop over all chunks ===
for item in embedded_chunks:
    chunk_id  = item["chunk_id"]
    row_index = item["row_index"]
    text      = item["text"]
    vector    = item["embedding"]

    # Skip until we reach our starting row
    if row_index < START_FROM_ROW:
        continue

    logging.info(f"Processing chunk {chunk_id} (row_index={row_index})")

    # --- Step 1: MERGE Document & Chunk + set text & embedding ---
    signal.alarm(TIMEOUT_SECONDS)
    try:
        graph.query(
            """
            MERGE (d:Document {id: $doc_id})
            SET d.row_index = $row_index
            MERGE (c:Chunk {id: $chunk_id})
            SET c.text = $text
            MERGE (c)-[:PART_OF]->(d)
            WITH c
            CALL db.create.setNodeVectorProperty(c, $vector_prop, $vector)
            """,
            {
                "doc_id":      str(row_index),
                "row_index":   row_index,
                "chunk_id":    chunk_id,
                "text":        text,
                "vector_prop": CHUNK_VECTOR_PROPERTY,
                "vector":      vector,
            }
        )
        signal.alarm(0)
        logging.info("Document & Chunk upserted (with embedding).")
    except TimeoutException:
        logging.error(f"Timeout inserting chunk {chunk_id}")
        continue
    except Exception as e:
        logging.error(f"Error inserting chunk {chunk_id}: {e}")
        continue

    # --- Step 2: Extract entities from the chunk text ---
    signal.alarm(TIMEOUT_SECONDS)
    try:
        fake_doc   = Document(page_content=text, metadata={"row_index": row_index})
        graph_docs = doc_transformer.convert_to_graph_documents([fake_doc])
        signal.alarm(0)
    except TimeoutException:
        logging.error(f"Timeout extracting entities for {chunk_id}")
        continue
    except Exception as e:
        logging.error(f"LLM graph extraction failed for {chunk_id}: {e}")
        continue

    if not graph_docs or not graph_docs[0].nodes:
        logging.info(f"No entities found for {chunk_id}")
        continue

    # --- Step 3: Normalize entities + build relationships to the chunk ---
    graph_doc = graph_docs[0]
    entities     = []
    relationships = []
    chunk_node   = Node(id=chunk_id, type="Chunk")

    for n in graph_doc.nodes:
        raw_id   = (n.id or "").strip()
        raw_type = (n.type or "").strip()
        if not raw_id or not raw_type:
            logging.warning(f"Skipping malformed entity: id={raw_id}, type={raw_type}")
            continue

        eid = raw_id.lower()
        n.id   = eid
        n.type = "Entity"
        n.properties["id"]            = eid
        n.properties["original_type"] = raw_type

        entities.append(n)
        relationships.append(
            Relationship(source=chunk_node, target=n, type="HAS_ENTITY")
        )

    graph_doc.nodes = entities
    graph_doc.relationships = relationships

    # --- Step 4: Manually MERGE each Entity + HAS_ENTITY rel per chunk ---
    signal.alarm(TIMEOUT_SECONDS)
    try:
        # upsert all entities for this chunk
        for ent in entities:
            graph.query(
                """
                MERGE (e:Entity {id: $entity_id})
                SET e.original_type = $original_type
                """,
                {
                    "entity_id":    ent.id,
                    "original_type": ent.properties["original_type"],
                },
            )

        # upsert the relationships from chunk→entity
        for ent in entities:
            graph.query(
                """
                MATCH (c:Chunk {id: $chunk_id})
                MATCH (e:Entity {id: $entity_id})
                MERGE (c)-[:HAS_ENTITY]->(e)
                """,
                {
                    "chunk_id":    chunk_id,
                    "entity_id":   ent.id,
                },
            )

        signal.alarm(0)
        logging.info(f"Entities & relationships inserted for {chunk_id}")
    except TimeoutException:
        logging.error(f"Timeout inserting entities for {chunk_id}")
        signal.alarm(0)
        continue

# Implement the Retriever 

In [None]:
import sys
import os
from dotenv import load_dotenv

#sys.path.append("/Users/christel/Desktop/Thesis/thesis_repo/src/retrievers/graphrag")
#/Users/christel/Desktop/Thesis/thesis_repo/src/retrievers/graphrag/graphRAG_retriever.py
sys.path.append("/Users/christel/Desktop/Thesis/thesis_repo")

load_dotenv()
from src.retrievers.graphrag.graphRAG_retriever import retrieve

query = "For STT, what was the percent change in the value of commercial paper outstanding between 2010 and 2011?"
results = retrieve(query)

print("=== Stage 1 Output ===")
print(results["stage1"])

print("\n=== Top Retrieved Chunks ===")
for r in results["candidates"]:
    print(f"Chunk ID: {r['chunkId']}")
    print(f"Fused Score: {r['fusedScore']:.4f}")
    print(f"Entities: {r['entities']}")
    print(f"Text: {r['text'][:300]}...\n")
