# GraphRAG Prep (neo4j-graphrag)

This notebook mirrors the original prep workflow but relies on the official `neo4j-graphrag` utilities for vector indexing, embedding, and optional retrieval checks.



In [None]:
import sys

print(sys.executable)
print("Kernel ready.")



## Environment Setup

Load project configuration and verify required Neo4j credentials.



In [None]:
import os
from pathlib import Path

try:
    from dotenv import load_dotenv
except ImportError:
    load_dotenv = None

PROJECT_ROOT = Path(".." ).resolve()
ENV_PATH = PROJECT_ROOT / ".env"
if load_dotenv and ENV_PATH.exists():
    load_dotenv(ENV_PATH)

NEO4J_URI = os.getenv("NEO4J_URI")
NEO4J_USERNAME = os.getenv("NEO4J_USERNAME")
NEO4J_PASSWORD = os.getenv("NEO4J_PASSWORD")

missing = [name for name, value in [
    ("NEO4J_URI", NEO4J_URI),
    ("NEO4J_USERNAME", NEO4J_USERNAME),
    ("NEO4J_PASSWORD", NEO4J_PASSWORD),
] if not value]
if missing:
    raise ValueError(f"Missing required environment variables: {', '.join(missing)}")

print("Loaded Neo4j credentials from environment.")



## Connect to Neo4j

Initialize a shared Neo4j driver that downstream steps reuse.



In [None]:
from neo4j import GraphDatabase

NEO4J_DATABASE = os.getenv("NEO4J_DATABASE") or None

driver = GraphDatabase.driver(
    NEO4J_URI,
    auth=(NEO4J_USERNAME, NEO4J_PASSWORD),
)

print("Driver initialized.")



In [None]:
with driver.session(database=NEO4J_DATABASE) as session:
    result = session.run("RETURN 1 AS ok")
    print(result.single()["ok"])



## Optional Reset

Use this only when you need a clean workspace. Comment the `session.run` lines you do not require.



In [None]:
# Uncomment the lines below if you want to wipe previously generated lexical graph data.
# with driver.session(database=NEO4J_DATABASE) as session:
#     session.run("MATCH (c:Chunk) DETACH DELETE c")
#     session.run("MATCH (d:Document) DETACH DELETE d")
#     session.run("MATCH (n:__Entity__) DETACH DELETE n")
#     session.run("MATCH ()-[r:FROM_DOCUMENT|:NEXT_CHUNK|:FROM_CHUNK]->() DELETE r")
#     print("Cleared lexical graph nodes and relationships.")



## Configure Neo4j GraphRAG Pipeline

The official `SimpleKGPipeline` orchestrates chunking, entity extraction, and embedding. Adjust the schema or model choices as needed for your corpus.



In [None]:
import asyncio
from neo4j_graphrag.embeddings import OpenAIEmbeddings
from neo4j_graphrag.experimental.pipeline.kg_builder import SimpleKGPipeline
from neo4j_graphrag.llm.openai_llm import OpenAILLM

EMBED_MODEL = os.getenv("OPENAI_EMBED_MODEL", "text-embedding-3-small")
LLM_MODEL = os.getenv("OPENAI_LLM_MODEL", "gpt-4o")
LLM_MAX_TOKENS = int(os.getenv("OPENAI_LLM_MAX_TOKENS", "2000"))
LLM_TEMPERATURE = float(os.getenv("OPENAI_LLM_TEMPERATURE", "0"))

SCHEMA_CONFIG = {
    "node_types": ["Person", "Organization", "Location", "Event", "Document"],
    "relationship_types": [
        "MENTIONS",
        "ASSOCIATED_WITH",
        "LOCATED_IN",
        "RELATED_TO",
        "DESCRIBES",
    ],
    "patterns": [
        ("Person", "ASSOCIATED_WITH", "Organization"),
        ("Organization", "LOCATED_IN", "Location"),
        ("Event", "RELATED_TO", "Organization"),
        ("Document", "DESCRIBES", "Event"),
        ("Document", "MENTIONS", "Person"),
    ],
}

llm = OpenAILLM(
    model_name=LLM_MODEL,
    model_params={
        "max_tokens": LLM_MAX_TOKENS,
        "temperature": LLM_TEMPERATURE,
    },
)
embedder = OpenAIEmbeddings(model=EMBED_MODEL)

kg_pipeline = SimpleKGPipeline(
    llm=llm,
    driver=driver,
    embedder=embedder,
    schema=SCHEMA_CONFIG,
    from_pdf=False,
    on_error="IGNORE",
    perform_entity_resolution=True,
    neo4j_database=NEO4J_DATABASE,
)

print("Pipeline ready (LLM=%s, embedding=%s)." % (LLM_MODEL, EMBED_MODEL))



## Run KG Builder on Markdown Sources

Point the pipeline at your markdown drop. Each file is chunked, embedded, and merged into Neo4j automatically.



In [None]:
from typing import Iterable

try:
    from tqdm.auto import tqdm
except ImportError:
    tqdm = None

SOURCE_DIR = (PROJECT_ROOT / "KnowledgeGraph" / "source_data").resolve()
if not SOURCE_DIR.exists():
    raise FileNotFoundError(f"Missing source directory: {SOURCE_DIR}")

LIMIT_FILES = int(os.getenv("PIPELINE_FILE_LIMIT", "0")) or None
RUN_PIPELINE = False  # flip to True to start ingestion

md_files: list[Path] = sorted(SOURCE_DIR.rglob("*.md"))
if LIMIT_FILES:
    md_files = md_files[:LIMIT_FILES]

print(f"Found {len(md_files)} markdown files under {SOURCE_DIR}.")

async def _process_files(files: Iterable[Path]):
    iterator = files
    if tqdm is not None:
        iterator = tqdm(files, desc="Ingesting", unit="file")
    for path in iterator:
        text = path.read_text(encoding="utf-8")
        metadata = {"source_path": str(path)}
        await kg_pipeline.run_async(
            file_path=str(path),
            text=text,
            document_metadata=metadata,
        )

if RUN_PIPELINE:
    asyncio.run(_process_files(md_files))
    print("Ingestion finished.")
else:
    print("Pipeline not executed. Set RUN_PIPELINE = True to run.")



## Vector Index Setup

The pipeline stores chunk embeddings on `Chunk.embedding`. Use the official helpers to manage the vector index.


In [None]:
from neo4j_graphrag.indexes import (
    create_vector_index,
    drop_index_if_exists,
    retrieve_vector_index_info,
)

VECTOR_INDEX_NAME = os.getenv("NEO4J_VECTOR_INDEX", "chunk_embedding_idx")
VECTOR_LABEL = "Chunk"
VECTOR_PROPERTY = "embedding"
EMBED_DIM = int(os.getenv("OPENAI_EMBED_DIM", "1536"))
SIMILARITY_FN = os.getenv("NEO4J_SIMILARITY_FN", "cosine").lower()

if RUN_PIPELINE:
    drop_index_if_exists(driver, VECTOR_INDEX_NAME, neo4j_database=NEO4J_DATABASE)
    create_vector_index(
        driver=driver,
        name=VECTOR_INDEX_NAME,
        label=VECTOR_LABEL,
        embedding_property=VECTOR_PROPERTY,
        dimensions=EMBED_DIM,
        similarity_fn=SIMILARITY_FN,
        neo4j_database=NEO4J_DATABASE,
    )

index_info = retrieve_vector_index_info(
    driver=driver,
    index_name=VECTOR_INDEX_NAME,
    label_or_type=VECTOR_LABEL,
    embedding_property=VECTOR_PROPERTY,
    neo4j_database=NEO4J_DATABASE,
)
print("Index info:", index_info)



## Validate Neo4j Load

Run quick diagnostics to confirm that documents, chunks, and embeddings landed as expected.



In [None]:
from textwrap import dedent


def _single_value(query: str, key: str = "c"):
    with driver.session(database=NEO4J_DATABASE) as session:
        record = session.run(query).single()
        return record[key] if record else 0


def _list_records(query: str, limit: int = 5):
    with driver.session(database=NEO4J_DATABASE) as session:
        return session.run(query, limit=limit).data()

stats = {
    "documents": _single_value("MATCH (d:Document) RETURN count(d) AS c"),
    "chunks": _single_value("MATCH (c:Chunk) RETURN count(c) AS c"),
    "entities": _single_value("MATCH (e:__Entity__) RETURN count(e) AS c"),
}
print("Counts:", stats)

sample_embeddings = _list_records(
    "MATCH (c:Chunk) WHERE c.embedding IS NOT NULL RETURN elementId(c) AS id, size(c.embedding) AS dim LIMIT $limit"
)
print("Embedding sample:", sample_embeddings[:3])



## Optional: End-to-End Question Answering

Instantiate Neo4j's `GraphRAG` helper to verify retrieval quality against the freshly populated index.



In [None]:
from neo4j_graphrag.generation import GraphRAG
from neo4j_graphrag.retrievers import VectorRetriever

RUN_RAG_CHECK = False
TEST_QUESTION = "What recent announcements involve tenders or awards?"

if RUN_RAG_CHECK:
    qa_llm = OpenAILLM(
        model_name=LLM_MODEL,
        model_params={
            "max_tokens": LLM_MAX_TOKENS,
            "temperature": 0,
        },
    )
    qa_embedder = OpenAIEmbeddings(model=EMBED_MODEL)
    retriever = VectorRetriever(
        driver=driver,
        index_name=VECTOR_INDEX_NAME,
        embedder=qa_embedder,
        neo4j_database=NEO4J_DATABASE,
    )
    graph_rag = GraphRAG(retriever=retriever, llm=qa_llm)
    rag_result = graph_rag.search(
        query_text=TEST_QUESTION,
        retriever_config={"top_k": 5},
        return_context=False,
    )
    print(rag_result.answer)
else:
    print("Set RUN_RAG_CHECK = True to issue a sample question.")



## Next Steps

- Update `RUN_PIPELINE` and `RUN_RAG_CHECK` flags before execution.
- Tune `SCHEMA_CONFIG` to match the entities and relationships you expect the model to extract.
- Adjust model names or environment variables to align with your OpenAI deployment.
- Once satisfied, consider checking the generated graph in Neo4j Browser (`CALL db.schema.visualization()`) or running ad-hoc Cypher validations.

