#### Confirm Working


In [2]:
import sys
print(sys.executable)
print("Kernel working. Proceed")

/home/sepeh/.cache/pypoetry/virtualenvs/graphrag-developer-challenge3-4suqu0LP-py3.12/bin/python
Kernel working. Proceed


#### Connect to Neo4j


In [3]:
from pathlib import Path
import sys

current_dir = Path.cwd()
parent_dir = current_dir.parent
print(f"Current directory: {current_dir}")
print(f"Parent directory: {parent_dir}")
if str(parent_dir) not in sys.path:
    sys.path.insert(0, str(parent_dir))
print(f"KnowledgeGraph exists: {(parent_dir / 'KnowledgeGraph').exists()}")

from KnowledgeGraph.config import load_neo4j_graph
from KnowledgeGraph.NodeRag import ensure_vector_index, preprocess_and_ingest

graph, *_ = load_neo4j_graph()
print("Successfully connected to Neo4j!")

Current directory: /mnt/c/Users/sepeh/OneDrive/Documents/Git/GraphRag_Developer_Challenge3/Main Functions
Parent directory: /mnt/c/Users/sepeh/OneDrive/Documents/Git/GraphRag_Developer_Challenge3
KnowledgeGraph exists: True


  from .autonotebook import tqdm as notebook_tqdm


Successfully connected to Neo4j!


#### Reset NodeRAG graph


In [3]:
print("Clearing existing NodeRAG nodes...")
graph.query(
    """
    MATCH (n)
    WHERE n:NR_Passage OR n:NR_Document OR n:NR_Entity OR n:NR_Relationship
    DETACH DELETE n
    """
)
print("Nodes cleared.")
try:
    graph.query("DROP INDEX `NR_Passage` IF EXISTS")
    print("Dropped vector index `NR_Passage` (if it existed).")
except Exception as exc:
    print(f"Index drop skipped: {exc}")

Clearing existing NodeRAG nodes...
Nodes cleared.
Dropped vector index `NR_Passage` (if it existed).


#### Create NodeRAG vector index


In [4]:
ensure_vector_index(graph=graph)
print("Vector index `NR_Passage` is ready.")

Vector index `NR_Passage` is ready.


#### Ingest NodeRAG data


In [None]:
stats = preprocess_and_ingest(graph=graph, show_progress=True)
print("Ingestion summary:", stats)

Fetching 30 files: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 30/30 [00:00<00:00, 137218.23it/s]
Fetching 30 files: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 30/30 [00:00<00:00, 127875.12it/s]


You're using a XLMRobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


#### Verify counts


In [4]:
doc_count = graph.query("MATCH (d:NR_Document) RETURN count(d) AS c")[0]["c"]
passage_count = graph.query("MATCH (p:NR_Passage) RETURN count(p) AS c")[0]["c"]
entity_count = graph.query("MATCH (e:NR_Entity) RETURN count(e) AS c")[0]["c"]
relationship_count = graph.query("MATCH (r:NR_Relationship) RETURN count(r) AS c")[0]["c"]
print(f"Documents: {doc_count}")
print(f"Passages: {passage_count}")
print(f"Entities: {entity_count}")
print(f"Relationship nodes: {relationship_count}")



Documents: 0
Passages: 0
Entities: 0
Relationship nodes: 0
