# Neo4j Import of GraphRAG Result Parquet files

In [1]:
import os
from dotenv import load_dotenv
import time
import pandas as pd
from neo4j import GraphDatabase

In [2]:
# Get the directory of the current script
current_dir = '/Users/royzalta/Documents/github-local/harry-potter-graphrag-hebrew/notebooks'

# Get the root directory (one level up from src)
root_dir = os.path.dirname(current_dir)

# Load environment variables from the root directory
load_dotenv(os.path.join(root_dir, ".env"))

# Neo4j connection details
GRAPHRAG_FOLDER = '/Users/royzalta/Documents/github-local/harry-potter-graphrag-hebrew/notebooks/data/graphrag/output/20240911-163559/artifacts'

In [3]:
NEO4J_URI = os.getenv("NEO4J_URI")
NEO4J_USERNAME = os.getenv("NEO4J_USERNAME")
NEO4J_PASSWORD = os.getenv("NEO4J_PASSWORD")
NEO4J_DATABASE = "neo4j"

In [4]:
print(f"NEO4J_URI: {NEO4J_URI}")
print(f"NEO4J_USER: {NEO4J_USERNAME}")
print(f"NEO4J_PASSWORD: {'*' * len(NEO4J_PASSWORD) if NEO4J_PASSWORD else 'Not set'}")
print(f"NEO4J_DATABASE: {NEO4J_DATABASE}")

NEO4J_URI: neo4j+s://115b3577.databases.neo4j.io
NEO4J_USER: neo4j
NEO4J_PASSWORD: *******************************************
NEO4J_DATABASE: neo4j


In [5]:
# Create a Neo4j driver
driver = GraphDatabase.driver(NEO4J_URI, auth=(NEO4J_USERNAME, NEO4J_PASSWORD))

## Batched Import
The batched import function takes a Cypher insert statement (needs to use the variable value for the row) and a dataframe to import. It will send by default 1k rows at a time as query parameter to the database to be inserted.


In [6]:
def batched_import(statement, df, batch_size=1000, neo4j_db="neo4j"):
    """
    Import a dataframe into Neo4j using a batched approach.

    Parameters:
        statement (str): The Cypher query to execute.
        df (pd.DataFrame): The dataframe to import.
        batch_size (int): The number of rows to import in each batch.
        neo4j_db (str): The database name to use (default is 'neo4j').
    """
    total = len(df)
    start_s = time.time()
    
    for start in range(0, total, batch_size):
        batch = df.iloc[start : min(start + batch_size, total)]
        result = driver.execute_query(
            "UNWIND $rows AS value " + statement,
            rows=batch.to_dict("records"),
            database_=neo4j_db  # Defaulting to 'neo4j'
        )
        print(result.summary.counters)
    
    print(f"{total} rows in {time.time() - start_s} s.")
    return total

## Indexes and Constraints
Indexes in Neo4j are only used to find the starting points for graph queries, e.g. quickly finding two nodes to connect. Constraints exist to avoid duplicates, we create them mostly on id's of Entity types.

We use some Types as markers with two underscores before and after to distinguish them from the actual entity types.

The default relationship type here is RELATED but we could also infer a real relationship-type from the description or the types of the start and end-nodes.

__Entity__
__Document__
__Chunk__
__Community__
__Covariate__


In [7]:
# create constraints, idempotent operation

statements = """
create constraint chunk_id if not exists for (c:__Chunk__) require c.id is unique;
create constraint document_id if not exists for (d:__Document__) require d.id is unique;
create constraint entity_id if not exists for (c:__Community__) require c.community is unique;
create constraint entity_id if not exists for (e:__Entity__) require e.id is unique;
create constraint entity_title if not exists for (e:__Entity__) require e.name is unique;
create constraint entity_title if not exists for (e:__Covariate__) require e.title is unique;
create constraint related_id if not exists for ()-[rel:RELATED]->() require rel.id is unique;
""".split(";")

for statement in statements:
    if len((statement or "").strip()) > 0:
        print(statement)
        driver.execute_query(statement)


create constraint chunk_id if not exists for (c:__Chunk__) require c.id is unique

create constraint document_id if not exists for (d:__Document__) require d.id is unique

create constraint entity_id if not exists for (c:__Community__) require c.community is unique

create constraint entity_id if not exists for (e:__Entity__) require e.id is unique

create constraint entity_title if not exists for (e:__Entity__) require e.name is unique

create constraint entity_title if not exists for (e:__Covariate__) require e.title is unique

create constraint related_id if not exists for ()-[rel:RELATED]->() require rel.id is unique


## Import Process
Importing the Documents
We're loading the parquet file for the documents and create nodes with their ids and add the title property. We don't need to store text_unit_ids as we can create the relationships and the text content is also contained in the chunks.


In [8]:
doc_df = pd.read_parquet(
    f"{GRAPHRAG_FOLDER}/create_final_documents.parquet", columns=["id", "title"]
)
doc_df.head(2)

Unnamed: 0,id,title
0,2f9e3a08cb643fbc248c8d28a6e427e7,graphrag_input.txt


In [9]:
# Import documents
statement = """
MERGE (d:__Document__ {id:value.id})
SET d += value {.title}
"""

batched_import(statement, doc_df)

{'_contains_updates': True, 'labels_added': 1, 'nodes_created': 1, 'properties_set': 2}
1 rows in 0.389178991317749 s.


1

## Loading Text Units
We load the text units, create a node per id and set the text and number of tokens. Then we connect them to the documents that we created before.


In [10]:
text_df = pd.read_parquet(
    f"{GRAPHRAG_FOLDER}/create_final_text_units.parquet",
    columns=["id", "text", "n_tokens", "document_ids"],
)
text_df.head(2)

Unnamed: 0,id,text,n_tokens,document_ids
0,e192067a9b0e0ac05d221f1f584b4cc3,יסה על מטאטא\n - משחק קווידיץ' הראשון\n - ...,300,[2f9e3a08cb643fbc248c8d28a6e427e7]
1,d9a43380e87731712c89805799a78bf8,�ורים ששואבים שמחה ומחשבות טובות)\n\n5. **אירו...,300,[2f9e3a08cb643fbc248c8d28a6e427e7]


In [11]:
statement = """
MERGE (c:__Chunk__ {id:value.id})
SET c += value {.text, .n_tokens}
WITH c, value
UNWIND value.document_ids AS document
MATCH (d:__Document__ {id:document})
MERGE (c)-[:PART_OF]->(d)
"""

batched_import(statement, text_df)

{'_contains_updates': True, 'labels_added': 36, 'relationships_created': 36, 'nodes_created': 36, 'properties_set': 108}
36 rows in 0.539341926574707 s.


36

In [12]:
entities_df = pd.read_parquet(
                f"{GRAPHRAG_FOLDER}/create_final_entities.parquet"
        )
# entities_df["type"] = entities_df["type"].map(
#                 {
#                         "character"     : "דמויות",
#                         "magical_object": "חפצים קסומים",
#                         "place"         : "מקומות",
#                         "event"         : "אירועים",
#                         "institution"   : "מוסדות",
#                 }
#         )
entities_df.head(10)

Unnamed: 0,id,name,type,description,human_readable_id,graph_embedding,text_unit_ids,description_embedding
0,b45241d70f0e43fca764df95b2b81f77,הוגוורטס,מוסדות,Here is a comprehensive summary of the data:\n...,0,,"[4cfeb842bc272b77517936cef90592ba, 8d0d3650eb5...","[-0.023707514628767967, 0.11131846904754639, -..."
1,4119fd06010c494caa07f439b333f4c5,סוורוס סנייפ,דמויות,"Based on the provided data, here is a comprehe...",1,,"[d9a43380e87731712c89805799a78bf8, e192067a9b0...","[-0.020804958418011665, 0.08280223608016968, -..."
2,d3835bf3dda84ead99deadbeac5d0d7d,רון ויזלי,דמויות,"Based on the provided data, here is a comprehe...",2,,"[05b0de3772b1084ed1fad0352565299a, e192067a9b0...","[-0.03052452951669693, 0.069986492395401, -0.1..."
3,077d2820ae1845bcbb1803379a3d1eae,הרמיוני גריינג'ר,דמויות,"Based on the provided data, here is a comprehe...",3,,"[05b0de3772b1084ed1fad0352565299a, 8d0d3650eb5...","[-0.0023231941740959883, 0.059761013835668564,..."
4,3671ea0dd4e84c1a9b02c5ab2c8f4bac,ניקולס פלאמל,דמויות,ניקולס פלאמל הוא אחד מבעלי ברית של הארי פוטר,4,,[e192067a9b0e0ac05d221f1f584b4cc3],"[-0.017206992954015732, 0.06735004484653473, -..."
5,19a7f254a5d64566ab5cc15472df02de,קווירל/וולדמורט,אנטי-דמויות,קווירל/וולדמורט הם אויביה של הארי פוטר)**Relat...,5,,[e192067a9b0e0ac05d221f1f584b4cc3],"[-0.0037687437143176794, 0.015865307301282883,..."
6,e7ffaee9d31d4d3c96e04f911d0a8f9e,הארי פוטר,,"Based on the provided data, here is a comprehe...",6,,"[4cfeb842bc272b77517936cef90592ba, a72cf9935f5...","[-0.04667172208428383, 0.0914478451013565, -0...."
7,f7e11b0e297a44a896dc67928368f600,אב,,,7,,[e192067a9b0e0ac05d221f1f584b4cc3],"[-0.02220052480697632, 0.0551026277244091, -0...."
8,1fd3fa8bb5a2408790042ab9573779ee,האגריד,,,8,,[e192067a9b0e0ac05d221f1f584b4cc3],"[0.0002496620872989297, 0.05938221886754036, -..."
9,27f9fbe6ad8c4a8b9acee0d3596ed57c,אירועים,EVENT,A collection of events that take place in the ...,9,,[d9a43380e87731712c89805799a78bf8],"[-0.05220653861761093, 0.06364908814430237, -0..."


In [13]:
print(entities_df['type'].unique())

['מוסדות' 'דמויות' 'אנטי-דמויות' '' 'EVENT' 'ארגונים' 'מקומות' 'אירועים'
 'אובייקטים' 'חפצים' 'אירוע']


In [14]:
entity_statement = """
        MERGE (e:__Entity__ {id: value.id})
        SET e.name = value.name,
            e.type = value.type,
            e.description = value.description
        """

batched_import(entity_statement, entities_df)

{'_contains_updates': True, 'labels_added': 59, 'nodes_created': 59, 'properties_set': 236}
59 rows in 1.9124531745910645 s.


59

## Import Relationships
For the relationships we find the source and target node by name, using the base __Entity__ type. After creating the RELATED relationships, we set the description as attribute.

In [15]:
rel_df = pd.read_parquet(
    f"{GRAPHRAG_FOLDER}/create_final_relationships.parquet",
    columns=[
        "source",
        "target",
        "id",
        "rank",
        "weight",
        "human_readable_id",
        "description",
        "text_unit_ids",
    ],
)
rel_df.head(2)

Unnamed: 0,source,target,id,rank,weight,human_readable_id,description,text_unit_ids
0,הוגוורטס,הארי פוטר,958beecdb5bb4060948415ffd75d2b03,18,18.0,0,"Based on the provided data, here is a comprehe...","[4cfeb842bc272b77517936cef90592ba, d9a43380e87..."
1,הוגוורטס,גילדורי לוקהרט,b999ed77e19e4f85b7f1ae79af5c002a,3,6.0,1,גילדורי לוקהרט הוא מורה חדש בהוגוורטס,[8d0d3650eb5633bc84790fc05b52b64e]


In [16]:
rel_statement = """
    MATCH (source:__Entity__ {name:replace(value.source,'"','')})
    MATCH (target:__Entity__ {name:replace(value.target,'"','')})
    // not necessary to merge on id as there is only one relationship per pair
    MERGE (source)-[rel:RELATED {id: value.id}]->(target)
    SET rel += value {.rank, .weight, .human_readable_id, .description, .text_unit_ids}
    RETURN count(*) as createdRels
"""

batched_import(rel_statement, rel_df)

{'_contains_updates': True, 'relationships_created': 40, 'properties_set': 240}
40 rows in 0.4526710510253906 s.


40