# Neo4j Import of GraphRAG Result Parquet files

In [2]:
import os
from dotenv import load_dotenv
import time
import pandas as pd
from neo4j import GraphDatabase

In [10]:
# Get the directory of the current script
current_dir = '/Users/royzalta/Documents/github-local/harry-potter-graphrag-hebrew/notebooks'

# Get the root directory (one level up from src)
root_dir = os.path.dirname(current_dir)

# Load environment variables from the root directory
load_dotenv(os.path.join(root_dir, ".env"))

# Neo4j connection details
GRAPHRAG_FOLDER = os.path.join(
    root_dir, "rag", "output", "20240907-102753", "artifacts"
)


In [11]:
NEO4J_URI = os.getenv("NEO4J_URI")
NEO4J_USERNAME = os.getenv("NEO4J_USERNAME")
NEO4J_PASSWORD = os.getenv("NEO4J_PASSWORD")
NEO4J_DATABASE = os.getenv("NEO4J_DATABASE")


In [12]:
print(f"NEO4J_URI: {NEO4J_URI}")
print(f"NEO4J_USER: {NEO4J_USERNAME}")
print(f"NEO4J_PASSWORD: {'*' * len(NEO4J_PASSWORD) if NEO4J_PASSWORD else 'Not set'}")
print(f"NEO4J_DATABASE: {NEO4J_DATABASE}")

NEO4J_URI: neo4j+s://0492ae55.databases.neo4j.io
NEO4J_USER: neo4j
NEO4J_PASSWORD: *******************************************
NEO4J_DATABASE: neo4j


In [13]:
# Create a Neo4j driver
driver = GraphDatabase.driver(NEO4J_URI, auth=(NEO4J_USERNAME, NEO4J_PASSWORD))

## Batched Import
The batched import function takes a Cypher insert statement (needs to use the variable value for the row) and a dataframe to import. It will send by default 1k rows at a time as query parameter to the database to be inserted.


In [14]:
def batched_import(statement, df, batch_size=1000):
    """
    Import a dataframe into Neo4j using a batched approach.

    Parameters: statement is the Cypher query to execute, df is the dataframe to import, and batch_size is the number of rows to import in each batch.
    """
    total = len(df)
    start_s = time.time()
    for start in range(0, total, batch_size):
        batch = df.iloc[start : min(start + batch_size, total)]
        result = driver.execute_query(
            "UNWIND $rows AS value " + statement,
            rows=batch.to_dict("records"),
            database_=NEO4J_DATABASE,
        )
        print(result.summary.counters)
    print(f"{total} rows in {time.time() - start_s} s.")
    return total

## Indexes and Constraints
Indexes in Neo4j are only used to find the starting points for graph queries, e.g. quickly finding two nodes to connect. Constraints exist to avoid duplicates, we create them mostly on id's of Entity types.

We use some Types as markers with two underscores before and after to distinguish them from the actual entity types.

The default relationship type here is RELATED but we could also infer a real relationship-type from the description or the types of the start and end-nodes.

__Entity__
__Document__
__Chunk__
__Community__
__Covariate__


In [15]:
# create constraints, idempotent operation

statements = """
create constraint chunk_id if not exists for (c:__Chunk__) require c.id is unique;
create constraint document_id if not exists for (d:__Document__) require d.id is unique;
create constraint entity_id if not exists for (c:__Community__) require c.community is unique;
create constraint entity_id if not exists for (e:__Entity__) require e.id is unique;
create constraint entity_title if not exists for (e:__Entity__) require e.name is unique;
create constraint entity_title if not exists for (e:__Covariate__) require e.title is unique;
create constraint related_id if not exists for ()-[rel:RELATED]->() require rel.id is unique;
""".split(";")

for statement in statements:
    if len((statement or "").strip()) > 0:
        print(statement)
        driver.execute_query(statement)


create constraint chunk_id if not exists for (c:__Chunk__) require c.id is unique

create constraint document_id if not exists for (d:__Document__) require d.id is unique

create constraint entity_id if not exists for (c:__Community__) require c.community is unique

create constraint entity_id if not exists for (e:__Entity__) require e.id is unique

create constraint entity_title if not exists for (e:__Entity__) require e.name is unique

create constraint entity_title if not exists for (e:__Covariate__) require e.title is unique

create constraint related_id if not exists for ()-[rel:RELATED]->() require rel.id is unique


## Import Process
Importing the Documents
We're loading the parquet file for the documents and create nodes with their ids and add the title property. We don't need to store text_unit_ids as we can create the relationships and the text content is also contained in the chunks.


In [16]:
doc_df = pd.read_parquet(
    f"{GRAPHRAG_FOLDER}/create_final_documents.parquet", columns=["id", "title"]
)
doc_df.head(2)

Unnamed: 0,id,title
0,a71d007ffb7c77590c9be35798d94230,harry_potter1.txt
1,fb610134bc58d43aac4a214277a720a9,harry_potter3.txt


In [17]:
# Import documents
statement = """
MERGE (d:__Document__ {id:value.id})
SET d += value {.title}
"""

batched_import(statement, doc_df)

{'_contains_updates': True, 'labels_added': 7, 'nodes_created': 7, 'properties_set': 14}
7 rows in 0.3757362365722656 s.


7

## Loading Text Units
We load the text units, create a node per id and set the text and number of tokens. Then we connect them to the documents that we created before.


In [18]:
text_df = pd.read_parquet(
    f"{GRAPHRAG_FOLDER}/create_final_text_units.parquet",
    columns=["id", "text", "n_tokens", "document_ids"],
)
text_df.head(2)

Unnamed: 0,id,text,n_tokens,document_ids
0,25358b9dc4926d6904d2943822d3a487,"לי, שהמתינה במטבח, צעקה עליהם. אחרי שנזפה בהם,...",1000,[250965faa3d5db03a88af69fdf5126b8]
1,73237cdcc9c7bf169508ba76f5844025,אשו שלקווירל. וולדמורט/קווירל מנסה לקחת את האב...,1000,[a71d007ffb7c77590c9be35798d94230]


In [19]:
statement = """
MERGE (c:__Chunk__ {id:value.id})
SET c += value {.text, .n_tokens}
WITH c, value
UNWIND value.document_ids AS document
MATCH (d:__Document__ {id:document})
MERGE (c)-[:PART_OF]->(d)
"""

batched_import(statement, text_df)

{'_contains_updates': True, 'labels_added': 76, 'relationships_created': 76, 'nodes_created': 76, 'properties_set': 228}
76 rows in 0.8387048244476318 s.


76

In [25]:
entities_df = pd.read_parquet(
                f"{GRAPHRAG_FOLDER}/create_final_entities.parquet"
        )
# entities_df["type"] = entities_df["type"].map(
#                 {
#                         "character"     : "דמויות",
#                         "magical_object": "חפצים קסומים",
#                         "place"         : "מקומות",
#                         "event"         : "אירועים",
#                         "institution"   : "מוסדות",
#                 }
#         )
entities_df.head(10)

Unnamed: 0,id,name,type,description,human_readable_id,graph_embedding,text_unit_ids,description_embedding
0,b45241d70f0e43fca764df95b2b81f77,לי,דמויות,לי היא דמות שממתינה במטבח וצורכת על בני משפחתה...,0,,[25358b9dc4926d6904d2943822d3a487],"[0.03342330828309059, 0.009608256630599499, -0..."
1,4119fd06010c494caa07f439b333f4c5,הארי,דמויות,"הארי הוא דמות מרכזית בסיפור, קוסם צעיר שמתנדב ...",1,,"[1f4dc57e2d638f5f8451e83bcd37398f, 25358b9dc49...","[0.019130228087306023, 0.02678035944700241, -0..."
2,d3835bf3dda84ead99deadbeac5d0d7d,ארתור וויזלי,דמויות,"ארתור וויזלי הוא אב משפחת וויזלי, המוכר כאביו ...",2,,"[1f4dc57e2d638f5f8451e83bcd37398f, 25358b9dc49...","[0.054881103336811066, -0.0048440974205732346,..."
3,077d2820ae1845bcbb1803379a3d1eae,מולי וויזלי,דמויות,Molly Weasley is the mother of Ron Weasley and...,3,,"[1f4dc57e2d638f5f8451e83bcd37398f, 25358b9dc49...","[0.04892512038350105, -0.0020493371412158012, ..."
4,3671ea0dd4e84c1a9b02c5ab2c8f4bac,ג'יני וויזלי,דמויות,"ג'יני היא בת משפחת וויזלי, לומדת בהוגוורטס ומא...",4,,[25358b9dc4926d6904d2943822d3a487],"[0.005863657221198082, -0.01551152765750885, -..."
5,19a7f254a5d64566ab5cc15472df02de,הוגוורטס,מוסדות,הוגוורטס הוא בית הספר לקוסמים שבו לומדים הארי ...,5,,"[216f6dc26f594711a928a689e1eba6e7, 25358b9dc49...","[-0.0016549237770959735, -0.033621206879615784..."
6,e7ffaee9d31d4d3c96e04f911d0a8f9e,המכונית המעופפת,חפצים קסומים,המכונית המעופפת היא כלי רכב קסום שבו משתמשים ה...,6,,[25358b9dc4926d6904d2943822d3a487],"[-0.022205879911780357, -0.01790991984307766, ..."
7,f7e11b0e297a44a896dc67928368f600,הערבה המפליקה,מקומות,הערבה המפליקה היא עץ תוקפני שבו נתקלים הארי ור...,7,,[25358b9dc4926d6904d2943822d3a487],"[-0.01209294144064188, 0.003610714105889201, -..."
8,1fd3fa8bb5a2408790042ab9573779ee,צרחן,אירועים,צרחן הוא עונש שמקבל רון מאמו בעקבות התעלולים ש...,8,,[25358b9dc4926d6904d2943822d3a487],"[0.010652403347194195, 0.007777633145451546, 0..."
9,27f9fbe6ad8c4a8b9acee0d3596ed57c,רון,,רון ויזלי הוא חברו הטוב של הארי פוטר. במהלך אש...,9,,"[1f4dc57e2d638f5f8451e83bcd37398f, 216f6dc26f5...","[-0.005318960640579462, 0.02122366987168789, -..."


In [21]:
print(entities_df['type'].unique())

[nan]


In [22]:
entity_statement = """
        MERGE (e:__Entity__ {id: value.id})
        SET e.name = value.name,
            e.type = value.type,
            e.description = value.description
        """

batched_import(entity_statement, entities_df)

{'_contains_updates': True, 'labels_added': 54, 'nodes_created': 54, 'properties_set': 216}
54 rows in 1.0383820533752441 s.


54

## Import Relationships
For the relationships we find the source and target node by name, using the base __Entity__ type. After creating the RELATED relationships, we set the description as attribute.

In [23]:
rel_df = pd.read_parquet(
    f"{GRAPHRAG_FOLDER}/create_final_relationships.parquet",
    columns=[
        "source",
        "target",
        "id",
        "rank",
        "weight",
        "human_readable_id",
        "description",
        "text_unit_ids",
    ],
)
rel_df.head(2)

Unnamed: 0,source,target,id,rank,weight,human_readable_id,description,text_unit_ids
0,לי,הארי,2670deebfa3f4d69bb82c28ab250a209,13,8.0,0,לי מכינה ארוחת בוקר להארי ולבניה,[25358b9dc4926d6904d2943822d3a487]
1,הארי,ג'יני וויזלי,404309e89a5241d6bff42c05a45df206,14,9.0,1,ג'יני מאוהבת בהארי ומבלה איתו,[25358b9dc4926d6904d2943822d3a487]


In [24]:
rel_statement = """
    MATCH (source:__Entity__ {name:replace(value.source,'"','')})
    MATCH (target:__Entity__ {name:replace(value.target,'"','')})
    // not necessary to merge on id as there is only one relationship per pair
    MERGE (source)-[rel:RELATED {id: value.id}]->(target)
    SET rel += value {.rank, .weight, .human_readable_id, .description, .text_unit_ids}
    RETURN count(*) as createdRels
"""

batched_import(rel_statement, rel_df)

{'_contains_updates': True, 'relationships_created': 56, 'properties_set': 336}
56 rows in 0.4678981304168701 s.


56