In [1]:
import pandas as pd
from neo4j import GraphDatabase
import time

In [2]:
driver = GraphDatabase.driver("bolt://localhost:7687", auth=("neo4j", "password"))

In [3]:
def batched_import(statement, df, batch_size=1000):
    """
    Import a dataframe into Neo4j using a batched approach.
    Parameters: statement is the Cypher query to execute, df is the dataframe to import, and batch_size is the number of rows to import in each batch.
    """
    total = len(df)
    start_s = time.time()
    for start in range(0,total, batch_size):
        batch = df.iloc[start: min(start+batch_size,total)]
        result = driver.execute_query("UNWIND $rows AS value " + statement,
                                      rows=batch.to_dict('records'),
                                      database_="neo4j")
        print(result.summary.counters)
    print(f'{total} rows in { time.time() - start_s} s.')
    return total

In [4]:
statements = """
create constraint chunk_id if not exists for (c:__Chunk__) require c.id is unique;
create constraint document_id if not exists for (d:__Document__) require d.id is unique;
create constraint entity_id if not exists for (c:__Community__) require c.community is unique;
create constraint entity_id if not exists for (e:__Entity__) require e.id is unique;
create constraint entity_title if not exists for (e:__Entity__) require e.name is unique;
create constraint entity_title if not exists for (e:__Covariate__) require e.title is unique;
create constraint related_id if not exists for ()-[rel:RELATED]->() require rel.id is unique;
""".split(";")

for statement in statements:
    if len((statement or "").strip()) > 0:
        print(statement)
        driver.execute_query(statement)


create constraint chunk_id if not exists for (c:__Chunk__) require c.id is unique

create constraint document_id if not exists for (d:__Document__) require d.id is unique

create constraint entity_id if not exists for (c:__Community__) require c.community is unique

create constraint entity_id if not exists for (e:__Entity__) require e.id is unique

create constraint entity_title if not exists for (e:__Entity__) require e.name is unique

create constraint entity_title if not exists for (e:__Covariate__) require e.title is unique

create constraint related_id if not exists for ()-[rel:RELATED]->() require rel.id is unique


In [7]:
doc_df = pd.read_parquet('/Users/shaansuri/Downloads/artifacts/create_final_documents.parquet', columns=["id", "title"])
print(doc_df.head(2))

                                 id     title
0  c305886e4aa2f6efcf64b57762777055  book.txt


In [8]:
# import documents
statement = """
MERGE (d:__Document__ {id:value.id})
SET d += value {.title}
"""

In [10]:
text_df = pd.read_parquet('/Users/shaansuri/Downloads/artifacts/create_final_text_units.parquet',
                          columns=["id","text","n_tokens","document_ids"])
text_df.head(2)

Unnamed: 0,id,text,n_tokens,document_ids
0,680dd6d2a970a49082fa4f34bf63a34e,﻿The Project Gutenberg eBook of A Christmas Ca...,300,[c305886e4aa2f6efcf64b57762777055]
1,95f1f8f5bdbf0bee3a2c6f2f4a4907f6,THE PROJECT GUTENBERG EBOOK A CHRISTMAS CAROL...,300,[c305886e4aa2f6efcf64b57762777055]


In [11]:
statement = """
MERGE (c:__Chunk__ {id:value.id})
SET c += value {.text, .n_tokens}
WITH c, value
UNWIND value.document_ids AS document
MATCH (d:__Document__ {id:document})
MERGE (c)-[:PART_OF]->(d)
"""

batched_import(statement, text_df)

{'_contains_updates': True, 'properties_set': 462}
231 rows in 0.9863979816436768 s.


231

In [13]:
entity_df = pd.read_parquet('/Users/shaansuri/Downloads/artifacts/create_final_entities.parquet',
                            columns=["name","type","description","human_readable_id","id","description_embedding","text_unit_ids"])
entity_df.head(5)

Unnamed: 0,name,type,description,human_readable_id,id,description_embedding,text_unit_ids
0,"""PROJECT GUTENBERG""","""ORGANIZATION""",Project Gutenberg is a pioneering organization...,0,b45241d70f0e43fca764df95b2b81f77,"[-0.020793898031115532, 0.02951139025390148, 0...","[01e84646075b255eab0a34d872336a89, 10bab8e9773..."
1,"""UNITED STATES""","""GEO""",The United States is prominently recognized fo...,1,4119fd06010c494caa07f439b333f4c5,"[-0.009704762138426304, 0.013335365802049637, ...","[01e84646075b255eab0a34d872336a89, 28f242c4515..."
2,"""CHARLES DICKENS""","""PERSON""",Charles Dickens is the renowned British noveli...,2,d3835bf3dda84ead99deadbeac5d0d7d,"[0.05020756274461746, 0.0023800835479050875, -...","[680dd6d2a970a49082fa4f34bf63a34e, 95f1f8f5bdb..."
3,"""ARTHUR RACKHAM""","""PERSON""","Arthur Rackham is renowned as an illustrator, ...",3,077d2820ae1845bcbb1803379a3d1eae,"[0.016978472471237183, 0.01494782418012619, -0...","[680dd6d2a970a49082fa4f34bf63a34e, 95f1f8f5bdb..."
4,"""A CHRISTMAS CAROL""","""EVENT""","""A Christmas Carol"" is a classic literary work...",4,3671ea0dd4e84c1a9b02c5ab2c8f4bac,"[-0.011152847670018673, 0.01469416357576847, -...","[680dd6d2a970a49082fa4f34bf63a34e, 95f1f8f5bdb..."


In [None]:
entity_statement = """
MERGE (e:__Entity__ {id:value.id})
SET e += value {.human_readable_id, .description, name:replace(value.name,'"','')}
WITH e, value
CALL db.create.setNodeVectorProperty(e, "description_embedding", value.description_embedding)
CALL apoc.create.addLabels(e, case when coalesce(value.type,"") = "" then [] else [apoc.text.upperCamelCase(replace(value.type,'"',''))] end) yield node
UNWIND value.text_unit_ids AS text_unit
MATCH (c:__Chunk__ {id:text_unit})
MERGE (c)-[:HAS_ENTITY]->(e)
"""

batched_import(entity_statement, entity_df)

In [15]:
rel_df = pd.read_parquet('/Users/shaansuri/Downloads/artifacts/create_final_relationships.parquet',
                         columns=["source","target","id","rank","weight","human_readable_id","description","text_unit_ids"])
rel_df.head(5)

Unnamed: 0,source,target,id,rank,weight,human_readable_id,description,text_unit_ids
0,"""PROJECT GUTENBERG""","""A CHRISTMAS CAROL""",b84d71ed9c3b45819eb3205fd28e13a0,20,1.0,0,"""Project Gutenberg is responsible for releasin...",[680dd6d2a970a49082fa4f34bf63a34e]
1,"""PROJECT GUTENBERG""","""SUZANNE SHELL""",b0b464bc92a541e48547fe9738378dab,15,1.0,1,"""Suzanne Shell produced the eBook version of '...",[680dd6d2a970a49082fa4f34bf63a34e]
2,"""PROJECT GUTENBERG""","""JANET BLENKINSHIP""",44c65dda6fb7472dae36f6eea720ab47,15,1.0,2,"""Janet Blenkinship produced the eBook version ...",[680dd6d2a970a49082fa4f34bf63a34e]
3,"""PROJECT GUTENBERG""","""UNITED STATES""",5d97ff82691c4482973d73d1860e4757,15,8.0,3,Project Gutenberg operates within the United S...,"[01e84646075b255eab0a34d872336a89, 28f242c4515..."
4,"""PROJECT GUTENBERG""","""GENERAL TERMS OF USE AND REDISTRIBUTING PROJE...",2567445079794d1e84f17abc48776002,14,1.0,4,"""Project Gutenberg establishes and enforces th...",[da3ca9f93aac15c67f6acf3cca2fc229]


In [16]:
rel_statement = """
    MATCH (source:__Entity__ {name:replace(value.source,'"','')})
    MATCH (target:__Entity__ {name:replace(value.target,'"','')})
    // not necessary to merge on id as there is only one relationship per pair
    MERGE (source)-[rel:RELATED {id: value.id}]->(target)
    SET rel += value {.rank, .weight, .human_readable_id, .description, .text_unit_ids}
    RETURN count(*) as createdRels
"""

batched_import(rel_statement, rel_df)

{'_contains_updates': True, 'properties_set': 1710}
342 rows in 0.5565528869628906 s.


342

In [18]:
community_df = pd.read_parquet('/Users/shaansuri/Downloads/artifacts/create_final_communities.parquet',
                     columns=["id","level","title","text_unit_ids","relationship_ids"])
print(community_df.head(2))

  id  level        title                                      text_unit_ids  \
0  2      0  Community 2  [0546d296a4d3bb0486bd0c94c01dc9be,0d6bc6e701a0...   
1  4      0  Community 4  [054bdcba0a3690b43609d9226a47f84d,3a450ed2b7fb...   

                                    relationship_ids  
0  [ba481175ee1d4329bf07757a30abd3a1, 8d8da35190b...  
1  [929f30875e1744b49e7b416eaf5a790c, 4920fda0318...  


In [19]:
statement = """
MERGE (c:__Community__ {community:value.id})
SET c += value {.level, .title}
/*
UNWIND value.text_unit_ids as text_unit_id
MATCH (t:__Chunk__ {id:text_unit_id})
MERGE (c)-[:HAS_CHUNK]->(t)
WITH distinct c, value
*/
WITH *
UNWIND value.relationship_ids as rel_id
MATCH (start:__Entity__)-[:RELATED {id:rel_id}]->(end:__Entity__)
MERGE (start)-[:IN_COMMUNITY]->(c)
MERGE (end)-[:IN_COMMUNITY]->(c)
RETURN count(distinct c) as createdCommunities
"""

batched_import(statement, community_df)

{'_contains_updates': True, 'properties_set': 94}
47 rows in 1.0728602409362793 s.


47

In [20]:
community_report_df = pd.read_parquet('/Users/shaansuri/Downloads/artifacts/create_final_community_reports.parquet',
                               columns=["id","community","level","title","summary", "findings","rank","rank_explanation","full_content"])
community_report_df.head(2)

Unnamed: 0,id,community,level,title,summary,findings,rank,rank_explanation,full_content
0,e7822326-4da8-4954-afa9-be7f4f5791a5,42,2,Scrooge's Supernatural Encounters: Marley's Gh...,This report delves into the pivotal supernatur...,[{'explanation': 'Marley's Ghost plays a cruci...,8.0,The impact severity rating is high due to the ...,# Scrooge's Supernatural Encounters: Marley's ...
1,8a5afac1-99ef-4f01-a1b1-f044ce392ff9,43,2,The Ghost's Influence on Scrooge's Transformation,This report delves into the pivotal role of 'T...,"[{'explanation': 'The Ghost, identified at tim...",8.5,The impact severity rating is high due to the ...,# The Ghost's Influence on Scrooge's Transform...


In [21]:
community_statement = """
MERGE (c:__Community__ {community:value.community})
SET c += value {.level, .title, .rank, .rank_explanation, .full_content, .summary}
WITH c, value
UNWIND range(0, size(value.findings)-1) AS finding_idx
WITH c, value, finding_idx, value.findings[finding_idx] as finding
MERGE (c)-[:HAS_FINDING]->(f:Finding {id:finding_idx})
SET f += finding
"""
batched_import(community_statement, community_report_df)

{'_contains_updates': True, 'properties_set': 682}
47 rows in 0.533484935760498 s.


47