In [1]:
import re

def parse_graph_line(line):
    pattern = r'<([^>]+)><([^>]+)><([^>]+)>\.'
    match = re.match(pattern, line.strip())
    if match:
        return match.groups()
    return None

test_line = '<Prince Hamlet><is><T:bold>.'
parsed = parse_graph_line(test_line)
print(f"Test: {test_line}")
print(f"Parsed: {parsed}")

Test: <Prince Hamlet><is><T:bold>.
Parsed: ('Prince Hamlet', 'is', 'T:bold')


In [2]:
from neo4j import GraphDatabase

# Before running this cell:
# Open neo4j desktop and create a database with username 'neo4j' and password 'password'

URI = "neo4j://127.0.0.1:7687"
AUTH = ("neo4j", "password")

driver = GraphDatabase.driver(URI, auth=AUTH)

try:
    driver.verify_connectivity()
    print("Connection to Neo4j successful!")
except Exception as e:
    print(f"Connection error: {e}")

Connection to Neo4j successful!


In [13]:
import time

def load_graph_to_neo4j(driver, graph_file_path, batch_size=10000):

    def process_batch(tx, batch):
        query = """
        UNWIND $batch AS row
        MERGE (s {name: row.s_name})
        ON CREATE SET s.type = row.s_type
        MERGE (o {name: row.o_name})
        ON CREATE SET o.type = row.o_type
        MERGE (s)-[r:RELATION {type: row.rel}]->(o)
        """
        tx.run(query, batch=batch)
    

    print("Clearing database...")
    with driver.session() as session:
        session.run("MATCH (n) DETACH DELETE n")
    
    print(f"\nStarting import from '{graph_file_path}'...")
    print(f"Batch size: {batch_size}")
    
    start_time = time.time()
    current_batch = []
    processed_count = 0
    batch_count = 0
    
    with open(graph_file_path, 'r', encoding='utf-8') as fp:
        for line in fp:
            parsed = parse_graph_line(line)
            if parsed:
                subject, relation, obj = parsed
                
                s_type = 'Trait' if subject.startswith('T:') else 'Question' if subject.startswith('Q:') else 'Answer' if subject.startswith('A:') else 'Character'
                o_type = 'Trait' if obj.startswith('T:') else 'Question' if obj.startswith('Q:') else 'Answer' if obj.startswith('A:') else 'Character'
                
                current_batch.append({
                    's_name': subject.replace("Q:", "").replace("A:", "").replace("T:", ""),
                    's_type': s_type,
                    'o_name': obj.replace("Q:", "").replace("A:", "").replace("T:", ""),
                    'o_type': o_type,
                    'rel': relation.replace(' ', '_')
                })
                
                if len(current_batch) >= batch_size:
                    with driver.session() as session:
                        session.execute_write(process_batch, current_batch)
                    processed_count += len(current_batch)
                    batch_count += 1
                    elapsed = time.time() - start_time
                    rate = processed_count / elapsed if elapsed > 0 else 0
                    print(f"  Batch {batch_count} done - {processed_count:,} triplets ({rate:.0f}/sec)")
                    current_batch = []
    
    if current_batch:
        with driver.session() as session:
            session.execute_write(process_batch, current_batch)
        processed_count += len(current_batch)
        batch_count += 1
        print(f"  Batch {batch_count} done - {processed_count:,} triplets")
    
    elapsed = time.time() - start_time
    print(f"\n✓ Graph loaded in {elapsed:.1f}s ({processed_count/elapsed:.0f} triplets/sec)")
    
    print("\nFetching statistics...")
    with driver.session() as session:
        result = session.run("MATCH (n) RETURN count(n) as node_count")
        node_count = result.single()["node_count"]
        
        result = session.run("MATCH ()-[r]->() RETURN count(r) as rel_count")
        rel_count = result.single()["rel_count"]
        
        print(f"  Nodes: {node_count:,}")
        print(f"  Relationships: {rel_count:,}")

In [14]:
load_graph_to_neo4j(driver, 'graph.txt', batch_size=50000)

Clearing database...

Starting import from 'graph.txt'...
Batch size: 50000
  Batch 1 done - 50,000 triplets (3090/sec)
  Batch 2 done - 100,000 triplets (2009/sec)
  Batch 3 done - 150,000 triplets (1660/sec)
  Batch 4 done - 200,000 triplets (1446/sec)
  Batch 5 done - 250,000 triplets (1301/sec)
  Batch 6 done - 300,000 triplets (1194/sec)
  Batch 7 done - 350,000 triplets (1100/sec)
  Batch 8 done - 400,000 triplets (1023/sec)
  Batch 9 done - 450,000 triplets (951/sec)
  Batch 10 done - 500,000 triplets (893/sec)
  Batch 11 done - 550,000 triplets (840/sec)
  Batch 12 done - 600,000 triplets (796/sec)
  Batch 13 done - 643,044 triplets

✓ Graph loaded in 851.6s (755 triplets/sec)

Fetching statistics...
  Nodes: 3,050
  Relationships: 489,467


In [16]:
# Example queries:

# 1. Get all nodes by type
def get_nodes_by_type(driver, node_type, limit=10):
    with driver.session() as session:
        result = session.run("""
            MATCH (n {type: $node_type})
            RETURN n.name as name
            ORDER BY name
            LIMIT $limit
        """, node_type=node_type, limit=limit)
        
        return [record["name"] for record in result]

# 2. Find what a node is connected to
def get_node_relations(driver, node_name):
    with driver.session() as session:
        result = session.run("""
            MATCH (n {name: $name})-[r:RELATION]->(o)
            RETURN r.type as relation_type, o.name as target, o.type as target_type
            ORDER BY relation_type, target
        """, name=node_name)
        
        return [(record["relation_type"], record["target"], record["target_type"]) for record in result]

# 3. Find character traits with specific relation
def find_character_traits(driver, character_name, relation_type='is'):
    with driver.session() as session:
        result = session.run("""
            MATCH (c {name: $name, type: 'Character'})-[r:RELATION {type: $rel_type}]->(t {type: 'Trait'})
            RETURN t.name as trait
            ORDER BY trait
        """, name=character_name, rel_type=relation_type)
        
        return [record["trait"] for record in result]

# Example usage:
print("First 10 characters:")
characters = get_nodes_by_type(driver, 'Character', limit=10)
print(characters)

print("\nFirst 10 traits:")
traits = get_nodes_by_type(driver, 'Trait', limit=10)
print(traits)

if characters:
    print(f"\n{characters[0]} - all relations:")
    relations = get_node_relations(driver, characters[0])
    for rel_type, target, target_type in relations[:10]:
        print(f"  {rel_type} -> {target} ({target_type})")
    
    print(f"\n{characters[0]} - traits 'is':")
    char_traits = find_character_traits(driver, characters[0], 'is')
    for trait in char_traits[:10]:
        print(f"  {trait}")

First 10 characters:
["'Chief' Bromden", 'A-Train', 'A.J. Soprano', 'Aang', 'Aaron Burr', 'Aaron Hotchner', 'Aaron Samuels', 'Abbey Bartlet', 'Abbi Abrams', 'Abby Littman']

First 10 traits:
['absent', 'abstract', 'accepting', 'accommodating', 'active', 'adamant', 'aggressive', 'agitated', 'aloof', 'ambiguous']

'Chief' Bromden - all relations:
  definitely_is -> abstract (Trait)
  definitely_is -> agitated (Trait)
  definitely_is -> apprehensive (Trait)
  definitely_is -> biased (Trait)
  definitely_is -> careless (Trait)
  definitely_is -> complex (Trait)
  definitely_is -> confused (Trait)
  definitely_is -> contentious (Trait)
  definitely_is -> conventional (Trait)
  definitely_is -> daring (Trait)

'Chief' Bromden - traits 'is':
  abstract
  ambiguous
  antagonistic
  anxious
  arrogant
  awkward
  blunt
  boastful
  brash
  callous


In [None]:
# Close the connection when done
driver.close()