In [2]:
!spacy download en_core_web_sm

Collecting en-core-web-sm==3.7.1
  Using cached https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
Collecting spacy<3.8.0,>=3.7.2 (from en-core-web-sm==3.7.1)
  Using cached spacy-3.7.5-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl.metadata (27 kB)
Using cached spacy-3.7.5-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl (6.3 MB)
Installing collected packages: spacy, en-core-web-sm
  Attempting uninstall: spacy
    Found existing installation: spacy 3.7.0
    Uninstalling spacy-3.7.0:
      Successfully uninstalled spacy-3.7.0
Successfully installed en-core-web-sm-3.7.1 spacy-3.7.5
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [9]:
import json
from collections import Counter
from pathlib import Path

import spacy

In [4]:
# Load the English SpaCy model
nlp = spacy.load("en_core_web_sm")
# nlp = spacy.load("en_core_web_md")


In [5]:
# Sample excerpt for demonstration purposes. Replace with a more detailed summary for a richer graph.
 
summary = """
Holden Caulfield, a teenager in New York City, narrates the novel. Struggling with depression, he recounts his experiences in school, relationships, and his disdain for 'phonies.'
"""

# Process the summary text
doc = nlp(summary)

In [12]:
# split document into sentences
def split_document_sent(text):
    doc = nlp(text)
    return [sent.text.strip() for sent in doc.sents]

# define custom relationship extraction and text processing
def process_text(text, verbose=False):
    doc = nlp(text)
    if verbose:
        print(f"Text: {doc.text}")
        print(f"Entities: {[(ent.text, ent.label_) for ent in doc.ents]}")
        # Relations extraction logic can be added here
    return doc

# Pipeline to run entity extraction
def extract_entities(text, verbose=False):
    processed_data = []
    entity_counts = Counter()

    sentences = split_document_sent(text)
    for sent in sentences:
        doc = process_text(sent, verbose)
        entities = [(ent.text, ent.label_) for ent in doc.ents]

        # Store processed data for each sentence
        processed_data.append({'text': doc.text, 'entities': entities})

        # Update counters
        entity_counts.update([ent[1] for ent in entities])

    # Export to JSON
    with open('processed_data.json', 'w') as f:
        json.dump(processed_data, f)

    # Display summary
    print(f"Entity counts: {entity_counts}")

# Run the pipeline on the summary text
verbose = True
extract_entities(summary, verbose)

Text: Holden Caulfield, a teenager in New York City, narrates the novel.
Entities: [('Holden Caulfield', 'PERSON'), ('New York City', 'GPE')]
Text: Struggling with depression, he recounts his experiences in school, relationships, and his disdain for 'phonies.'
Entities: []
Entity counts: Counter({'PERSON': 1, 'GPE': 1})


In [17]:
## Create node and relationships using LLMs

import json
from pathlib import Path
from langchain_ollama import ChatOllama

# Load processed data from JSON
json_path = Path("processed_data.json")
with open(json_path, "r") as f:
    processed_data = json.load(f)

# Prepare nodes and relationships
nodes = []
relationships = []

# Formulate a prompt
prompt = (
    "Extract entities and relationships from the following JSON data. For each entry in data['entities'], "
    "create a 'node' dictionary with fields 'id' (unique identifier), 'name' (entity text), and 'type' (entity label). "
    "For entities that have meaningful connections, define 'relationships' as dictionaries with 'source' (source node id), "
    "'target' (target node id), and 'relationship' (type of connection). Create max 30 nodes, format relationships in the format of capital letters and _ inbetween words and format the entire response in the JSON output containing only variables nodes and relationships without any text inbetween"
    "JSON data:\n"
    f"{json.dumps(processed_data)}"
)



In [20]:
llm = ChatOllama(
    base_url="http://host.docker.internal:11434",
    model="llama3.1"
)



In [26]:
response = llm.invoke(
   [
        {"role": "system", "content": "You are a helpful assistant that structures data into nodes and relationships."},
        {"role": "user", "content": prompt}
    ]
)
text = response.content.replace("```json", "").replace("```","")
output = json.loads(text)


In [28]:
print(output)

{'nodes': [{'id': 'node_1', 'name': 'Holden Caulfield', 'type': 'PERSON'}, {'id': 'node_2', 'name': 'New York City', 'type': 'GPE'}], 'relationships': [{'source': 'node_1', 'target': 'node_2', 'relationship': 'LOCATION_OF_RESIDENCE'}, {'source': 'node_1', 'target': 'node_1', 'relationship': 'MENTAL_HEALTH_CONDITION'}]}


In [30]:
# Populate nodes and relationships lists
nodes.extend(output.get("nodes", []))
relationships.extend(output.get("relationships", []))

# Generate Queries

In [33]:
def generate_cypher_queries(nodes, relationships):
    queries = []

    # Create nodes
    for node in nodes:
        query = f"CREATE (n:{node['type']} {{id: '{node['id']}', name: '{node['name']}'}})"
        queries.append(query)

    # Create relationships
    for rel in relationships:
        query = f"MATCH (a {{id: '{rel['source']}'}}), (b {{id: '{rel['target']}'}}) " \
                f"CREATE (a)-[:{rel['relationship']}]->(b)"
        queries.append(query)

    return queries

cypher_queries = generate_cypher_queries(nodes, relationships)
print(cypher_queries)

["CREATE (n:PERSON {id: 'node_1', name: 'Holden Caulfield'})", "CREATE (n:GPE {id: 'node_2', name: 'New York City'})", "MATCH (a {id: 'node_1'}), (b {id: 'node_2'}) CREATE (a)-[:LOCATION_OF_RESIDENCE]->(b)", "MATCH (a {id: 'node_1'}), (b {id: 'node_1'}) CREATE (a)-[:MENTAL_HEALTH_CONDITION]->(b)"]


# Execute Queries

In [38]:
from neo4j import GraphDatabase

# Initialize the Neo4j driver for Memgraph (modify the URI if necessary)
uri = "bolt://host.docker.internal:7687"
user = ""
password = ""
driver = GraphDatabase.driver(uri, auth=(user, password))

# Function to execute Cypher queries in Memgraph
def execute_cypher_queries(queries):
    with driver.session() as session:
        session.run("MATCH (n) DETACH DELETE n;")
        for query in queries:
            try:
                session.run(query)
                print(f"Executed query: {query}")
            except Exception as e:
                print(f"Error executing query: {query}. Error: {e}")

# Execute the generated Cypher queries
execute_cypher_queries(cypher_queries)

Executed query: CREATE (n:PERSON {id: 'node_1', name: 'Holden Caulfield'})
Executed query: CREATE (n:GPE {id: 'node_2', name: 'New York City'})
Executed query: MATCH (a {id: 'node_1'}), (b {id: 'node_2'}) CREATE (a)-[:LOCATION_OF_RESIDENCE]->(b)
Executed query: MATCH (a {id: 'node_1'}), (b {id: 'node_1'}) CREATE (a)-[:MENTAL_HEALTH_CONDITION]->(b)
