# Setup

In [67]:
from neo4j import GraphDatabase, basic_auth
from neo4j.auth_management import AuthManagers
import os

# Replace with your Neo4j Aura credentials
uri = os.environ.get('NEO4J_AURA_URI')
pw = os.environ.get('NEO4J_AURA_PW')

In [68]:
from openai import OpenAI
client = OpenAI(api_key=os.environ.get('OPENAI_API_KEY'))

# Helper Graph Class

In [103]:
from neo4j import GraphDatabase

class Neo4jGraph:
    def __init__(self, uri, pw):
        """
        Initialize the Neo4jGraph with a connection to the Neo4j Aura database.
        """
        auth = ("neo4j", pw)
        self.driver = GraphDatabase.driver(uri, auth=AuthManagers.static(auth))

    def close(self):
        """
        Close the connection to the database.
        """
        self.driver.close()

    def generate_embedding(self, text):
        """
        Generate an embedding for the given text using OpenAI API.

        :param text: The text to generate the embedding for.
        :return: A list of floats representing the embedding.
        """
        return client.embeddings.create(
            input=text,
            model="text-embedding-3-small"
        ).data[0].embedding

    def insert_triplet(self, triplet):
        """
        Insert a triplet (head, relation, tail) with properties and embeddings.
        Ensure that all nodes have the "Entity" label. If nodes already exist (based on 'name'), they will not be recreated.

        :param triplet: A dictionary containing:
                        - 'head': name of the head node
                        - 'type': type of the relationship
                        - 'tail': name of the tail node
                        - 'head_label' (optional): label for the head node
                        - 'tail_label' (optional): label for the tail node
                        - 'head_properties' (optional): properties for the head node
                        - 'tail_properties' (optional): properties for the tail node
                        - 'relation_properties' (optional): properties for the relationship
        """
        head_name = triplet['head']
        tail_name = triplet['tail']
        relation_type = triplet['type']

        # Ensure that the Entity label is added to all nodes
        head_label = 'Entity' + (':' + triplet.get('head_label', '') if triplet.get('head_label') else '')
        tail_label = 'Entity' + (':' + triplet.get('tail_label', '') if triplet.get('tail_label') else '')

        head_properties = triplet.get('head_properties', {})
        tail_properties = triplet.get('tail_properties', {})
        relation_properties = triplet.get('relation_properties', {})
        relation_properties['type'] = relation_type  # Store the relation type as a property

        # Ensure 'name' is included in properties
        head_properties['name'] = head_name
        tail_properties['name'] = tail_name

        # Generate embeddings for head and tail
        head_embedding = self.generate_embedding(head_name)
        tail_embedding = self.generate_embedding(tail_name)

        # Add the embedding as a property
        head_properties['embedding'] = head_embedding
        tail_properties['embedding'] = tail_embedding

        query = """
        MERGE (h:{head_label} {{name: $head_name}})
        SET h += $head_properties
        MERGE (t:{tail_label} {{name: $tail_name}})
        SET t += $tail_properties
        MERGE (h)-[r:RELATION]->(t)
        SET r += $relation_properties
        RETURN h, r, t
        """.format(head_label=head_label, tail_label=tail_label)

        with self.driver.session() as session:
            session.run(
                query,
                head_name=head_name,
                tail_name=tail_name,
                head_properties=head_properties,
                tail_properties=tail_properties,
                relation_properties=relation_properties
            )
        print(f"Inserted triplet with embeddings for {head_name} and {tail_name}, both labeled as 'Entity'.")


    def delete_entity(self, name):
        query = """
        MATCH (n {name: $name})
        DETACH DELETE n
        """
        with self.driver.session() as session:
            session.run(query, name=name)


    def delete_relation(self, head_name, tail_name, relation_type):
        """
        Delete a relationship between two nodes based on 'name' and relationship type.

        :param head_name: The 'name' property of the head/start node
        :param tail_name: The 'name' property of the tail/end node
        :param relation_type: The 'type' property of the relationship to delete
        """
        query = """
        MATCH (h {{name: $head_name}})-[r:RELATION {{type: $relation_type}}]->(t {{name: $tail_name}})
        DELETE r
        """

        with self.driver.session() as session:
            session.run(
                query,
                head_name=head_name,
                tail_name=tail_name,
                relation_type=relation_type
            )

    def get_entity_with_relations(self, name, related_names=None, relation_types=None, verbose=False):
        """
        Get an entity and its relationships, optionally filtering by related node names and relationship types.

        :param name: The 'name' property of the node to retrieve
        :param related_names: A list of 'name' properties of related nodes to filter by
        :param relation_types: A list of relationship 'type' properties to filter by
        :return: A list of dictionaries representing the relationships
        """
        # Start building the query
        query = """
        MATCH (n {name: $name})-[r]->(related)
        """
        # Add conditions based on optional parameters
        conditions = []
        if related_names:
            conditions.append("related.name IN $related_names")
        if relation_types:
            conditions.append("r.type IN $relation_types")
        if conditions:
            query += "WHERE " + " AND ".join(conditions) + "\n"
        query += "RETURN n, r, related"

        query += """\n
        UNION
        MATCH (n)-[r]->(related {name: $name})
        """
        # Add conditions based on optional parameters
        conditions = []
        if related_names:
            conditions.append("related.name IN $related_names")
        if relation_types:
            conditions.append("r.type IN $relation_types")
        if conditions:
            query += "WHERE " + " AND ".join(conditions) + "\n"
        query += "RETURN n, r, related"

        # Prepare parameters
        parameters = {'name': name}
        if related_names:
            parameters['related_names'] = related_names
        if relation_types:
            parameters['relation_types'] = relation_types

        if verbose:
            print(query)

        # Execute the query
        with self.driver.session() as session:
            result = session.run(query, **parameters)
            records = list(result)

            if not records:
                raise ValueError(f"Entity '{name}' not found or no relationships matching the criteria.")

            return [{
                'head': record['n']['name'],
                'relation': record['r']['type'],
                'tail': record['related']['name']
            } for record in records]

    def list_all_nodes(self):
        """
        List all nodes in the database.

        :return: A list of node dictionaries.
        """
        query = """
        MATCH (n)
        WITH n, apoc.map.removeKey(properties(n), 'embedding') AS props
        RETURN props
        """
        with self.driver.session() as session:
            result = session.run(query)
            nodes = [record["props"] for record in result]
            return nodes

    def list_all_relationships(self):
        """
        List all relationships in the database.

        :return: A list of relationship dictionaries.
        """
        query = "MATCH ()-[r]->() RETURN r"
        with self.driver.session() as session:
            result = session.run(query)
            relationships = [record["r"] for record in result]
            return relationships

    def wipe_db(self):
        """
        Wipe the entire database.
        """
        query = "MATCH (n) DETACH DELETE n"
        with self.driver.session() as session:
            session.run(query)

    def create_vector_index(self, label, property_name, dimensions):
        """
        Create a vector index for nodes with a specific label and property.

        :param label: The label of the nodes to create the index for.
        :param property_name: The property of the nodes containing the vector data.
        :param dimensions: The number of dimensions of the vector.
        """
        query = f"""
        CREATE VECTOR INDEX vector_index_{property_name} IF NOT EXISTS
        FOR (n:{label})
        ON (n.{property_name})
        OPTIONS {{
            indexConfig: {{
                `vector.dimensions`: {dimensions},
                `vector.similarity_function`: 'cosine'
            }}
        }}
        """
        with self.driver.session() as session:
            session.run(query)
        print(f"Vector index created for label '{label}' on property '{property_name}' with {dimensions} dimensions.")

    def search_by_embedding(self, text, top_k=5):
        """
        Search for nodes with embeddings closest to the given embedding, using cosine similarity.

        :param label: The label of the nodes to search for.
        :param property_name: The property where the embedding is stored.
        :param embedding: The vector embedding to search with.
        :param top_k: Number of nearest nodes to return.
        :return: A list of nodes with their similarity scores.
        """
        label = 'Entity'
        property_name = 'embedding'
        embedding = self.generate_embedding(text)
        query = f"""
        CALL db.index.vector.queryNodes('vector_index_{property_name}', $top_k, $embedding)
        YIELD node, score
        RETURN node, score
        ORDER BY score DESC
        LIMIT $top_k
        """
        with self.driver.session() as session:
            result = session.run(query, embedding=embedding, top_k=top_k)
            nodes = [{'node': record['node'], 'score': record['score']} for record in result]
            return nodes

# Getting Started with our Graph

In [104]:
graph = Neo4jGraph(uri, pw)

In [105]:
graph.wipe_db()

In [106]:
graph.list_all_nodes()

[]

In [107]:
graph.create_vector_index('Entity', 'embedding', 1536)

Vector index created for label 'Entity' on property 'embedding' with 1536 dimensions.


In [108]:
# Insert a triplet with properties, ensuring nodes are not duplicated
triplet = {
    'head': 'Napoleon Bonaparte',
    'head_label': 'Person',
    'head_properties': {
    },
    'type': 'participant in',
    'relation_properties': {
        'since': 1799
    },
    'tail': 'French Revolution',
    'tail_label': 'Event',
    'tail_properties': {
        'start_year': 1789,
        'end_year': 1799
    }
}

graph.insert_triplet(triplet)

Inserted triplet with embeddings for Napoleon Bonaparte and French Revolution, both labeled as 'Entity'.


In [109]:
graph.list_all_nodes()

[{'end_year': 1799, 'name': 'French Revolution', 'start_year': 1789},
 {'name': 'Napoleon Bonaparte'}]

In [110]:
# Get all relationships of an entity
try:
    relations = graph.get_entity_with_relations('Napoleon Bonaparte')
    for rel in relations:
        print(rel)
except ValueError as e:
    print(e)

{'head': 'Napoleon Bonaparte', 'relation': 'participant in', 'tail': 'French Revolution'}


In [111]:
# Search for nodes similar to the generated embedding
similar_nodes = graph.search_by_embedding(text='Napoleon', top_k=5)

for node in similar_nodes:
    print(f"Node: {node['node']['name']}, Similarity: {node['score']}")


Node: Napoleon Bonaparte, Similarity: 0.934220552444458
Node: French Revolution, Similarity: 0.7053217887878418


In [112]:
# Get relationships filtered by related node names
try:
    relations = graph.get_entity_with_relations(
        'Napoleon Bonaparte', related_names=['French Revolution']
    )
    for rel in relations:
        print(rel)
except ValueError as e:
    print(e)


{'head': 'Napoleon Bonaparte', 'relation': 'participant in', 'tail': 'French Revolution'}


In [113]:
# Expanded triplet with properties
triplet = {
    'head': 'Napoleon Bonaparte',
    'head_label': 'Person',
    'head_properties': {
        'occupation': 'Emperor of the French',
        'birth_date': '15 August 1769',
        'death_date': '5 May 1821'
    },
    'type': 'exiled from',
    'relation_properties': {
        'year': 1815,
        'reason': 'After defeat at Waterloo'
    },
    'tail': 'France',
    'tail_label': 'Country',
    'tail_properties': {
        'continent': 'Europe',
        'capital': 'Paris'
    }
}

# Insert the triplet into the graph
graph.insert_triplet(triplet)

Inserted triplet with embeddings for Napoleon Bonaparte and France, both labeled as 'Entity'.


In [114]:
# Get all relationships of an entity
try:
    relations = graph.get_entity_with_relations('Napoleon Bonaparte')
    for rel in relations:
        print(rel)
except ValueError as e:
    print(e)

{'head': 'Napoleon Bonaparte', 'relation': 'participant in', 'tail': 'French Revolution'}
{'head': 'Napoleon Bonaparte', 'relation': 'exiled from', 'tail': 'France'}


In [115]:
# Search for nodes similar to the generated embedding
similar_nodes = graph.search_by_embedding(text='Napoleon', top_k=5)

for node in similar_nodes:
    print(f"Node: {node['node']['name']}, Similarity: {node['score']}")


Node: Napoleon Bonaparte, Similarity: 0.9342186450958252
Node: French Revolution, Similarity: 0.7053217887878418
Node: France, Similarity: 0.6384046077728271


In [116]:
# Get relationships filtered by related node names
try:
    relations = graph.get_entity_with_relations(
        'Napoleon Bonaparte',
        related_names=['French Revolution']
    )
    for rel in relations:
        print(rel)
except ValueError as e:
    print(e)


{'head': 'Napoleon Bonaparte', 'relation': 'participant in', 'tail': 'French Revolution'}


In [117]:
edge_types = [edge.get('type') for edge in graph.list_all_relationships()]

In [118]:
print(f'Edge types: {edge_types}')

Edge types: ['participant in', 'exiled from']


# Cross-encoders to filter for best relationships

In [119]:
from sentence_transformers import CrossEncoder
import torch.nn.functional as F

# Initialize the model with softmax activation
model = CrossEncoder(
    'cross-encoder/ms-marco-MiniLM-L-2-v2',
    max_length=512,
    default_activation_function=lambda x: F.softmax(x, dim=0)
)

In [120]:
def get_top_edges(query, names, threshold=0.9):
    scores = model.predict([(query, node) for node in names])

    top_names = sorted([dict(name=names[i], score=scores[i]) for i in range(len(names))], key=lambda x: x['score'], reverse=True)
    return [name for name in top_names if name['score'] > threshold]

print(get_top_edges("Napoleon's exile", edge_types))

[{'name': 'exiled from', 'score': 0.9580355}]


In [121]:
def get_top_relations(query, edge_types, threshold=0.7, verbose=False):
    top_nodes = [(n['node']['name'], n['score']) for n in graph.search_by_embedding(text=query, top_k=5) if n['score'] > threshold]
    top_edges = [_['name'] for _ in get_top_edges(query, edge_types, threshold)]

    if verbose:
        print(f'Top nodes: {top_nodes}')
        print(f'Top edges: {top_edges}')

    relations = []
    for node_name, score in top_nodes:
        try:
            relations += graph.get_entity_with_relations(node_name, relation_types=top_edges)
        except Exception as e:
            pass

    return [f'{relation["head"]} {relation["relation"]} {relation["tail"]}' for relation in relations]

get_top_relations("Napoleon's exile", edge_types, verbose=True)

Top nodes: [('Napoleon Bonaparte', 0.8197953701019287)]
Top edges: ['exiled from']


['Napoleon Bonaparte exiled from France']

In [122]:
graph.get_entity_with_relations(
        'Napoleon Bonaparte',
        relation_types=['exiled from']
    )

[{'head': 'Napoleon Bonaparte', 'relation': 'exiled from', 'tail': 'France'}]

# Graph RAG

In [123]:
Q_A_PROMPT = '''
You answer questions given information from a knowledge graph.
If the query cannot be answered by the knowledge graph, say "I don't know."

Query: {query}

Knowledge Relationships:
{kb_rels}

Response:'''

def GraphRAG(query, graph, threshold=0.7, verbose=False):
    edge_types = [edge.get('type') for edge in graph.list_all_relationships()]
    top_edges = [_['name'] for _ in get_top_edges(query, edge_types, threshold)]

    top_relations = get_top_relations(query, edge_types, threshold, verbose=verbose)

    prompt = Q_A_PROMPT.format(kb_rels="\n".join(top_relations), query=query)

    if verbose:
        print(f'Prompt: {prompt}')

    return client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {
                "role": "user",
                "content": prompt
            }
        ],
        temperature=0
    ).choices[0].message.content

In [124]:
print(GraphRAG("When was Napoleon born?", graph, verbose=True)) # I don't know.
print('---')
print(GraphRAG("Where was Napoleon exlied from?", graph, verbose=True))  # Napoleon Bonaparte was exiled from France.

Top nodes: [('Napoleon Bonaparte', 0.8150084018707275)]
Top edges: []
Prompt: 
You answer questions given information from a knowledge graph.
If the query cannot be answered by the knowledge graph, say "I don't know."

Query: When was Napoleon born?

Knowledge Relationships:
Napoleon Bonaparte participant in French Revolution
Napoleon Bonaparte exiled from France

Response:
I don't know.
---
Top nodes: [('Napoleon Bonaparte', 0.7915337085723877)]
Top edges: []
Prompt: 
You answer questions given information from a knowledge graph.
If the query cannot be answered by the knowledge graph, say "I don't know."

Query: Where was Napoleon exlied from?

Knowledge Relationships:
Napoleon Bonaparte participant in French Revolution
Napoleon Bonaparte exiled from France

Response:
Napoleon Bonaparte was exiled from France.


# Using Langchain to extract entities

In [125]:
!python3 -m pip install --upgrade --quiet langchain-core langchain-openai langchain_experimental langchain-community json-repair

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [126]:
from langchain_experimental.graph_transformers import LLMGraphTransformer
import networkx as nx
from langchain_core.documents import Document
from langchain.chat_models import ChatOpenAI

os.environ["OPENAI_API_KEY"] = os.environ.get('OPENAI_API_KEY')
llm = ChatOpenAI(model_name='gpt-4o')

In [127]:
text = '''Napoleon Bonaparte[b] (born Napoleone di Buonaparte;[1][c] 15 August 1769 – 5 May 1821), later known by his regnal name Napoleon I, was a French military officer and statesman who rose to prominence during the French Revolution and led a series of successful campaigns across Europe during the French Revolutionary and Napoleonic Wars from 1796 to 1815. He was the leader of the French Republic as First Consul from 1799 to 1804, then of the French Empire as Emperor of the French from 1804 to 1814, and briefly again in 1815.

Born on the island of Corsica to a family of Italian origin, Napoleon moved to mainland France in 1779 and was commissioned as an officer in the French Royal Army in 1785. He supported the French Revolution in 1789, and promoted its cause in Corsica. He rose rapidly through the ranks after winning the siege of Toulon in 1793 and defeating royalist insurgents in Paris on 13 Vendémiaire in 1795. In 1796, Napoleon commanded a military campaign against the Austrians and their Italian allies in the War of the First Coalition, scoring decisive victories and becoming a national hero. He led an invasion of Egypt and Syria in 1798 which served as a springboard to political power. In November 1799, Napoleon engineered the Coup of 18 Brumaire against the Directory, and became First Consul of the Republic. He won the Battle of Marengo in 1800, which secured France's victory in the War of the Second Coalition, and in 1803 sold the territory of Louisiana to the United States. In December 1804, Napoleon crowned himself Emperor of the French, further expanding his power.'''

In [128]:
documents = [Document(page_content=text)]

In [129]:
documents

[Document(metadata={}, page_content="Napoleon Bonaparte[b] (born Napoleone di Buonaparte;[1][c] 15 August 1769 – 5 May 1821), later known by his regnal name Napoleon I, was a French military officer and statesman who rose to prominence during the French Revolution and led a series of successful campaigns across Europe during the French Revolutionary and Napoleonic Wars from 1796 to 1815. He was the leader of the French Republic as First Consul from 1799 to 1804, then of the French Empire as Emperor of the French from 1804 to 1814, and briefly again in 1815.\n\nBorn on the island of Corsica to a family of Italian origin, Napoleon moved to mainland France in 1779 and was commissioned as an officer in the French Royal Army in 1785. He supported the French Revolution in 1789, and promoted its cause in Corsica. He rose rapidly through the ranks after winning the siege of Toulon in 1793 and defeating royalist insurgents in Paris on 13 Vendémiaire in 1795. In 1796, Napoleon commanded a milita

In [130]:
llm_transformer_filtered = LLMGraphTransformer(
    llm=llm,
    # allowed_nodes=["Person", "Country", "Organization"],
    # allowed_relationships=["NATIONALITY", "LOCATED_IN", "WORKED_AT", "SPOUSE"],
)
graph_documents_filtered = llm_transformer_filtered.convert_to_graph_documents(
    documents
)

In [131]:
# Add edges to the graph
for edge in graph_documents_filtered[0].relationships:
    # Extract and transform edge data
    triplet = {
        'head': edge.source.id,
        'head_label': edge.source.type,
        'head_properties': edge.source.properties,
        'type': edge.type,
        'relation_properties': edge.properties,
        'tail': edge.target.id,
        'tail_label': edge.target.type,
        'tail_properties': edge.target.properties
    }
    print(triplet['head'], triplet['type'], triplet['tail'])

    # Insert the triplet into the Neo4j graph
    try:
        graph.insert_triplet(triplet)
    except Exception as e:
        print(e)


Napoleon Bonaparte BORN_ON 15 August 1769
Inserted triplet with embeddings for Napoleon Bonaparte and 15 August 1769, both labeled as 'Entity'.
Napoleon Bonaparte DIED_ON 5 May 1821
Inserted triplet with embeddings for Napoleon Bonaparte and 5 May 1821, both labeled as 'Entity'.
Napoleon Bonaparte KNOWN_AS Napoleon I
Inserted triplet with embeddings for Napoleon Bonaparte and Napoleon I, both labeled as 'Entity'.
Napoleon Bonaparte WAS_A French military officer
Inserted triplet with embeddings for Napoleon Bonaparte and French military officer, both labeled as 'Entity'.
Napoleon Bonaparte WAS_A statesman
Inserted triplet with embeddings for Napoleon Bonaparte and statesman, both labeled as 'Entity'.
Napoleon Bonaparte ROSE_TO_PROMINENCE_DURING French Revolution
Inserted triplet with embeddings for Napoleon Bonaparte and French Revolution, both labeled as 'Entity'.
Napoleon Bonaparte LEAD_CAMPAIGNS_DURING French Revolutionary Wars
Inserted triplet with embeddings for Napoleon Bonaparte 

In [136]:
print(GraphRAG("When was Napoleon born?", graph, verbose=True))  # now we know the answer
print('---')
print(GraphRAG("Where was Napoleon exlied from?", graph))  # same as before

Top nodes: [('Napoleon I', 0.8160932064056396), ('Napoleon Bonaparte', 0.8150107860565186), ('Napoleonic Wars', 0.752434492111206), ('Emperor of the French', 0.7418801784515381), ('Coup of 18 Brumaire', 0.7007098197937012)]
Top edges: []
Prompt: 
You answer questions given information from a knowledge graph.
If the query cannot be answered by the knowledge graph, say "I don't know."

Query: When was Napoleon born?

Knowledge Relationships:
Napoleon Bonaparte KNOWN_AS Napoleon I
Napoleon Bonaparte SUPPORTED French Revolution
Napoleon Bonaparte exiled from France
Napoleon Bonaparte BORN_ON 15 August 1769
Napoleon Bonaparte DIED_ON 5 May 1821
Napoleon Bonaparte KNOWN_AS Napoleon I
Napoleon Bonaparte WAS_A French military officer
Napoleon Bonaparte WAS_A statesman
Napoleon Bonaparte LEAD_CAMPAIGNS_DURING French Revolutionary Wars
Napoleon Bonaparte LEAD_CAMPAIGNS_DURING Napoleonic Wars
Napoleon Bonaparte FIRST_CONSUL_OF French Republic
Napoleon Bonaparte EMPEROR_OF French Empire
Napoleon B