### This notebook contains the code for storing a sample pdf or History of AI and Advancements into the Neo4j Graph Database.

### Steps Involved:
1. Loading the pdf using pypdf loader.
2. Use OpenAI Embeddings for semantic chunking of the document.
3. Use LLMGraphTransformer from langchain to convert the chunks into Nodes and Relationships.
4. Store the Nodes and Relationships into the Neo4j Database.
5. Query the Graph Database and retrieve the information.


In [2]:
from dotenv import load_dotenv
import os
load_dotenv()


True

Fetch the environment variables from .env

In [3]:
neo4j_username = os.getenv("NEO4J_USERNAME")
neo4j_password = os.getenv("NEO4J_PASSWORD")
neo4j_uri = os.getenv("NEO4J_URI")
openai_api_key = os.getenv("OPENAI_API_KEY")

In [4]:
from langchain_community.document_loaders import PyPDFLoader

In [None]:
loader = PyPDFLoader('./advancements_in_ai.pdf')

In [6]:
docs = loader.load()

## Semantic chunking of the docs

In [None]:
from langchain_experimental.text_splitter import SemanticChunker
from langchain_openai.embeddings import OpenAIEmbeddings
from langchain.callbacks import get_openai_callback
callback = get_openai_callback()

embeddings = OpenAIEmbeddings(model = "text-embedding-3-small", openai_api_key=openai_api_key)


In [11]:
splitter = SemanticChunker(OpenAIEmbeddings())
with callback:
    chunks  = splitter.split_documents(docs)


In [12]:
print(f"Total tokens used: {callback.total_tokens}")
print(f"Total cost: ${callback.total_cost:.6f}")


AttributeError: '_GeneratorContextManager' object has no attribute 'total_tokens'

Now, let's use LLMGraphTransformer from Langchain to construct Knowledge Graph.

In [None]:
from langchain_experimental.graph_transformers import LLMGraphTransformer
llm_transformer = LLMGraphTransformer(llm=llm)

In [None]:
graph_documents = llm_transformer.convert_to_graph_documents(chunks)
print(f"Nodes: {graph_documents[0].nodes}")
print(f"Relationships:{graph_documents[0].relationships}")

In [None]:
from langchain_neo4j import Neo4jGraph

Let's initialize the neo4j graph.

In [None]:
graph = Neo4jGraph(
    database="my-new-db",
    url=neo4j_uri,
    username=neo4j_username,
    password=neo4j_password
)

In [None]:
# adding the graph documents to the graph
graph.add_graph_documents(
    graph_documents,
    baseEntityLabel=True,
    include_source=True
)

In [None]:
print(graph)

In [None]:
print(graph_documents)

In [None]:
print(graph.get_schema)

In [None]:
! pip install --quiet yfiles_jupyter_graphs

In [None]:
from neo4j import GraphDatabase
from yfiles_jupyter_graphs import GraphWidget

def showGraph():
    driver = GraphDatabase.driver(
        uri=neo4j_uri,
        auth=(neo4j_username, neo4j_password)
    )
    session = driver.session()
    widget = GraphWidget(graph=session.run("MATCH (s)-[r:!MENTIONS]->(t) RETURN s, r, t").graph())
    widget.node_label_mapping = 'id'
    return widget



In [None]:
showGraph()

## Now let's query the Graph Database. We will provide a query in Natural Language and the langchain's GraphCypherQAChain would convert it into the Cypher query and retrieve the required info from the graph.

In [None]:
from langchain.chains import GraphCypherQAChain

In [None]:
chain = GraphCypherQAChain.from_llm(
    ChatOpenAI(temperature=0), graph=graph, verbose=True, allow_dangerous_requests=True
)

Got this error due to incompatible langchain version. Let's upgrade langchain and use langchain_community instead.

In [None]:
! pip install --upgrade langchain langchain-community langchain-neo4j


In [None]:
from langchain_community.graphs import Neo4jGraph  

graph = Neo4jGraph(
    database="neo4j",
    url=neo4j_uri,
    username=neo4j_username,
    password=neo4j_password
)

In [None]:
chain = GraphCypherQAChain.from_llm(
    ChatOpenAI(temperature=0), graph=graph, verbose=True, allow_dangerous_requests=True
)

In [None]:
chain.invoke({"query": "who co-founded Anthropic?"})

## For now LLM decideds nodes and relationships itself using the LLMGraphTransformer from Langchain. Now let's define system message and human message and chain it to the LLMGraphTransformer

In [9]:
from langchain_core.prompts import PromptTemplate

In [23]:
from langchain_core.prompts import SystemMessagePromptTemplate, PromptTemplate

system_prompt = SystemMessagePromptTemplate.from_template("""
Knowledge Graph Extraction Guidelines:
                                                          
### Important: Avoid using duplicate nodes or relationships. For example Growth Opportunites and Growth_Opportunities must be same node.

1. Overview
You are an advanced algorithm designed to extract structured information for building a knowledge graph.

Capture as much information as possible without sacrificing accuracy.

Do not add any information that is not explicitly stated in the text.

2. Nodes
Entities and concepts are represented as nodes.

Ensure clarity and simplicity, making the knowledge graph accessible to a wide audience.

Node Labeling
Consistency: Use basic and general labels for node types.

Example: Label a person as "person" instead of "scientist" or "mathematician".

Node IDs:

Do not use integers as node IDs.

Use human-readable identifiers from the text.

3. Relationships
Relationships connect entities and concepts.

Use consistent, general, and timeless relationship types.

DO NOT use momentary or overly specific relationship types (e.g., "BECAME_PROFESSOR").

USE general types instead (e.g., "PROFESSOR").

4. Coreference Resolution
Ensure entity consistency throughout the knowledge graph.

When an entity is mentioned multiple times with variations (e.g., "John Doe," "Joe," "he"), always use the most complete identifier (e.g., "John Doe").

5. Strict Compliance
Follow these rules precisely.

Non-compliance will result in termination.



 """)



In [None]:
from langchain.prompts import HumanMessagePromptTemplate
from langchain_experimental.graph_transformers.llm import ChatPromptTemplate
from langchain_experimental.graph_transformers.llm import LLMGraphTransformer


human_prompt = HumanMessagePromptTemplate(
    prompt = PromptTemplate.from_template("""
You are tasked with extracting a knowledge graph from a document titled "Advancements in Artificial Intelligence: A Comprehensive Overview".

Design a schema and extract entities and relationships to construct a Neo4j knowledge graph that captures:

Entities (Nodes):
- Organizations: OpenAI, Anthropic, DeepSeek, Google DeepMind, Microsoft, Nvidia.
- Researchers: Key individuals mentioned for organization transitions.
- AI Models: GPT-4, Claude, DeepSeek-V3, AlphaGo, o3, Gemini.
- Concepts: AI Ethics, AI Safety, AGI (Artificial General Intelligence), AI Democratization.

Relationships (Edges):
- Developed_By (Model ➔ Organization)
- Founded_By (Researcher ➔ Organization)
- Collaborates_With (Organization ➔ Organization)
- Transitioned_From_To (Researcher ➔ Organization ➔ Organization)
- Focuses_On (Organization ➔ Concept)
- Enables (Technology ➔ Application)

Instructions:
- Extract key entities and associate them with appropriate labels (e.g., Organization, Researcher, Model, Concept).
- Create meaningful relationships based on the document context.
- Capture important thematic concepts like AI safety, open-source AI, AGI, multimodal AI, and ethical AI development.
- Focus particularly on key player organizations, model developments, researcher movements, and societal implications.

Output:
- List of Nodes (with labels and properties).
- List of Relationships (with source node, target node, relationship type).
- Generate Cypher queries to create these nodes and relationships in Neo4j.
"


""")
    
)


In [25]:
chat_prompt = ChatPromptTemplate.from_messages([system_prompt, human_prompt])

In [35]:
from langchain_openai import ChatOpenAI
import os
llm= ChatOpenAI(model="gpt-4o", api_key=os.getenv("OPENAI_API_KEY"))

In [36]:
llm_transformer = LLMGraphTransformer(
    llm = llm,
    prompt = chat_prompt
)

In [37]:
from langchain.callbacks import get_openai_callback
 
with get_openai_callback() as cb:
    graph_documents = llm_transformer.convert_to_graph_documents(docs)
 
print("-----------------*Cost Estimation*------------------")
print(f"Estimated total cost: ${cb.total_cost:.4f}")
print(f"Total tokens used: {cb.total_tokens}")
print(f"Prompt tokens: {cb.prompt_tokens}")
print(f"Completion tokens: {cb.completion_tokens}")

-----------------*Cost Estimation*------------------
Estimated total cost: $0.1466
Total tokens used: 35344
Prompt tokens: 27580
Completion tokens: 7764


In [39]:
print("-----------------*Cost Estimation*------------------")
print(f"Estimated total cost: ${cb.total_cost:.4f}")
print(f"Total tokens used: {cb.total_tokens}")
print(f"Prompt tokens: {cb.prompt_tokens}")
print(f"Completion tokens: {cb.completion_tokens}")

-----------------*Cost Estimation*------------------
Estimated total cost: $0.1466
Total tokens used: 35344
Prompt tokens: 27580
Completion tokens: 7764


In [38]:
graph_documents

[GraphDocument(nodes=[Node(id='Cloud Marketplaces', type='Entity', properties={}), Node(id='Isvs', type='Entity', properties={}), Node(id='Enterprise Users', type='Entity', properties={}), Node(id='Cloud Service Providers', type='Entity', properties={}), Node(id='Growth Opportunities', type='Concept', properties={})], relationships=[Relationship(source=Node(id='Cloud Marketplaces', type='Entity', properties={}), target=Node(id='Software Procurement', type='Concept', properties={}), type='ENABLE', properties={}), Relationship(source=Node(id='Isvs', type='Entity', properties={}), target=Node(id='Software', type='Concept', properties={}), type='OFFER', properties={}), Relationship(source=Node(id='Cloud Service Providers', type='Entity', properties={}), target=Node(id='Cloud Marketplaces', type='Entity', properties={}), type='HOST', properties={}), Relationship(source=Node(id='Enterprise Users', type='Entity', properties={}), target=Node(id='Cloud Marketplaces', type='Entity', properties={

In [40]:
graph = Neo4jGraph(
    database="",
    url=neo4j_uri,
    username=neo4j_username,
    password=neo4j_password
)

In [41]:
graph.add_graph_documents(
    graph_documents,
    baseEntityLabel=True,
    include_source=False
)

In [43]:
from neo4j import GraphDatabase
from yfiles_jupyter_graphs import GraphWidget

def showGraph():
    driver = GraphDatabase.driver(
        uri=neo4j_uri,
        auth=(neo4j_username, neo4j_password)
    )
    session = driver.session()
    widget = GraphWidget(graph=session.run("MATCH (s)-[r:!MENTIONS]->(t) RETURN s, r, t").graph())
    widget.node_label_mapping = 'id'
    return widget



In [44]:
showGraph()

GraphWidget(layout=Layout(height='670px', width='100%'))

In [60]:
graph.schema

'Node properties:\nConcept {id: STRING}\nEntity {id: STRING}\nPlatform {id: STRING}\nRelationship properties:\n\nThe relationships:\n(:Concept)-[:IMPACTS]->(:Concept)\n(:Concept)-[:IMPACTS]->(:Entity)\n(:Concept)-[:FOCUSES_ON]->(:Concept)\n(:Concept)-[:INCLUDES]->(:Concept)\n(:Concept)-[:IMPACT]->(:Concept)\n(:Concept)-[:IMPACT]->(:Entity)\n(:Concept)-[:RELATED_TO]->(:Concept)\n(:Concept)-[:INFLUENCE]->(:Concept)\n(:Concept)-[:INFLUENCE]->(:Entity)\n(:Concept)-[:INFLUENCE]->(:Platform)\n(:Concept)-[:DRIVE]->(:Concept)\n(:Concept)-[:DRIVE]->(:Entity)\n(:Concept)-[:AFFECT]->(:Concept)\n(:Concept)-[:AFFECT]->(:Entity)\n(:Concept)-[:ENABLE]->(:Concept)\n(:Concept)-[:ENABLE]->(:Entity)\n(:Concept)-[:SUPPORT]->(:Concept)\n(:Concept)-[:SUPPORT]->(:Entity)\n(:Concept)-[:ENHANCE]->(:Concept)\n(:Concept)-[:ENHANCE]->(:Entity)\n(:Concept)-[:ENHANCE]->(:Platform)\n(:Concept)-[:ENABLES]->(:Concept)\n(:Concept)-[:ENABLES]->(:Entity)\n(:Concept)-[:OFFERS_SOFTWARE_ON]->(:Concept)\n(:Concept)-[:OFFERS_