In [1]:
from langchain_core.documents import Document

text = """
Marie Curie, 7 November 1867 – 4 July 1934, was a Polish and naturalised-French physicist and chemist who conducted pioneering research on radioactivity.
She was the first woman to win a Nobel Prize, the first person to win a Nobel Prize twice, and the only person to win a Nobel Prize in two scientific fields.
Her husband, Pierre Curie, was a co-winner of her first Nobel Prize, making them the first-ever married couple to win the Nobel Prize and launching the Curie family legacy of five Nobel Prizes.
She was, in 1906, the first woman to become a professor at the University of Paris.
Also, Robin Williams.
"""
documents = [Document(page_content=text)]


In [2]:
from langchain_openai import ChatOpenAI
import getpass
import os

llm = ChatOpenAI(model='gpt-4o')


In [3]:
from langchain_experimental.graph_transformers import LLMGraphTransformer

no_schema = LLMGraphTransformer(llm=llm)


In [4]:
data = await no_schema.aconvert_to_graph_documents(documents)


In [5]:
data

[GraphDocument(nodes=[Node(id='Marie Curie', type='Person', properties={}), Node(id='Poland', type='Place', properties={}), Node(id='France', type='Place', properties={}), Node(id='Radioactivity', type='Concept', properties={}), Node(id='Nobel Prize', type='Award', properties={}), Node(id='Pierre Curie', type='Person', properties={}), Node(id='Curie Family', type='Group', properties={}), Node(id='University Of Paris', type='Organization', properties={}), Node(id='Robin Williams', type='Person', properties={})], relationships=[Relationship(source=Node(id='Marie Curie', type='Person', properties={}), target=Node(id='Poland', type='Place', properties={}), type='NATIONALITY', properties={}), Relationship(source=Node(id='Marie Curie', type='Person', properties={}), target=Node(id='France', type='Place', properties={}), type='NATIONALITY', properties={}), Relationship(source=Node(id='Marie Curie', type='Person', properties={}), target=Node(id='Radioactivity', type='Concept', properties={}), 

In [6]:
!pip install neo4j-graphrag

Collecting neo4j-graphrag
  Downloading neo4j_graphrag-1.9.0-py3-none-any.whl.metadata (18 kB)
Collecting fsspec<2025.0.0,>=2024.9.0 (from neo4j-graphrag)
  Using cached fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Collecting json-repair<0.45.0,>=0.44.1 (from neo4j-graphrag)
  Downloading json_repair-0.44.1-py3-none-any.whl.metadata (12 kB)
Collecting neo4j<6.0.0,>=5.17.0 (from neo4j-graphrag)
  Downloading neo4j-5.28.2-py3-none-any.whl.metadata (5.9 kB)
Collecting numpy<3.0.0,>=2.0.0 (from neo4j-graphrag)
  Using cached numpy-2.3.2-cp312-cp312-macosx_14_0_arm64.whl.metadata (62 kB)
Collecting pypdf<6.0.0,>=5.1.0 (from neo4j-graphrag)
  Downloading pypdf-5.9.0-py3-none-any.whl.metadata (7.1 kB)
Collecting pyyaml<7.0.0,>=6.0.2 (from neo4j-graphrag)
  Using cached PyYAML-6.0.2-cp312-cp312-macosx_11_0_arm64.whl.metadata (2.1 kB)
Collecting tenacity<10.0.0,>=9.1.2 (from neo4j-graphrag)
  Using cached tenacity-9.1.2-py3-none-any.whl.metadata (1.2 kB)
Collecting types-pyyaml<7.0.0.0,>=

In [20]:
from neo4j import GraphDatabase
from neo4j_graphrag.embeddings.openai import OpenAIEmbeddings
from neo4j_graphrag.llm import OpenAILLM
from neo4j_graphrag.experimental.pipeline.kg_builder import SimpleKGPipeline
from neo4j_graphrag.experimental.components.text_splitters.fixed_size_splitter import FixedSizeSplitter


# Initialize Neo4j driver
driver = GraphDatabase.driver("neo4j://localhost:7687", auth=("neo4j", "test1234"))

# Initialize LLM
ex_llm = OpenAILLM(
    model_name="gpt-4o",
    model_params={
        "response_format": {"type": "json_object"},
        "temperature": 0
    }
)

# Initialize embedder
embedder = OpenAIEmbeddings()

# Define node labels and relation types
node_labels = [
            "mission", "spacecraft", "rover", "orbiter", "lander", "probe",
            "planet", "moon", "asteroid", "comet",
            "facility", "laboratory", "launch_site", "research_center",
            "company", "organization", "agency",
            "technology", "instrument", "system", "equipment",
            "scientist", "engineer", "researcher",
            "discovery", "finding", "observation"
        ]
rel_types = [
            "MANAGES", "OPERATES", "BUILT", "LAUNCHED_FROM", "STUDIES",
            "EXPLORES", "DISCOVERS", "USES_TECHNOLOGY", "COLLABORATES_WITH",
            "LOCATED_AT", "PART_OF", "DEPENDS_ON", "COMMUNICATES_WITH",
            "TRANSMITS_TO", "RECEIVES_FROM", "ANALYZES", "MEASURES",
            "OBSERVES", "MONITORS", "CONTROLS", "SUPPORTS"
        ]

# Initialize KG pipeline
kg_builder_pdf = SimpleKGPipeline(
    llm=ex_llm,
    driver=driver,
    text_splitter=FixedSizeSplitter(chunk_size=500, chunk_overlap=100),
    embedder=embedder,
    entities=node_labels,
    relations=rel_types,
    prompt_template=_get_additional_instructions(),
    from_pdf=True
)

# List of PDF files
pdf_file_paths = [
    'data/pdfs/exoplanet_detection_methods.pdf',
    'data/pdfs/hubble_space_telescope.pdf',
    'data/pdfs/ion_propulsion_technology.pdf',
    'data/pdfs/voyager_program.pdf',
    'data/pdfs/nasa_initial_knowledge.pdf',
    'data/pdfs/voyager_program.pdf'
]

# Run KG builder for each PDF
for path in pdf_file_paths:
    graph_data = await kg_builder_pdf.run_async(file_path=path)




In [19]:
def _get_additional_instructions() -> str:
    """Get additional instructions for NASA-specific extraction."""
    return """
    You are a NASA space exploration researcher tasked with extracting information from NASA documentation 
    and structuring it in a property graph to inform further space exploration and research Q&A.

    Extract the entities (nodes) and specify their type from the following Input text.
    Also extract the relationships between these nodes. The relationship direction goes from the start node to the end node.

    Return result as JSON using the following format:
    {{"nodes": [ {{"id": "0", "label": "the type of entity", "properties": {{"name": "name of entity" }} }}],
      "relationships": [{{"type": "TYPE_OF_RELATIONSHIP", "start_node_id": "0", "end_node_id": "1", "properties": {{"details": "Description of the relationship"}} }}] }}

    Do not return any additional information other than the JSON in it.
    """

In [16]:
from neo4j_graphrag.indexes import create_vector_index

create_vector_index(driver, name="text_embeddings", label="Chunk",
                    embedding_property="embedding", dimensions=1536, similarity_fn="cosine")


In [32]:
from neo4j_graphrag.retrievers import VectorRetriever

vector_retriever = VectorRetriever(
    driver,
    index_name="text_embeddings",
    embedder=embedder,
    return_properties=["text"],
)


import json

vector_res = vector_retriever.get_search_results(
    query_text="When was Voyager 1 launched?",
    top_k=3
)

for i in vector_res.records:
    print("====\n" + json.dumps(i.data(), indent=4))


====
{
    "node": {
        "text": "Voyager Program Overview\nThe Voyager program consists of two robotic probes, Voyager 1 and Voyager 2, \nlaunched in 1977 to study the outer Solar System and interstellar space. Both \nspacecraft are still operational and continue to send data back to Earth.\nVoyager 1 and Voyager 2 were designed to take advantage of a rare planetary \nalignment that occurs only once every 176 years. This alignment allowed the \nspacecraft to visit Jupiter, Saturn, Uranus, and Neptune using gravity assists.\nKey "
    },
    "nodeLabels": [
        "__KGBuilder__",
        "Chunk"
    ],
    "elementId": "4:ce55c410-1d81-4077-85b0-706b09aff537:26",
    "id": "4:ce55c410-1d81-4077-85b0-706b09aff537:26",
    "score": 0.9327101111412048
}
====
{
    "node": {
        "text": "Voyager Program Overview\nThe Voyager program consists of two robotic probes, Voyager 1 and Voyager 2, \nlaunched in 1977 to study the outer Solar System and interstellar space. Both \nspacecraft

In [35]:
from neo4j_graphrag.retrievers import VectorCypherRetriever

graph_retriever = VectorCypherRetriever(
    driver,
    index_name="text_embeddings",
    embedder=embedder,
    retrieval_query="""
        // 1) Go out 2-3 hops in the entity graph and get relationships
WITH node AS chunk
MATCH (chunk)<-[:FROM_CHUNK]-()-[relList:!FROM_CHUNK]-{1,2}()
UNWIND relList AS rel

// 2) Collect relationships and text chunks
WITH collect(DISTINCT chunk) AS chunks, 
  collect(DISTINCT rel) AS rels

// 3) Format and return context
RETURN '=== text ===\n' + apoc.text.join([c in chunks | c.text], '\n---\n') + '\n\n=== kg_rels ===\n' +
  apoc.text.join([r in rels | startNode(r).name + ' - ' + type(r) + '(' + coalesce(r.details, '') + ')' +  ' -> ' + endNode(r).name ], '\n---\n') AS info
"""
)


In [36]:
vc_res = graph_retriever.get_search_results(
    query_text="When was Voyager 1 launched?", 
    top_k=3
)




In [37]:
kg_rel_pos = vc_res.records[0]['info'].find('\n\n=== kg_rels ===\n')


print("# Text Chunk Context:")
print(vc_res.records[0]['info'][:kg_rel_pos])

print("# KG Context From Relationships:")
print(vc_res.records[0]['info'][kg_rel_pos:])


# Text Chunk Context:
=== text ===

# KG Context From Relationships:


=== kg_rels ===



In [22]:
# Initialize LLM
from neo4j_graphrag.generation import RagTemplate
from neo4j_graphrag.generation.graphrag import GraphRAG



llm = OpenAILLM(
    model_name="gpt-4o",
    model_params={"temperature": 0.0}
)

# Define RAG prompt template
rag_template = RagTemplate(
    template='''
        Answer the Question using the following Context. 
        Only respond with information mentioned in the Context. 
        Do not inject any speculative information not mentioned.

        # Question:
        {query_text}

        # Context:
        {context}

        # Answer:
    ''',
    expected_inputs=['query_text', 'context']
)

# # Create Vector and Graph RAG pipelines
# vector_rag = GraphRAG(
#     llm=llm,
#     retriever=vector_retriever,
#     prompt_template=rag_template
# )

graph_rag = GraphRAG(
    llm=llm,
    retriever=graph_retriever,
    prompt_template=rag_template
)

# Define question
q = (
    "When was Voyager 1 launched?"
    "Include the launch date in the answer"
)

# Perform RAG search and get answers
# vector_rag.search(q, retriever_config={'top_k': 5}).answer
graph_rag.search(q, retriever_config={'top_k': 5}).answer


CypherSyntaxError: {code: Neo.ClientError.Statement.SyntaxError} {message: Variable length relationships must not use relationship type expressions. (line 4, column 56 (offset: 304))
"        MATCH (chunk)<-[:FROM_CHUNK]-(entity)-[relList:!FROM_CHUNK*1..2]-(nb)"
                                                        ^}