In [46]:
# bring in our LLAMA_CLOUD_API_KEY
from dotenv import load_dotenv
load_dotenv()
import os


In [47]:
#!pip install llama-index-graph-stores-neo4j

In [4]:
import nest_asyncio

nest_asyncio.apply()

# Setup Model
Here we use gpt-4o and default OpenAI embeddings.

In [5]:
from llama_index.llms.openai import OpenAI
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.core import Settings

llm = OpenAI(model="gpt-4o-mini")
embed_model = OpenAIEmbedding(model="text-embedding-3-small")

Settings.llm = llm
Settings.embed_model = embed_model

In [57]:
import requests

url = "https://www.dropbox.com/scl/fi/vip161t63s56vd94neqlt/2023-CSF_Proposed_Budget_Book_June_2023_Master_Web.pdf?rlkey=hemoce3w1jsuf6s2bz87g549i&dl=0"
response = requests.get(url)

with open("budget_2023.pdf", "wb") as file:
    file.write(response.content)

In [60]:
from llama_parse import LlamaParse

docs = LlamaParse(result_type="text").load_data("./budget_2023.pdf")

Started parsing the file under job_id b05d3628-8e59-4f40-a4cf-7fa17c1aae86


In [61]:
len(docs)

362

In [62]:
from copy import deepcopy
from llama_index.core.schema import TextNode, Document
from llama_index.core import VectorStoreIndex


def get_sub_docs(docs):
    """Split docs into pages, by separator."""
    sub_docs = []
    for doc in docs:
        doc_chunks = doc.text.split("\n---\n")
        for doc_chunk in doc_chunks:
            sub_doc = Document(
                text=doc_chunk,
                metadata=deepcopy(doc.metadata),
            )
            sub_docs.append(sub_doc)

    return sub_docs

In [63]:
# this will split into pages
sub_docs = get_sub_docs(docs)

In [64]:
len(sub_docs)

362

Initialize Graph Store
Here we use Neo4j but you can also use our other integrations like Nebula (see an example notebook).

To launch Neo4j locally, first ensure you have docker installed. Then, you can launch the database with the following docker command

docker run \
    -p 7474:7474 -p 7687:7687 \
    -v $PWD/data:/data -v $PWD/plugins:/plugins \
    --name neo4j-apoc \
    -e NEO4J_apoc_export_file_enabled=true \
    -e NEO4J_apoc_import_file_enabled=true \
    -e NEO4J_apoc_import_file_use__neo4j__config=true \
    -e NEO4JLABS_PLUGINS=\[\"apoc\"\] \
    neo4j:latest
From here, you can open the db at http://localhost:7474/. On this page, you will be asked to sign in. Use the default username/password of neo4j and neo4j.

Once you login for the first time, you will be asked to change the password.

After this, you are ready to create your first property graph!

In [65]:
NEO4J_URI = os.environ.get('NEO4J_URI')
NEO4J_USERNAME = os.environ.get('NEO4J_USERNAME')
NEO4J_PASSWORD = os.environ.get('NEO4J_PASSWORD')

In [67]:
NEO4J_URI

'neo4j+s://368c1712.databases.neo4j.io'

In [66]:
from llama_index.graph_stores.neo4j import Neo4jPGStore
graph_store = Neo4jPGStore(
    username=NEO4J_USERNAME,
    password=NEO4J_PASSWORD,
    url=NEO4J_URI,
)

In [68]:
vec_store = None

In [69]:
from llama_index.core.indices.property_graph import (
    ImplicitPathExtractor,
    SimpleLLMPathExtractor,
)
from llama_index.core import PropertyGraphIndex
from llama_index.llms.openai import OpenAI
from llama_index.embeddings.openai import OpenAIEmbedding

In [71]:
index = PropertyGraphIndex.from_documents(
    sub_docs,
    embed_model=OpenAIEmbedding(model_name="text-embedding-3-small"),
    kg_extractors=[
        ImplicitPathExtractor(),
        SimpleLLMPathExtractor(
            llm=OpenAI(model="gpt-4o-mini", temperature=0.3),
            num_workers=4,
            max_paths_per_chunk=10,
        ),
    ],
    property_graph_store=graph_store,
    show_progress=True,
)

Parsing nodes: 100%|██████████| 362/362 [00:01<00:00, 223.87it/s]
Extracting implicit paths: 100%|██████████| 438/438 [00:00<00:00, 48670.19it/s]
Extracting paths from text: 100%|██████████| 438/438 [05:07<00:00,  1.42it/s]
Generating embeddings: 100%|██████████| 5/5 [00:04<00:00,  1.16it/s]
Generating embeddings: 100%|██████████| 10/10 [00:03<00:00,  2.78it/s]


# run this if index is already loaded !!!!

In [33]:
index = PropertyGraphIndex.from_existing(
    graph_store,
    embed_model=OpenAIEmbedding(model_name="text-embedding-3-small"),
    kg_extractors=[
        ImplicitPathExtractor(),
        SimpleLLMPathExtractor(
            llm=OpenAI(model="gpt-4o-mini", temperature=0.3),
            num_workers=4,
            max_paths_per_chunk=10,
        ),
    ],
    show_progress=True,
)

# Define Vector Retriever
Here we define our vector context retriever - it returns initial nodes via vector search, and traverses the relations to pull in more nodes/context.

In [74]:
from llama_index.core.indices.property_graph import VectorContextRetriever

kg_retriever = VectorContextRetriever(
    index.property_graph_store,
    embed_model=OpenAIEmbedding(model_name="text-embedding-3-small"),
    similarity_top_k=2,
    path_depth=1,
    # include_text=False,
    include_text=True,
)

In [75]:
nodes = kg_retriever.retrieve(
    "Give me all the programs that the mayor's budget includes"
)
# nodes = kg_retriever.retrieve('san francisco')
print(len(nodes))
for idx, node in enumerate(nodes):
    print(f">> IDX: {idx}, {node.get_content()}")

ClientError: {code: Neo.ClientError.Procedure.ProcedureCallFailed} {message: Failed to invoke procedure `db.index.vector.queryNodes`: Caused by: java.lang.IllegalArgumentException: There is no such vector schema index: entity}

# Build Baseline Vector Index
We also build a "baseline" vector index. This follows the "naive" RAG pipeline approach of chunking and vector embedding. We use this as a comparison point.

In [36]:
from llama_index.core import VectorStoreIndex
from llama_index.core.query_engine import RetrieverQueryEngine

base_index = VectorStoreIndex.from_documents(sub_docs, embed_model=embed_model)
base_retriever = base_index.as_retriever(similarity_top_k=2)
base_query_engine = RetrieverQueryEngine(base_retriever)

In [37]:
response = base_query_engine.query(
    "Give me all the programs that the mayor's budget includes"
)
print(str(response))

The provided information does not specify any programs included in the mayor's budget. For details on the programs, it would be necessary to refer to the actual budget document or additional resources that outline the budget specifics.
