<a href="https://colab.research.google.com/github/sadnyd/EdgarGraphLLM/blob/main/notebooks/final.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install dotenv langchain_community neo4j langchain langchain_google_genai

Collecting dotenv
  Downloading dotenv-0.9.9-py2.py3-none-any.whl.metadata (279 bytes)
Collecting langchain_community
  Downloading langchain_community-0.3.21-py3-none-any.whl.metadata (2.4 kB)
Collecting neo4j
  Downloading neo4j-5.28.1-py3-none-any.whl.metadata (5.9 kB)
Collecting langchain_google_genai
  Downloading langchain_google_genai-2.1.2-py3-none-any.whl.metadata (4.7 kB)
Collecting python-dotenv (from dotenv)
  Downloading python_dotenv-1.1.0-py3-none-any.whl.metadata (24 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain_community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting pydantic-settings<3.0.0,>=2.4.0 (from langchain_community)
  Downloading pydantic_settings-2.8.1-py3-none-any.whl.metadata (3.5 kB)
Collecting httpx-sse<1.0.0,>=0.4.0 (from langchain_community)
  Downloading httpx_sse-0.4.0-py3-none-any.whl.metadata (9.0 kB)
Collecting filetype<2.0.0,>=1.2.0 (from langchain_google_genai)
  Downloading filetype-1.2.0-py2.py3

In [None]:
import textwrap

# Langchain
from langchain_community.graphs import Neo4jGraph
from langchain_community.vectorstores import Neo4jVector
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain.chains import RetrievalQAWithSourcesChain
from langchain.prompts.prompt import PromptTemplate
from langchain.chains import GraphCypherQAChain
from langchain_google_genai import ChatGoogleGenerativeAI
from google.colab import userdata
# Warning control
import warnings
warnings.filterwarnings("ignore")

In [None]:
# Load from Colab userdata
NEO4J_URI = userdata.get('EDGAR_URI')
NEO4J_USERNAME = userdata.get('EDGAR_USERNAME')
NEO4J_PASSWORD = userdata.get('EDGAR_PASSWORD')
NEO4J_DATABASE = userdata.get('EDGAR_DATABASE')
GEMINI_API_KEY = userdata.get('gemini')
GEMINI_ENDPOINT = 'https://generativelanguage.googleapis.com/v1beta/models/text-embedding-004:embedText'

In [None]:
# Global constants
VECTOR_INDEX_NAME = 'form_10k_chunks'
VECTOR_NODE_LABEL = 'Chunk'
VECTOR_SOURCE_PROPERTY = 'text'
VECTOR_EMBEDDING_PROPERTY = 'textEmbedding'

In [None]:
kg = Neo4jGraph(
    url=NEO4J_URI, username=NEO4J_USERNAME, password=NEO4J_PASSWORD, database=NEO4J_DATABASE
)

In [None]:
kg.refresh_schema()
print(textwrap.fill(kg.schema, 60))

Node properties: Chunk {chunkId: STRING, names: LIST,
formId: STRING, cik: STRING, cusip6: STRING, source: STRING,
f10kItem: STRING, chunkSeqId: INTEGER, text: STRING,
textEmbedding: LIST} Form {names: LIST, formId: STRING, cik:
STRING, cusip6: STRING, source: STRING} Company {names:
LIST, cusip6: STRING, companyName: STRING, cusip: STRING}
Manager {managerCik: STRING, managerName: STRING,
managerAddress: STRING} Relationship properties: SECTION
{f10kItem: STRING} OWNS_STOCK_IN {reportCalendarOrQuarter:
STRING, value: FLOAT, shares: INTEGER} The relationships:
(:Chunk)-[:NEXT]->(:Chunk) (:Chunk)-[:PART_OF]->(:Form)
(:Form)-[:SECTION]->(:Chunk) (:Company)-[:FILED]->(:Form)
(:Manager)-[:OWNS_STOCK_IN]->(:Company)


In [None]:
kg.query("""
MATCH (mgr:Manager)-[:LOCATED_AT]->(addr:Address)
RETURN mgr, addr
LIMIT 1
""")



[]

In [None]:
kg.query("""
  CALL db.index.fulltext.queryNodes(
         "fullTextManagerNames",
         "royal bank") YIELD node, score
  RETURN node.managerName, score LIMIT 1
""")

[{'node.managerName': 'Royal Bank of Canada', 'score': 3.7019896507263184}]

In [None]:
kg.query("""
CALL db.index.fulltext.queryNodes(
         "fullTextManagerNames",
         "royal bank"
  ) YIELD node, score
WITH node as mgr LIMIT 1
MATCH (mgr:Manager)-[:LOCATED_AT]->(addr:Address)
RETURN mgr.managerName, addr
""")



[]

In [None]:
kg.query("""
  MATCH p=(:Manager)-[:LOCATED_AT]->(address:Address)
  RETURN address.state as state, count(address.state) as numManagers
    ORDER BY numManagers DESC
    LIMIT 10
""")



[]

In [None]:
kg.query("""
  MATCH p=(:Company)-[:LOCATED_AT]->(address:Address)
  RETURN address.state as state, count(address.state) as numCompanies
    ORDER BY numCompanies DESC
""")



[]

In [None]:
kg.query("""
  MATCH p=(:Manager)-[:LOCATED_AT]->(address:Address)
         WHERE address.state = 'California'
  RETURN address.city as city, count(address.city) as numManagers
    ORDER BY numManagers DESC
    LIMIT 10
""")



[]

In [None]:
kg.query("""
  MATCH p=(mgr:Manager)-[:LOCATED_AT]->(address:Address),
         (mgr)-[owns:OWNS_STOCK_IN]->(:Company)
         WHERE address.city = "San Francisco"
  RETURN mgr.managerName, sum(owns.value) as totalInvestmentValue
    ORDER BY totalInvestmentValue DESC
    LIMIT 10
""")



[]

In [None]:
kg.query("""
  MATCH (com:Company)-[:LOCATED_AT]->(address:Address)
         WHERE address.city = "Santa Clara"
  RETURN com.companyName
""")



[]

In [None]:
kg.query("""
  MATCH (sc:Address)
    WHERE sc.city = "Santa Clara"
  MATCH (com:Company)-[:LOCATED_AT]->(comAddr:Address)
    WHERE point.distance(sc.location, comAddr.location) < 10000
  RETURN com.companyName, com.companyAddress
""")



[]

In [None]:
kg.query("""
  MATCH (address:Address)
    WHERE address.city = "Santa Clara"
  MATCH (mgr:Manager)-[:LOCATED_AT]->(managerAddress:Address)
    WHERE point.distance(address.location,
        managerAddress.location) < 10000
  RETURN mgr.managerName, mgr.managerAddress
""")



[]

In [None]:
CYPHER_GENERATION_TEMPLATE = """Task:Generate Cypher statement to
query a graph database.
Instructions:
Use only the provided relationship types and properties in the
schema. Do not use any other relationship types or properties that
are not provided.
Schema:
{schema}
Note: Do not include any explanations or apologies in your responses.
Do not respond to any questions that might ask anything else than
for you to construct a Cypher statement.
Do not include any text except the generated Cypher statement.
Examples: Here are a few examples of generated Cypher
statements for particular questions:

# What investment firms are in San Francisco?
MATCH (mgr:Manager)-[:LOCATED_AT]->(mgrAddress:Address)
    WHERE mgrAddress.city = 'San Francisco'
RETURN mgr.managerName
The question is:
{question}"""

In [None]:
CYPHER_GENERATION_PROMPT = PromptTemplate(
    input_variables=["schema", "question"],
    template=CYPHER_GENERATION_TEMPLATE
)

In [None]:
cypherChain = GraphCypherQAChain.from_llm(
    ChatGoogleGenerativeAI(
        model="gemini-2.0-flash",
        google_api_key=GEMINI_API_KEY,
        temperature=0
    ),
    graph=kg,
    verbose=True,
    cypher_prompt=CYPHER_GENERATION_PROMPT,
)

ValueError: In order to use this chain, you must acknowledge that it can make dangerous requests by setting `allow_dangerous_requests` to `True`.You must narrowly scope the permissions of the database connection to only include necessary permissions. Failure to do so may result in data corruption or loss or reading sensitive data if such data is present in the database.Only use this chain if you understand the risks and have taken the necessary precautions. See https://python.langchain.com/docs/security for more information.