# Building a Knowledge Graph from JSON

In [1]:
%load_ext autoreload
%autoreload 2 

from dotenv import load_dotenv
import os

# Common data processing
import json
import textwrap

# Langchain
from langchain_community.graphs import Neo4jGraph
from langchain_community.vectorstores import Neo4jVector
from langchain_openai import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains import RetrievalQAWithSourcesChain
from langchain_openai import ChatOpenAI


# Warning control
import warnings
warnings.filterwarnings("ignore")

In [2]:
# Load from environment
load_dotenv('.env', override=True)
NEO4J_URI = os.getenv('NEO4J_URI')
NEO4J_USERNAME = os.getenv('NEO4J_USERNAME')
NEO4J_PASSWORD = os.getenv('NEO4J_PASSWORD')
NEO4J_DATABASE = os.getenv('NEO4J_DATABASE') or 'neo4j'
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
OPENAI_ENDPOINT = os.getenv('OPENAI_ENDPOINT')

# Global constants
VECTOR_INDEX_NAME = 'form_10k_chunks'
VECTOR_NODE_LABEL = 'Chunk'
VECTOR_SOURCE_PROPERTY = 'text'
VECTOR_EMBEDDING_PROPERTY = 'textEmbedding'

### 10-K form json file for the company NetApp

- Publicly traded companies are required to fill a form 10-K each year with the Securities and Exchange Commision (SEC)
- [EDGAR database](https://www.sec.gov/edgar/search/)

In [15]:
netApp_form = "Data/0000950170-23-027948.json"


In [16]:
netApp_form_as_object = json.load(open(netApp_form))

In [5]:
type(netApp_form_as_object)

dict

In [7]:
for k,v in netApp_form_as_object.items():
    print(k, type(v))

item1 <class 'str'>
item1a <class 'str'>
item7 <class 'str'>
item7a <class 'str'>
cik <class 'str'>
cusip6 <class 'str'>
cusip <class 'list'>
names <class 'list'>
source <class 'str'>


In [8]:
item1_text = netApp_form_as_object['item1']

In [9]:
item1_text[0:1500]

'>Item 1.  \nBusiness\n\n\nOverview\n\n\nNetApp, Inc. (NetApp, we, us or the Company) is a global cloud-led, data-centric software company. We were incorporated in 1992 and are headquartered in San Jose, California. Building on more than three decades of innovation, we give customers the freedom to manage applications and data across hybrid multicloud environments. Our portfolio of cloud services, and storage infrastructure, powered by intelligent data management software, enables applications to run faster, more reliably, and more securely, all at a lower cost.\n\n\nOur opportunity is defined by the durable megatrends of data-driven digital and cloud transformations. NetApp helps organizations meet the complexities created by rapid data and cloud growth, multi-cloud management, and the adoption of next-generation technologies, such as AI, Kubernetes, and modern databases. Our modern approach to hybrid, multicloud infrastructure and data management, which we term ‘evolved cloud’, provi

### Split Form 10-K sections into chunks
- Set up text splitter using LangChain. Let's first apply it to just "item1".

In [10]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 2000,
    chunk_overlap  = 200,
    length_function = len,
    is_separator_regex = False,
)

In [11]:
item1_text_chunks = text_splitter.split_text(item1_text)

In [12]:
type(item1_text_chunks)

list

In [13]:
len(item1_text_chunks)

254

In [14]:
item1_text_chunks[0]

'>Item 1.  \nBusiness\n\n\nOverview\n\n\nNetApp, Inc. (NetApp, we, us or the Company) is a global cloud-led, data-centric software company. We were incorporated in 1992 and are headquartered in San Jose, California. Building on more than three decades of innovation, we give customers the freedom to manage applications and data across hybrid multicloud environments. Our portfolio of cloud services, and storage infrastructure, powered by intelligent data management software, enables applications to run faster, more reliably, and more securely, all at a lower cost.\n\n\nOur opportunity is defined by the durable megatrends of data-driven digital and cloud transformations. NetApp helps organizations meet the complexities created by rapid data and cloud growth, multi-cloud management, and the adoption of next-generation technologies, such as AI, Kubernetes, and modern databases. Our modern approach to hybrid, multicloud infrastructure and data management, which we term ‘evolved cloud’, provi

- Let's chunk all sections of the Form 10-K by using the next helper function.
- In this function I selected Items 1, 1A, 7, and 7A which contain the most critical business and financial information that analysts and investors typically focus on first. This represents the "core" of a 10-K filing because:

    - Item 1 gives the fundamental business overview
    - Item 1A identifies key risks
    - Item 7 provides management's financial analysis
    - Item 7A covers market risk exposures

- For testing purposes, I'll also limit the number of chunks in each section to 20

In [32]:
def split_form10k_data_from_file(file):
    chunks_with_metadata = [] # use this to accumlate chunk records
    file_as_object = json.load(open(file)) # open the json file
    for item in ['item1','item1a','item7','item7a']: # pull these keys from the json
        print(f'Processing {item} from {file}') 
        item_text = file_as_object[item] # grab the text of the item
        item_text_chunks = text_splitter.split_text(item_text) # split the text into chunks
        chunk_seq_id = 0
        for chunk in item_text_chunks[:20]: # only take the first 20 chunks
            form_id = file[file.rindex('/') + 1:file.rindex('.')] # extract form id from file name
            # record with metadata and the chunk text
            chunks_with_metadata.append({
                'text': chunk, 
                # metadata from looping...
                'f10kItem': item,
                'chunkSeqId': chunk_seq_id,
                # constructed metadata...
                'formId': f'{form_id}', # pulled from the filename
                'chunkId': f'{form_id}-{item}-chunk{chunk_seq_id:04d}', #pad with zeros to make it 4 digits wide
                # metadata from file...
                'names': file_as_object['names'],
                'cik': file_as_object['cik'],
                'cusip6': file_as_object['cusip6'],
                'source': file_as_object['source'],
            })
            chunk_seq_id += 1
        print(f'\tSplit into {chunk_seq_id} chunks')
    return chunks_with_metadata

In [33]:
netApp_form_chunks = split_form10k_data_from_file(netApp_form)

Processing item1 from Data/0000950170-23-027948.json
	Split into 20 chunks
Processing item1a from Data/0000950170-23-027948.json
	Split into 1 chunks
Processing item7 from Data/0000950170-23-027948.json
	Split into 1 chunks
Processing item7a from Data/0000950170-23-027948.json
	Split into 1 chunks


In [34]:
print(type(netApp_form_chunks))
print(len(netApp_form_chunks))

<class 'list'>
23


In [40]:
netApp_form_chunks[20]

{'text': '>Item 1A\n\n\n\xa0\n\n\nRisk Factors\n\n\n\xa0\n\n\n14',
 'f10kItem': 'item1a',
 'chunkSeqId': 0,
 'formId': '0000950170-23-027948',
 'chunkId': '0000950170-23-027948-item1a-chunk0000',
 'names': ['Netapp Inc', 'NETAPP INC'],
 'cik': '1002047',
 'cusip6': '64110D',
 'source': 'https://www.sec.gov/Archives/edgar/data/1002047/000095017023027948/0000950170-23-027948-index.htm'}

### Create graph nodes using the already created text chunks

In [41]:
merge_chunk_node_query = """
MERGE(mergedChunk:Chunk {chunkId: $chunkParam.chunkId})
    ON CREATE SET 
        mergedChunk.names = $chunkParam.names,
        mergedChunk.formId = $chunkParam.formId, 
        mergedChunk.cik = $chunkParam.cik, 
        mergedChunk.cusip6 = $chunkParam.cusip6, 
        mergedChunk.source = $chunkParam.source, 
        mergedChunk.f10kItem = $chunkParam.f10kItem, 
        mergedChunk.chunkSeqId = $chunkParam.chunkSeqId, 
        mergedChunk.text = $chunkParam.text
RETURN mergedChunk
"""

- Set up connection to graph instance using LangChain

- Ideally, I would first create a new database since the default one currently contains the movies data using in the notebook querying_with_graphql.ipynb. However, the free tier does not allow you to create a new database or instance, so nodes will be differenciated by name.

In [43]:
kg = Neo4jGraph(
    url=NEO4J_URI, username=NEO4J_USERNAME, password=NEO4J_PASSWORD, database=NEO4J_DATABASE
)

- Let's test creating one chunk node for now. You can check on cloud to see that the node was created.

In [44]:
kg.query(merge_chunk_node_query, 
         params={'chunkParam':netApp_form_chunks[0]})

[{'mergedChunk': {'formId': '0000950170-23-027948',
   'f10kItem': 'item1',
   'names': ['Netapp Inc', 'NETAPP INC'],
   'cik': '1002047',
   'cusip6': '64110D',
   'source': 'https://www.sec.gov/Archives/edgar/data/1002047/000095017023027948/0000950170-23-027948-index.htm',
   'text': '>Item 1.  \nBusiness\n\n\nOverview\n\n\nNetApp, Inc. (NetApp, we, us or the Company) is a global cloud-led, data-centric software company. We were incorporated in 1992 and are headquartered in San Jose, California. Building on more than three decades of innovation, we give customers the freedom to manage applications and data across hybrid multicloud environments. Our portfolio of cloud services, and storage infrastructure, powered by intelligent data management software, enables applications to run faster, more reliably, and more securely, all at a lower cost.\n\n\nOur opportunity is defined by the durable megatrends of data-driven digital and cloud transformations. NetApp helps organizations meet the 

- Create a uniqueness constraint to avoid duplicate chunks

In [45]:
kg.query("""
CREATE CONSTRAINT unique_chunk IF NOT EXISTS 
    FOR (c:Chunk) REQUIRE c.chunkId IS UNIQUE
""")


[]

- Loop through and create nodes for all chunks
- Should create 23 nodes because you set a limit of 20 chunks in the text splitting function above

In [60]:
node_count = 0
for chunk in netApp_form_chunks:
    print(f"Creating `:Chunk` node for chunk ID {chunk['chunkId']}")
    kg.query(merge_chunk_node_query, 
            params={
                'chunkParam': chunk
            })
    node_count += 1
print(f"Created {node_count} nodes")

Creating `:Chunk` node for chunk ID 0000950170-23-027948-item1-chunk0000


[#F48D]  _: <CONNECTION> error: Failed to read from defunct connection ResolvedIPv4Address(('34.28.184.63', 7687)) (ResolvedIPv4Address(('34.28.184.63', 7687))): OSError('No data')
Unable to retrieve routing information
Transaction failed and will be retried in 0.9611597986339417s (Unable to retrieve routing information)
[#F2D7]  _: <CONNECTION> error: Failed to read from defunct connection IPv4Address(('si-306463c4-5134.production-orch-0696.neo4j.io', 7687)) (ResolvedIPv4Address(('34.28.184.63', 7687))): OSError('No data')
Transaction failed and will be retried in 2.2297214251220847s (Failed to read from defunct connection IPv4Address(('si-306463c4-5134.production-orch-0696.neo4j.io', 7687)) (ResolvedIPv4Address(('34.28.184.63', 7687))))


Creating `:Chunk` node for chunk ID 0000950170-23-027948-item1-chunk0001
Creating `:Chunk` node for chunk ID 0000950170-23-027948-item1-chunk0002
Creating `:Chunk` node for chunk ID 0000950170-23-027948-item1-chunk0003
Creating `:Chunk` node for chunk ID 0000950170-23-027948-item1-chunk0004
Creating `:Chunk` node for chunk ID 0000950170-23-027948-item1-chunk0005
Creating `:Chunk` node for chunk ID 0000950170-23-027948-item1-chunk0006
Creating `:Chunk` node for chunk ID 0000950170-23-027948-item1-chunk0007
Creating `:Chunk` node for chunk ID 0000950170-23-027948-item1-chunk0008
Creating `:Chunk` node for chunk ID 0000950170-23-027948-item1-chunk0009
Creating `:Chunk` node for chunk ID 0000950170-23-027948-item1-chunk0010
Creating `:Chunk` node for chunk ID 0000950170-23-027948-item1-chunk0011
Creating `:Chunk` node for chunk ID 0000950170-23-027948-item1-chunk0012
Creating `:Chunk` node for chunk ID 0000950170-23-027948-item1-chunk0013
Creating `:Chunk` node for chunk ID 0000950170-23-0

In [62]:
kg.query("""
         MATCH (n:Chunk)
         RETURN count(n) as nodeCount
         """)

[{'nodeCount': 23}]

### Create a vector index

In [63]:
kg.query("""
         CREATE VECTOR INDEX `form_10k_chunks` IF NOT EXISTS
          FOR (c:Chunk) ON (c.textEmbedding) 
          OPTIONS { indexConfig: {
            `vector.dimensions`: 1536,
            `vector.similarity_function`: 'cosine'    
         }}
""")

[#F9B6]  _: <CONNECTION> error: Failed to read from defunct connection ResolvedIPv4Address(('34.28.184.63', 7687)) (ResolvedIPv4Address(('34.28.184.63', 7687))): OSError('No data')
Unable to retrieve routing information
Transaction failed and will be retried in 0.9200820382776858s (Unable to retrieve routing information)
[#F9BA]  _: <CONNECTION> error: Failed to read from defunct connection IPv4Address(('si-306463c4-5134.production-orch-0696.neo4j.io', 7687)) (ResolvedIPv4Address(('34.28.184.63', 7687))): OSError('No data')
Transaction failed and will be retried in 2.3258141361502473s (Failed to read from defunct connection IPv4Address(('si-306463c4-5134.production-orch-0696.neo4j.io', 7687)) (ResolvedIPv4Address(('34.28.184.63', 7687))))


[]

In [65]:
kg.query("SHOW INDEXES")

[{'id': 9,
  'name': 'form_10k_chunks',
  'state': 'ONLINE',
  'populationPercent': 100.0,
  'type': 'VECTOR',
  'entityType': 'NODE',
  'labelsOrTypes': ['Chunk'],
  'properties': ['textEmbedding'],
  'indexProvider': 'vector-2.0',
  'owningConstraint': None,
  'lastRead': None,
  'readCount': 0},
 {'id': 0,
  'name': 'index_343aff4e',
  'state': 'ONLINE',
  'populationPercent': 100.0,
  'type': 'LOOKUP',
  'entityType': 'NODE',
  'labelsOrTypes': None,
  'properties': None,
  'indexProvider': 'token-lookup-1.0',
  'owningConstraint': None,
  'lastRead': neo4j.time.DateTime(2025, 7, 5, 2, 1, 48, 857000000, tzinfo=<UTC>),
  'readCount': 78},
 {'id': 1,
  'name': 'index_f7700477',
  'state': 'ONLINE',
  'populationPercent': 100.0,
  'type': 'LOOKUP',
  'entityType': 'RELATIONSHIP',
  'labelsOrTypes': None,
  'properties': None,
  'indexProvider': 'token-lookup-1.0',
  'owningConstraint': None,
  'lastRead': neo4j.time.DateTime(2025, 7, 4, 20, 12, 46, 180000000, tzinfo=<UTC>),
  'readCou

### Calculate embedding vectors for chunks and populate index
- This query calculates the embedding vector and stores it as a property called `textEmbedding` on each `Chunk` node.

In [66]:
kg.query("""
    MATCH (chunk:Chunk) WHERE chunk.textEmbedding IS NULL
    WITH chunk, genai.vector.encode(
      chunk.text, 
      "OpenAI", 
      {
        token: $openAiApiKey, 
        endpoint: $openAiEndpoint
      }) AS vector
    CALL db.create.setNodeVectorProperty(chunk, "textEmbedding", vector)
    """, 
    params={"openAiApiKey":OPENAI_API_KEY, "openAiEndpoint": OPENAI_ENDPOINT} )

[]

- Note that the text embedding property was added to the schema

In [68]:
kg.refresh_schema()
print(kg.schema)

Node properties:
Movie {title: STRING, released: INTEGER, tagline: STRING, taglineEmbedding: LIST}
Person {name: STRING, born: INTEGER}
Chunk {chunkId: STRING, names: LIST, formId: STRING, cik: STRING, cusip6: STRING, source: STRING, f10kItem: STRING, chunkSeqId: INTEGER, text: STRING, textEmbedding: LIST}
Relationship properties:
ACTED_IN {roles: LIST}
REVIEWED {summary: STRING, rating: INTEGER}
The relationships:
(:Person)-[:ACTED_IN]->(:Movie)
(:Person)-[:DIRECTED]->(:Movie)
(:Person)-[:PRODUCED]->(:Movie)
(:Person)-[:WROTE]->(:Movie)
(:Person)-[:FOLLOWS]->(:Person)
(:Person)-[:REVIEWED]->(:Movie)
(:Person)-[:WORKS_WITH]->(:Person)


### Use similarity search to find relevant chunks

In [70]:
def neo4j_vector_search(question):
  """Search for similar nodes using the Neo4j vector index"""
  vector_search_query = """
    WITH genai.vector.encode(
      $question, 
      "OpenAI", 
      {
        token: $openAiApiKey,
        endpoint: $openAiEndpoint
      }) AS question_embedding
    CALL db.index.vector.queryNodes($index_name, $top_k, question_embedding) yield node, score
    RETURN score, node.text AS text
  """
  similar = kg.query(vector_search_query, 
                     params={
                      'question': question, 
                      'openAiApiKey':OPENAI_API_KEY,
                      'openAiEndpoint': OPENAI_ENDPOINT,
                      'index_name':VECTOR_INDEX_NAME, 
                      'top_k': 3})
  return similar

- Let's test by asking questions about the company

In [71]:
search_results = neo4j_vector_search(
    'In a single sentence, tell me about Netapp.'
)

[#FCD5]  _: <CONNECTION> error: Failed to read from defunct connection ResolvedIPv4Address(('34.28.184.63', 7687)) (ResolvedIPv4Address(('34.28.184.63', 7687))): OSError('No data')
Unable to retrieve routing information
Transaction failed and will be retried in 1.1333921860036371s (Unable to retrieve routing information)
[#FCD6]  _: <CONNECTION> error: Failed to read from defunct connection IPv4Address(('si-306463c4-5134.production-orch-0696.neo4j.io', 7687)) (ResolvedIPv4Address(('34.28.184.63', 7687))): OSError('No data')
Transaction failed and will be retried in 1.8656578531654806s (Failed to read from defunct connection IPv4Address(('si-306463c4-5134.production-orch-0696.neo4j.io', 7687)) (ResolvedIPv4Address(('34.28.184.63', 7687))))


In [73]:
print(search_results[0])

{'score': 0.930267333984375, 'text': '>Item 1.  \nBusiness\n\n\nOverview\n\n\nNetApp, Inc. (NetApp, we, us or the Company) is a global cloud-led, data-centric software company. We were incorporated in 1992 and are headquartered in San Jose, California. Building on more than three decades of innovation, we give customers the freedom to manage applications and data across hybrid multicloud environments. Our portfolio of cloud services, and storage infrastructure, powered by intelligent data management software, enables applications to run faster, more reliably, and more securely, all at a lower cost.\n\n\nOur opportunity is defined by the durable megatrends of data-driven digital and cloud transformations. NetApp helps organizations meet the complexities created by rapid data and cloud growth, multi-cloud management, and the adoption of next-generation technologies, such as AI, Kubernetes, and modern databases. Our modern approach to hybrid, multicloud infrastructure and data management,

### Set up a LangChain RAG workflow to chat with the form

In [74]:
neo4j_vector_store = Neo4jVector.from_existing_graph(
    embedding=OpenAIEmbeddings(),
    url=NEO4J_URI,
    username=NEO4J_USERNAME,
    password=NEO4J_PASSWORD,
    index_name=VECTOR_INDEX_NAME,
    node_label=VECTOR_NODE_LABEL,
    text_node_properties=[VECTOR_SOURCE_PROPERTY],
    embedding_node_property=VECTOR_EMBEDDING_PROPERTY,
)


In [75]:
retriever = neo4j_vector_store.as_retriever()

- Next, let's Set up a RetrievalQAWithSourcesChain to carry out question answering

In [76]:
chain = RetrievalQAWithSourcesChain.from_chain_type(
    ChatOpenAI(temperature=0), 
    chain_type="stuff", 
    retriever=retriever
)

In [77]:
def prettychain(question: str) -> str:
    """Pretty print the chain's response to a question"""
    response = chain({"question": question},
        return_only_outputs=True,)
    print(textwrap.fill(response['answer'], 60))

- Now let's test the retriever

In [78]:
question = "What is Netapp's primary business?"

In [79]:
prettychain(question)

NetApp's primary business is enterprise storage and data
management, cloud storage, and cloud operations.


In [80]:
prettychain("Where is Netapp headquartered?")

Netapp is headquartered in San Jose, California.


In [81]:
prettychain("""
    Tell me about Netapp. 
    Limit your answer to a single sentence.
""")

NetApp is a global cloud-led, data-centric software company
that provides customers with the freedom to manage
applications and data across hybrid multicloud environments.


- Take a look at the next hallucination and how it's easily avoided in the subsequent cell

In [82]:
prettychain("""
    Tell me about Apple. 
    Limit your answer to a single sentence.
""")

Apple is a global cloud-led, data-centric software company
headquartered in San Jose, California, that provides
customers with the freedom to manage applications and data
across hybrid multicloud environments.


In [83]:
prettychain("""
    Tell me about Apple. 
    Limit your answer to a single sentence.
    If you are unsure about the answer, say you don't know.
""")

I don't know.
