In [1]:
from dotenv import load_dotenv
import os

# Common data processing
import json
import textwrap

# Langchain
from langchain_community.graphs import Neo4jGraph
from langchain_community.vectorstores import Neo4jVector
from langchain_openai import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains import RetrievalQAWithSourcesChain
from langchain_openai import ChatOpenAI
from utils import geocode_address_string


# Warning control
import warnings
warnings.filterwarnings("ignore")

In [2]:
# Load from environment
load_dotenv('.env', override=True)
NEO4J_URI = os.getenv('NEO4J_URI')
NEO4J_USERNAME = os.getenv('NEO4J_USERNAME')
NEO4J_PASSWORD = os.getenv('NEO4J_PASSWORD')
NEO4J_DATABASE = os.getenv('NEO4J_DATABASE') or 'neo4j'
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')

# Global constants
VECTOR_INDEX_NAME = 'form_10k_chunks'
VECTOR_NODE_LABEL = 'Chunk'
VECTOR_SOURCE_PROPERTY = 'text'
VECTOR_EMBEDDING_PROPERTY = 'textEmbedding'

In [3]:
kg = Neo4jGraph(
    url=NEO4J_URI, username=NEO4J_USERNAME, password=NEO4J_PASSWORD, database=NEO4J_DATABASE
)

In [10]:
import csv

all_form13s = []

with open('./data/form13.csv', mode='r') as csv_file:
    csv_reader = csv.DictReader(csv_file)
    for row in csv_reader: # each row will be a dictionary
      all_form13s.append(row)

In [4]:
kg.refresh_schema()
print(textwrap.fill(kg.schema, 60))

Node properties: Chunk {chunkId: STRING, names: LIST,
formId: STRING, cik: STRING, cusip6: STRING, source: STRING,
f10kItem: STRING, chunkSeqId: INTEGER, text: STRING,
textEmbedding: LIST} Form {names: LIST, formId: STRING, cik:
STRING, cusip6: STRING, source: STRING} Company {names:
LIST, cusip6: STRING, companyName: STRING, cusip: STRING}
Manager {managerCik: STRING, managerName: STRING,
managerAddress: STRING} Relationship properties: SECTION
{f10kItem: STRING} OWNS_STOCK_IN {reportCalendarOrQuarter:
STRING, value: FLOAT, shares: INTEGER} The relationships:
(:Chunk)-[:NEXT]->(:Chunk) (:Chunk)-[:PART_OF]->(:Form)
(:Form)-[:SECTION]->(:Chunk) (:Company)-[:FILED]->(:Form)
(:Manager)-[:OWNS_STOCK_IN]->(:Company)


In [None]:
kg.query("""
CREATE CONSTRAINT unique_address 
  IF NOT EXISTS
  FOR (a:Address) 
  REQUIRE a.location IS UNIQUE
""")

# Iterate over the extracted addresses and create Cypher queries


In [33]:
cypher = """
    MERGE (a:Address {location: point($address.location) })
      ON CREATE 
        SET a.city = $address.city
        SET a.state = $address.state
        SET a.country = $address.country
    WITH a
      MATCH (m:Manager)
        WHERE m.managerCik = $managerCik
    WITH a,m
      MERGE (m)-[:LOCATED_AT]->(a)
"""

In [38]:
for form13 in all_form13s:
    address = geocode_address_string(form13['managerAddress'])
    kg.query(cypher, params={'address': address, 'managerCik': form13['managerCik']})

In [39]:
kg.query("""
  MATCH (a:Address)<-[:LOCATED_AT]-(m:Manager)
  RETURN a,m LIMIT 5
""")

[{'a': {'country': 'Canada',
   'city': 'Toronto',
   'location': POINT(-79.3804133 43.6468457),
   'state': 'Ontario'},
  'm': {'managerCik': '1000275',
   'managerAddress': 'ROYAL BANK PLAZA, 200 BAY STREET, TORONTO, A6, M5J2J5',
   'managerName': 'Royal Bank of Canada'}},
 {'a': {'country': 'United States',
   'city': 'Denver',
   'location': POINT(-104.9933932 39.7513059),
   'state': 'Colorado'},
  'm': {'managerCik': '1002784',
   'managerAddress': '1875 Lawrence Street, Suite 300, Denver, CO, 80202-1805',
   'managerName': 'SHELTON CAPITAL MANAGEMENT'}},
 {'a': {'country': 'United States',
   'city': 'Columbus',
   'location': POINT(-82.9923517 39.9593222),
   'state': 'Ohio'},
  'm': {'managerCik': '1007280',
   'managerAddress': '277 E TOWN ST, COLUMBUS, OH, 43215',
   'managerName': 'PUBLIC EMPLOYEES RETIREMENT SYSTEM OF OHIO'}},
 {'a': {'country': 'United States',
   'city': 'Norfolk',
   'location': POINT(-76.2931849 36.84716299999999),
   'state': 'Virginia'},
  'm': {'man

In [40]:
# TODO: add company addresses