In [1]:
from dotenv import load_dotenv

load_dotenv(".env")

import os
import sys
import glob
from pathlib import Path
import pandas as pd
import numpy as np

sys.path.append("")

##
# import openai
text_embedding_model = "text-embedding-ada-002"
# from openai.embeddings_utils import get_embedding

In [2]:
# import langchain
## For generating and persisting Embeddings
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores.pgvector import PGVector
from langchain.vectorstores import Neo4jVector

# 

In [3]:
from langchain.document_loaders import PyPDFLoader, UnstructuredPDFLoader, PyPDFium2Loader
from langchain.document_loaders import PyPDFDirectoryLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter



## Neo4j Connection

In [4]:
index_name = "documents"
rebuild = True

if rebuild:
    filepath = "./data/CompaniesAct2013.pdf"
    loader = PyPDFLoader(filepath)
    documents = loader.load()

    splitter = RecursiveCharacterTextSplitter(
        chunk_size=1100,
        chunk_overlap=150,
        length_function=len,
        is_separator_regex=False,
    )

    pages = splitter.split_documents(documents)

    print("Number of chunks = ", len(pages))

    index = Neo4jVector.from_documents(
        pages,
        OpenAIEmbeddings(),
        url=os.environ["NEO4J_URI"],
        username=os.environ["NEO4J_USERNAME"],
        password=os.environ["NEO4J_PASSWORD"],
        index_name=index_name,  # vector by default
        node_label="Chunk",  # Chunk by default
        text_node_property="text",  # text by default
        embedding_node_property="embedding",  # embedding by default
        create_id_index=True,  # True by default
    )
else:
    index = Neo4jVector.from_existing_index(
        OpenAIEmbeddings(),
        url=os.environ["NEO4J_URI"],
        username=os.environ["NEO4J_USERNAME"],
        password=os.environ["NEO4J_PASSWORD"],
        index_name=index_name,
        text_node_property="text",  # Need to define if it is not default
    )

Number of chunks =  1154


In [5]:
from neo4j import GraphDatabase

AUTH = (os.environ['NEO4J_USERNAME'], os.environ['NEO4J_PASSWORD'])

with GraphDatabase.driver(os.environ['NEO4J_URI'], auth=AUTH) as driver:
    driver.verify_connectivity()



## Test the index

In [6]:
input = "How to close a company"

from openai import OpenAI

client = OpenAI()
response = client.embeddings.create(
            input=input,
            model="text-embedding-ada-002"
        )

embedding = response.data[0].embedding


query = """
CALL db.index.vector.queryNodes($collection, 10, $embedding)
YIELD node AS chunks, score
"""
with GraphDatabase.driver(os.environ['NEO4J_URI'], auth=AUTH) as driver:
    records, summary, keys = driver.execute_query(
        query,
        {'embedding': embedding, 'collection': index_name},
        database_="neo4j",
    )

len(records)

10

## Testing the Neo4j Retriever

In [1]:
import os
from agent.retrievers.neo4j_retriever import Neo4jRetriever

ret = Neo4jRetriever(
    conn_params={
        "username": os.environ["NEO4J_USERNAME"],
        "password": os.environ["NEO4J_PASSWORD"],
        "uri": os.environ["NEO4J_URI"],
    },
    collection='documents'
)


In [2]:
records = ret.get_docs("How can I close the company?", {'num_docs': 6})

In [4]:
records[0].metadata

{'page': 14,
 'source': './data/CompaniesAct2013.pdf',
 'id': 'aaa59050-9c06-11ee-8f1a-bedec676b353'}