In [7]:
import os

# Set up environment variables
os.environ["OPENAI_API_KEY"] = "sk-proj"
os.environ["NEO4J_URI"] = "uri"
os.environ["NEO4J_USERNAME"] = "neo4j"
os.environ["NEO4J_PASSWORD"] = "password"


In [8]:
# Neo4j Graph and Vector setup
from neo4j import GraphDatabase
from langchain_community.graphs import Neo4jGraph
from langchain_community.vectorstores import Neo4jVector
from langchain.embeddings import SentenceTransformerEmbeddings

graph = Neo4jGraph()

embeddings = SentenceTransformerEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")  # 384 dimensions

vector_index = Neo4jVector.from_existing_graph(
    embeddings,
    search_type="hybrid",
    node_label="Document",
    text_node_properties=["text"],
    embedding_node_property="embedding"
)

# Ensure entity index exists
graph.query("CREATE FULLTEXT INDEX entity IF NOT EXISTS FOR (e:__Entity__) ON EACH [e.id]")


  embeddings = SentenceTransformerEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")  # 384 dimensions
  from tqdm.autonotebook import tqdm, trange





[]

In [9]:
# Data loading and preparation
from langchain.document_loaders import WikipediaLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Load documents from Wikipedia
raw_documents = WikipediaLoader(query="Elizabeth I").load()

# Split documents into smaller chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=512, chunk_overlap=24)
documents = text_splitter.split_documents(raw_documents[:3])




  lis = BeautifulSoup(html).find_all('li')


In [11]:
from langchain_experimental.graph_transformers import LLMGraphTransformer
from langchain_openai import ChatOpenAI

# LLM initialization
llm = ChatOpenAI(temperature=0, model_name="gpt-4o-mini")

# Convert documents to graph format
llm_transformer = LLMGraphTransformer(llm=llm)
graph_documents = llm_transformer.convert_to_graph_documents(documents)

# Add documents to the graph
graph.add_graph_documents(graph_documents, baseEntityLabel=True, include_source=True)

In [14]:
from langchain_core.pydantic_v1 import BaseModel, Field
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.prompts.prompt import PromptTemplate
from typing import Tuple, List, Optional

# Define entity extraction schema
class Entities(BaseModel):
    names: List[str] = Field(..., description="Extracted entities from the text.")

# Create prompt for entity extraction
prompt = ChatPromptTemplate.from_messages([
    ("system", "You are extracting organization and person entities from the text."),
    ("human", "Use the given format to extract information from the following input: {question}")
])

entity_chain = prompt | llm.with_structured_output(Entities)


In [19]:
from langchain_community.vectorstores.neo4j_vector import remove_lucene_chars

# Generate full-text query for Neo4j
def generate_full_text_query(input: str) -> str:
    words = [el for el in remove_lucene_chars(input).split() if el]
    return " AND ".join([f"{word}~2" for word in words])

# Structured data retrieval
def structured_retriever(question: str) -> str:
    result = ""
    entities = entity_chain.invoke({"question": question})
    for entity in entities.names:
        response = graph.query(
            """CALL db.index.fulltext.queryNodes('entity', $query, {limit:2})
            YIELD node,score
            CALL {
              WITH node
              MATCH (node)-[r:!MENTIONS]->(neighbor)
              RETURN node.id + ' - ' + type(r) + ' -> ' + neighbor.id AS output
              UNION ALL
              WITH node
              MATCH (node)<-[r:!MENTIONS]-(neighbor)
              RETURN neighbor.id + ' - ' + type(r) + ' -> ' +  node.id AS output
            }
            RETURN output LIMIT 50""",
            {"query": generate_full_text_query(entity)},
        )
        result += "\n".join([el['output'] for el in response])
    return result
print(structured_retriever("Who is Elizabeth I?"))



Elizabeth I - QUEEN_OF -> England
Elizabeth I - QUEEN_OF -> Ireland
Elizabeth I - LAST_MONARCH_OF -> House Of Tudor
Elizabeth I - CHILD_OF -> Anne Boleyn
Elizabeth I - CHILD_OF -> Henry VIII
Elizabeth I - CHILD_OF -> Henry VIII of England
Elizabeth I - ESTABLISHED -> Church of England
Elizabeth I - SUCCEEDED_BY -> James VI of Scotland
Elizabeth I - SUCCEEDED_BY -> House of Stuart
Elizabeth I - BORN_IN -> Greenwich Palace
Elizabeth I - SUCCESSION -> James Vi Of Scotland
Elizabeth I - SUBJECT_OF_DEBATE -> Norfolk Conspiracy
Elizabeth I - SUBJECT_OF_DEBATE -> Elizabethan Exclusion
Elizabeth I - AVOIDED -> Order Of Succession
Elizabeth I - CONCERNED_WITH -> Scotland
Elizabeth I - REJECTED -> Catholic Women
Elizabeth I - OUTLIVED -> Edward Vi
Elizabeth I - OUTLIVED -> Mary I
Elizabeth I - OUTLIVED -> Jane Grey
Elizabeth I - OUTLIVED -> Katherine Grey
Elizabeth I - OUTLIVED -> Mary Grey
Elizabeth I - OUTLIVED -> Margaret Clifford
Elizabeth I - SUCCESSOR -> James Vi Of Scotland
Elizabeth I - 

In [16]:
# Integrate structured and unstructured retrieval
def retriever(question: str) -> str:
    structured_data = structured_retriever(question)
    unstructured_data = [el.page_content for el in vector_index.similarity_search(question)]
    return f"""Structured data:
{structured_data}
Unstructured data:
{"#Document ".join(unstructured_data)}"""


In [17]:
from langchain_core.messages import AIMessage, HumanMessage
from langchain_core.runnables import RunnableBranch, RunnableLambda, RunnableParallel, RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser

# Format chat history for follow-up questions
def _format_chat_history(chat_history: List[Tuple[str, str]]) -> List:
    buffer = []
    for human, ai in chat_history:
        buffer.append(HumanMessage(content=human))
        buffer.append(AIMessage(content=ai))
    return buffer

# Question condensing
CONDENSE_QUESTION_PROMPT = PromptTemplate.from_template(
    """Given the following conversation and a follow-up question, rephrase the follow-up question to be standalone.
    Chat History:
    {chat_history}
    Follow Up Input: {question}
    Standalone question:"""
)

# Define search query processing
_search_query = RunnableBranch(
    (
        RunnableLambda(lambda x: bool(x.get("chat_history"))).with_config(run_name="HasChatHistoryCheck"),
        RunnablePassthrough.assign(chat_history=lambda x: _format_chat_history(x["chat_history"]))
        | CONDENSE_QUESTION_PROMPT
        | ChatOpenAI(temperature=0)
        | StrOutputParser(),
    ),
    RunnableLambda(lambda x: x["question"]),
)

# Combine search results into final answer
template = """Answer the question based only on the following context:
{context}

Question: {question}
Answer:"""

prompt = ChatPromptTemplate.from_template(template)

chain = (
    RunnableParallel({
        "context": _search_query | retriever,
        "question": RunnablePassthrough(),
    })
    | prompt
    | llm
    | StrOutputParser()
)


In [18]:
# Example query
print(chain.invoke({"question": "Which house did Elizabeth I belong to?"}))


  words = [el for el in remove_lucene_chars(input).split() if el]


Elizabeth I belonged to the House of Tudor.


In [20]:
print(chain.invoke({"question": "Who was Elizabeth younger brother?"}))



Elizabeth's younger brother was Edward VI.
