In [8]:
# %pip install pandas
# %pip install langchain langchain_community
# %pip install neo4j
# %pip install langchain_openai
# %pip install python-dotenv
# %pip install chromadb
# %pip install "numpy<2"

# Using Human in Loop in Graph RAG

Generally LLMs are good at cypher generation, when best practices like sharing examples, schema are followed. However, hallucinations / incorrect queries are common in complex graphs. GraphQACypherChain is commonly used for graph RAG. This example explains how to dissect Graph RAG to ask user for guidance on generating better query. 

In [9]:
import pandas as pd
import os
from langchain_community.graphs import Neo4jGraph
from langchain_core.messages import AIMessage
from langchain_openai import AzureChatOpenAI, AzureOpenAIEmbeddings
from dotenv import load_dotenv

In [10]:
os.environ['NEO4J_URI'] = 'bolt://localhost:7690'
os.environ['NEO4J_USERNAME'] = 'neo4j'
os.environ['NEO4J_PASSWORD'] = 'Password@123'
graph = Neo4jGraph()

In [11]:
load_dotenv()

os.environ["AZURE_OPENAI_API_KEY"] = os.getenv("AZURE_OPENAI_API_KEY")
os.environ["AZURE_OPENAI_ENDPOINT"] = "https://sriks-openai.openai.azure.com/"
os.environ["AZURE_OPENAI_API_VERSION"] = "2024-05-01-preview"
os.environ["AZURE_OPENAI_CHAT_DEPLOYMENT"] = "gpt-4o"
os.environ["AZURE_OPENAI_EMBEDDINGS_DEPLOYMENT"] = "text-embedding-ada-002"

llm = AzureChatOpenAI(
    openai_api_version=os.environ["AZURE_OPENAI_API_VERSION"],
    azure_deployment=os.environ["AZURE_OPENAI_CHAT_DEPLOYMENT"],
)

embeddings = AzureOpenAIEmbeddings(
    openai_api_version=os.environ["AZURE_OPENAI_API_VERSION"],
    azure_deployment=os.environ["AZURE_OPENAI_EMBEDDINGS_DEPLOYMENT"],
)

In [12]:
examples = [
    {
        "question": "What is the root cause request id 8ff8696695aa73588ac454809741e2ea",
        "query": "MATCH (n)-[r:DEPENDS_ON]->(m) where n.id <> 'ROOT' OR m.id <> 'ROOT' and r.operationId = 8ff8696695aa73588ac454809741e2ea RETURN n.id, r.duration, r.operation_Name, m.id ORDER BY r.duration DESC LIMIT 3",
    },
    {
        "question": "what is the longest running operation",
        "query": "MATCH (n)-[r:DEPENDS_ON]->(m) where n.id <> 'ROOT' OR m.id <> 'ROOT' RETURN n.id, r.duration, r.operation_Name, m.id ORDER BY r.duration DESC LIMIT 3",
    }
]

In [13]:
from langchain.chains import GraphCypherQAChain
from langchain_core.prompts import FewShotPromptTemplate, PromptTemplate
from langchain_core.example_selectors import SemanticSimilarityExampleSelector, MaxMarginalRelevanceExampleSelector
from langchain.vectorstores import Chroma
from langchain.chains.llm import LLMChain
from langchain.chains.graph_qa.cypher import construct_schema
from langchain.chains import SimpleSequentialChain, SequentialChain
from langchain.agents import AgentType, initialize_agent, load_tools

# load human intervention tools
tools = load_tools(
    ["human"], llm
)

graph.refresh_schema()


EXAMPLES_PROMPT_TEMPLATE = """   
                Input: {question},
                Output: {query}
            """

example_prompt = PromptTemplate(input_variables=["question", "query"], template=EXAMPLES_PROMPT_TEMPLATE)

cypherPromptTemplate = """
You are an expert Neo4j Developer translating user questions into Cypher to answer questions.
Convert the user's question based on the schema.
Instructions: Use only the provided relationship types and properties in the schema.
Do not use any other relationship types or properties that are not provided.
Schema:
{schema}

Note: Do not include any explanations or apologies in your responses.
Do not respond to any questions that might ask anything else than for you to construct a Cypher statement.
Do not include any text except the generated Cypher statement.

Important: In the generated Cypher query, the RETURN statement must explicitly include the property values used in the query's filtering condition, alongside the main information requested from the original question.

Question: {question}
input:
"""

similaritySelector = SemanticSimilarityExampleSelector.from_examples(
    examples=examples, 
    embeddings=embeddings, 
    k=1,
    vectorstore_cls=Chroma
)

cypherPrompt = FewShotPromptTemplate(
    example_selector=similaritySelector,
    example_prompt=example_prompt,
    input_variables=["question", "schema"], 
    prefix=cypherPromptTemplate,
    suffix="the question is:{question}",
)

cypherqachain = GraphCypherQAChain.from_llm(
    llm = llm,
    return_intermediate_steps=True,
    validate_cypher=True,
    graph=graph, 
    verbose=True,
    k=3,
    use_function_response=True,
    cypherPrompt=cypherPrompt,
    return_direct=True,
    output_key="input",
) 

schema  = construct_schema(graph.get_structured_schema, [], [])

In [14]:
# Test cypher chain invocation
cypherqachain.invoke({"query": "which nodes has max azure dependencies"})



[1m> Entering new GraphCypherQAChain chain...[0m
Generated Cypher:
[32;1m[1;3mcypher
MATCH (crn:cloudRoleName)-[r:AZURE_DEPENDS_ON]->(sql:SQL)
RETURN crn.id, COUNT(r) AS azureDependencyCount
ORDER BY azureDependencyCount DESC
LIMIT 1
[0m

[1m> Finished chain.[0m


{'query': 'which nodes has max azure dependencies',
 'input': [{'crn.id': 'people-api', 'azureDependencyCount': 23}],
 'intermediate_steps': [{'query': 'cypher\nMATCH (crn:cloudRoleName)-[r:AZURE_DEPENDS_ON]->(sql:SQL)\nRETURN crn.id, COUNT(r) AS azureDependencyCount\nORDER BY azureDependencyCount DESC\nLIMIT 1\n'}]}

The below code, creates a confidence chain that will be used to ask LLM how confident it is query generated.

In [49]:
confidence_prompt_template = """Given the {input}, {schema} and the question from user question:{question} how confident are you in the query on scale of 1-5, answer precisely in JSON format
score: 1, reason: The query is not clear, assumptions: The query is not clear"""

confidence_prompt = PromptTemplate(template=confidence_prompt_template, input_variables=["input", "schema"])

# ask llm on confidence of the query
confidenceChain = LLMChain(
    llm=llm,
    prompt=confidence_prompt
)    

In [50]:
human_interaction_prompt = """How do you want to proceed?"""
agent = initialize_agent(
    tools=tools, 
    llm=llm, 
    agent="zero-shot-react-description", 
    verbose=True, 
    handle_parsing_errors=True,
    prompt=human_interaction_prompt
)

In [71]:

QA_GENERATION_TEMPLATE = """
       Task: answer the question you are given based on the context provided.
       Instructions:
        You are an assistant that helps to form nice and human understandable answers. 
        Use the context information provided to generate a well organized and comprehensve answer to the user's question. 
        When the provided information contains multiple elements, structure your answer as a bulleted or numbered list to enhance clarity and readability.
        You must use the information to construct your answer. 
        The provided information is authoritative; do not doubt it or try to use your internal knowledge to correct it. 
        Make the answer sound like a response to the question without mentioning that you based the result on the given information. 
        If there is no information provided, say that the knowledge base returned empty results.

        Here's the information:
        {context}

        Question: {question}
        Answer:
        
        Also explain the assumptions you made while generating the answer given the query used to generate the context and the schema provided.
        query: {input}, schema: {schema}
            """
       
            
qaPrompt = PromptTemplate(input_variables=["context", "question", "input", "schema"], template=QA_GENERATION_TEMPLATE)

qaPromptChain = LLMChain(
    llm=llm,
    prompt=qaPrompt
)   

# function is used to extract the query from the message, schema is a global variable
def queryExtractor(msg: AIMessage):
    # print(msg)
    query = msg["intermediate_steps"][0]["query"]
    question = msg["query"]
    msg = {"input": query, "schema": schema, "question": question}
    return msg

# function is used to execute the cypher query
def executeCypher(msg: AIMessage):
    # print(msg)
    query = msg["input"]
    query = query.replace("cypher", "")
    result = graph.query(query)
    # result = pd.DataFrame(result)    
    return { "context" : result , "question": msg["question"], "input": msg["input"], "schema": msg["schema"]}

final_chain = cypherqachain | queryExtractor | confidenceChain | agent | executeCypher | qaPromptChain

The below execution waits for user to confirm if the cypher query generated is matching user's expectations. User can suggest improvements using natural language to generate a better query, once the user confirms the query is run on the Neo4j instance and final response is generated. 

[Pending] : Ask for user affirmation only if LLM is not confident. LLM generates a confidence score based on the query, schema shared.

In [74]:
result = final_chain.invoke({"query": "which nodes has max sql dependencies"})
print("############ Final Answer ############")
print(result['text'])



[1m> Entering new GraphCypherQAChain chain...[0m
Generated Cypher:
[32;1m[1;3mcypher
MATCH (c:cloudRoleName)-[r:AZURE_DEPENDS_ON]->(s:SQL)
RETURN c.id, COUNT(r) as dependencyCount
ORDER BY dependencyCount DESC
LIMIT 1
[0m

[1m> Finished chain.[0m


[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mThe question appears to involve a Cypher query for a graph database, specifically querying for a `cloudRoleName` node that has dependencies on `SQL` nodes. The request is to return the ID of the `cloudRoleName` node with the highest number of `AZURE_DEPENDS_ON` relationships, ordered by the count of these relationships in descending order, and limited to one result.

The Cypher query provided seems mostly correct, but to ensure clarity and correctness, let's break down the query:

1. `MATCH (c:cloudRoleName)-[r:AZURE_DEPENDS_ON]->(s:SQL)`: This matches all `cloudRoleName` nodes that have `AZURE_DEPENDS_ON` relationships to `SQL` nodes.
2. `RETURN c.id, COUNT(r) as dependency

Notice the conversation `Observation: do not limit by 1 get all matches` and its influence on final query generation. <br/>
The final answer begins from ########### Final Answer ###########