## Knowledge Graph Exploration Notebook

This notebook is a brief exploration and tests of several different knowledge graph methods including property graph and knowledge graph index from LlamaIndex, langchain's knowledge graph and Microsoft's GraphRAG implmentation to assess cost, feasibility, ease-of-use, compatibility, performance etc. 

Ultimate conclusion was to use LLamaIndex's KnowledgeGraphIndex or PropertyGraphIndex as GraphRAG was quite expnesive and often timed out for bigger queries and though LangChain worked well it was hard to incorporate with our existing code.

In [3]:
%pip install llama-index
%pip install llama-index-llms-azure-openai -q
%pip install llama-index-graph-stores-nebula -q
%pip install llama-index-llms-openai -q
%pip install llama-index-embeddings-azure-openai -q
%pip install llama-index-vector-stores-neo4jvector
%pip install neo4j
%pip install llama-index llama-index-graph-stores-neo4j openai
%pip install langchain --upgrade
%pip install --upgrade --quiet  langchain-openai
%pip install -U langchain-community


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.1.2[0m[39;49m -> [0m[32;49m24.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.1.2[0m[39;49m -> [0m[32;49m24.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.1.2[0m[39;49m -> [0m[32;49m24.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;4

In [2]:
import os
import sys
import logging
import traceback
import pandas as pd
import nest_asyncio
import asyncio
from datetime import datetime
from dotenv import load_dotenv
from typing import List
from huggingface_hub import InferenceApi
from transformers import pipeline
from IPython.display import Markdown, display

nest_asyncio.apply()
load_dotenv()

# LLamaIndex Imports
from llama_index.llms.azure_openai import AzureOpenAI
from llama_index.vector_stores.neo4jvector import Neo4jVectorStore
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, Settings, StorageContext, get_response_synthesizer,PropertyGraphIndex, Document, KnowledgeGraphIndex
from llama_index.core.evaluation import (DatasetGenerator,FaithfulnessEvaluator,RelevancyEvaluator)
from llama_index.core.indices.property_graph import SchemaLLMPathExtractor
from llama_index.core.node_parser import SentenceSplitter
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.readers.file import PyMuPDFReader
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.llms.openai import OpenAI
from llama_index.graph_stores.neo4j import Neo4jPropertyGraphStore
from llama_index.core.schema import IndexNode, NodeWithScore, Document, QueryBundle
from llama_index.core.extractors import (SummaryExtractor,QuestionsAnsweredExtractor)
from llama_index.retrievers.bm25 import BM25Retriever
from llama_index.core.retrievers import BaseRetriever, VectorIndexRetriever, KGTableRetriever
from llama_index.core.query_engine import RetrieverQueryEngine, KnowledgeGraphQueryEngine, CitationQueryEngine
from llama_index.core.prompts.base import PromptTemplate, PromptType
from llama_index.graph_stores.neo4j import Neo4jGraphStore
from llama_index.core.base.llms.types import ChatMessage, MessageRole
from llama_index.core.retrievers import BaseRetriever


# Giskard imports
import giskard
from giskard.rag import AgentAnswer, evaluate, RAGReport, KnowledgeBase, generate_testset, QATestset
from giskard.rag.metrics.ragas_metrics import ragas_context_recall, ragas_faithfulness, ragas_answer_relevancy, ragas_context_precision
from giskard.llm import set_llm_model, set_llm_api
from giskard.llm.client import get_default_client
from giskard.llm.embeddings import set_default_embedding, get_default_embedding

def remove_openai_api_key():
    if "OPENAI_API_KEY" in os.environ:
        del os.environ["OPENAI_API_KEY"]

# Need to specify this here otherwise it doesn't work - Giskard Problem (?)
os.environ["AZURE_OPENAI_API_KEY"] = os.getenv("GSK_AZURE_OPENAI_API_KEY")
os.environ["AZURE_OPENAI_ENDPOINT"] = os.getenv("GSK_AZURE_OPENAI_ENDPOINT")
os.environ["AZURE_API_VERSION"] = os.getenv("AZURE_API_VERSION")
os.environ["GSK_LLM_API"] = "azure"
os.environ["GSK_LLM_MODEL"] = "gpt-4o-mini"
set_llm_api("azure")
set_llm_model('gpt-4o-mini')

AZURE_API_KEY = os.getenv('AZURE_OPENAI_API_KEY')
AZURE_DEPLOYMENT_NAME = os.getenv("AZURE_DEPLOYMENT_NAME")
AZURE_API_VERSION = os.getenv("AZURE_API_VERSION")
AZURE_OPENAI_ENDPOINT = os.getenv("AZURE_OPENAI_ENDPOINT")


embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-large-en-v1.5")
Settings.embed_model = embed_model

# Setup LLM
llm_gpt4o = AzureOpenAI(
    deployment_name="gpt-4o-mini",
    temperature=0, 
    api_key=AZURE_API_KEY,
    azure_endpoint=AZURE_OPENAI_ENDPOINT,
    api_version=AZURE_API_VERSION
)

llm_gpt35 = AzureOpenAI(
    deployment_name="gpt35",
    temperature=0, 
    api_key=AZURE_API_KEY,
    azure_endpoint=AZURE_OPENAI_ENDPOINT,
    api_version=AZURE_API_VERSION
)

# Setup LLM
llm_gpt4o_ = AzureOpenAI(
    deployment_name="gpt4o",
    temperature=0,
    api_key=os.getenv("GPT4O_API_KEY"),
    azure_endpoint=os.getenv("GPT4O_AZURE_ENDPOINT"),
    api_version=os.getenv("GPT4O_API_VERSION")
)

Settings.llm = llm_gpt35

# Verify LLM setup
client = get_default_client()

#print("Client base URL:", client._client._base_url)
#print("Client API key:", client._client.api_key)
#print("Client API version:", client._client._api_version)
#print("Client model:", client.model)

assert client._client._base_url == f'{os.environ["AZURE_OPENAI_ENDPOINT"]}/openai/'
assert client._client.api_key == os.environ["AZURE_OPENAI_API_KEY"]
assert client._client._api_version == os.environ["OPENAI_API_VERSION"]

url = os.getenv("NEO4J_URI")
username = os.getenv("NEO4J_USERNAME")
password = os.getenv("NEO4J_PASSWORD")
database = os.getenv("NEO4J_DATABASE")

In [None]:
from llama_index.core.prompts.base import PromptTemplate, PromptType
from llama_index.core.query_engine import KnowledgeGraphQueryEngine
from llama_index.core import StorageContext
from llama_index.graph_stores.neo4j import Neo4jGraphStore


graph_store = Neo4jPropertyGraphStore(
    username = username,
    password = password,
    url = url,
    database=database
)
storage_context = StorageContext.from_defaults(graph_store=graph_store)


from llama_index.core.prompts.base import PromptTemplate, PromptType


DEFAULT_KG_RESPONSE_ANSWER_PROMPT_TMPL = """
The original question is given below.
This question has been translated into a Graph Database query.
Both the Graph query and the response are given below.
Given the Graph Query response, synthesise a response to the original question.

Original question: {query_str}
Graph query: {kg_query_str}
Graph response: {kg_response_str}
Response:
"""

DEFAULT_KG_RESPONSE_ANSWER_PROMPT = PromptTemplate(
    DEFAULT_KG_RESPONSE_ANSWER_PROMPT_TMPL,
    prompt_type=PromptType.QUESTION_ANSWER,
)

# Prompt
DEFAULT_NEO4J_NL2CYPHER_PROMPT_TMPL = (
    "Task:Generate Cypher statement to query a graph database.\n"
    "Instructions:\n"
    "Use only the provided relationship types and properties in the schema.\n"
    "Do not use any other relationship types or properties that are not provided.\n"
    "Schema:\n"
    "{schema}\n"
    "Note: Do not include any explanations or apologies in your responses.\n"
    "Do not respond to any questions that might ask anything else than for you "
    "to construct a Cypher statement. \n"
    "Do not include any text except the generated Cypher statement.\n"
    "Do not include any backticks, headings or anything, just return the query"
    "\n"
    "The question is:\n"
    "{query_str}\n"
)

DEFAULT_NEO4J_NL2CYPHER_PROMPT = PromptTemplate(
    DEFAULT_NEO4J_NL2CYPHER_PROMPT_TMPL,
    prompt_type=PromptType.TEXT_TO_GRAPH_QUERY,
)

graph_query_synthesis_prompt = DEFAULT_NEO4J_NL2CYPHER_PROMPT

# Initialize the KnowledgeGraphQueryEngine with the required prompts
query_engine = KnowledgeGraphQueryEngine(
    storage_context=storage_context,
    llm=llm_gpt4o,
    graph_query_synthesis_prompt=graph_query_synthesis_prompt,
    graph_response_answer_prompt=DEFAULT_KG_RESPONSE_ANSWER_PROMPT,  # or your custom prompt
    verbose=True,
    refresh_schema=False
)

response = query_engine.query(
    "Tell me about LL144?",
)

In [None]:
graph_store = Neo4jPropertyGraphStore(
    username = username,
    password = password,
    url = url,
    database=database
)

index = PropertyGraphIndex.from_existing(
    property_graph_store=graph_store,
    llm=llm_gpt4o,
    embed_model=embed_model
)

nodes = index.as_retriever(include_text=False).retrieve("Bias audit")
print(nodes[0].text)

In [None]:


from llama_index.core.indices.property_graph import TextToCypherRetriever

DEFAULT_RESPONSE_TEMPLATE = (
"Generated Cypher query:\n{query}\n\n" "Cypher Response:\n{response}"
)
DEFAULT_ALLOWED_FIELDS = ["text", "label", "type"]


index = PropertyGraphIndex.from_existing(
    property_graph_store=graph_store,
    llm=llm_gpt4o,
    embed_model=embed_model,
)
DEFAULT_TEXT_TO_CYPHER_TEMPLATE = index.property_graph_store.text_to_cypher_template,


cypher_retriever = TextToCypherRetriever(
    index.property_graph_store,
    # customize the LLM, defaults to Settings.llm
    llm=llm_gpt4o,
    # customize the text-to-cypher template.
    # Requires `schema` and `question` template args
    text_to_cypher_template=index.property_graph_store.text_to_cypher_template,
    # customize how the cypher result is inserted into
    # a text node. Requires `query` and `response` template args
    response_template=DEFAULT_RESPONSE_TEMPLATE,
    # an optional callable that can clean/verify generated cypher
    cypher_validator=None,
    # allowed fields in the resulting
    allowed_output_field=DEFAULT_ALLOWED_FIELDS
)
from llama_index.core.query_engine import RetrieverQueryEngine

query_engine = RetrieverQueryEngine.from_args(
    cypher_retriever,
)

response = query_engine.query(
    "Tell me about a bias audit?",
)
print(str(response))


In [None]:
graph_store = Neo4jPropertyGraphStore(
    username = username,
    password = password,
    url = url,
    database=database
)
from llama_index.core.prompts.base import PromptTemplate, PromptType

# Define your custom prompt templates or use existing ones
graph_query_synthesis_prompt = PromptTemplate(
    "Answer the question",
    prompt_type=PromptType.QUESTION_ANSWER,
)

from llama_index.core.query_engine import KnowledgeGraphQueryEngine
from llama_index.core import StorageContext

storage_context = StorageContext.from_defaults(graph_store=graph_store)

query_engine = KnowledgeGraphQueryEngine(
    storage_context=storage_context,
    llm=llm_gpt4o,
    #graph_query_synthesis_prompt=graph_query_synthesis_prompt,
    #graph_response_answer_prompt=DEFAULT_KG_RESPONSE_ANSWER_PROMPT,
    verbose=True
)

response = query_engine.query(
    "Tell me about a bias audit?",
)

# Define retriever
retriever = index.as_retriever(
    include_text=False,  # include source text in returned nodes, default True
)
results = retriever.retrieve("What is a Bias Audit?")
for record in results:
    print(record.text)

print(results)

Collecting langchain
  Downloading langchain-0.2.14-py3-none-any.whl.metadata (7.1 kB)
Collecting langchain-core<0.3.0,>=0.2.32 (from langchain)
  Downloading langchain_core-0.2.32-py3-none-any.whl.metadata (6.2 kB)
Downloading langchain-0.2.14-py3-none-any.whl (997 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m997.8/997.8 kB[0m [31m25.5 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25hDownloading langchain_core-0.2.32-py3-none-any.whl (389 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m389.8/389.8 kB[0m [31m26.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: langchain-core, langchain
  Attempting uninstall: langchain-core
    Found existing installation: langchain-core 0.2.23
    Uninstalling langchain-core-0.2.23:
      Successfully uninstalled langchain-core-0.2.23
  Attempting uninstall: langchain
    Found existing installation: langchain 0.2.11
    Uninstalling langchain-0.2.11:
      Successfully uninstalled langchain-0.

In [4]:
from langchain_openai import AzureOpenAI, AzureChatOpenAI
llm_langchain = AzureChatOpenAI(deployment_name="gpt35", azure_endpoint=AZURE_ENDPOINT)

from langchain_openai import AzureOpenAIEmbeddings
# Query the knowledge graph in a RAG application
from langchain.chains import GraphCypherQAChain
from langchain.graphs import Neo4jGraph

graph = Neo4jGraph(
    url=url,
    username=username,
    password=password
)
graph.refresh_schema()

cypher_chain = GraphCypherQAChain.from_llm(
    graph=graph,
    cypher_llm=llm_langchain,
    qa_llm=llm_langchain,
    validate_cypher=True, # Validate relationship directions
    verbose=True
)
     

In [17]:
cypher_res=cypher_chain.invoke({"query": "What do you know about LL144"})



[1m> Entering new GraphCypherQAChain chain...[0m


BadRequestError: Error code: 400 - {'error': {'code': 'OperationNotSupported', 'message': 'The completion operation does not work with the specified model, gpt-35-turbo. Please choose different model and try again. You can learn more about which models can be used with each operation here: https://go.microsoft.com/fwlink/?linkid=2197993.'}}

In [5]:
%pip install --upgrade --quiet  langchain sentence_transformers
%pip install langchain-huggingface


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.1.2[0m[39;49m -> [0m[32;49m24.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.1.2[0m[39;49m -> [0m[32;49m24.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [6]:
import os
from langchain.chains import RetrievalQA
from langchain_openai import ChatOpenAI
from langchain_openai import OpenAIEmbeddings
from langchain_huggingface import HuggingFaceEmbeddings

import os
from langchain_community.vectorstores.neo4j_vector import Neo4jVector
from langchain.document_loaders import WikipediaLoader
from langchain_openai import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.docstore.document import Document
     
embed_model = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

retrieval_query="""
 MATCH (node)-[:PART_OF]->(d:Document)
WITH d, apoc.text.join(collect(node.text),"\n----\n") as text, avg(score) as score
RETURN text, score, {source: COALESCE(CASE WHEN d.url CONTAINS "None" THEN d.fileName ELSE d.url END, d.fileName)} as metadata
"""

neo_db=Neo4jVector.from_existing_index(
        embedding=embed_model,  
        url=url,
        username=username,
        password=password,
        database="neo4j",
        index_name="vector",
        retrieval_query=retrieval_query,

    )

qa = RetrievalQA.from_chain_type(
    llm=llm_langchain, chain_type="stuff", retriever=neo_db.as_retriever(), return_source_documents=True
)

  from tqdm.autonotebook import tqdm, trange


In [8]:
####
qa.invoke("what is a bias audit and why do i need one?")



{'query': 'what is a bias audit and why do i need one?',
 'result': "A bias audit is an impartial evaluation conducted by an independent auditor. In the context of automated employment decision tools (AEDTs), a bias audit specifically refers to the testing of an AEDT to assess its impact on different categories of individuals, such as race/ethnicity and sex.\n\nYou would need a bias audit if you are an employer or an employment agency using an AEDT to screen candidates or employees for employment decisions. It is required by law (Local Law 144 of 2021) in New York City. The purpose of a bias audit is to identify any potential biases or disparities in the AEDT's decision-making process that may disproportionately affect certain groups of individuals. By conducting a bias audit, you can ensure that the AEDT is fair and does not have a discriminatory impact on protected categories of individuals.",
 'source_documents': [Document(metadata={'source': 'LL144_Definitions.pdf'}, page_content='

In [28]:
## LLamaIndex version of the above code
from llama_index.core import Settings
import os
from llama_index.core import VectorStoreIndex
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.vector_stores.neo4jvector import Neo4jVectorStore
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.llms.azure_openai import AzureOpenAI

embed_model = HuggingFaceEmbedding(model_name="sentence-transformers/all-MiniLM-L6-v2")
retrieval_query="""
 MATCH (node)-[:PART_OF]->(d:Document)
WITH d, apoc.text.join(collect(node.text),"\n----\n") as text, avg(score) as score
RETURN text, score, {source: COALESCE(CASE WHEN d.url CONTAINS "None" THEN d.fileName ELSE d.url END, d.fileName)} as metadata
"""


neo_db = Neo4jVectorStore(
    embedding=embed_model,  
    embedding_dimension=384,
    url=url,
    username=username,
    password=password,
    database="neo4j",
    index_name="vector",
    retrieval_query=retrieval_query,
    text_node_property="text",
    #keyword_index_name="Term"
)

#index = VectorStoreIndex(
#    vector_store=neo_db,
#    embed_model=embed_model
#)

index = VectorStoreIndex.from_vector_store(neo_db)
retriever = index.as_retriever()

llm = AzureOpenAI(
    deployment_name="gpt35",
    temperature=0, 
    api_key=AZURE_API_KEY,
    azure_endpoint=AZURE_OPENAI_ENDPOINT,
    api_version=AZURE_API_VERSION
)

Settings.llm = llm

#query_engine = RetrieverQueryEngine(
#    retriever=retriever,
#)

query_engine = index.as_query_engine()

response = query_engine.query("What is a bias audit?")

Failed to write data to connection ResolvedIPv4Address(('35.241.237.34', 7687)) (ResolvedIPv4Address(('35.241.237.34', 7687)))
Failed to write data to connection IPv4Address(('2542e8be.databases.neo4j.io', 7687)) (ResolvedIPv4Address(('35.241.237.34', 7687)))
Failed to write data to connection ResolvedIPv4Address(('35.241.237.34', 7687)) (ResolvedIPv4Address(('35.241.237.34', 7687)))
Failed to write data to connection IPv4Address(('2542e8be.databases.neo4j.io', 7687)) (ResolvedIPv4Address(('35.241.237.34', 7687)))
Failed to write data to connection ResolvedIPv4Address(('35.241.237.34', 7687)) (ResolvedIPv4Address(('35.241.237.34', 7687)))
Failed to write data to connection IPv4Address(('2542e8be.databases.neo4j.io', 7687)) (ResolvedIPv4Address(('35.241.237.34', 7687)))
Failed to write data to connection ResolvedIPv4Address(('35.241.237.34', 7687)) (ResolvedIPv4Address(('35.241.237.34', 7687)))
Failed to write data to connection IPv4Address(('2542e8be.databases.neo4j.io', 7687)) (Resolv

NameError: name 'source' is not defined

#### Try use LLamaIndex to create KG as well

In [4]:
# Evaluate with Giskard
loader = PyMuPDFReader()
#file_extractor = {".pdf": loader}
documents1 = loader.load(file_path="../legal_data/LL144/LL144.pdf")
documents2 = loader.load(file_path="../legal_data/LL144/LL144_Definitions.pdf")
documents = documents1 + documents2

splitter = SentenceSplitter(chunk_size=512)

In [5]:
graph_store = Neo4jGraphStore(
    username=username,
    password=password,
    url=url,
    database=database,
)

storage_context = StorageContext.from_defaults(graph_store=graph_store)

Settings.llm = llm_gpt4o_
Settings.embed_model=embed_model

graph_index = KnowledgeGraphIndex.from_documents(
    documents,
    storage_context=storage_context,
    max_triplets_per_chunk=5,
    llm = llm_gpt4o_,
    embed_model=embed_model,
    include_embeddings=True,
    transformations=[splitter]
)

In [6]:
query_engine = graph_index.as_query_engine(include_text=True, response_mode="tree_summarize")
response = query_engine.query("Tell me more about bias audit")

In [7]:
print(response)

A bias audit is a process used to evaluate automated employment decision tools (AEDTs) to ensure they do not unfairly discriminate against individuals based on categories such as race or gender. This involves calculating metrics like the impact ratio, which compares the selection rate for a specific category to the selection rate of the most selected category. The audit also examines the selection rate, which is the rate at which individuals in a category are chosen to advance in the hiring process, and the scoring rate, which is the rate at which individuals in a category receive a score above the sample’s median score. The primary goal of a bias audit is to identify and mitigate any biases that may arise from the use of these automated tools, ensuring fair treatment across different demographic groups.


In [8]:
vector_index = VectorStoreIndex.from_documents(
    documents,
    embed_model=embed_model,
    transformations=[splitter]
)

In [9]:


class CustomRetriever(BaseRetriever):
    """Custom retriever that performs both Vector search and Knowledge Graph search"""
    
    def __init__(
        self,
        vector_retriever: VectorIndexRetriever,
        kg_retriever: KGTableRetriever,
        mode: str = "OR"
    ) -> None:
        """Init params."""
        
        self._vector_retriever = vector_retriever
        self._kg_retriever = kg_retriever
        if mode not in ("AND", "OR"):
            raise ValueError("Invalid mode.")
        self._mode = mode
        
    def _retrieve(self, query_bundle: QueryBundle) -> List[NodeWithScore]: 
        """Retrieve nodes given query."""
        
        vector_nodes = self._vector_retriever.retrieve(query_bundle)
        kg_nodes = self._kg_retriever.retrieve(query_bundle)
        print(f"vector_nodes: {vector_nodes}")
        vector_ids = {n.node.id_ for n in vector_nodes}
        kg_ids = {n.node.id_ for n in kg_nodes}
        
        combined_dict = {n.node.id_: n for n in vector_nodes}
        combined_dict.update({n.node.id_: n for n in kg_nodes})
        
        if self._mode == "AND":
            retrieve_ids = vector_ids.intersection(kg_ids)
        else:
            retrieve_ids = vector_ids.union(kg_ids)

        retrieve_nodes = [combined_dict[rid] for rid in retrieve_ids]
        return retrieve_nodes


In [10]:
from llama_index.core.schema import QueryBundle, NodeWithScore, TextNode
from llama_index.core.retrievers import BaseRetriever, VectorIndexRetriever, KGTableRetriever
from transformers import pipeline
from typing import List, Optional
import asyncio
from llama_index.core.postprocessor import SentenceTransformerRerank

class CustomRetrieverWithQueryRewriting(BaseRetriever):
    """Custom retriever that performs query rewriting, Vector search, BM25 search, and Knowledge Graph search."""
    
    def __init__(
        self,
        llm,  # LLM for query generation
        vector_retriever: Optional[VectorIndexRetriever] = None,
        bm25_retriever: Optional[BaseRetriever] = None,
        kg_retriever: Optional[KGTableRetriever] = None,
        mode: str = "OR",
        rewriter: bool = True,
        classifier_model: Optional[str] = None,  # Optional classifier model
        device: str = 'mps',  # Set to 'mps' as the default device
        reranker_model_name: Optional[str] = None,  # Model name for SentenceTransformerRerank
        verbose: bool = False  # Verbose flag
    ) -> None:
        """Init params."""
        self._vector_retriever = vector_retriever
        self._bm25_retriever = bm25_retriever
        self._kg_retriever = kg_retriever
        self._llm = llm
        self._rewriter = rewriter
        self._mode = mode
        self._reranker_model_name = reranker_model_name  # Store the model name for the reranker
        self._reranker = None  # Initialize reranker as None initially
        self.verbose = verbose  # Set verbose flag

        # Initialize the classifier if provided
        self.classifier = None
        if classifier_model:
            self.classifier = pipeline("text-classification", model=classifier_model, device=device)

        if mode not in ("AND", "OR"):
            raise ValueError("Invalid mode.")

    def classify_query_and_get_topk(self, query: str) -> int:
        """Classify the query and determine the top-k value based on the classification."""
        if self.classifier:
            classification = self.classifier(query)[0]
            label = int(classification['label'].split('_')[-1])
            if self.verbose:
                print(f"Query Classification: {classification['label']} with score {classification['score']}")
            if label == 0:
                top_k = 5
            elif label == 1:
                top_k = 7
            elif label == 2:
                top_k = 7
            if self.verbose:
                print(f"Selected top-k for the query: {top_k}")
            return top_k
        # Default top-k if no classifier is used
        top_k = 5
        if self.verbose:
            print(f"Classifier not provided. Default top-k: {top_k}")
        return top_k

    def classify_query(self, query_str: str) -> str:
        """Classify the query into one of the predefined categories using LLM."""
        classification_prompt = (
            f"Classify the following query into one of the following categories: '5-300. Definitions', "
            f"'5-301 Bias Audit', '5-302 Data Requirements', '§ 5-303 Published Results', '§ 5-304 Notice to Candidates and Employees'. "
            f"If it doesn't fit into any category, respond with 'None'. Return the classification, do not output absolutely anything else. Query: '{query_str}'"
        )
        response = self._llm.complete(classification_prompt)
        category = response.text.strip()
        return category if category in [
            '5-300. Definitions', '5-301 Bias Audit', 
            '5-302 Data Requirements', '§ 5-303 Published Results', 
            '§ 5-304 Notice to Candidates and Employees'
        ] else None

    def generate_queries(self, query_str: str, category: str, num_queries: int = 3) -> List[str]:
        """Generate query variations using the LLM, taking into account the category if applicable."""
        
        query_gen_prompt_str = (
            f"You are an expert at distilling a user question into sub-questions that can be used to fully answer the original query. "
            f"First, identify the key words from the original question below: \n"
            f"{query_str}"
            f"Generate {num_queries} sub-queries that cover the different aspects needed to fully address the user's query.\n\n"
            f"Here is an example: \n"
            f"Original Question: What does test data mean and what do I need to know about it?"
            f"Output:"
            f"definition of 'test data'\n"
            f"test data requirements and conditions for a bias audit\n"
            f"examples of the use of test data in a bias audit\n\n"
            f"Output the rewritten sub-queries, one on each line, do not output absolutely anything else"
        )

        query_gen_prompt = PromptTemplate(query_gen_prompt_str)
        fmt_prompt = query_gen_prompt.format(
            num_queries=num_queries, query_str=query_str,
        )

        response = self._llm.complete(fmt_prompt)
        queries = response.text.split("\n")

        # Remove empty strings from the generated queries
        queries = [query.strip() for query in queries if query.strip()]
        
        # Add the category-specific query if the category is available
        if category:
            category_query = f"{category}"
            queries.append(category_query)

        return queries

    
    async def run_queries(self, queries: List[str], retrievers: List[BaseRetriever]) -> dict:
        """Run queries against retrievers."""
        tasks = []
        for query in queries:
            for i, retriever in enumerate(retrievers):
                tasks.append(retriever.aretrieve(query))

        task_results = await asyncio.gather(*tasks)

        results_dict = {}
        for i, (query, query_result) in enumerate(zip(queries, task_results)):
            results_dict[(query, i)] = query_result
        return results_dict

    def fuse_vector_and_bm25_results(self, results_dict, similarity_top_k: int) -> List[NodeWithScore]:
        """Fuse results from Vector and BM25 retrievers."""
        k = 60.0  # `k` is a parameter used to control the impact of outlier rankings.
        fused_scores = {}
        text_to_node = {}

        # Compute reciprocal rank scores for BM25 and Vector retrievers
        for nodes_with_scores in results_dict.values():
            for rank, node_with_score in enumerate(
                sorted(nodes_with_scores, key=lambda x: x.score or 0.0, reverse=True)
            ):
                text = node_with_score.node.get_content()
                text_to_node[text] = node_with_score
                if text not in fused_scores:
                    fused_scores[text] = 0.0
                fused_scores[text] += 1.0 / (rank + k)

        # Sort results by combined scores
        reranked_results = dict(sorted(fused_scores.items(), key=lambda x: x[1], reverse=True))

        # Adjust node scores and prepare final results
        reranked_nodes: List[NodeWithScore] = []
        for text, score in reranked_results.items():
            if text in text_to_node:
                node = text_to_node[text]
                node.score = score
                reranked_nodes.append(node)
            else:
                if self.verbose:
                    print(f"Warning: Text not found in `text_to_node`: {text}")

        return reranked_nodes[:similarity_top_k]

    def _retrieve(self, query_bundle: QueryBundle) -> List[NodeWithScore]:
        """Retrieve nodes given query."""
        # Classify the query to determine its category and top-k value
        if self._rewriter:
            category = self.classify_query(query_bundle.query_str)
        if self.verbose:
            print(f"Classified Category: {category}")
        top_k = self.classify_query_and_get_topk(query_bundle.query_str)

        # Initialize the reranker with the correct top_k value
        if self._reranker_model_name:
            self._reranker = SentenceTransformerRerank(model=self._reranker_model_name, top_n=top_k)
            if self.verbose:
                print(f"Initialized reranker with top_n: {top_k}")

        # Determine the number of query rewrites based on classification
        if top_k == 5:
            num_queries = 3
        elif top_k == 7:
            num_queries = 5
        else:
            num_queries = 7

        # Generate query variations if rewriter is True
        if self._rewriter:
            queries = self.generate_queries(query_bundle.query_str, category, num_queries=num_queries)
            if self.verbose:
                print(f"Generated Queries: {queries}")
        else:
            queries = [query_bundle.query_str]

        # Prepare the list of active retrievers
        active_retrievers = []
        if self._vector_retriever:
            active_retrievers.append(self._vector_retriever)
        if self._bm25_retriever:
            active_retrievers.append(self._bm25_retriever)

        # If no active retrievers (BM25 or Vector), handle KG only case
        if not active_retrievers and not self._kg_retriever:
            raise ValueError("No active retriever provided!")

        results = {}
        # Run the queries asynchronously for active retrievers if any
        if active_retrievers:
            results = asyncio.run(self.run_queries(queries, active_retrievers))
            if self.verbose:
                print(f"Fusion Results: {results}")

        # Fuse the results from active retrievers (BM25/Vector)
        final_results = self.fuse_vector_and_bm25_results(results, similarity_top_k=top_k)

        # Combine with KG nodes according to the mode ("AND" or "OR")
        if self._kg_retriever:
            kg_nodes = self._kg_retriever.retrieve(query_bundle)
            if self.verbose:
                print(f"KG Retrieved Nodes: {kg_nodes}")

            vector_ids = {n.node.id_ for n in final_results}
            kg_ids = {n.node.id_ for n in kg_nodes}

            combined_dict = {n.node.id_: n for n in final_results}
            combined_dict.update({n.node.id_: n for n in kg_nodes})

            if self._mode == "AND":
                retrieve_ids = vector_ids.intersection(kg_ids)
            else:
                retrieve_ids = vector_ids.union(kg_ids)

            final_results = [combined_dict[rid] for rid in retrieve_ids]

        # Apply reranker if available
        if self._reranker:
            final_results = self._reranker.postprocess_nodes(final_results, query_bundle)
            if self.verbose:
                print(f"Reranked Results: {final_results}")
        else:
            final_results = final_results[:top_k]

        # Remove duplicates if rewriter is used
        if self._rewriter:
            unique_nodes = {}
            for node in final_results:
                content = node.node.get_content()
                if content not in unique_nodes:
                    unique_nodes[content] = node
            final_results = list(unique_nodes.values())

        if self.verbose:
            print(f"Final Results: {final_results}")
        return final_results


In [13]:
#question = "In the context of a bias audit, what is the definition of 'test data' and when can it be used?"
question = "As an employer conducting a bias audit, I want to know the threshold for excluding a category from impact ratio calculations. Can you clarify what percentage of data a category must represent to be excluded?"
question = "According to the document, what is the definition of 'Candidate for Employment'?"
question = "As an employer conducting a bias audit, I want to know the threshold for excluding a category from impact ratio calculations. Can you clarify what percentage of data a category must represent to be excluded?"

In [14]:
custom_retriever = CustomRetriever(
    vector_retriever=vector_retriever,
    kg_retriever=kg_retriever
)
response_synthesizer = get_response_synthesizer(response_mode="tree_summarize")
query_engine = RetrieverQueryEngine(
    retriever=custom_retriever,#,
    response_synthesizer=response_synthesizer,
)

# Define a function to query each engine and display the response
def query_and_display(engine, engine_name):
    response = engine.query(question)
    display(Markdown(f"### {engine_name}"))
    display(Markdown(f"{response}"))
    
query_and_display(query_engine, "custom rewriter query engine")


vector_nodes: [NodeWithScore(node=TextNode(id_='d2946b29-a019-4cbf-a428-a65064bdcd87', embedding=None, metadata={'total_pages': 10, 'file_path': '../legal_data/LL144/LL144.pdf', 'source': '7'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='10227b3a-a0a4-43a4-bc32-21690e2e022f', node_type=<ObjectType.DOCUMENT: '4'>, metadata={'total_pages': 10, 'file_path': '../legal_data/LL144/LL144.pdf', 'source': '7'}, hash='94ed1f43d65da0e31e063e7e9d9db7e5a6f4bc82bcf761a11343cc8ea51a5fc3'), <NodeRelationship.PREVIOUS: '2'>: RelatedNodeInfo(node_id='cf4822a8-e795-4a64-bb48-1d031eedbeaf', node_type=<ObjectType.TEXT: '1'>, metadata={'total_pages': 10, 'file_path': '../legal_data/LL144/LL144.pdf', 'source': '7'}, hash='06fd47c85d4b2d109c14cf277a0ce7d48bb162d6e91fdc4eeb7df9f8d60950c2')}, text='(d) Notwithstanding the requirements of paragraphs (2) and (3) of subdivision (b) and paragraphs (3) and \n(4) of subdivisio

### custom rewriter query engine

A category can be excluded from impact ratio calculations if it represents less than 2% of the data being used for the bias audit.

In [15]:
# create custom retriever
vector_retriever = VectorIndexRetriever(index=vector_index, similarity_top_k=10)
kg_retriever = KGTableRetriever(index=graph_index, retriever_mode='keyword', include_text=False, max_keywords_per_query=4, max_knowledge_sequence=2)
bm25_retriever = BM25Retriever.from_defaults(
    docstore=graph_index.docstore, similarity_top_k=10
)
from llama_index.core.postprocessor import SentenceTransformerRerank
reranker = SentenceTransformerRerank(model="mixedbread-ai/mxbai-rerank-base-v1", top_n=3)

# Define the custom retriever with query rewriting
custom_retriever = CustomRetrieverWithQueryRewriting(
    llm=llm_gpt35,
    vector_retriever=vector_retriever,#vector_retriever,
    kg_retriever=kg_retriever,#kg_retriever,#kg_retriever,
    bm25_retriever=bm25_retriever,
    classifier_model = "rk68/distilbert-q-classifier-3",
    #similarity_top_k=5,
    mode="OR",  # or "AND" depending on your needs
    rewriter=True,
    reranker_model_name="mixedbread-ai/mxbai-rerank-base-v1",
    verbose=True
)

# Define the query engine using the custom retriever
response_synthesizer = get_response_synthesizer(response_mode="tree_summarize")
query_engine = RetrieverQueryEngine(
    retriever=custom_retriever,#,
    response_synthesizer=response_synthesizer,
)

# Define a function to query each engine and display the response
def query_and_display(engine, engine_name):
    response = engine.query(question)
    display(Markdown(f"### {engine_name}"))
    display(Markdown(f"{response}"))
    
query_and_display(query_engine, "custom rewriter query engine")

Classified Category: 5-301 Bias Audit
Query Classification: LABEL_0 with score 0.996429979801178
Selected top-k for the query: 5
Initialized reranker with top_n: 5
Generated Queries: ['Sub-Query 1: What is the definition of "impact ratio" in a bias audit?', 'Sub-Query 2: What are the criteria for excluding a category from impact ratio calculations in a bias audit?', 'Sub-Query 3: Can you provide examples of how categories are excluded from impact ratio calculations in a bias audit?', '5-301 Bias Audit']
Fusion Results: {('Sub-Query 1: What is the definition of "impact ratio" in a bias audit?', 0): [NodeWithScore(node=TextNode(id_='5580835d-3226-4ec6-880c-8f12ac8dc20e', embedding=None, metadata={'total_pages': 10, 'file_path': '../legal_data/LL144/LL144.pdf', 'source': '4'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='d6461c19-3cc8-4d79-88ca-597df7255b54', node_type=<ObjectType.DOCUMENT: '4'>, me

### custom rewriter query engine

A category can be excluded from impact ratio calculations if it represents less than 2% of the data being used for the bias audit.

In [95]:
chat_engine = vector_index.as_chat_engine(chat_mode="context")
print(chat_engine.chat("what is a impact ratio"))

An impact ratio is a measure used in a bias audit of an Automated Employment Decision Tool (AEDT) to assess the tool's potential impact on different categories of candidates. It is calculated by dividing the selection rate or scoring rate for a particular category by the selection rate or scoring rate of the most selected or highest scoring category. The resulting ratio indicates the relative impact of the AEDT on different categories of candidates.


In [100]:
test_retriever = graph_index.as_retriever("hybrid")

retrieved_nodes_test = test_retriever.retrieve("what is an impact ratio?")

for node in retrieved_nodes_test:
    print(node.text)

4 
 
iii. 
at any point during the bias audit, has a direct financial interest or a material indirect 
financial interest in an employer or employment agency that seeks to use or continue to 
use the  AEDT or in a vendor that developed or distributed the AEDT. 
 
Impact Ratio. “Impact ratio” means either (1) the selection rate for a category divided by the selection 
rate of the most selected category or (2) the scoring rate for a category divided by the scoring rate for the 
highest scoring category.  
 
 
 
    _________selection rate for a category________  
Impact Ratio =        selection rate of the most selected category  
 
OR 
 
  
 
     ______scoring rate for a category__________  
Impact Ratio =        scoring rate of the highest scoring category 
 
 
Machine learning, statistical modeling, data analytics, or artificial intelligence. “Machine learning, 
statistical modeling, data analytics, or artificial intelligence” means a group of mathematical, computer-
based techniques

In [49]:
cretrieved_nodes = custom_retriever.retrieve(question)

for node in cretrieved_nodes:
    print('='*100)
    print(node)

Classified Category: 5-301 Bias Audit
Query Classification: LABEL_0 with score 0.996429979801178
Selected top-k for the query: 5
Generated Queries: ['Sub-Query 1: What is the definition of "impact ratio" in a bias audit?', 'Sub-Query 2: What are the criteria for excluding a category from impact ratio calculations in a bias audit?', 'Sub-Query 3: Can you provide examples of how categories are excluded from impact ratio calculations in a bias audit?', '5-301 Bias Audit']
Fusion Results: {('Sub-Query 1: What is the definition of "impact ratio" in a bias audit?', 0): [NodeWithScore(node=TextNode(id_='0e5b18cb-21c3-45b4-a39d-0e1ae538c384', embedding=None, metadata={'total_pages': 10, 'file_path': '../legal_data/LL144/LL144.pdf', 'source': '4'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='96ea126a-43ca-41f2-8d76-3e56478aace1', node_type=<ObjectType.DOCUMENT: '4'>, metadata={'total_pages': 10, 'file_pa

In [423]:
kg_retriever = KGTableRetriever(index=graph_index, retriever_mode='hybrid', include_text=True, similarity_top_k=5)

test_nodes = kg_retriever.retrieve("what is bias audit")

print(test_nodes[0].text)

(a) Historical Data. A bias audit conducted pursuant to section 5-301 of this Chapter must use historical 
data of the AEDT. The historical data used to conduct a bias audit may be from one or more employers 
or employment agencies that use the AEDT. However, an individual employer or employment agency 
may rely on a bias audit of an AEDT that uses the historical data of other employers or employment 
agencies only in the following circumstances: if such employer or employment agency provided historical 
data from its own use of the AEDT to the independent auditor conducting the bias audit or if such 
employer or employment agency has never used the AEDT. 
(b) Test Data. Notwithstanding the requirements of subdivision (a) of this section, an employer or 
employment agency may rely on a bias audit that uses test data if insufficient historical data is available 
to conduct a statistically significant bias audit. If a bias audit uses test data, the summary of results of the 
bias audit m

In [23]:
kg_retriever = KGTableRetriever(index=graph_index, retriever_mode='keyword', include_text=True, similarity_top_k=10)


In [370]:
from IPython.display import display, Markdown
from llama_index.core.query_engine import RetrieverQueryEngine, KnowledgeGraphQueryEngine
from llama_index.core.retrievers import KnowledgeGraphRAGRetriever
from llama_index.core import get_response_synthesizer, StorageContext
from transformers import pipeline

question = "As an employer conducting a bias audit, I want to know the threshold for excluding a category from impact ratio calculations. Can you clarify what percentage of data a category must represent to be excluded?"

# Assuming the llm_gpt35 is already defined
Settings.llm = llm_gpt35

# Create storage context (assuming graph_store is defined)
storage_context = StorageContext.from_defaults(graph_store=graph_store)

# Create custom retriever with the option to use a classifier
vector_retriever = VectorIndexRetriever(index=vector_index, similarity_top_k=10)
kg_retriever = KGTableRetriever(index=graph_index, retriever_mode='keyword', include_text=True, similarity_top_k=10)

from llama_index.core.postprocessor import SentenceTransformerRerank
reranker = SentenceTransformerRerank(model="mixedbread-ai/mxbai-rerank-base-v1", top_n=5)

# Define the custom retriever with query rewriting
custom_retriever = CustomRetrieverWithQueryRewriting(
    llm=llm_gpt35,
    vector_retriever=vector_retriever,#vector_retriever,
    kg_retriever=kg_retriever,#kg_retriever,#kg_retriever,
    bm25_retriever=bm25_retriever,
    classifier_model = "rk68/distilbert-q-classifier-3",
    #similarity_top_k=5,
    mode="OR",  # or "AND" depending on your needs
    rewriter=True,
    reranker=reranker
)

# Create response synthesizer
response_synthesizer = get_response_synthesizer(response_mode="tree_summarize")

# Create custom query engine with the custom retriever
custom_query_engine = RetrieverQueryEngine(
    retriever=custom_retriever,
    response_synthesizer=response_synthesizer,
)

# Create other query engines (no changes needed here)
vector_query_engine = CitationQueryEngine.from_args(
    vector_index,
    similarity_top_k=10,
    citation_chunk_size=512,
    citation_chunk_overlap=200
)

kg_keyword_query_engine = CitationQueryEngine.from_args(
    graph_index,
    similarity_top_k=10,
    include_text=False,
    retriever_mode="keyword",
    response_mode="tree_summarize"
)

kg_embedding_query_engine = CitationQueryEngine.from_args(
    graph_index,
    similarity_top_k=10,
    include_text=True,
    retriever_mode="embedding",
    response_mode="tree_summarize"
)

kg_hybrid_query_engine = CitationQueryEngine.from_args(
    graph_index,
    similarity_top_k=10,
    include_text=True,
    retriever_mode="hybrid",
    response_mode="tree_summarize",
    explore_global_knowledege=False
)

kg_hybrid_global_query_engine = CitationQueryEngine.from_args(
    graph_index,
    similarity_top_k=10,
    include_text=True,
    retriever_mode="hybrid",
    response_mode="tree_summarize",
    explore_global_knowledege=True
)

kg_query_engine = KnowledgeGraphQueryEngine(
    storage_context=storage_context,
    llm=llm_gpt35,
    verbose=True
)

graph_rag_retriever = KnowledgeGraphRAGRetriever(
    graph_index,
    verbose=True,
)

graph_rag_query_engine = RetrieverQueryEngine.from_args(
    graph_rag_retriever,
)

graph_rag_retriever_with_nl2graphquery = KnowledgeGraphRAGRetriever(
    storage_context=storage_context,
    verbose=True,
    with_nl2graphquery=True,
)

query_engine_with_nl2graphquery = RetrieverQueryEngine.from_args(
    graph_rag_retriever_with_nl2graphquery,
)

# Function to run the query and display the results
def query_and_display(engine, engine_name):
    question = "As an employer conducting a bias audit, I want to know the threshold for excluding a category from impact ratio calculations. Can you clarify what percentage of data a category must represent to be excluded?"
    response = engine.query(question)
    display(Markdown(f"### {engine_name}"))
    display(Markdown(f"{response}"))

# Define the question (adjust the question as needed)
question = "In the context of a bias audit, what is the definition of 'test data' and when can it be used?"

# Run the query through each query engine and display the results
query_and_display(custom_query_engine, "Custom Query Engine")
query_and_display(vector_query_engine, "Vector Query Engine")
query_and_display(kg_keyword_query_engine, "KG Keyword Query Engine")
query_and_display(kg_embedding_query_engine, "KG Embedding Query Engine")
query_and_display(kg_hybrid_query_engine, "KG Hybrid Query Engine")
query_and_display(kg_hybrid_global_query_engine, "KG Hybrid Global Query Engine")
query_and_display(graph_rag_query_engine, "Graph RAG Query Engine")
query_and_display(query_engine_with_nl2graphquery, "Query Engine with NL2GraphQuery")


Classified Category: 5-301 Bias Audit
Query Classification: LABEL_0 with score 0.996429979801178
Selected top-k for the query: 5
Generated Queries: ['Sub-Query 1: What is the definition of "impact ratio" in a bias audit?', 'Sub-Query 2: What are the criteria for excluding a category from impact ratio calculations in a bias audit?', 'Sub-Query 3: Can you provide examples of how categories are excluded from impact ratio calculations in a bias audit?', '5-301 Bias Audit As an employer conducting a bias audit, I want to know the threshold for excluding a category from impact ratio calculations. Can you clarify what percentage of data a category must represent to be excluded?']
Fusion Results: {('Sub-Query 1: What is the definition of "impact ratio" in a bias audit?', 0): [NodeWithScore(node=TextNode(id_='bed98c39-2df9-435b-b1df-c74f9840ead4', embedding=None, metadata={'total_pages': 10, 'file_path': '../legal_data/LL144/LL144.pdf', 'source': '4'}, excluded_embed_metadata_keys=[], excluded_

### Custom Query Engine

An independent auditor may exclude a category that represents less than 2% of the data being used for the bias audit from the required calculations for impact ratio. If such a category is excluded, the summary of results must include the independent auditor’s justification for the exclusion, as well as the number of applicants and scoring rate or selection rate for the excluded category.

### Vector Query Engine

According to Source 1, an independent auditor may exclude a category that represents less than 2% of the data being used for the bias audit from the required calculations for impact ratio. However, the summary of results must include the independent auditor’s justification for the exclusion, as well as the number of applicants and scoring rate or selection rate for the excluded category [1].

### KG Keyword Query Engine

The provided context information does not contain a specific answer to the query about the threshold for excluding a category from impact ratio calculations during a bias audit. There is no information provided on the percentage of data a category must represent to be excluded from impact ratio calculations. However, the summary of results must include the independent auditor's justification for exclusion, which may provide more information on the criteria used for exclusion.

### KG Embedding Query Engine

The threshold for excluding a category from impact ratio calculations during a bias audit is if it represents less than 2% of the data being used. However, if a category is excluded, the summary of results must include the independent auditor's justification for the exclusion, as well as the number of applicants and scoring rate or selection rate for the excluded category.

### KG Hybrid Query Engine

There is no specific percentage threshold mentioned in the provided context information for excluding a category from impact ratio calculations during a bias audit. It is recommended to consult with experts in the field or refer to industry standards for guidance on determining the appropriate threshold for exclusion.

### KG Hybrid Global Query Engine

I'm sorry, but the provided context information does not directly answer the query about the threshold for excluding a category from impact ratio calculations during a bias audit. It is important to consult relevant guidelines and regulations for conducting a bias audit in your specific industry or location to determine the appropriate threshold for excluding a category.

### Graph RAG Query Engine

Repeat: The original answer still applies and there is no specific information provided about the threshold for excluding a category from impact ratio calculations during a bias audit. However, it is important to note that any individual who was not included in the calculations must be indicated in the bias audit report, as required in Paragraphs (1) and consistent with Section 1607.4 of the EEOC Uniform Guidelines.

template
  none is not an allowed value (type=type_error.none.not_allowed)


### Query Engine with NL2GraphQuery

Repeat: The original answer still applies and there is no specific information provided about the threshold for excluding a category from impact ratio calculations during a bias audit. However, it is important to note that any individual who was not included in the calculations must be indicated in the bias audit report, as required in Paragraphs (1) and consistent with Section 1607.4 of the EEOC Uniform Guidelines.

In [139]:
response = kg_keyword_query_engine.query(question)
display(Markdown(f"<b>{response}</b>"))

<b>The definition of 'test data' in the context of a bias audit is not explicitly provided in the given context information. However, it is clear that 'test data' is used to conduct the audit of an automated employment decision tool and is generated and obtained as described in the summary of results. It is used to calculate the impact ratio for the selection of male candidates versus female candidates. Therefore, 'test data' can be used when conducting a bias audit of an automated employment decision tool.</b>

In [140]:
response = vector_query_engine.query(question)
display(Markdown(f"<b>{response}</b>"))

<b>According to Source 2, "test data" refers to data used to conduct a bias audit when there is insufficient historical data available. An employer or employment agency may rely on a bias audit that uses test data only if historical data is not available to conduct a statistically significant bias audit [2]. However, Source 5 states that an employer may not rely on a bias audit conducted using test data [5].</b>

In [154]:
print(response.source_nodes[5].node.get_text())

9 
 
Example 2: An employment agency has been using an AEDT for 6 months. The bias audit the 
employment agency relied on before its first use of the AEDT was conducted 10 months ago using test 
data. The employment agency will need an updated bias audit if it will continue to use the AEDT once 12 
months have passed since the bias audit it first relied on was conducted. The employment agency’s data 
from 6 months of use of the AEDT is not sufficient on its own to conduct a statistically significant bias 
audit. The employment agency may rely on a bias audit using the historical data of other employers and 
employment agencies if it provides its 6 months of historical data to the independent auditor for use and 
consideration. The employment agency may also rely on a bias audit that uses test data. 
 
Example 3: An employer has been using an AEDT for 3 years and will soon need an updated bias audit. 
The employer has statistically significant data from its 3 years of use of the AEDT. T

In [144]:
response = custom_query_engine.query(question)
display(Markdown(f"<b>{response}</b>"))

vector_nodes: [NodeWithScore(node=TextNode(id_='f5350096-3d47-40ec-8f21-017439aad886', embedding=None, metadata={'total_pages': 10, 'file_path': '../legal_data/LL144/LL144.pdf', 'source': '5'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='ba9fc987-7aa5-44e6-bfad-2bd70e152dc0', node_type=<ObjectType.DOCUMENT: '4'>, metadata={'total_pages': 10, 'file_path': '../legal_data/LL144/LL144.pdf', 'source': '5'}, hash='0f9cd16b2c0effbe3b745518ea7dae8908b58d7f808567e9262bb352aa97907e'), <NodeRelationship.NEXT: '3'>: RelatedNodeInfo(node_id='e9300dd9-a380-4807-b767-95f1989c0985', node_type=<ObjectType.TEXT: '1'>, metadata={}, hash='0bdb3c4ec34275a5c54355c5c1de41c54c95e3aec4ad684eadc1dc6dccaf4ca4')}, text='5 \n \n \nTest data. “Test data” means data used to conduct a bias audit that is not historical data. \n§ 5-301 Bias Audit. \n(a) An employer or employment agency may not use or continue to use an AEDT if m

<b>Test data in the context of a bias audit refers to data that is used to assess the performance of an automated employment decision tool (AEDT) during testing, as opposed to historical data which is collected from actual use of the AEDT. Test data can be used in a bias audit if there is insufficient historical data available to conduct a statistically significant bias audit. However, if a bias audit uses test data, the summary of results of the bias audit must explain why historical data was not used and describe how the test data used was generated and obtained.</b>

In [39]:
vector_query_engine = vector_index.as_query_engine(similarity_top_k=10)
graph_query_engine = graph_index.as_query_engine(similarity_top_k=10, include_text=False)
graph_query_engine2 = graph_index.as_query_engine(similarity_top_k=10, include_text=True, response_mode="tree_summarize", embedding_mode="hybrid")

In [40]:
response_vector = vector_query_engine.query("As an employer conducting a bias audit, I want to know the threshold for excluding a category from impact ratio calculations. Can you clarify what percentage of data a category must represent to be excluded?")
response_graph = graph_query_engine.query("As an employer conducting a bias audit, I want to know the threshold for excluding a category from impact ratio calculations. Can you clarify what percentage of data a category must represent to be excluded?")
response_graph2 = graph_query_engine2.query("As an employer conducting a bias audit, I want to know the threshold for excluding a category from impact ratio calculations. Can you clarify what percentage of data a category must represent to be excluded?")

display(Markdown(f"<i>{response_vector}</i>"))


<i>An independent auditor may exclude a category from impact ratio calculations if it represents less than 2% of the data being used for the bias audit. However, if a category is excluded, the summary of results must include the independent auditor's justification for the exclusion, as well as the number of applicants and scoring rate or selection rate for the excluded category.</i>

In [41]:
display(Markdown(f"<i>{response_graph}</i>"))

<i>The context information does not provide a specific answer to the query about the threshold for excluding a category from impact ratio calculations during a bias audit.</i>

In [42]:
display(Markdown(f"<i>{response_graph2}</i>"))

<i>The knowledge sequence provided does not contain information about the threshold for excluding a category from impact ratio calculations during a bias audit.</i>

In [28]:
Settings.llm = llm_gpt35

from llama_index.core.memory import ChatMemoryBuffer
memory = ChatMemoryBuffer.from_defaults(token_limit=8192)

chat_engine = index.as_chat_engine(
    chat_mode="context",
    memory=memory,
    verbose=True
)

from llama_index.core.memory import ChatMemoryBuffer
memory = ChatMemoryBuffer.from_defaults(token_limit=8192)

response = chat_engine.chat("As an employer conducting a bias audit, I want to know the threshold for excluding a category from impact ratio calculations. Can you clarify what percentage of data a category must represent to be excluded?")
display(Markdown(f"<i>{response}</i>"))

[1;3;32mExtracted keywords: ['employer', 'represent', 'bias', 'audit', 'excluding', 'category', 'threshold', 'calculations', 'excluding category', 'bias audit', 'data', 'ratio', 'impact ratio', 'impact', 'excluded', 'percentage']
[0m[1;3;34mKG context:
The following are knowledge sequence in max depth 2 in the form of directed graph like:
`subject -[predicate]->, object, <-[predicate_next_hop]-, object_next_hop ...`
['CALCULATES', 'Impact ratio', 'FOR_SELECTION_OF', 'Hispanic or latino male candidates vs not hispanic or latino black or african american female candidates']
['UNLAWFUL_TO_USE', 'Automated employment decision tool', 'MEANS', 'Using a simplified output to overrule conclusions derived from other factors']
['EVALUATES', 'Automated employment decision tool', 'MEANS', 'Using a simplified output to overrule conclusions derived from other factors']
['USE', 'Automated employment decision tool', 'MEANS', 'Using a simplified output to overrule conclusions derived from other facto

<i>I'm sorry, but the context information and knowledge sequences provided do not contain the specific information you are asking for. It is not clear from the given information what the threshold for excluding a category from impact ratio calculations is. However, it is mentioned that the calculations must separately calculate the impact on race/ethnicity categories and that the independent auditor's justification for exclusion must be included in the summary of results. You may need to consult additional sources or seek legal advice to determine the specific threshold for exclusion in your jurisdiction.</i>

In [395]:
response = chat_engine.chat("what are the penalties?")
display(Markdown(f"<i>{response}</i>"))

2024-08-21 17:20:42,895 pid:33026 MainThread neo4j.io     ERROR    Failed to write data to connection ResolvedIPv4Address(('35.241.237.34', 7687)) (ResolvedIPv4Address(('35.241.237.34', 7687)))
2024-08-21 17:20:43,108 pid:33026 MainThread neo4j.io     ERROR    Failed to write data to connection IPv4Address(('2542e8be.databases.neo4j.io', 7687)) (ResolvedIPv4Address(('35.241.237.34', 7687)))


<i>The new context does not provide information on the penalties for violating Section 20-871 in the context of conducting a bias audit for an Automated Employment Decision Tool (AEDT). Therefore, the original answer remains the same.</i>

In [397]:
graph_retriever_nl2graphquery = index.as_retriever(verbose=True, with_nl2graphquery=False)

query_engine = RetrieverQueryEngine.from_args(graph_retriever_nl2graphquery)

response = query_engine.query("what is a bias audit")
display(Markdown(f"<i>{response}</i>"))

[1;3;32mExtracted keywords: ['purpose', 'assessment', 'bias', 'audit', 'evaluation', 'fairness', 'definition', 'discrimination', 'methodology', 'process']
[0m[1;3;34mKG context:
The following are knowledge sequence in max depth 2 in the form of directed graph like:
`subject -[predicate]->, object, <-[predicate_next_hop]-, object_next_hop ...`
['INCLUDE', 'Testing of automated employment decision tool']
('Bias audit', 'Conducted using', 'Test data')
('Bias audit', 'Use', 'Test data')
[0m

<i>A bias audit is a process that must be conducted to assess the potential disparate impact of an Automated Employment Decision Tool (AEDT) on persons of any component 1 category required to be reported by employers pursuant to subsection (c) of section 2000e-8 of title 42 of the United States code as specified in part 1602.7 of title 29 of the code of federal regulations. The bias audit must use historical data of the AEDT, and may be from one or more employers or employment agencies that use the AEDT. If insufficient historical data is available, an employer or employment agency may rely on a bias audit that uses test data. The bias audit should be conducted using test data to ensure that the AEDT is not biased against any race/ethnicity or sex category.</i>

In [13]:
query_engine = index.as_query_engine(
    include_text=True,
    response_mode="tree_summarize",
    embedding_mode="hybrid",
    similarity_top_k=10,
    chat_mode="context"
)

In [14]:
response = query_engine.query("Define impact ratio")
display(Markdown(f"<i>{response}</i>"))

<i>The impact ratio is a term used in the context of conducting a bias audit of an Artificial Intelligence Employment Decision Tool (AEDT). It is calculated separately for each category and can be either the selection rate for a category divided by the selection rate of the most selected category or the scoring rate for a category divided by the scoring rate for the highest scoring category. The impact ratio is used to measure the impact of the AEDT on different categories, such as sex categories, race/ethnicity categories, and intersectional categories. The impact ratios for all categories assessed by the AEDT must be made publicly available by the employer.</i>

In [15]:
response = query_engine.query("Percentage threshold for excluding category from impact ratio calculations")
display(Markdown(f"<i>{response}</i>"))

<i>The context information does not provide a specific percentage threshold for excluding a category from impact ratio calculations. However, it is mentioned that an independent auditor may exclude a category that represents less than 2% of the data being used for the bias audit from the required calculations for impact ratio, as per the information provided in source 7, subdivision (d).</i>

In [10]:
response = query_engine.query("As an employer conducting a bias audit, I want to know the threshold for excluding a category from impact ratio calculations. Can you clarify what percentage of data a category must represent to be excluded?")
display(Markdown(f"<i>{response}</i>"))

<i>The context information does not provide a specific threshold for excluding a category from impact ratio calculations. However, an independent auditor may exclude a category that represents less than 2% of the data being used for the bias audit from the required calculations for impact ratio. If such a category is excluded, the summary of results must include the independent auditor’s justification for the exclusion, as well as the number of applicants and scoring rate or selection rate for the excluded category.</i>

In [31]:
response = query_engine.query("What's a bias audit and what's the key steps to doing one? Also what happens if I don't do one?")
display(Markdown(f"<i>{response}</i>"))

<i>A bias audit is an impartial evaluation aimed at assessing the fairness and impact of an automated employment decision tool (AEDT) on various demographic groups, such as sex and race/ethnicity. The goal is to identify any potential disparities in outcomes that may affect different groups.

Key steps to conducting a bias audit include:

1. **Data Collection**: Gather historical data related to the AEDT, which may involve data from multiple employers if necessary. If historical data is lacking, test data can be utilized.

2. **Calculating Selection Rates**: Determine the selection rates for each demographic category involved in the hiring or promotion process.

3. **Calculating Impact Ratios**: Compute impact ratios for different groups by comparing their selection rates.

4. **Intersectional Analysis**: Analyze the effects on intersectional categories, such as combinations of sex and race/ethnicity.

5. **Reporting**: Summarize the findings, including the number of individuals assessed in unknown categories, and make this information publicly accessible.

If a bias audit is not conducted within the required timeframe (no more than one year prior to the use of the AEDT), employers or employment agencies may face legal repercussions, including being prohibited from using the tool. This non-compliance can lead to civil penalties, which may start at $500 for a first violation and increase for subsequent violations, with additional penalties accruing for each day the AEDT is used without compliance.</i>

In [18]:
retriever = index.as_retriever(similarity_top_k=5, retriever_mode='keyword')

retrieved_nodes = retriever.retrieve("what is the definition of candidate for employment")

for node in retrieved_nodes:
    print(node.get_content())

The following are knowledge sequence in max depth 2 in the form of directed graph like:
`subject -[predicate]->, object, <-[predicate_next_hop]-, object_next_hop ...`
['MODIFIED', 'Machine learning']
['MODIFIED', 'Machine learning', 'GENERATES', 'Classification']
['MODIFIED', 'Machine learning', 'GENERATES', 'Prediction']
['MODIFIED', 'Machine learning', 'MEANS', 'Group of mathematical techniques']
['MAY_BRING', 'Civil action in court']
['RESIDES_IN', 'City']


In [62]:
text_nodes = index.docstore.docs.values()
knowledge_base_df = pd.DataFrame([node.text for node in text_nodes], columns=['text'])
knowledge_base_df
knowledge_base = KnowledgeBase(knowledge_base_df)
print(knowledge_base)

<giskard.rag.knowledge_base.KnowledgeBase object at 0x2a8e3a470>


In [35]:
Settings.llm = llm_gpt35

In [17]:
from llama_index.core.memory import ChatMemoryBuffer
memory = ChatMemoryBuffer.from_defaults(token_limit=8192)

#chat_engine = index.as_chat_engine(chat_mode='context', similarity_top_k=3, memory=memory)
#chat_engine.reset()
#response = chat_engine.chat("what is a candidate for employment")

In [102]:
print(response)

According to the legal document, a "candidate for employment" means a person who has applied for a specific employment position by submitting the necessary information or items in the format required by the employer or employment agency.


In [16]:
Settings.llm = llm_gpt35

retriever = index.as_retriever(similarity_top_k=5, embedding_mode="hybrid")
retrieved_nodes = retriever.retrieve("what is a candidate for employment")

print(retrieved_nodes)


[NodeWithScore(node=TextNode(id_='b13ca048-6d96-4966-a379-d434c6ce578c', embedding=None, metadata={'total_pages': 10, 'file_path': '../legal_data/LL144/LL144.pdf', 'source': '9'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='a412f3be-e5eb-4b82-9b23-dc4bdf39db73', node_type=<ObjectType.DOCUMENT: '4'>, metadata={'total_pages': 10, 'file_path': '../legal_data/LL144/LL144.pdf', 'source': '9'}, hash='7d13301f2b16635b6930fcc6dcbdf7e16da31df9ff2d4c60d02de9812da4cb56'), <NodeRelationship.PREVIOUS: '2'>: RelatedNodeInfo(node_id='c2acd5ff-8732-4fdb-8a90-106d7d5ed360', node_type=<ObjectType.TEXT: '1'>, metadata={'total_pages': 10, 'file_path': '../legal_data/LL144/LL144.pdf', 'source': '9'}, hash='61b7a25a7a0e7c81391f36792b16d0be6f21cf4a9977d06807f3d9ae50ac535e'), <NodeRelationship.NEXT: '3'>: RelatedNodeInfo(node_id='ac42300a-a802-487a-9714-6fff923a3968', node_type=<ObjectType.TEXT: '1'>, metadata={}, hash

In [90]:
for nodes in retrieved_nodes:
    print(nodes.text)

10 
 
(2) Provide notice in a job posting at least 10 business days before use of an AEDT; or, 
(3) Provide notice via U.S. mail or e-mail at least 10 business days before use of an AEDT.  
(d) To comply with § 20-871(b)(3) of the Code, an employer or employment agency must: 
(1) Provide information on the employment section of its website in a clear and conspicuous 
manner about its AEDT data retention policy, the type of data collected for the AEDT, and the 
source of the data; 
(2) Post instructions on the employment section of its website in a clear and conspicuous manner 
for how to make a written request for such information, and if a written request is received, 
provide such information within 30 days; and 
(3) Provide an explanation to a candidate for employment or employee being considered for 
promotion why disclosure of such information would violate local, state, or federal law, or 
interfere with a law enforcement investigation.
Nothing in this subchapter requires an empl

In [79]:
import tiktoken
encoding = tiktoken.get_encoding("cl100k_base")

def num_tokens_from_string(string: str, encoding_name: str) -> int:
    """Returns the number of tokens in a text string."""
    encoding = tiktoken.get_encoding(encoding_name)
    num_tokens = len(encoding.encode(string))
    return num_tokens

n=0
for node in text_nodes:
    n+=1
    print(f"Node No. {n}")
    t = num_tokens_from_string(string=node.text, encoding_name="cl100k_base")
    print(f"Node Length: {len(node.text)}")
    print(f"Token Count: {t}")

Node No. 1
Node Length: 1982
Token Count: 437
Node No. 2
Node Length: 1757
Token Count: 391
Node No. 3
Node Length: 2251
Token Count: 475
Node No. 4
Node Length: 2026
Token Count: 470
Node No. 5
Node Length: 1487
Token Count: 346
Node No. 6
Node Length: 2366
Token Count: 479
Node No. 7
Node Length: 1782
Token Count: 370
Node No. 8
Node Length: 2080
Token Count: 469
Node No. 9
Node Length: 797
Token Count: 186
Node No. 10
Node Length: 1402
Token Count: 482
Node No. 11
Node Length: 335
Token Count: 71
Node No. 12
Node Length: 1567
Token Count: 350
Node No. 13
Node Length: 1507
Token Count: 407
Node No. 14
Node Length: 1755
Token Count: 465
Node No. 15
Node Length: 1348
Token Count: 281
Node No. 16
Node Length: 2107
Token Count: 468
Node No. 17
Node Length: 2117
Token Count: 483
Node No. 18
Node Length: 1011
Token Count: 245
Node No. 19
Node Length: 956
Token Count: 224
Node No. 20
Node Length: 1865
Token Count: 427
Node No. 21
Node Length: 2197
Token Count: 465
Node No. 22
Node Length: 1

In [40]:
vector_retriever = VectorIndexRetriever(index=vector_index, similarity_top_k=10)
kg_retriever = KGTableRetriever(index=graph_index, retriever_mode='keyword', include_text=False, similarity_top_k=10)
bm25_retriever = BM25Retriever.from_defaults(
    docstore=graph_index.docstore, similarity_top_k=10
)

In [18]:
from llama_index.core.postprocessor import SentenceTransformerRerank
reranker = SentenceTransformerRerank(model="mixedbread-ai/mxbai-rerank-base-v1", top_n=10)
from llama_index.core.chat_engine import CondenseQuestionChatEngine, ContextChatEngine


# Define the custom retriever with query rewriting
retriever = CustomRetrieverWithQueryRewriting(
    llm=llm_gpt35,
    vector_retriever=vector_retriever,#vector_retriever,
    kg_retriever=kg_retriever,#kg_retriever,#kg_retriever,
    bm25_retriever=bm25_retriever,
    classifier_model = "rk68/distilbert-q-classifier-3",
    #similarity_top_k=5,
    mode="OR",  # or "AND" depending on your needs
    rewriter=False,
    reranker_model_name=None,
    verbose=False
)

memory = ChatMemoryBuffer.from_defaults(token_limit=8192)

chat_engine = ContextChatEngine.from_defaults(
    retriever=retriever,
    verbose=False,
    chat_mode="context",
    memory_cls=memory,
    memory=memory
)

response = chat_engine.chat('what is Local Law 144 about?')
print(response)

Local Law 144 of 2021 pertains to the regulation of automated employment decision tools (AEDTs) used by employers and employment agencies in New York City. The law includes several key provisions:

1. **Bias Audit Requirement**: Employers and employment agencies are prohibited from using AEDTs unless the tool has undergone a bias audit within one year of its use. The bias audit must be conducted by an independent auditor and assess the tool's disparate impact on various demographic categories, such as race/ethnicity and sex, as required by the U.S. Equal Employment Opportunity Commission (EEOC).

2. **Public Availability of Audit Results**: A summary of the results of the most recent bias audit, along with the distribution date of the tool, must be made publicly available on the employer or employment agency's website before the tool is used.

3. **Notification Requirements**: Employers and employment agencies must notify candidates and employees residing in the city at least ten busin

In [410]:
retrieved_nodes = retriever.retrieve('As an employer, I need to understand the definition of Candidate for Employment as stated in the legal documents. Can you provide me with the specific criteria that determine whether someone is considered a candidate for employment?')
import tiktoken

# Initialize the tokenizer for cl100k_base
tokenizer = tiktoken.get_encoding("cl100k_base")

total_token_count = 0

for node in retrieved_nodes:
    # Compute the token count for the node's text
    tokens = tokenizer.encode(node.node.text)
    token_count = len(tokens)
    
    # Print the token count for this node
    print(f"Node token count: {token_count}")
    
    # Accumulate the total token count
    total_token_count += token_count

# Print the total token count for all nodes
print(f"Total token count for all nodes: {total_token_count}")


Node token count: 245
Node token count: 427
Node token count: 186
Node token count: 370
Node token count: 465
Total token count for all nodes: 1693


In [31]:
from llama_index.core.postprocessor import SentenceTransformerRerank
from llama_index.core.chat_engine import CondenseQuestionChatEngine, ContextChatEngine
import pandas as pd
import tiktoken

Settings.llm = llm_gpt35

def run_evaluation(
    results_base_path: str,
    test_set_path: str = "../giskard_test_sets/LL144_275_New.jsonl",
    rewriter: bool = False,
    reranker_model_name: str = None,
    classifier_model: str = "rk68/distilbert-q-classifier-3"
):
    
    vector_retriever = VectorIndexRetriever(index=vector_index, similarity_top_k=10)
    #kg_retriever = KGTableRetriever(index=graph_index, retriever_mode='keyword', include_text=False, max_keywords_per_query=4, max_knowledge_sequence=2)
    kg_retriever = KGTableRetriever(
        index=graph_index,
        retriever_mode='hybrid',
        include_text=False,
        max_keywords_per_query=4,  # Reduce to focus on fewer, more relevant keywords
        max_knowledge_sequence=2  # Retrieve only the most directly relevant connections
    )
    bm25_retriever = BM25Retriever.from_defaults(
        docstore=graph_index.docstore, similarity_top_k=10
    )

    # Define the custom retriever with query rewriting
    retriever = CustomRetrieverWithQueryRewriting(
        llm=llm_gpt35,
        vector_retriever=vector_retriever,
        kg_retriever=kg_retriever,
        bm25_retriever=bm25_retriever,
        classifier_model=classifier_model,
        mode="OR",
        rewriter=rewriter,
        reranker_model_name=reranker_model_name,
        verbose=False
    )

    memory = ChatMemoryBuffer.from_defaults(token_limit=8192)
    #chat_engine.reset()
    chat_engine = ContextChatEngine.from_defaults(
        retriever=retriever,
        verbose=False,
        chat_mode="context",
        memory_cls=memory,
        memory=memory
    )

    Settings.llm = llm_gpt35
    splitter = SentenceSplitter(chunk_size=512)
    text_nodes = splitter(graph_index.docstore.docs.values())
    knowledge_base_df = pd.DataFrame([node.text for node in text_nodes], columns=['text'])
    knowledge_base = KnowledgeBase(knowledge_base_df)

    def answer_fn(question, history=None):
        chat_history = [ChatMessage(role=MessageRole.USER if msg['role'] == 'user' else MessageRole.ASSISTANT, content=msg['content']) for msg in history] if history else []
        
        # Debug: Print chat history and token count
        tokenizer = tiktoken.get_encoding("cl100k_base")
        total_token_count = 0
        for msg in chat_history:
            tokens = tokenizer.encode(msg.content)
            token_count = len(tokens)
            total_token_count += token_count
            print(f"Message: {msg.content}\nToken count: {token_count}")
        
        print(f"Total token count in chat history: {total_token_count}")
        
        return str(chat_engine.chat(question, chat_history=chat_history))

    def get_answer_fn(question: str, history=None) -> str:
        print(f"Question: {question}")
        messages = history if history else []
        messages.append({'role': 'user', 'content': question})
        print(f"Messages: {messages}")
        answer = answer_fn(question, history)
        print(f"Answer: {answer}")
        retrieved_nodes = retriever.retrieve(question)
        
        # Debug: Print retrieved nodes and their token counts
        tokenizer = tiktoken.get_encoding("cl100k_base")
        total_token_count = 0
        for node in retrieved_nodes:
            tokens = tokenizer.encode(node.node.text)
            token_count = len(tokens)
            total_token_count += token_count
            print(f"Node token count: {token_count}")
            print(f"Node Snippet: {node.text[:200]}")

        print(f"Total token count for all nodes: {total_token_count}")
        
        documents = [node.node.text for node in retrieved_nodes]
        return AgentAnswer(message=answer, documents=documents)

    # Load test set
    testset = QATestset.load(test_set_path)

    results_path = f'{results_base_path}'
    report = evaluate(get_answer_fn, testset=testset, knowledge_base=knowledge_base, metrics=[ragas_context_recall, ragas_faithfulness, ragas_answer_relevancy, ragas_context_precision])
    results = report.to_pandas()

    csv_path = results_path + '.csv'
    html_path = results_path + '.html'
    results.to_csv(csv_path, index=False)

# Example of how to call the function:
# run_evaluation("path/to/results", "../giskard_test_sets/LL144_275_New.jsonl", rewriter=True, reranker_model="mixedbread-ai/mxbai-rerank-base-v1", classifier_model="your_classifier_model")



In [32]:
run_evaluation(
    results_base_path='HGRAG_3class_base', 
    test_set_path='../giskard_test_sets/LL144_275_New.jsonl', 
    classifier_model="rk68/distilbert-q-classifier-3", 
    rewriter=False, 
    reranker_model_name=None
)

Asking questions to the agent:   0%|          | 0/269 [00:00<?, ?it/s]

Question: As an employer, I need to understand the definition of 'Candidate for Employment' as stated in the legal documents. Can you provide me with the specific criteria that determine whether someone is considered a candidate for employment?
Messages: [{'role': 'user', 'content': "As an employer, I need to understand the definition of 'Candidate for Employment' as stated in the legal documents. Can you provide me with the specific criteria that determine whether someone is considered a candidate for employment?"}]
Total token count in chat history: 0


ValueError: Initial token count exceeds token limit

In [427]:
run_evaluation(
    results_base_path='HGRAG_2class_base', 
    test_set_path='../giskard_test_sets/LL144_275_New.jsonl', 
    classifier_model="rk68/distilbert-q-classifier-2", 
    rewriter=False, 
    reranker_model=None
)

Asking questions to the agent:   0%|          | 0/269 [00:00<?, ?it/s]

CorrectnessMetric evaluation:   0%|          | 0/269 [00:00<?, ?it/s]

RagasMetric evaluation:   0%|          | 0/269 [00:00<?, ?it/s]

RagasMetric evaluation:   0%|          | 0/269 [00:00<?, ?it/s]

RagasMetric evaluation:   0%|          | 0/269 [00:00<?, ?it/s]

RagasMetric evaluation:   0%|          | 0/269 [00:00<?, ?it/s]

In [63]:
graph_index.docstore.docs.values()

dict_values([TextNode(id_='8d4b7a99-3300-4280-86f8-ebadd341b803', embedding=None, metadata={'total_pages': 10, 'file_path': '../legal_data/LL144/LL144.pdf', 'source': '1'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='0f4137b4-d5ee-4a7e-ba89-b5a7edf0c279', node_type=<ObjectType.DOCUMENT: '4'>, metadata={'total_pages': 10, 'file_path': '../legal_data/LL144/LL144.pdf', 'source': '1'}, hash='4e0c3a4fd055b8cbce9ba46e58440ac8f05a34496ec263530292d2deb64e7096')}, text='1 \n \nNew York City Department of Consumer and Worker Protection \n \nNotice of Adoption of Final Rule \n \nPursuant to the authority vested in the Commissioner of the Department of Consumer and Worker Protection by \nSections 1043 and 2203(f) of the New York City Charter and Section 20-104(b) of the New York City \nAdministrative Code, and in accordance with the requirements of Section 1043 of the New York City Charter, \nthe Department

In [72]:
run_evaluation(
    results_base_path='HGRAG_3class_rewriter', 
    test_set_path='../giskard_test_sets/LL144_275_New.jsonl', 
    classifier_model="rk68/distilbert-q-classifier-3", 
    rewriter=True, 
    reranker_model_name=None
)

Asking questions to the agent:   0%|          | 0/269 [00:00<?, ?it/s]

ValueError: Initial token count exceeds token limit

In [24]:
run_evaluation(
    results_base_path='HGRAG_2class_rewriter', 
    test_set_path='../giskard_test_sets/LL144_275_New.jsonl', 
    classifier_model="rk68/distilbert-q-classifier-2", 
    rewriter=False, 
    reranker_model_name=None
)

Asking questions to the agent:   0%|          | 0/269 [00:00<?, ?it/s]

Question: As an employer, I need to understand the definition of 'Candidate for Employment' as stated in the legal documents. Can you provide me with the specific criteria that determine whether someone is considered a candidate for employment?
Messages: [{'role': 'user', 'content': "As an employer, I need to understand the definition of 'Candidate for Employment' as stated in the legal documents. Can you provide me with the specific criteria that determine whether someone is considered a candidate for employment?"}]
Total token count in chat history: 0
Answer: Unfortunately, the specific criteria for determining whether someone is considered a candidate for employment are not explicitly stated in the provided legal documents. However, based on the context of the documents, a candidate for employment can be generally understood as someone who has applied for a position or is being considered for a position within an organization. This includes both individuals who are seeking initial e

ValueError: Initial token count exceeds token limit

In [78]:
chat_engine.reset()

In [420]:
text_nodes = splitter(index.docstore.docs.values())
for node in text_nodes:
    tokens = tokenizer.encode(node.text)
    token_count = len(tokens)
    total_token_count += token_count
    print(f"Node token count: {token_count}")
    #print(f"Node Snippet: {node.text[:200]}")

Node token count: 437
Node token count: 391
Node token count: 475
Node token count: 470
Node token count: 346
Node token count: 479
Node token count: 370
Node token count: 469
Node token count: 186
Node token count: 482
Node token count: 71
Node token count: 350
Node token count: 407
Node token count: 465
Node token count: 281
Node token count: 468
Node token count: 483
Node token count: 245
Node token count: 224
Node token count: 427
Node token count: 465
Node token count: 415
Node token count: 222
Node token count: 114
Node token count: 437
Node token count: 391
Node token count: 475
Node token count: 470
Node token count: 346
Node token count: 479
Node token count: 370
Node token count: 469
Node token count: 186
Node token count: 482
Node token count: 71
Node token count: 350
Node token count: 407
Node token count: 465
Node token count: 281
Node token count: 468
Node token count: 483
Node token count: 245
Node token count: 224
Node token count: 427
Node token count: 465
Node token c

In [384]:

from llama_index.core.postprocessor import SentenceTransformerRerank
reranker = SentenceTransformerRerank(model="mixedbread-ai/mxbai-rerank-base-v1", top_n=10)
from llama_index.core.chat_engine import CondenseQuestionChatEngine, ContextChatEngine


# Define the custom retriever with query rewriting
retriever = CustomRetrieverWithQueryRewriting(
    llm=llm_gpt35,
    vector_retriever=vector_retriever,#vector_retriever,
    kg_retriever=kg_retriever,#kg_retriever,#kg_retriever,
    bm25_retriever=bm25_retriever,
    classifier_model = "rk68/distilbert-q-classifier-3",
    #similarity_top_k=5,
    mode="OR",  # or "AND" depending on your needs
    rewriter=False,
    reranker=None,
    verbose=False
)

memory = ChatMemoryBuffer.from_defaults(token_limit=8192)

chat_engine = ContextChatEngine.from_defaults(
    retriever=retriever,
    verbose=False,
    chat_mode="context",
    memory_cls=memory,
    memory=memory
)


Settings.llm = llm_gpt35


#retriever = index.as_retriever(similarity_top_k=top_k)
text_nodes = splitter(index.docstore.docs.values())
#print(text_nodes)
results_base_path="graph_hybrid_rewriter_classifier_3class_large_rerank"
knowledge_base_df = pd.DataFrame([node.text for node in text_nodes], columns=['text'])
knowledge_base = KnowledgeBase(knowledge_base_df)
#chat_engine = index.as_chat_engine(chat_mode='context', similarity_top_k=top_k, memory=memory)

def answer_fn(question, history=None):
    chat_history = [ChatMessage(role=MessageRole.USER if msg['role'] == 'user' else MessageRole.ASSISTANT, content=msg['content']) for msg in history] if history else []
    #print(f"chat history: {chat_history}")
    return str(chat_engine.chat(question, chat_history=chat_history))

def get_answer_fn(question: str, history=None) -> str:
    messages = history if history else []
    messages.append({'role': 'user', 'content': question})
    answer = answer_fn(question, history)
    #print(answer)
    #print(f"answer: {answer}")
    retrieved_nodes = retriever.retrieve(question)
    #print(f"retrieved nodes {retrieved_nodes}")
    documents = [node.node.text for node in retrieved_nodes]
    return AgentAnswer(message=answer, documents=documents)

# Load test set
testset = QATestset.load('../giskard_test_sets/LL144_275_New.jsonl')

results_path = f'{results_base_path}'
report = evaluate(get_answer_fn, testset=testset, knowledge_base=knowledge_base, metrics=[ragas_context_recall, ragas_faithfulness, ragas_answer_relevancy, ragas_context_precision])
results = report.to_pandas()
csv_path = results_path + '.csv'
html_path = results_path + '.html'
results.to_csv(csv_path, index=False)


Asking questions to the agent:   0%|          | 0/269 [00:00<?, ?it/s]

CorrectnessMetric evaluation:   0%|          | 0/269 [00:00<?, ?it/s]

RagasMetric evaluation:   0%|          | 0/269 [00:00<?, ?it/s]

RagasMetric evaluation:   0%|          | 0/269 [00:00<?, ?it/s]

RagasMetric evaluation:   0%|          | 0/269 [00:00<?, ?it/s]

RagasMetric evaluation:   0%|          | 0/269 [00:00<?, ?it/s]

In [86]:
# Define the custom retriever with query rewriting
retriever = CustomRetrieverWithQueryRewriting(
    llm=llm_gpt35,
    vector_retriever=vector_retriever,#vector_retriever,
    kg_retriever=kg_retriever,#kg_retriever,#kg_retriever,
    bm25_retriever=bm25_retriever,
    classifier_model = "rk68/distilbert-q-classifier-2",
    #similarity_top_k=5,
    mode="OR",  # or "AND" depending on your needs
    rewriter=True,
    reranker_model_name=None,
    verbose=False
)

Settings.llm = llm_gpt35
from llama_index.core.memory import ChatMemoryBuffer
memory = ChatMemoryBuffer.from_defaults(token_limit=12000)

#retriever = index.as_retriever(similarity_top_k=top_k)
text_nodes = splitter(index.docstore.docs.values())
#print(text_nodes)
results_base_path="graph_hybrid_rewriter_classifier_2class_large_rerank"
knowledge_base_df = pd.DataFrame([node.text for node in text_nodes], columns=['text'])
knowledge_base = KnowledgeBase(knowledge_base_df)
#chat_engine = index.as_chat_engine(chat_mode='context', similarity_top_k=top_k, memory=memory)

def answer_fn(question, history=None):
    chat_history = [ChatMessage(role=MessageRole.USER if msg['role'] == 'user' else MessageRole.ASSISTANT, content=msg['content']) for msg in history] if history else []
    #print(f"chat history: {chat_history}")
    return str(chat_engine.chat(question, chat_history=chat_history))

def get_answer_fn(question: str, history=None) -> str:
    messages = history if history else []
    messages.append({'role': 'user', 'content': question})
    answer = answer_fn(question, history)
    #print(answer)
    #print(f"answer: {answer}")
    retrieved_nodes = retriever.retrieve(question)
    #print(f"retrieved nodes {retrieved_nodes}")
    documents = [node.node.text for node in retrieved_nodes]
    return AgentAnswer(message=answer, documents=documents)

# Load test set
testset = QATestset.load('../giskard_test_sets/LL144_275_New.jsonl')

results_path = f'{results_base_path}'
report = evaluate(get_answer_fn, testset=testset, knowledge_base=knowledge_base, metrics=[ragas_context_recall, ragas_faithfulness, ragas_answer_relevancy, ragas_context_precision])
results = report.to_pandas()
csv_path = results_path + '.csv'
html_path = results_path + '.html'
results.to_csv(csv_path, index=False)


NameError: name 'index' is not defined

In [386]:
# Define the custom retriever with query rewriting
retriever = CustomRetrieverWithQueryRewriting(
    llm=llm_gpt35,
    vector_retriever=vector_retriever,#vector_retriever,
    kg_retriever=kg_retriever,#kg_retriever,#kg_retriever,
    bm25_retriever=bm25_retriever,
    classifier_model = "rk68/distilbert-q-classifier-3",
    #similarity_top_k=5,
    mode="OR",  # or "AND" depending on your needs
    rewriter=False,
    reranker=reranker,
    verbose=False
)

Settings.llm = llm_gpt35
from llama_index.core.memory import ChatMemoryBuffer
memory = ChatMemoryBuffer.from_defaults(token_limit=12000)

#retriever = index.as_retriever(similarity_top_k=top_k)
text_nodes = splitter(index.docstore.docs.values())
#print(text_nodes)
results_base_path="graph_hybrid_no_rewriter_classifier_3class_large_rerank"
knowledge_base_df = pd.DataFrame([node.text for node in text_nodes], columns=['text'])
knowledge_base = KnowledgeBase(knowledge_base_df)
#chat_engine = index.as_chat_engine(chat_mode='context', similarity_top_k=top_k, memory=memory)

def answer_fn(question, history=None):
    chat_history = [ChatMessage(role=MessageRole.USER if msg['role'] == 'user' else MessageRole.ASSISTANT, content=msg['content']) for msg in history] if history else []
    #print(f"chat history: {chat_history}")
    return str(chat_engine.chat(question, chat_history=chat_history))

def get_answer_fn(question: str, history=None) -> str:
    messages = history if history else []
    messages.append({'role': 'user', 'content': question})
    answer = answer_fn(question, history)
    #print(answer)
    #print(f"answer: {answer}")
    retrieved_nodes = retriever.retrieve(question)
    #print(f"retrieved nodes {retrieved_nodes}")
    documents = [node.node.text for node in retrieved_nodes]
    return AgentAnswer(message=answer, documents=documents)

# Load test set
testset = QATestset.load('../giskard_test_sets/LL144_275_New.jsonl')

results_path = f'{results_base_path}'
report = evaluate(get_answer_fn, testset=testset, knowledge_base=knowledge_base, metrics=[ragas_context_recall, ragas_faithfulness, ragas_answer_relevancy, ragas_context_precision])
results = report.to_pandas()
csv_path = results_path + '.csv'
html_path = results_path + '.html'
results.to_csv(csv_path, index=False)


Asking questions to the agent:   0%|          | 0/269 [00:00<?, ?it/s]

2024-08-19 07:37:48,098 pid:33026 MainThread neo4j.io     ERROR    Failed to write data to connection ResolvedIPv4Address(('35.241.237.34', 7687)) (ResolvedIPv4Address(('35.241.237.34', 7687)))
2024-08-19 07:37:48,357 pid:33026 MainThread neo4j.io     ERROR    Failed to write data to connection IPv4Address(('2542e8be.databases.neo4j.io', 7687)) (ResolvedIPv4Address(('35.241.237.34', 7687)))


CorrectnessMetric evaluation:   0%|          | 0/269 [00:00<?, ?it/s]

RagasMetric evaluation:   0%|          | 0/269 [00:00<?, ?it/s]

RagasMetric evaluation:   0%|          | 0/269 [00:00<?, ?it/s]



RagasMetric evaluation:   0%|          | 0/269 [00:00<?, ?it/s]

RagasMetric evaluation:   0%|          | 0/269 [00:00<?, ?it/s]

In [388]:
# Define the custom retriever with query rewriting
retriever = CustomRetrieverWithQueryRewriting(
    llm=llm_gpt35,
    vector_retriever=vector_retriever,#vector_retriever,
    kg_retriever=kg_retriever,#kg_retriever,#kg_retriever,
    bm25_retriever=bm25_retriever,
    classifier_model = "rk68/distilbert-q-classifier-2",
    #similarity_top_k=5,
    mode="OR",  # or "AND" depending on your needs
    rewriter=False,
    reranker=reranker,
    verbose=False
)

Settings.llm = llm_gpt35
from llama_index.core.memory import ChatMemoryBuffer
memory = ChatMemoryBuffer.from_defaults(token_limit=5000)

#retriever = index.as_retriever(similarity_top_k=top_k)
text_nodes = splitter(index.docstore.docs.values())
#print(text_nodes)
results_base_path="graph_hybrid_no_rewriter_classifier_2class_large_rerank"
knowledge_base_df = pd.DataFrame([node.text for node in text_nodes], columns=['text'])
knowledge_base = KnowledgeBase(knowledge_base_df)
#chat_engine = index.as_chat_engine(chat_mode='context', similarity_top_k=top_k, memory=memory)

def answer_fn(question, history=None):
    chat_history = [ChatMessage(role=MessageRole.USER if msg['role'] == 'user' else MessageRole.ASSISTANT, content=msg['content']) for msg in history] if history else []
    #print(f"chat history: {chat_history}")
    return str(chat_engine.chat(question, chat_history=chat_history))

def get_answer_fn(question: str, history=None) -> str:
    messages = history if history else []
    messages.append({'role': 'user', 'content': question})
    answer = answer_fn(question, history)
    #print(answer)
    #print(f"answer: {answer}")
    retrieved_nodes = retriever.retrieve(question)
    #print(f"retrieved nodes {retrieved_nodes}")
    documents = [node.node.text for node in retrieved_nodes]
    return AgentAnswer(message=answer, documents=documents)

# Load test set
testset = QATestset.load('../giskard_test_sets/LL144_275_New.jsonl')

results_path = f'{results_base_path}'
report = evaluate(get_answer_fn, testset=testset, knowledge_base=knowledge_base, metrics=[ragas_context_recall, ragas_faithfulness, ragas_answer_relevancy, ragas_context_precision])
results = report.to_pandas()
csv_path = results_path + '.csv'
html_path = results_path + '.html'
results.to_csv(csv_path, index=False)


Asking questions to the agent:   0%|          | 0/269 [00:00<?, ?it/s]

CorrectnessMetric evaluation:   0%|          | 0/269 [00:00<?, ?it/s]

RagasMetric evaluation:   0%|          | 0/269 [00:00<?, ?it/s]

RagasMetric evaluation:   0%|          | 0/269 [00:00<?, ?it/s]

RagasMetric evaluation:   0%|          | 0/269 [00:00<?, ?it/s]

RagasMetric evaluation:   0%|          | 0/269 [00:00<?, ?it/s]



ValidationError: 2 validation errors for ContextPrecisionVerifications
__root__ -> 1 -> reason
  field required (type=value_error.missing)
__root__ -> 1 -> verdict
  field required (type=value_error.missing)

In [None]:
# Define the custom retriever with query rewriting
retriever = CustomRetrieverWithQueryRewriting(
    llm=llm_gpt35,
    vector_retriever=vector_retriever,#vector_retriever,
    kg_retriever=kg_retriever,#kg_retriever,#kg_retriever,
    bm25_retriever=bm25_retriever,
    classifier_model = "rk68/distilbert-q-classifier-3",
    #similarity_top_k=5,
    mode="OR",  # or "AND" depending on your needs
    rewriter=False,
    reranker=None,
    verbose=False
)

Settings.llm = llm_gpt35
from llama_index.core.memory import ChatMemoryBuffer
memory = ChatMemoryBuffer.from_defaults(token_limit=5000)

#retriever = index.as_retriever(similarity_top_k=top_k)
text_nodes = splitter(index.docstore.docs.values())
#print(text_nodes)
results_base_path="graph_hybrid_no_rewriter_classifier_3class_no_rerank"
knowledge_base_df = pd.DataFrame([node.text for node in text_nodes], columns=['text'])
knowledge_base = KnowledgeBase(knowledge_base_df)
#chat_engine = index.as_chat_engine(chat_mode='context', similarity_top_k=top_k, memory=memory)

def answer_fn(question, history=None):
    chat_history = [ChatMessage(role=MessageRole.USER if msg['role'] == 'user' else MessageRole.ASSISTANT, content=msg['content']) for msg in history] if history else []
    #print(f"chat history: {chat_history}")
    return str(chat_engine.chat(question, chat_history=chat_history))

def get_answer_fn(question: str, history=None) -> str:
    messages = history if history else []
    messages.append({'role': 'user', 'content': question})
    answer = answer_fn(question, history)
    #print(answer)
    #print(f"answer: {answer}")
    retrieved_nodes = retriever.retrieve(question)
    #print(f"retrieved nodes {retrieved_nodes}")
    documents = [node.node.text for node in retrieved_nodes]
    return AgentAnswer(message=answer, documents=documents)

# Load test set
testset = QATestset.load('../giskard_test_sets/LL144_275_New.jsonl')

results_path = f'{results_base_path}'
report = evaluate(get_answer_fn, testset=testset, knowledge_base=knowledge_base, metrics=[ragas_context_recall, ragas_faithfulness, ragas_answer_relevancy, ragas_context_precision])
results = report.to_pandas()
csv_path = results_path + '.csv'
html_path = results_path + '.html'
results.to_csv(csv_path, index=False)


In [398]:
# Define the custom retriever with query rewriting
retriever = CustomRetrieverWithQueryRewriting(
    llm=llm_gpt35,
    vector_retriever=vector_retriever,#vector_retriever,
    kg_retriever=kg_retriever,#kg_retriever,#kg_retriever,
    bm25_retriever=bm25_retriever,
    classifier_model = "rk68/distilbert-q-classifier-2",
    #similarity_top_k=5,
    mode="OR",  # or "AND" depending on your needs
    rewriter=False,
    reranker=None,
    verbose=False
)

Settings.llm = llm_gpt35
from llama_index.core.memory import ChatMemoryBuffer
memory = ChatMemoryBuffer.from_defaults(token_limit=8192)

#retriever = index.as_retriever(similarity_top_k=top_k)
text_nodes = splitter(index.docstore.docs.values())
#print(text_nodes)
results_base_path="graph_hybrid_no_rewriter_classifier_2class_no_rerank"
knowledge_base_df = pd.DataFrame([node.text for node in text_nodes], columns=['text'])
knowledge_base = KnowledgeBase(knowledge_base_df)
#chat_engine = index.as_chat_engine(chat_mode='context', similarity_top_k=top_k, memory=memory)

def answer_fn(question, history=None):
    chat_history = [ChatMessage(role=MessageRole.USER if msg['role'] == 'user' else MessageRole.ASSISTANT, content=msg['content']) for msg in history] if history else []
    #print(f"chat history: {chat_history}")
    return str(chat_engine.chat(question, chat_history=chat_history))

def get_answer_fn(question: str, history=None) -> str:
    messages = history if history else []
    messages.append({'role': 'user', 'content': question})
    answer = answer_fn(question, history)
    #print(answer)
    #print(f"answer: {answer}")
    retrieved_nodes = retriever.retrieve(question)
    #print(f"retrieved nodes {retrieved_nodes}")
    documents = [node.node.text for node in retrieved_nodes]
    return AgentAnswer(message=answer, documents=documents)

# Load test set
testset = QATestset.load('../giskard_test_sets/LL144_275_New.jsonl')

results_path = f'{results_base_path}'
report = evaluate(get_answer_fn, testset=testset, knowledge_base=knowledge_base, metrics=[ragas_context_recall, ragas_faithfulness, ragas_answer_relevancy, ragas_context_precision])
results = report.to_pandas()
csv_path = results_path + '.csv'
html_path = results_path + '.html'
results.to_csv(csv_path, index=False)


Asking questions to the agent:   0%|          | 0/269 [00:00<?, ?it/s]

KeyboardInterrupt: 

### GraphRAG

In [50]:
from llama_index.readers.file import PyMuPDFReader


# Evaluate with Giskard
loader = PyMuPDFReader()
#file_extractor = {".pdf": loader}
documents1 = loader.load(file_path="../legal_data/LL144/LL14421.pdf")
documents2 = loader.load(file_path="../legal_data/LL144/LL144_Definitions.pdf")
documents = documents1 + documents2

In [51]:
print(len(documents))

14


In [55]:
from llama_index.core.node_parser import SentenceSplitter

splitter = SentenceSplitter(
    chunk_size=1024,
    chunk_overlap=20,
)
nodes = splitter.get_nodes_from_documents(documents)

In [2]:
import pandas as pd
from llama_index.core import Document

import ssl

# Disable SSL verification
ssl._create_default_https_context = ssl._create_unverified_context


news = pd.read_csv(
    "https://raw.githubusercontent.com/tomasonjo/blog-datasets/main/news_articles.csv"
)[:30]

news.head()



documents = [
    Document(text=f"{row['title']}: {row['text']}")
    for i, row in news.iterrows()
]



In [3]:
print(len(documents))

llm = llm_gpt4o

30


In [57]:
!pip install llama-index graspologic numpy==1.24.4 scipy==1.12.0

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting numpy==1.24.4
  Using cached numpy-1.24.4-cp310-cp310-macosx_11_0_arm64.whl.metadata (5.6 kB)
INFO: pip is looking at multiple versions of graspologic to determine which version is compatible with other requirements. This could take a while.
Collecting graspologic
  Using cached graspologic-3.4.0-py3-none-any.whl.metadata (5.8 kB)
Collecting POT<0.8.0,>=0.7.0 (from graspologic)
  Using cached POT-0.7.0.post1.tar.gz (176 kB)
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Installing backend dependencies ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
[?25hCollecting beartype<0.11.0,>=0.10.4 (from graspologic)
  Using cached beartype-0.10.4-py3-none-any.whl.metadata (256 kB)
Collecting hyppo<0.4.0,>=0.3.2 (from graspologic)
  Using cached hyppo-0.3.2.tar.gz (84 kB)
  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting graspologic
  Using cached graspologic-3.3.0-py3-none-any.

In [58]:
!pip install graspologic -U

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting graspologic
  Using cached graspologic-3.4.1-py3-none-any.whl.metadata (5.8 kB)
Collecting numpy<2.0.0,>=1.26.4 (from graspologic)
  Using cached numpy-1.26.4-cp310-cp310-macosx_11_0_arm64.whl.metadata (61 kB)
Using cached graspologic-3.4.1-py3-none-any.whl (5.2 MB)
Using cached numpy-1.26.4-cp310-cp310-macosx_11_0_arm64.whl (14.0 MB)
Installing collected packages: numpy, graspologic
  Attempting uninstall: numpy
    Found existing installation: numpy 1.24.4
    Uninstalling numpy-1.24.4:
      Successfully uninstalled numpy-1.24.4
  Attempting uninstall: graspologic
    Found existing installation: graspologic 3.3.0
    Uninstalling graspologic-3.3.0:
      Successfully uninstalled graspologic-3.3.0
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
fastembed 0.2.7 requires huggingface-hub<0.21,>=0.20, but you have huggingface-hub 0.23.4 which is 

In [4]:
import asyncio
import nest_asyncio
import graspologic
nest_asyncio.apply()

from typing import Any, List, Callable, Optional, Union, Dict
from IPython.display import Markdown, display

from llama_index.core.async_utils import run_jobs
from llama_index.core.indices.property_graph.utils import (
    default_parse_triplets_fn,
)
from llama_index.core.graph_stores.types import (
    EntityNode,
    KG_NODES_KEY,
    KG_RELATIONS_KEY,
    Relation,
)
from llama_index.core.llms.llm import LLM
from llama_index.core.prompts import PromptTemplate
from llama_index.core.prompts.default_prompts import (
    DEFAULT_KG_TRIPLET_EXTRACT_PROMPT,
)
from llama_index.core.schema import TransformComponent, BaseNode
from llama_index.core.bridge.pydantic import BaseModel, Field


class GraphRAGExtractor(TransformComponent):
    """Extract triples from a graph.

    Uses an LLM and a simple prompt + output parsing to extract paths (i.e. triples) and entity, relation descriptions from text.

    Args:
        llm (LLM):
            The language model to use.
        extract_prompt (Union[str, PromptTemplate]):
            The prompt to use for extracting triples.
        parse_fn (callable):
            A function to parse the output of the language model.
        num_workers (int):
            The number of workers to use for parallel processing.
        max_paths_per_chunk (int):
            The maximum number of paths to extract per chunk.
    """

    llm: LLM
    extract_prompt: PromptTemplate
    parse_fn: Callable
    num_workers: int
    max_paths_per_chunk: int

    def __init__(
        self,
        llm: Optional[LLM] = None,
        extract_prompt: Optional[Union[str, PromptTemplate]] = None,
        parse_fn: Callable = default_parse_triplets_fn,
        max_paths_per_chunk: int = 10,
        num_workers: int = 4,
    ) -> None:
        """Init params."""
        from llama_index.core import Settings

        if isinstance(extract_prompt, str):
            extract_prompt = PromptTemplate(extract_prompt)

        super().__init__(
            llm=llm or Settings.llm,
            extract_prompt=extract_prompt or DEFAULT_KG_TRIPLET_EXTRACT_PROMPT,
            parse_fn=parse_fn,
            num_workers=num_workers,
            max_paths_per_chunk=max_paths_per_chunk,
        )

    @classmethod
    def class_name(cls) -> str:
        return "GraphExtractor"

    def __call__(
        self, nodes: List[BaseNode], show_progress: bool = False, **kwargs: Any
    ) -> List[BaseNode]:
        """Extract triples from nodes."""
        return asyncio.run(
            self.acall(nodes, show_progress=show_progress, **kwargs)
        )

    async def _aextract(self, node: BaseNode) -> BaseNode:
        """Extract triples from a node."""
        assert hasattr(node, "text")

        text = node.get_content(metadata_mode="llm")
        try:
            llm_response = await self.llm.apredict(
                self.extract_prompt,
                text=text,
                max_knowledge_triplets=self.max_paths_per_chunk,
            )
            entities, entities_relationship = self.parse_fn(llm_response)
        except ValueError:
            entities = []
            entities_relationship = []

        existing_nodes = node.metadata.pop(KG_NODES_KEY, [])
        existing_relations = node.metadata.pop(KG_RELATIONS_KEY, [])
        metadata = node.metadata.copy()
        for entity, entity_type, description in entities:
            metadata[
                "entity_description"
            ] = description  # Not used in the current implementation. But will be useful in future work.
            entity_node = EntityNode(
                name=entity, label=entity_type, properties=metadata
            )
            existing_nodes.append(entity_node)

        metadata = node.metadata.copy()
        for triple in entities_relationship:
            subj, rel, obj, description = triple
            subj_node = EntityNode(name=subj, properties=metadata)
            obj_node = EntityNode(name=obj, properties=metadata)
            metadata["relationship_description"] = description
            rel_node = Relation(
                label=rel,
                source_id=subj_node.id,
                target_id=obj_node.id,
                properties=metadata,
            )

            existing_nodes.extend([subj_node, obj_node])
            existing_relations.append(rel_node)

        node.metadata[KG_NODES_KEY] = existing_nodes
        node.metadata[KG_RELATIONS_KEY] = existing_relations
        return node

    async def acall(
        self, nodes: List[BaseNode], show_progress: bool = False, **kwargs: Any
    ) -> List[BaseNode]:
        """Extract triples from nodes async."""
        jobs = []
        for node in nodes:
            jobs.append(self._aextract(node))

        return await run_jobs(
            jobs,
            workers=self.num_workers,
            show_progress=show_progress,
            desc="Extracting paths from text",
        )
    

import re
from llama_index.core.graph_stores import SimplePropertyGraphStore
import networkx as nx
from graspologic.partition import hierarchical_leiden

from llama_index.core.llms import ChatMessage


class GraphRAGStore(SimplePropertyGraphStore):
    community_summary = {}
    max_cluster_size = 5

    def generate_community_summary(self, text):
        """Generate summary for a given text using an LLM."""
        messages = [
            ChatMessage(
                role="system",
                content=(
                    "You are provided with a set of relationships from a knowledge graph, each represented as "
                    "entity1->entity2->relation->relationship_description. Your task is to create a summary of these "
                    "relationships. The summary should include the names of the entities involved and a concise synthesis "
                    "of the relationship descriptions. The goal is to capture the most critical and relevant details that "
                    "highlight the nature and significance of each relationship. Ensure that the summary is coherent and "
                    "integrates the information in a way that emphasizes the key aspects of the relationships."
                ),
            ),
            ChatMessage(role="user", content=text),
        ]
        response = OpenAI().chat(messages)
        clean_response = re.sub(r"^assistant:\s*", "", str(response)).strip()
        return clean_response

    def build_communities(self):
        """Builds communities from the graph and summarizes them."""
        nx_graph = self._create_nx_graph()
        community_hierarchical_clusters = hierarchical_leiden(
            nx_graph, max_cluster_size=self.max_cluster_size
        )
        community_info = self._collect_community_info(
            nx_graph, community_hierarchical_clusters
        )
        self._summarize_communities(community_info)

    def _create_nx_graph(self):
        """Converts internal graph representation to NetworkX graph."""
        nx_graph = nx.Graph()
        for node in self.graph.nodes.values():
            nx_graph.add_node(str(node))
        for relation in self.graph.relations.values():
            nx_graph.add_edge(
                relation.source_id,
                relation.target_id,
                relationship=relation.label,
                description=relation.properties["relationship_description"],
            )
        return nx_graph

    def _collect_community_info(self, nx_graph, clusters):
        """Collect detailed information for each node based on their community."""
        community_mapping = {item.node: item.cluster for item in clusters}
        community_info = {}
        for item in clusters:
            cluster_id = item.cluster
            node = item.node
            if cluster_id not in community_info:
                community_info[cluster_id] = []

            for neighbor in nx_graph.neighbors(node):
                if community_mapping[neighbor] == cluster_id:
                    edge_data = nx_graph.get_edge_data(node, neighbor)
                    if edge_data:
                        detail = f"{node} -> {neighbor} -> {edge_data['relationship']} -> {edge_data['description']}"
                        community_info[cluster_id].append(detail)
        return community_info

    def _summarize_communities(self, community_info):
        """Generate and store summaries for each community."""
        for community_id, details in community_info.items():
            details_text = (
                "\n".join(details) + "."
            )  # Ensure it ends with a period
            self.community_summary[
                community_id
            ] = self.generate_community_summary(details_text)

    def get_community_summaries(self):
        """Returns the community summaries, building them if not already done."""
        if not self.community_summary:
            self.build_communities()
        return self.community_summary
    

from llama_index.core.query_engine import CustomQueryEngine
from llama_index.core.llms import LLM


class GraphRAGQueryEngine(CustomQueryEngine):
    graph_store: GraphRAGStore
    llm: LLM

    def custom_query(self, query_str: str) -> str:
        """Process all community summaries to generate answers to a specific query."""
        community_summaries = self.graph_store.get_community_summaries()
        community_answers = [
            self.generate_answer_from_summary(community_summary, query_str)
            for _, community_summary in community_summaries.items()
        ]

        final_answer = self.aggregate_answers(community_answers)
        return final_answer

    def generate_answer_from_summary(self, community_summary, query):
        """Generate an answer from a community summary based on a given query using LLM."""
        prompt = (
            f"Given the community summary: {community_summary}, "
            f"how would you answer the following query? Query: {query}"
        )
        messages = [
            ChatMessage(role="system", content=prompt),
            ChatMessage(
                role="user",
                content="I need an answer based on the above information.",
            ),
        ]
        response = self.llm.chat(messages)
        cleaned_response = re.sub(r"^assistant:\s*", "", str(response)).strip()
        return cleaned_response

    def aggregate_answers(self, community_answers):
        """Aggregate individual community answers into a final, coherent response."""
        # intermediate_text = " ".join(community_answers)
        prompt = "Combine the following intermediate answers into a final, concise response."
        messages = [
            ChatMessage(role="system", content=prompt),
            ChatMessage(
                role="user",
                content=f"Intermediate answers: {community_answers}",
            ),
        ]
        final_response = self.llm.chat(messages)
        cleaned_final_response = re.sub(
            r"^assistant:\s*", "", str(final_response)
        ).strip()
        return cleaned_final_response

In [5]:
KG_TRIPLET_EXTRACT_TMPL = """
-Goal-
Given a text document, identify all entities and their entity types from the text and all relationships among the identified entities.
Given the text, extract up to {max_knowledge_triplets} entity-relation triplets.

-Steps-
1. Identify all entities. For each identified entity, extract the following information:
- entity_name: Name of the entity, capitalized
- entity_type: Type of the entity
- entity_description: Comprehensive description of the entity's attributes and activities
Format each entity as ("entity")

2. From the entities identified in step 1, identify all pairs of (source_entity, target_entity) that are *clearly related* to each other.
For each pair of related entities, extract the following information:
- source_entity: name of the source entity, as identified in step 1
- target_entity: name of the target entity, as identified in step 1
- relation: relationship between source_entity and target_entity
- relationship_description: explanation as to why you think the source entity and the target entity are related to each other

Format each relationship as ("relationship")

3. When finished, output.

-Real Data-
######################
text: {text}
######################
output:"""


entity_pattern = r'"entity"$$$$"(.+?)"$$$$"(.+?)"$$$$"(.+?)""entity"$$$$"(.+?)"$$$$"(.+?)"$$$$"(.+?)""entity"\$\$\$\$"(.+?)"\$\$\$\$"(.+?)"\$\$\$\$"(.+?)"'
relationship_pattern = r'"relationship"$$$$"(.+?)"$$$$"(.+?)"$$$$"(.+?)"$$$$"(.+?)""relationship"$$$$"(.+?)"$$$$"(.+?)"$$$$"(.+?)"$$$$"(.+?)""relationship"\$\$\$\$"(.+?)"\$\$\$\$"(.+?)"\$\$\$\$"(.+?)"\$\$\$\$"(.+?)"'


def parse_fn(response_str: str) -> Any:
    entities = re.findall(entity_pattern, response_str)
    relationships = re.findall(relationship_pattern, response_str)
    return entities, relationships


kg_extractor = GraphRAGExtractor(
    llm=llm,
    extract_prompt=KG_TRIPLET_EXTRACT_TMPL,
    max_paths_per_chunk=2,
    parse_fn=parse_fn,
)

In [7]:
from llama_index.core import PropertyGraphIndex



from llama_index.core.node_parser import SentenceSplitter

splitter = SentenceSplitter(
    chunk_size=1024,
    chunk_overlap=20,
)
nodes = splitter.get_nodes_from_documents(documents)



index = PropertyGraphIndex(
    nodes=nodes,
    property_graph_store=GraphRAGStore(),
    kg_extractors=[kg_extractor],
    show_progress=True,
)

Extracting paths from text:   0%|          | 0/30 [00:00<?, ?it/s]

Extracting paths from text: 100%|██████████| 30/30 [01:30<00:00,  3.01s/it]
Generating embeddings: 100%|██████████| 3/3 [00:16<00:00,  5.66s/it]
Generating embeddings: 0it [00:00, ?it/s]


In [8]:
list(index.property_graph_store.graph.nodes.values())[-1]

ChunkNode(label='text_chunk', embedding=[0.0024235264863818884, 0.013167659752070904, 0.0072385878302156925, 0.019178122282028198, 0.0010461017955094576, 0.01909640058875084, 0.001598485978320241, 0.009496637620031834, 0.03447521850466728, 0.04272036254405975, 0.06363996118307114, -0.0016113902674987912, -0.0008026427240110934, -0.008253972977399826, -0.058976661413908005, 0.019031547009944916, -0.010623959824442863, -0.04250356927514076, -0.0014436853816732764, 0.005331028252840042, -0.040690381079912186, -0.024544252082705498, -0.048615969717502594, 0.007665549870580435, 0.007321249693632126, 0.02348478138446808, 0.01695089414715767, -0.007167942356318235, 0.11247008293867111, 0.05967866629362106, -0.0023573441430926323, -0.032052356749773026, -0.001038963790051639, -0.05069563537836075, -0.014188705943524837, 0.014905848540365696, -0.012721233069896698, -0.009602385573089123, -0.00665656803175807, -0.01519862562417984, 0.019763682037591934, -0.010990634560585022, 0.06027193740010261

In [9]:
for value in index.property_graph_store.graph.nodes.values():
    print(value)

Chevron: Best Of Breed: JHVEPhoto Like many companies in the O&G sector, the stock of Chevron (NYSE:CVX) has declined about 10% over the past 90-days despite the fact that Q2 consensus earnings estimates have risen sharply (~25%) during that same time frame. Over the years, Chevron has kept a very strong balance sheet. That allowed the...
FirstEnergy (NYSE:FE) Posts Earnings Results: FirstEnergy (NYSE:FE – Get Rating) posted its earnings results on Tuesday. The utilities provider reported $0.53 earnings per share for the quarter, topping the consensus estimate of $0.52 by $0.01, RTT News reports. FirstEnergy had a net margin of 10.85% and a return on equity of 17.17%. During the same period...
If the content contained herein violates any of your rights, including those of copyright, you are requested to immediately notify us using via the following email address operanews-external(at)opera.com
Top News
Dáil almost suspended after Sinn Féin TD put pager in front of Minister during firef

In [14]:
print(list(index.property_graph_store.graph.relations.values()))

[]


In [65]:
list(index.property_graph_store.graph.relations.values())[0].properties[
    "relationship_description"
]

IndexError: list index out of range

In [66]:
index.property_graph_store.build_communities()

EmptyNetworkError: EmptyNetworkError

In [15]:
query_engine = GraphRAGQueryEngine(
    graph_store=index.property_graph_store, llm=llm
)

response = query_engine.query(
    "What are the main news discussed in the document?"
)
display(Markdown(f"{response.response}"))

EmptyNetworkError: EmptyNetworkError

In [16]:

graph_store = Neo4jPropertyGraphStore(
    username="neo4j",
    password="tfSoF1h2DqMDBe91hxcqYF7REiR65HZcDkIVuZ74NMs",
    url="neo4j+ssc://2542e8be.databases.neo4j.io",
)

# load from existing graph/vector store
index = PropertyGraphIndex.from_existing(
    property_graph_store=graph_store,
    # optional, neo4j also supports vectors directly
    #vector_store=vector_store,
    embed_kg_nodes=True,
)



In [17]:
retriever = index.as_retriever(similarity_top_k=5)

nodes = retriever.retrieve("what are the key takeaways from LL144?")
print(nodes)
print(len(nodes))



ValidationError: 1 validation error for EntityNode
name
  none is not an allowed value (type=type_error.none.not_allowed)

