# Setting up the environment

In [1]:
import nest_asyncio

nest_asyncio.apply()

In [2]:
import os
import sys
sys.path.append('..')
from config import Config

os.environ["OPENAI_API_KEY"] = Config.OPENAI_API_KEY

In [3]:
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.llms.openai import OpenAI

llm = OpenAI(model="gpt-4o")
embed_model = OpenAIEmbedding(model_name="text-embedding-3-small")

# Graph Store

In [4]:
from graph_rag_store import GraphRAGStore

# Note: used to be `Neo4jPGStore`
graph_store = GraphRAGStore(
    username="neo4j", password="admin123", url="bolt://localhost:7687"
)

  from .autonotebook import tqdm as notebook_tqdm


# Query

In [33]:
from typing import Any

from llama_index.core.retrievers import QueryFusionRetriever
from llama_index.core.schema import QueryBundle

NUM_QUERIES = 5

In [12]:
query_gen_prompt = """You are an AI language model assistant specializing in query expansion. Your task is to generate {num_queries} diverse versions of the given user question. These variations will be used to retrieve relevant documents from a vector database, helping to overcome limitations of distance-based similarity search.

Original question: {query}

Instructions:
1. Create {num_queries} unique variations of the original question.
2. Ensure each variation maintains the core intent of the original question.
3. Use different phrasings, synonyms, or perspectives for each variation.
4. Consider potential context or implications not explicitly stated in the original question.
5. Avoid introducing new topics or drastically changing the meaning of the question.

Please provide your {num_queries} question variations, each on a new line:
"""

query_variations = None

In [13]:
def get_queries(self, original_query: str) -> list[QueryBundle]:  # noqa: ANN001
    global query_variations  # Declare the global list  # noqa: PLW0603
    prompt_str = self.query_gen_prompt.format(
        num_queries=self.num_queries - 1,
        query=original_query,
    )
    response = self._llm.complete(prompt_str)

    # Assume LLM properly put each query on a newline
    queries = response.text.split("\n")
    queries = [q.strip() for q in queries if q.strip()]

    # Store the generated queries in the global variable
    query_variations = queries # this is the only change that I made

    if self._verbose:
        queries_str = "\n".join(queries)
        print(f"Generated queries:\n{queries_str}")

    # The LLM often returns more queries than we asked for, so trim the list.
    return [QueryBundle(q) for q in query_variations]

QueryFusionRetriever._get_queries = get_queries

In [24]:
def get_retriever(retriever, top_n: int) -> QueryFusionRetriever:
    """
    Creates and returns a QueryFusionRetriever instance configured with the specified number of top results.

    Args:
    ----
        top_n (int): The number of top results to retrieve.

    Returns:
    -------
        QueryFusionRetriever: An instance of QueryFusionRetriever configured with the specified parameters.

    """
    return QueryFusionRetriever(
        [retriever],
        similarity_top_k=top_n,
        num_queries=NUM_QUERIES,
        mode="simple",
        use_async=True,
        verbose=True,
        query_gen_prompt=query_gen_prompt,
    )

In [15]:
def retrieve_documents(question, retriever, top_n) -> tuple[None, Any]:
    """
    Retrieve documents based on a given question.

    Args:
    ----
        question (str): The question to retrieve documents for.
        top_n (int): The number of top documents to retrieve.

    Returns:
    -------
        tuple[None, Any]: A tuple containing None and the retrieved documents.

    """
    retriever = get_retriever(retriever, top_n)
    docs = retriever.retrieve(question)
    return query_variations, docs

# Retriever

In [66]:
top_n = 20

In [67]:
from llama_index.core import PropertyGraphIndex

retriever = PropertyGraphIndex.from_existing(
    property_graph_store=graph_store,
    llm=llm,
    embed_model=embed_model,
    show_progress=True,
    include_text=True
).as_retriever()

In [74]:
question = "How does the islamic concept of freedom relate to divine justice and why there is suffering in the world?"

In [75]:
query_variations, nodes = retrieve_documents(question=question, retriever=retriever, top_n=top_n)

Generated queries:
1. In what ways does the Islamic belief in freedom intersect with the idea of divine justice and the existence of suffering in the world?
2. How is the Islamic principle of freedom connected to the concept of divine justice and the presence of suffering in the world?
3. What is the relationship between the Islamic notion of freedom, divine justice, and the existence of suffering in the world?
4. How does the Islamic perspective on freedom influence the understanding of divine justice and the reason behind suffering in the world?


In [76]:
len(nodes), type(nodes[0])

(11, llama_index.core.schema.NodeWithScore)

In [77]:
nodes[0].score

0.7316093444824219

In [78]:
from IPython.display import Markdown

display(Markdown(f"{nodes[0].text}"))

George Sarton -> Advocate -> RELIGIOUS FAITH

In [79]:
import json

# create a dictionary to store the query and nodes data
output_data = {
    'original_question': question,
    'query_variations': query_variations,
    'nodes': []
}

# extract relevant information from each node
for node in nodes:
    node_info = {
        'text': node.node.text,
        'score': float(node.score),  # convert to float for JSON serialization
        'metadata': node.node.metadata
    }
    output_data['nodes'].append(node_info)

# write to JSON file
with open('output/retrieved_nodes.json', 'w', encoding='utf-8') as f:
    json.dump(output_data, f, indent=2, ensure_ascii=False)