# Setup

In [1]:
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.llms.openai import OpenAI
from dotenv import load_dotenv
import os
import nest_asyncio

nest_asyncio.apply()

load_dotenv()
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

embed_model = OpenAIEmbedding(model_name="text-embedding-ada-002", api_key=OPENAI_API_KEY)
llm = OpenAI(api_key=OPENAI_API_KEY, model_name="gpt-4o-mini", temperature=0.1)

In [2]:
from vectorstore import get_vectorstore

vectorstore = get_vectorstore()

In [3]:
from index import get_index

index = get_index(embed_model=embed_model)

In [4]:
import json

def store_docs(docs, filename):
    docs_to_store = []

    for node in docs:
        docs_to_store.append({
            "text": node.text,
            "metadata": node.metadata,
            "score": node.score
        })

    with open(f"./output/{filename}", "w") as f:
        json.dump(docs_to_store, f, indent=4)

# Retrieving

In [5]:
query_gen_prompt = """You are an AI language model assistant specializing in query expansion. Your task is to generate {num_queries} diverse versions of the given user question. These variations will be used to retrieve relevant documents from a vector database, helping to overcome limitations of distance-based similarity search.

Original question: {query}

Instructions:
1. Create {num_queries} unique variations of the original question.
2. Ensure each variation maintains the core intent of the original question.
3. Use different phrasings, synonyms, or perspectives for each variation.
4. Consider potential context or implications not explicitly stated in the original question.
5. Avoid introducing new topics or drastically changing the meaning of the question.

Please provide your {num_queries} question variations, each on a new line:
"""

In [26]:
top_n = 50
num_queries = 5
question = "what are the signs of Imam Mehdi's reappearance?"

## Existing Implementation

In [18]:
vector_retriever = index.as_retriever(similarity_top_k=top_n)

In [19]:
from llama_index.core.retrievers import QueryFusionRetriever

retriever = QueryFusionRetriever(
    [vector_retriever],
    similarity_top_k=top_n,
    num_queries=num_queries,  # set this to 1 to disable query generation
    mode="reciprocal_rerank",
    use_async=True,
    verbose=True,
    query_gen_prompt=query_gen_prompt,
    llm=llm,
)

In [27]:
# it will genereate n - 1 queries since the original query is also included
nodes_with_scores = retriever.retrieve(question)

Generated queries:
1. What indications signal the return of Imam Mehdi?
2. How can one recognize the signs of Imam Mehdi's emergence?
3. What are the manifestations that Imam Mehdi is about to reappear?
4. What are the clues that Imam Mehdi's reappearance is imminent?


In [28]:
from llm_score_docs import BatchProcessingConfig, batch_score_documents

config = BatchProcessingConfig()
scored_docs = batch_score_documents(nodes_with_scores, question, config)

Processing batch 1
Processing batch 2
Processing batch 3
Processing batch 4
Raw result: ```json
[
    {"reason": "Discusses signs of Imam Mehdi's reappearance.", "score": 9},
    {"reason": "Describes conditions before Imam's appearance.", "score": 8},
    {"reason": "Emphasizes importance of recognizing the Imam.", "score": 6},
    {"reason": "Mentions the brightness of the Imam's reappearance.", "score": 5},
    {"reason": "Speaks on the certainty of the Imam's future appearance.", "score": 9},
    {"reason": "Explains the occultation and its significance for believers.", "score": 7},
    {"reason": "References actions of the Imam during his reign.", "score": 6},
    {"reason": "Mentions signs preceding the Imam's reappearance.", "score": 9},
    {"reason": "Discusses conditions related to the Imam's appearance.", "score": 7},
    {"reason": "Mentions the call from Heaven about the Imam.", "score": 8},
    {"reason": "Focuses on signs and conditions linked to the reappearance.", "sco

In [30]:
relevance_threshold = 4

filtered_docs = [
    doc for doc in scored_docs
    if doc.score >= relevance_threshold
]

len(filtered_docs)

50

In [31]:
store_docs([doc.doc for doc in filtered_docs], "baseline_docs.json")

## Cross Encoder

In [41]:
from llama_index.core.postprocessor import SentenceTransformerRerank

sentence_transformer_rerank = SentenceTransformerRerank(
    model='cross-encoder/ms-marco-MiniLM-L-6-v2',
    top_n=top_n
)

In [33]:
cross_encoder_retriever = index.as_retriever(similarity_top_k=top_n, postprocessor=sentence_transformer_rerank) # This will retrieve and rerank the top_n documents

In [34]:
docs = cross_encoder_retriever.retrieve(question)

In [None]:
# a = index.as_retriever(similarity_top_k=top_n, postprocessors=[sentence_transformer_rerank])
# b = a.retrieve(question)

In [35]:
len(docs)

50

In [36]:
store_docs(docs, "cross_encoder_docs.json")

## Cross Encoder with Query Fusion

In [38]:
from llama_index.core.retrievers import QueryFusionRetriever

cross_encoder_retriever_with_query_fusion = QueryFusionRetriever(
    [cross_encoder_retriever],
    similarity_top_k=top_n,
    num_queries=num_queries,  # set this to 1 to disable query generation
    use_async=True,
    verbose=True,
    query_gen_prompt=query_gen_prompt,
    llm=llm,
)

In [39]:
docs2 = cross_encoder_retriever_with_query_fusion.retrieve(question)
len(docs2)

Generated queries:
1. What indications signal the return of Imam Mehdi?
2. How can one recognize the signs of Imam Mehdi's emergence?
3. What are the indicators that Imam Mehdi is about to reappear?
4. What are the telltale signs of Imam Mehdi's imminent return?


50

In [40]:
store_docs(docs2, "cross_encoder_retriever_with_query_fusion.json")