# Setup

In [10]:
import nest_asyncio

nest_asyncio.apply()

In [1]:
from dotenv import load_dotenv
import os

load_dotenv()
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

In [2]:
from llama_index.llms.openai import OpenAI
from llama_index.embeddings.openai import OpenAIEmbedding

llm = OpenAI(api_key=OPENAI_API_KEY, model_name="gpt-4o-mini", temperature=0.1)
embed_model = OpenAIEmbedding(model_name="text-embedding-ada-002", api_key=OPENAI_API_KEY)

In [3]:
import chromadb
from llama_index.vector_stores.chroma import ChromaVectorStore
from llama_index.core import VectorStoreIndex


# The name "IndexedVectorStore" emphasizes that the class handles both the vector store and the index

class IndexedVectorStore:
    def __init__(self):
        self.db = chromadb.PersistentClient(path="../the-server/vectorstore")
        self.chroma_collection = self.db.get_or_create_collection("transcription_project")
        self.vector_store = ChromaVectorStore(chroma_collection=self.chroma_collection)
        self.index = VectorStoreIndex.from_vector_store(
            self.vector_store,
            embed_model=embed_model,
        )

    def add_documents(self, documents: list) -> None:
        # Add the documents to the LlamaIndex and persist them
        for document in documents:
            self.index.insert(document)
        self.index.storage_context.persist(persist_dir="./db")

In [5]:
vectorstore = IndexedVectorStore()

# Retrieving

In [6]:
query_gen_prompt = """You are an AI language model assistant specializing in query expansion. Your task is to generate {num_queries} diverse versions of the given user question. These variations will be used to retrieve relevant documents from a vector database, helping to overcome limitations of distance-based similarity search.

Original question: {query}

Instructions:
1. Create {num_queries} unique variations of the original question.
2. Ensure each variation maintains the core intent of the original question.
3. Use different phrasings, synonyms, or perspectives for each variation.
4. Consider potential context or implications not explicitly stated in the original question.
5. Avoid introducing new topics or drastically changing the meaning of the question.

Please provide your {num_queries} question variations, each on a new line:
"""

In [19]:
top_n = 20
num_queries = 5
question = "Who is Prophet Muhammad (SAW)?"

vector_retriever = vectorstore.index.as_retriever(similarity_top_k=top_n)

In [20]:
from llama_index.core.retrievers import QueryFusionRetriever

retriever = QueryFusionRetriever(
    [vector_retriever],
    similarity_top_k=top_n,
    num_queries=num_queries,  # set this to 1 to disable query generation
    mode="reciprocal_rerank",
    use_async=True,
    verbose=True,
    query_gen_prompt=query_gen_prompt,
)

In [61]:
# it will genereate n - 1 queries since the original query is also included
nodes_with_scores = retriever.retrieve(question)

Generated queries:
1. Can you provide information about Prophet Muhammad (SAW)?
2. What can you tell me about Prophet Muhammad (SAW)?
3. Who is the historical figure known as Prophet Muhammad (SAW)?
4. Could you share details about Prophet Muhammad (SAW) with me?


In [62]:
len(nodes_with_scores)

20

In [63]:
# Printing first document
print("------- BOOK INFO -------")
print(f"Book Title: {nodes_with_scores[0].metadata['title']}")
print(f"Book ID   : {nodes_with_scores[0].metadata['id']}")
print(f"Author    : {nodes_with_scores[0].metadata['author']}")

print("\n------- TEXT -------")
print(nodes_with_scores[0].text)

------- BOOK INFO -------
Book Title: Prophet Muhammad (SAW) 
Book ID   : epub-319e117c70ba47faaa1abacf1429ce9d
Author    : Sakina Hasan Askari - XKP

------- TEXT -------
The Prophet wept when he heard about this. Ali, fearless, strong and
brave put so wonderful a fight that Jibrail praised Ali's action with the
celebrated words : "la fata illa Ali" (there is no man save Al and "la sayf
illa zulfiqar" (there is no sword save zulfiqar).

The scene again changed and the three thousand proud warriors of Arabia ran
from the battlefield like frightened rats, with Abu Sufyan in the front. The
news of a possible pursuit by Ali ibn Abu Talib broke his spirit and he at
once took the road to Mecca.

Next the enemy clans united and planned to invade Medina itself. The Holy
Prophet consulted the Muslims. One of his companions, Salman suggested that
they dig a ditch Kandaq, a trench around the north boundary of Medina. This
would prevent the enemy horses jumping through. The Prophet Muhammad SAW
a

We've got some documents. Let's find their start and ending positions.

In [64]:
for node in nodes_with_scores:
    source_id = node.metadata.get("video_id") or node.metadata.get("id")
    node_id = node.node_id
    # print(f"Source ID: {source_id}, Node ID: {node_id}")

    # Get document from vectorstore based on node_id
    document = vectorstore.vector_store.get_nodes([node_id])[0]
    node_info = document.node_info
    start, end = node_info["start"], node_info["end"]
    text = node.text
    if start is not None and end is not None:
        print(f"Start: {start}, End: {end}, Length: {end - start}, Source ID: {source_id}")
    
        # # Confirming if both are actually equal
        # print(len(text), len(node.text[0:end-start]))
        # print(text == node.text[0:end-start])

Start: 21901, End: 23426, Length: 1525, Source ID: epub-319e117c70ba47faaa1abacf1429ce9d
Start: 44039, End: 45414, Length: 1375, Source ID: epub-319e117c70ba47faaa1abacf1429ce9d
Start: 12749, End: 14254, Length: 1505, Source ID: epub-319e117c70ba47faaa1abacf1429ce9d
Start: 35320, End: 36969, Length: 1649, Source ID: epub-319e117c70ba47faaa1abacf1429ce9d
Start: 6657, End: 8134, Length: 1477, Source ID: epub-319e117c70ba47faaa1abacf1429ce9d
Start: 11186, End: 12748, Length: 1562, Source ID: epub-319e117c70ba47faaa1abacf1429ce9d
Start: 26462, End: 28033, Length: 1571, Source ID: epub-319e117c70ba47faaa1abacf1429ce9d
Start: 39831, End: 41322, Length: 1491, Source ID: epub-319e117c70ba47faaa1abacf1429ce9d
Start: 18936, End: 20546, Length: 1610, Source ID: epub-319e117c70ba47faaa1abacf1429ce9d
Start: 29472, End: 31075, Length: 1603, Source ID: epub-319e117c70ba47faaa1abacf1429ce9d
Start: 14256, End: 15851, Length: 1595, Source ID: epub-319e117c70ba47faaa1abacf1429ce9d
Start: 5133, End: 6656,

How do we know that these values are correct? We can check them by comparing the original text with the text that we get by slicing the original text using the start and ending positions.

In [65]:
for node in nodes_with_scores:
    source_id = node.metadata.get("video_id") or node.metadata.get("id")
    node_id = node.node_id
    # print(f"Source ID: {source_id}, Node ID: {node_id}")

    # Get document from vectorstore based on node_id
    document = vectorstore.vector_store.get_nodes([node_id])[0]
    node_info = document.node_info
    start, end = node_info["start"], node_info["end"]
    text = node.text
    if start is not None and end is not None:
        # Confirming if both are actually equal
        print(len(text), len(node.text[0:end-start]), text == node.text[0:end-start])

1525 1525 True
1375 1375 True
1505 1505 True
1649 1649 True
1477 1477 True
1562 1562 True
1571 1571 True
1491 1491 True
1610 1610 True
1603 1603 True
1595 1595 True
1523 1523 True
1680 1680 True
1326 1326 True
958 958 True
1494 1494 True
1457 1457 True
1329 1329 True
1413 1413 True
1396 1396 True


# Finding Adjacent Chunks

In [66]:
def group_nodes_by_source(chunks):
    chunks_by_source = {}
    for chunk in chunks:
        node_id = chunk.node_id
        source_id = chunk.metadata.get("id")
        document = vectorstore.vector_store.get_nodes([node_id])[0]
        node_info = document.node_info
        start, end = node_info["start"], node_info["end"]
        chunk.metadata["start"] = start
        chunk.metadata["end"] = end
        if source_id not in chunks_by_source:
            chunks_by_source[source_id] = []
        chunks_by_source[source_id].append(chunk)

    return chunks_by_source

In [67]:
def find_adjacent_chunks(chunks):
    # Group chunks by source_id
    chunks_by_source = group_nodes_by_source(chunks)

    
    # Find adjacent chunks within each source_id group
    adjacent_pairs = []
    for source_id, source_chunks in chunks_by_source.items():
        # Sort chunks by start position
        sorted_chunks = sorted(source_chunks, key=lambda x: x.metadata.get("start", 0))
        
        # Compare each chunk with others to find adjacency/overlap
        for i in range(len(sorted_chunks)-1):
            chunk1 = sorted_chunks[i]
            chunk2 = sorted_chunks[i+1]
            
            # Check if chunks are adjacent or overlapping
            if chunk1.metadata.get("end", 0) >= chunk2.metadata.get("start", 0):
                adjacent_pairs.append((chunk1, chunk2))
    
    return adjacent_pairs

In [68]:
adjacent_pairs = find_adjacent_chunks(nodes_with_scores)
for pair in adjacent_pairs:
    print(f"Adjacent chunks found: {pair[0].text} \n******\n {pair[1].text}")

Adjacent chunks found: The
Meccans sent a large army with 1000 m en to Badr, twenty miles from Ma dinah.
The Prophet had only 313 brave soldiers ready to give up their life to defend
Islam . It was Allah alone, who gave victory through the valour of Ali and
Hamza which brought the enemy to their knees. Abu Jahl, the leader of the
Qureish was killed. Abu Sufyan became the new leader and vowed to fight the
Muslims to take revenge. Though they were few in numbers, the Muslims returned
to Madinah successful.

The next year, Abu Sufyan came to fight again with an even larger army. They
met the Muslims at the foot of M ount Uhud, just outside Madinah.

The Prophet went out to defend and put a group of 50 archers to guard a post
by the mountain. As the fight began and the Muslims were winning, these
archers thought that the battle was over and left their position to com e down
to collect the booty.

Khalid bin Walid from the Meccan army attacked the Muslims from behind. There
was chaos. The u

Reading these chunks, we can see there was surely an overlap between the chunks. We can find the overlapping chunks by comparing the ending position of the first chunk with the starting position of the second chunk.

In [69]:
len(adjacent_pairs)

4

In [70]:
print("Adjacent chunks IDs:")
for pair in adjacent_pairs:
    print(f"Chunk 1 ID: {pair[0].node_id}, Chunk 2 ID: {pair[1].node_id}")

Adjacent chunks IDs:
Chunk 1 ID: f596652f-870f-4b9d-8686-ec382ce95475, Chunk 2 ID: f7f211e0-041a-45ee-8d10-c441d59e2259
Chunk 1 ID: 65e10b16-ea5a-404e-9808-3cf0bb46be34, Chunk 2 ID: da608af4-19e2-4995-824f-f17e105c08ca
Chunk 1 ID: 0793aecb-4998-427b-af50-c0db6a09b8ed, Chunk 2 ID: 24b837a1-662c-4d04-9f77-bb4cad5c1233
Chunk 1 ID: d19b242e-964f-4ead-80a6-83c8da5154b7, Chunk 2 ID: a02fb19a-3381-4c5c-927a-9c951c05239f


In [71]:
for node in nodes_with_scores:
    print(node.node_id)

f7f211e0-041a-45ee-8d10-c441d59e2259
d19b242e-964f-4ead-80a6-83c8da5154b7
398949b3-df89-40e9-b313-c338fb058334
24b837a1-662c-4d04-9f77-bb4cad5c1233
c02aeb29-40f9-498a-9b3f-cc42b0ea4b36
f219801d-3d3e-4e4c-83e6-21d39ea0e9a0
88873739-b9fc-4ffb-9599-cd46c6df0ba1
db3b743e-9514-400c-9c7e-abc960ac6522
bd6cbbc5-9a88-4aec-ba7b-0d93210ff49f
65e10b16-ea5a-404e-9808-3cf0bb46be34
ad3bc744-389e-4847-b6a7-5967fa23480e
b1f0586c-082f-45d9-b25d-42bea76483cc
da608af4-19e2-4995-824f-f17e105c08ca
a02fb19a-3381-4c5c-927a-9c951c05239f
8a02e6af-84fc-42db-a078-312c66eec373
3c896245-ba9a-4067-a56c-2ed38f393ed5
26c23673-f3f8-481e-8af0-f157c4157a15
c6089be3-d6cf-440b-93f2-0b6b235ded88
0793aecb-4998-427b-af50-c0db6a09b8ed
f596652f-870f-4b9d-8686-ec382ce95475


In [72]:
from typing import List, Dict, Any


def combine_chunks(chunks: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
    
    # Group chunks by source_id
    chunks_by_source = group_nodes_by_source(chunks)
    
    combined_chunks = []
    
    # Process each source_id group
    for source_chunks in chunks_by_source.values():
        # Sort by start position
        sorted_chunks = sorted(source_chunks, key=lambda x: x.metadata.get("start"))
        
        if not sorted_chunks:
            continue
            
        # Initialize with first chunk
        current = sorted_chunks[0]
        
        # Combine overlapping chunks
        for next_chunk in sorted_chunks[1:]:
            if current.metadata.get("end") >= next_chunk.metadata.get("start"):
                # Merge overlapping chunks
                current.metadata["end"] = max(current.metadata.get("end"), next_chunk.metadata.get("end"))
                current.node.text += next_chunk.node.text
            else:
                # No overlap - add current to result and move to next
                combined_chunks.append(current)
                current = next_chunk
        
        # Add final chunk
        combined_chunks.append(current)
    
    return combined_chunks

In [73]:
result = combine_chunks(nodes_with_scores)
for chunk in result:
    print(f"Combined chunk: {chunk.text}")

Combined chunk: ## Chapter 1 Introduction

Prophet Muhammad (SAW ) was chosen by Allah to bring His message of Islam to
its perfection. The followers of Islam are called Muslims.

He is the final messenger from Allah so is called the seal of the Prophet. He
was free from sin and perfect in every way. He was given the Quran, the word
of Allah. He and his Ahlulbayt are the living examples of the Quran in action,
showing us how to live as Allah wants us to.

From the beginning of time, Allah sent his messengers to the world so that
everybody had a guide. Hadhrat Adam (AS) is the first Prophet and Hadhrat
Muhammad SAW is the last Apostle of Allah.

The religion of all the Prophet is Islam. Each nation had a Prophet who told
them about the Almighty. The five great Prophet were Hadhrat Nooh AS, Hadhrat
Ibraheem AS, Hadhrat M oosa AS, Hadhrat Eesa AS and Hadhrat Muhammad SAW .
Allah sent His books through the Prophet: Sahifatu Nooh, Sahifatu Ibraheem,
Tauraat, Injeel and finally the Holy Qura

In [74]:
len(result), len(nodes_with_scores)

(16, 20)