# Auto-merging Retrieval System

# **Overview**
This notebook implements **Auto-merging Retrieval**, an advanced retrieval strategy that combines hierarchical document chunking with intelligent merging during query time. It automatically merges smaller chunks into larger ones when appropriate, providing better context while maintaining efficiency.

**Key Components**

### 1. **Hierarchical Node Parser**
- **Chunk Sizes**: [2048, 512, 128] tokens
- **Multi-level hierarchy**: Parent ‚Üí Child ‚Üí Leaf nodes
- **Smart merging**: Automatically combines related chunks

### 2. **Auto-merging Retriever**
- **Similarity Search**: Top-12 most relevant chunks
- **Intelligent Merging**: Combines small chunks into meaningful contexts
- **Context Preservation**: Maintains document structure and relationships

### 3. **Re-ranking**
- **SentenceTransformerRerank**: Uses BAAI/bge-reranker-base
- **Top-6 Selection**: Re-ranks and selects best 6 chunks
- **Quality Enhancement**: Improves retrieval precision

## **Use Cases**
- **Long Documents**: Better context for large PDFs/books
- **Structured Content**: Maintains document hierarchy
- **Precision Tasks**: When exact context is crucial
- **Research Papers**: Academic document analysis and so on ....

## **Advantages**
- **Better Context**: Merges related information automatically
- **Flexible Retrieval**: Adapts chunk size based on query needs
- **Memory Efficient**: Avoids storing all chunk combinations
- **High Precision**: Re-ranking improves answer quality

In [None]:
# Install required packages if not already installed
import subprocess
import sys

print(f"üêç Using Python: {sys.executable}")
print(f"üìç Python version: {sys.version}")

# Fix NumPy/Pandas binary compatibility issue
print("üîß Ensuring NumPy/Pandas compatibility...")
try:
    subprocess.check_call([sys.executable, "-m", "pip", "uninstall", "-y", "-q", "pandas"])
    subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", "--no-cache-dir", "pandas==2.0.3"])
    print("‚úÖ Pandas reinstalled with NumPy 1.24.3 compatibility")
except:
    print("‚ö†Ô∏è  Pandas already compatible")

# Install basic packages
print("üì¶ Installing other packages...")
subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", "matplotlib"])
subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", "docx2txt"])
subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", "python-dotenv"])

print("‚úÖ Package installation complete!")

# All imports
import warnings
import os
import openai
import matplotlib.pyplot as plt
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()

# LlamaIndex Core imports
from llama_index.core import SimpleDirectoryReader, Document, VectorStoreIndex, StorageContext, load_index_from_storage
from llama_index.core.settings import Settings

# LlamaIndex Node Parser imports
from llama_index.core.node_parser import HierarchicalNodeParser, get_leaf_nodes

# LlamaIndex LLM imports
from llama_index.llms.openai import OpenAI

# LlamaIndex Retriever and Postprocessor imports
from llama_index.core.retrievers import AutoMergingRetriever
from llama_index.core.postprocessor import SentenceTransformerRerank
from llama_index.core.query_engine import RetrieverQueryEngine

# LlamaIndex Response Utils
from llama_index.core.response.notebook_utils import display_response

warnings.filterwarnings('ignore')


In [None]:
# Load API keys from .env file (already loaded with load_dotenv() above)
PINECONE_API_KEY = os.getenv('PINECONE_API_KEY')
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
GOOGLE_API_KEY = os.getenv('GOOGLE_API_KEY')
ANTHROPIC_API_KEY = os.getenv('ANTHROPIC_API_KEY')

In [None]:
# from llama_index.core import SimpleDirectoryReader  # Already imported at the top

# Load all files from the data directory
documents = SimpleDirectoryReader(
    input_dir="./data"
).load_data()

# If you want to load a single specific PDF file, uncomment below:
# documents = SimpleDirectoryReader(
#     input_files=["./data/YAPI ƒ∞≈ûLERƒ∞NDE ƒ∞≈û SAƒûLIƒûI VE G√úVENLƒ∞ƒûƒ∞ Y√ñNETMELƒ∞ƒûƒ∞.pdf"]
# ).load_data()

In [None]:
print(type(documents), "\n")
print(len(documents), "\n")
print(type(documents[0]))
print(documents[0])

## Auto-merging retrieval setup

In [None]:
# from llama_index.core import Document  # Already imported

document = Document(text="\n\n".join([doc.text for doc in documents]))

In [None]:
# from llama_index.core.node_parser import HierarchicalNodeParser  # Already imported 

# create the hierarchical node parser w/ default settings
node_parser = HierarchicalNodeParser.from_defaults(
    chunk_sizes=[2048, 512, 128]
)

In [None]:
nodes = node_parser.get_nodes_from_documents([document])

In [None]:
# from llama_index.core.node_parser import get_leaf_nodes  # Already imported

leaf_nodes = get_leaf_nodes(nodes)
print(leaf_nodes[30].text)

In [None]:
nodes_by_id = {node.node_id: node for node in nodes}

parent_node = nodes_by_id[leaf_nodes[30].parent_node.node_id]
print(parent_node.text)

### Building the index

In [None]:
# from llama_index.llms.openai import OpenAI  # Already imported 

llm = OpenAI(model="gpt-3.5-turbo", temperature=0.1)

In [None]:
# from llama_index.core import ServiceContext  # Deprecated, using Settings instead

Settings.llm = llm
Settings.embed_model = "local:BAAI/bge-small-en-v1.5"

In [None]:
# from llama_index.core import VectorStoreIndex, StorageContext  # Already imported at the top

storage_context = StorageContext.from_defaults()
storage_context.docstore.add_documents(nodes)

automerging_index = VectorStoreIndex(
    leaf_nodes, storage_context=storage_context
)

automerging_index.storage_context.persist(persist_dir="./merging_index")

In [None]:
# This block of code is optional to check
# if an index file exist, then it will load it
# if not, it will rebuild it

# import os  # Already imported 
# from llama_index.core import VectorStoreIndex, StorageContext, load_index_from_storage  # Already imported

if not os.path.exists("./merging_index"):
    storage_context = StorageContext.from_defaults()
    storage_context.docstore.add_documents(nodes)

    automerging_index = VectorStoreIndex(
            leaf_nodes,
            storage_context=storage_context
        )

    automerging_index.storage_context.persist(persist_dir="./merging_index")
else:
    automerging_index = load_index_from_storage(
        StorageContext.from_defaults(persist_dir="./merging_index")
    )

### Defining the retriever and running the query engine

In [None]:
# from llama_index.core.postprocessor import SentenceTransformerRerank  # Already imported at the top
# from llama_index.core.retrievers import AutoMergingRetriever  # Already imported at the top
# from llama_index.core.query_engine import RetrieverQueryEngine  # Already imported at the top

automerging_retriever = automerging_index.as_retriever(
    similarity_top_k=12
)

retriever = AutoMergingRetriever(
    automerging_retriever, 
    automerging_index.storage_context, 
    verbose=True
)

rerank = SentenceTransformerRerank(top_n=6, model="BAAI/bge-reranker-base")

auto_merging_engine = RetrieverQueryEngine.from_args(
    automerging_retriever, node_postprocessors=[rerank]
)

In [None]:
auto_merging_response = auto_merging_engine.query(
    "Yapi isleri yonetmeligine gore kac metreden sonra yuksekte calisma sayilir?"
)

In [None]:
# from llama_index.core.response.notebook_utils import display_response  # Already imported   
display_response(auto_merging_response)

## Putting it all Together

In [None]:
# import os  # Already imported at the top
# from llama_index.core import StorageContext, VectorStoreIndex, load_index_from_storage  # Already imported 
# from llama_index.core.node_parser import HierarchicalNodeParser, get_leaf_nodes  # Already imported 
# from llama_index.core.retrievers import AutoMergingRetriever  # Already imported 
# from llama_index.core.postprocessor import SentenceTransformerRerank  # Already imported 
# from llama_index.core.query_engine import RetrieverQueryEngine  # Already imported 


def build_automerging_index(
    documents,
    llm,
    embed_model="local:BAAI/bge-small-en-v1.5",
    save_dir="merging_index",
    chunk_sizes=None,
):
    chunk_sizes = chunk_sizes or [2048, 512, 128]
    node_parser = HierarchicalNodeParser.from_defaults(chunk_sizes=chunk_sizes)
    nodes = node_parser.get_nodes_from_documents(documents)
    leaf_nodes = get_leaf_nodes(nodes)
    
    # Use Settings instead of deprecated ServiceContext
    Settings.llm = llm
    Settings.embed_model = embed_model
    
    storage_context = StorageContext.from_defaults()
    storage_context.docstore.add_documents(nodes)

    if not os.path.exists(save_dir):
        automerging_index = VectorStoreIndex(
            leaf_nodes, storage_context=storage_context
        )
        automerging_index.storage_context.persist(persist_dir=save_dir)
    else:
        automerging_index = load_index_from_storage(
            StorageContext.from_defaults(persist_dir=save_dir)
        )
    return automerging_index


def get_automerging_query_engine(
    automerging_index,
    similarity_top_k=12,
    rerank_top_n=6,
):
    base_retriever = automerging_index.as_retriever(similarity_top_k=similarity_top_k)
    retriever = AutoMergingRetriever(
        base_retriever, automerging_index.storage_context, verbose=True
    )
    rerank = SentenceTransformerRerank(
        top_n=rerank_top_n, model="BAAI/bge-reranker-base"
    )
    auto_merging_engine = RetrieverQueryEngine.from_args(
        retriever, node_postprocessors=[rerank]
    )
    return auto_merging_engine

In [None]:
# from llama_index.llms.openai import OpenAI  # Already imported

index = build_automerging_index(
    [document],
    llm=OpenAI(model="gpt-3.5-turbo", temperature=0.1),
    save_dir="./merging_index",
)

In [None]:
query_engine = get_automerging_query_engine(index, similarity_top_k=6)