# Enhanced RAG System with Hybrid Search and Metadata

This notebook demonstrates improvements to our RAG system. The changes include:

- A more reliable embedding model (`all-mpnet-base-v2`)
- A sophisticated, sentence-based text chunking strategy that attaches metadata
- A sparse (TF-IDF) index for keyword search
- A hybrid search combining dense (FAISS) and sparse (TF-IDF) scoring
- Optional metadata filtering of chunks

In [6]:
import re
import os
import numpy as np
import faiss
import nltk
from nltk.tokenize import sent_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sentence_transformers import SentenceTransformer

# Use a more powerful embedding model
embedding_model = SentenceTransformer('all-mpnet-base-v2')

from google import genai
from dotenv import load_dotenv
load_dotenv()
client = genai.Client(api_key=os.getenv('GEMINI_API_KEY'))

In [7]:
def sophisticated_chunk_text(text, min_sentences=3, overlap=1):
    """
    Splits text into chunks using sentence tokenization. Each chunk contains at least
    min_sentences, and overlaps with the previous chunk by the last 'overlap' sentences.
    Returns a list of dictionaries with 'chunk' and 'metadata'.
    """
    sentences = sent_tokenize(text)
    chunks = []
    i = 0
    while i < len(sentences):
        chunk_sentences = sentences[i:i+min_sentences]
        if not chunk_sentences:
            break
        chunk_text = " ".join(chunk_sentences)
        metadata = {"start_sentence": i, "end_sentence": i + len(chunk_sentences) - 1}
        chunks.append({"chunk": chunk_text, "metadata": metadata})
        # Move forward by min_sentences - overlap sentences
        i += max(min_sentences - overlap, 1)
    return chunks

def create_faiss_index(embeddings):
    """
    Create a FAISS index for dense vector retrieval from the provided embeddings.
    """
    dimension = embeddings.shape[1]
    index = faiss.IndexFlatL2(dimension)
    index.add(embeddings.astype(np.float32))
    return index

def create_sparse_index(chunks):
    """
    Create a sparse TF-IDF index from the list of chunk dictionaries.
    """
    documents = [item["chunk"] for item in chunks]
    vectorizer = TfidfVectorizer()
    sparse_matrix = vectorizer.fit_transform(documents)
    return vectorizer, sparse_matrix

def hybrid_search(query, dense_index, sparse_vectorizer, sparse_matrix, chunks, top_k=3, alpha=0.5):
    """
    Retrieve relevant chunks using both dense (FAISS) and sparse (TF-IDF) search.
    The hybrid score is computed as a weighted sum of:
      - Dense score: inverse of the Euclidean distance
      - Sparse score: TF-IDF cosine similarity score
    alpha weights the dense score (0 <= alpha <= 1).
    """
    # Dense search
    query_embedding = embedding_model.encode([query])
    distances, dense_indices = dense_index.search(query_embedding.astype(np.float32), top_k)

    # Sparse search
    query_sparse = sparse_vectorizer.transform([query])
    sparse_scores = (sparse_matrix * query_sparse.T).toarray().ravel()
    sparse_indices = np.argsort(-sparse_scores)[:top_k]

    # Combine candidate indices from both searches
    combined_indices = set(dense_indices[0]).union(set(sparse_indices))
    combined_scores = {}

    # Compute a combined score for each candidate
    for idx in combined_indices:
        # For dense score, approximate by inverse Euclidean distance
        chunk_embedding = embedding_model.encode([chunks[idx]['chunk']])
        dense_score = 1 / (1 + np.linalg.norm(query_embedding - chunk_embedding))
        sparse_score = sparse_scores[idx]
        combined_scores[idx] = alpha * dense_score + (1 - alpha) * sparse_score

    # Rank candidates by the combined score
    ranked = sorted(combined_scores.items(), key=lambda x: x[1], reverse=True)[:top_k]
    return [chunks[idx]['chunk'] for idx, _ in ranked]

def filter_chunks_by_metadata(chunks, condition):
    """
    Filter chunks based on a metadata condition. The condition is a function
    that takes a metadata dictionary and returns True or False.
    """
    return [item for item in chunks if condition(item["metadata"])]

def semantic_retrieval_hybrid(query, dense_index, sparse_vectorizer, sparse_matrix, chunks, top_k=3):
    """
    Use the hybrid search for retrieval.
    """
    return hybrid_search(query, dense_index, sparse_vectorizer, sparse_matrix, chunks, top_k)

In [8]:
# For demonstration, we use a sample document.
import requests
from bs4 import BeautifulSoup

# Fetch a sample webpage (Alice's Adventures in Wonderland from Gutenberg)
url = "https://www.gutenberg.org/cache/epub/11/pg11-images.html"
response = requests.get(url)
html_content = response.text

soup = BeautifulSoup(html_content, 'html.parser')
for script in soup(["script", "style"]):
    script.decompose()

document = soup.get_text(separator=" ", strip=True)
clean_text = re.sub('\s+', ' ', document).strip()
nltk.download('punkt_tab')
# Create chunks using the sophisticated sentence-based function
chunks = sophisticated_chunk_text(clean_text, min_sentences=5, overlap=2)
print(f"Total chunks created: {len(chunks)}")
print("Example chunk:", chunks[0])

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\myhom\AppData\Roaming\nltk_data...


Total chunks created: 373
Example chunk: {'chunk': "Alice’s Adventures in Wonderland | Project Gutenberg The Project Gutenberg eBook of Alice's Adventures in Wonderland This ebook is for the use of anyone anywhere in the United States and most other parts of the world at no cost and with almost no restrictions whatsoever. You may copy it, give it away or re-use it under the terms of the Project Gutenberg License included with this ebook or online at www.gutenberg.org . If you are not located in the United States, you will have to check the laws of the country where you are located before using this eBook. Title : Alice's Adventures in Wonderland Author : Lewis Carroll Release date : June 27, 2008 [eBook #11] Most recently updated: November 10, 2024 Language : English Credits : Arthur DiBianca and David Widger *** START OF THE PROJECT GUTENBERG EBOOK ALICE'S ADVENTURES IN WONDERLAND *** Alice’s Adventures in Wonderland by Lewis Carroll THE MILLENNIUM FULCRUM EDITION 3.0 Contents CHAPTER

[nltk_data]   Unzipping tokenizers\punkt_tab.zip.


In [9]:
# Build dense and sparse indexes from the chunks
chunk_texts = [item["chunk"] for item in chunks]

# Generate dense embeddings
chunk_embeddings = embedding_model.encode(chunk_texts)
dense_index = create_faiss_index(chunk_embeddings)

# Build the sparse (TF-IDF) index
sparse_vectorizer, sparse_matrix = create_sparse_index(chunks)
print(f"Dense index built with {chunk_embeddings.shape[0]} items")

Dense index built with 373 items


In [22]:
# Define a sample query
query = "what did the queen shouted at the top of her voice"

# Optionally filter chunks by metadata, for example, only consider chunks starting after sentence 50
filtered_chunks = filter_chunks_by_metadata(chunks, lambda m: m["start_sentence"] > 50)

# if filtered_chunks and len(filtered_chunks) < len(chunks):
#     print(f"Using metadata-filtered chunks: {len(filtered_chunks)} items")
#     filtered_texts = [item["chunk"] for item in filtered_chunks]
#     filtered_embeddings = embedding_model.encode(filtered_texts)
#     dense_index = create_faiss_index(filtered_embeddings)
#     sparse_vectorizer, sparse_matrix = create_sparse_index(filtered_chunks)
#     results = semantic_retrieval_hybrid(query, dense_index, sparse_vectorizer, sparse_matrix, filtered_chunks)
# else:
results = semantic_retrieval_hybrid(query, dense_index, sparse_vectorizer, sparse_matrix, chunks)


In [23]:

print("\nRetrieved Chunks:")
for res in results:
    print("-", res[:200], "...")

# Generate an answer using Gemini LLM
context = "\n".join(results)
prompt = f"Context: {context}\nQuestion: {query}\nAnswer:"

response = client.models.generate_content(
    model="gemini-2.0-flash",
    contents=prompt
)

print("\nGenerated Answer:")
print(response.text)


Retrieved Chunks:
- “Nothing whatever? ” persisted the King. “Nothing whatever,” said Alice. “That’s very important,” the King said, turning to the jury. They were just beginning to write this down on their slates, when  ...
- “I want a clean cup,” interrupted the Hatter: “let’s all move one place on.” He moved on as he spoke, and the Dormouse followed him: the March Hare moved into the Dormouse’s place, and Alice rather un ...
- However, he consented to go on. “And so these three little sisters—they were learning to draw, you know—” “What did they draw?” said Alice, quite forgetting her promise. “Treacle,” said the Dormouse,  ...

Generated Answer:
The provided text does not include any mention of the Queen shouting. The characters in the text include the King, Alice, the White Rabbit, the Hatter, the Dormouse, and the March Hare.

