In [None]:
import os
import dotenv
dotenv.load_dotenv()
search_endpoint = os.getenv("SEARCH_ENDPOINT")
admin_key= os.getenv("ADMIN_KEY")
index_name = os.getenv("INDEX_NAME")

In [None]:
import pandas as pd
from azure.search.documents.indexes import SearchIndexClient
from azure.search.documents.indexes.models import SearchIndex, SimpleField, SearchableField, SearchFieldDataType
from azure.search.documents import SearchClient
from azure.core.credentials import AzureKeyCredential

# Set your Azure Search details
endpoint = search_endpoint
admin_key = admin_key
credential = AzureKeyCredential(admin_key)

# Index names
indexes = ["finance", "hr", "engineering"]
csv_files = {
    "finance": "finance.csv",
    "hr": "hr.csv",
    "engineering": "engineering.csv"
}

In [None]:
def create_index(index_name):
    fields = [
        SimpleField(name="id", type=SearchFieldDataType.String, key=True),
        SearchableField(name="title", type=SearchFieldDataType.String, filterable=True, sortable=True, facetable=False),
        SearchableField(name="content", type=SearchFieldDataType.String, filterable=False, sortable=False, facetable=False),
        SimpleField(name="timestamp", type=SearchFieldDataType.DateTimeOffset, filterable=True, sortable=True, facetable=False),
        SimpleField(name="score", type=SearchFieldDataType.Double, filterable=True, sortable=True, facetable=False)
    ]
    index = SearchIndex(name=index_name, fields=fields)
    index_client = SearchIndexClient(endpoint=endpoint, credential=credential)
    # Delete if exists
    try:
        index_client.delete_index(index_name)
    except Exception:
        pass
    index_client.create_index(index)
    print(f"Index '{index_name}' created.")

for idx in indexes:
    create_index(idx)

In [None]:
def upload_documents(index_name, csv_file):
    df = pd.read_csv(csv_file)
    # Azure Search doesn't like '@' in field names, so rename
    df = df.rename(columns={"@search.score": "score"})
    # Convert timestamp to ISO format
    df['timestamp'] = pd.to_datetime(df['timestamp']).dt.strftime('%Y-%m-%dT%H:%M:%SZ')
    docs = df.to_dict(orient="records")
    search_client = SearchClient(endpoint=endpoint, index_name=index_name, credential=credential)
    result = search_client.upload_documents(documents=docs)
    print(f"Uploaded {len(docs)} docs to '{index_name}':", result[0].status_code if result else "OK")

for idx, file in csv_files.items():
    upload_documents(idx, file)


In [None]:
from typing import List, Dict
import numpy as np

def search_index(index_name, search_text, top=10, filter_expr=None, orderby=None):
    search_client = SearchClient(endpoint=endpoint, index_name=index_name, credential=credential)
    results = search_client.search(
        search_text,
        filter=filter_expr,
        top=top,
        order_by=orderby if orderby else None,
        include_total_count=True
    )
    docs = []
    for r in results:
        doc = r.copy()
        doc['__index'] = index_name
        doc['score'] = r['score'] if 'score' in r else r['@search.score'] if '@search.score' in r else r['score']
        docs.append(doc)
    return docs

# Query all indexes
def query_all_indexes(search_text, top=10, filter_expr=None, orderby="timestamp desc"):
    all_results = []
    for idx in indexes:
        docs = search_index(idx, search_text, top=top, filter_expr=filter_expr, orderby=orderby)
        all_results.extend(docs)
    return all_results

# Normalize scores
def normalize_scores(results, score_field="score"):
    scores = [doc[score_field] for doc in results]
    min_s, max_s = min(scores), max(scores)
    for doc in results:
        doc["norm_score"] = (doc[score_field] - min_s) / (max_s - min_s) if max_s > min_s else 1.0
    return results

# RRF merge with tie-breaking
def rrf_merge(results_by_index: Dict[str, List[Dict]], k=60):
    """
    Merge results using Reciprocal Rank Fusion.
    
    When RRF scores are identical, uses timestamp as tie-breaker (newest first).
    This ensures consistent, predictable ordering across pages.
    """
    doc_ranks = {}
    for idx, docs in results_by_index.items():
        docs_sorted = sorted(docs, key=lambda x: x["norm_score"], reverse=True)
        for rank, doc in enumerate(docs_sorted):
            doc_id = doc["id"]
            if doc_id not in doc_ranks:
                doc_ranks[doc_id] = []
            doc_ranks[doc_id].append(rank + 1)
    
    merged = []
    for doc_id, ranks in doc_ranks.items():
        rrf_score = sum(1 / (r + k) for r in ranks)
        doc = next(d for d in all_results if d["id"] == doc_id)
        merged.append({**doc, "rrf_score": rrf_score})
    
    # Sort by RRF score (primary), then timestamp (secondary tie-breaker)
    # This ensures consistent ordering when RRF scores are equal
    merged.sort(key=lambda x: (-x["rrf_score"], x["timestamp"]), reverse=True)
    
    return merged

# Paging by RRF score (recommended for relevance-based results)
def get_page_by_rrf(results, page_num=1, page_size=10):
    """
    Paginate results by RRF score (maintains relevance ranking).
    Use this when you want to show results in order of relevance.
    """
    start = (page_num - 1) * page_size
    end = start + page_size
    page = results[start:end]
    
    total_pages = (len(results) + page_size - 1) // page_size
    has_next = page_num < total_pages
    
    return {
        "results": page,
        "page": page_num,
        "page_size": page_size,
        "total_results": len(results),
        "total_pages": total_pages,
        "has_next": has_next,
        "has_previous": page_num > 1
    }

# Paging by timestamp (cursor-based - for time-sorted results)
def get_page_by_timestamp(df, page_size=2, last_timestamp=None):
    """
    Paginate results by timestamp (cursor-based).
    Use this when you want to show results in chronological order.
    Note: This ignores RRF scores and sorts purely by time.
    """
    df = sorted(df, key=lambda x: x['timestamp'], reverse=True)
    if last_timestamp:
        df = [d for d in df if d['timestamp'] < last_timestamp]
    page = df[:page_size]
    next_cursor = page[-1]['timestamp'] if page else None
    return page, next_cursor

# Demo search with RRF-based pagination
search_text = "cloud security"
all_results = query_all_indexes(search_text, top=10)
all_results = normalize_scores(all_results)
# Group by index for RRF
results_by_index = {idx: [doc for doc in all_results if doc["__index"] == idx] for idx in indexes}
merged_results = rrf_merge(results_by_index)

print("="*70)
print("PAGINATION BY RRF SCORE (Maintains Relevance Ranking)")
print("="*70)

# First page - shows highest RRF scores
page_result = get_page_by_rrf(merged_results, page_num=1, page_size=2)
print(f"\n📄 Page {page_result['page']} of {page_result['total_pages']}:")
for i, doc in enumerate(page_result['results'], 1):
    print(f"   {i}. [{doc['__index']:12}] {doc['title'][:40]:40} | RRF: {doc['rrf_score']:.4f} | {doc['timestamp']}")

# Second page - shows next highest RRF scores
if page_result['has_next']:
    page_result2 = get_page_by_rrf(merged_results, page_num=2, page_size=2)
    print(f"\n📄 Page {page_result2['page']} of {page_result2['total_pages']}:")
    for i, doc in enumerate(page_result2['results'], 1):
        print(f"   {i}. [{doc['__index']:12}] {doc['title'][:40]:40} | RRF: {doc['rrf_score']:.4f} | {doc['timestamp']}")

print("\n" + "="*70)
print("PAGINATION BY TIMESTAMP (Chronological Order)")
print("="*70)

# Compare with timestamp-based pagination
page1, cursor1 = get_page_by_timestamp(merged_results, page_size=2)
print(f"\n📄 Page 1 (newest first):")
for i, doc in enumerate(page1, 1):
    print(f"   {i}. [{doc['__index']:12}] {doc['title'][:40]:40} | RRF: {doc['rrf_score']:.4f} | {doc['timestamp']}")
print(f"Next cursor: {cursor1}")

if cursor1:
    page2, cursor2 = get_page_by_timestamp(merged_results, page_size=2, last_timestamp=cursor1)
    print(f"\n📄 Page 2:")
    for i, doc in enumerate(page2, 1):
        print(f"   {i}. [{doc['__index']:12}] {doc['title'][:40]:40} | RRF: {doc['rrf_score']:.4f} | {doc['timestamp']}")
    print(f"Next cursor: {cursor2}")

# Optimized Multi-Index Search with Azure Redis Cache

This implementation adds:
1. **Azure Cache for Redis** - Managed Redis service from Azure
2. **Result Caching** - Cache merged results to avoid repeated queries
3. **Lazy Loading** - Only fetch results as needed
4. **Max Pages Limit** - Prevent unbounded result sets

In [None]:
# Install required package (if not already installed)
# !pip install redis

In [None]:
import redis
import json
import hashlib
from typing import List, Dict, Optional
from datetime import datetime

class AzureRedisMultiIndexSearch:
    """
    Optimized multi-index search with Azure Cache for Redis.
    
    Features:
    - Caches merged search results in Azure Redis
    - Lazy loading - only fetches what's needed
    - Max pages limit to prevent unbounded queries
    - TTL-based cache expiration
    """
    
    def __init__(self, 
                 redis_host: str,
                 redis_password: str,
                 redis_port: int = 6380,  # Azure Redis default SSL port
                 redis_ssl: bool = True,   # Azure Redis requires SSL
                 cache_ttl: int = 300,     # 5 minutes
                 max_pages: int = 50,
                 page_size: int = 10):
        """
        Initialize Azure Redis connection.
        
        Args:
            redis_host: Azure Redis hostname (e.g., 'myredis.redis.cache.windows.net')
            redis_password: Azure Redis access key (from Azure Portal)
            redis_port: Port (6380 for SSL, 6379 for non-SSL)
            redis_ssl: Use SSL (required for Azure Redis)
            cache_ttl: Cache time-to-live in seconds
            max_pages: Maximum number of pages to return
            page_size: Number of results per page
        """
        # Connect to Azure Redis
        self.redis_client = redis.Redis(
            host=redis_host,
            port=redis_port,
            password=redis_password,
            ssl=redis_ssl,
            ssl_cert_reqs=None,  # Azure manages certificates
            decode_responses=True  # Return strings instead of bytes
        )
        
        self.cache_ttl = cache_ttl
        self.MAX_PAGES = max_pages
        self.PAGE_SIZE = page_size
        self.MAX_RESULTS = max_pages * page_size
        
        # Test connection
        try:
            self.redis_client.ping()
            print("✅ Connected to Azure Redis Cache")
        except redis.ConnectionError as e:
            print(f"❌ Failed to connect to Azure Redis: {e}")
            raise
    
    def _generate_cache_key(self, search_text: str, filters: Optional[str] = None) -> str:
        """Generate unique cache key for search parameters."""
        key_data = f"{search_text}:{filters or ''}"
        hash_key = hashlib.md5(key_data.encode()).hexdigest()
        return f"multi_index_search:{hash_key}"
    
    def _get_cached_results(self, cache_key: str) -> Optional[List[Dict]]:
        """Retrieve cached results from Azure Redis."""
        try:
            cached_data = self.redis_client.get(cache_key)
            if cached_data:
                print(f"✅ Cache HIT - Retrieved from Azure Redis")
                return json.loads(cached_data)
            else:
                print(f"❌ Cache MISS - Will query Azure AI Search")
                return None
        except Exception as e:
            print(f"⚠️ Redis error: {e} - Proceeding without cache")
            return None
    
    def _cache_results(self, cache_key: str, results: List[Dict]):
        """Store results in Azure Redis with TTL."""
        try:
            self.redis_client.setex(
                cache_key,
                self.cache_ttl,
                json.dumps(results)
            )
            print(f"💾 Cached {len(results)} results in Azure Redis (TTL: {self.cache_ttl}s)")
        except Exception as e:
            print(f"⚠️ Failed to cache results: {e}")
    
    def search(self, 
               search_text: str, 
               page: int = 1,
               filter_expr: Optional[str] = None) -> Dict:
        """
        Execute multi-index search with caching and pagination.
        
        Args:
            search_text: Search query
            page: Page number (1-indexed)
            filter_expr: Optional OData filter expression
            
        Returns:
            Dictionary with results, pagination info, and metadata
        """
        # Validate page number
        if page < 1:
            raise ValueError("Page must be >= 1")
        if page > self.MAX_PAGES:
            raise ValueError(f"Page {page} exceeds maximum of {self.MAX_PAGES}")
        
        start_time = datetime.now()
        
        # Try to get from cache
        cache_key = self._generate_cache_key(search_text, filter_expr)
        merged_results = self._get_cached_results(cache_key)
        
        if merged_results is None:
            # Cache miss - query Azure AI Search
            print(f"🔍 Querying {len(indexes)} indexes...")
            
            # Calculate how many results we need per index
            # We want enough to fill MAX_PAGES, distributed across indexes
            per_index_limit = min(200, self.MAX_RESULTS // len(indexes) + 10)
            
            # Query all indexes
            all_results = query_all_indexes(
                search_text, 
                top=per_index_limit,
                filter_expr=filter_expr
            )
            
            if not all_results:
                return {
                    "results": [],
                    "page": page,
                    "page_size": self.PAGE_SIZE,
                    "total_results": 0,
                    "total_pages": 0,
                    "query_time_ms": (datetime.now() - start_time).total_seconds() * 1000,
                    "cache_hit": False
                }
            
            # Normalize scores
            all_results = normalize_scores(all_results)
            
            # Group by index for RRF
            results_by_index = {
                idx: [doc for doc in all_results if doc["__index"] == idx] 
                for idx in indexes
            }
            
            # Apply RRF merge
            merged_results = rrf_merge(results_by_index)
            
            # Limit to max results
            merged_results = merged_results[:self.MAX_RESULTS]
            
            # Cache the results
            self._cache_results(cache_key, merged_results)
            cache_hit = False
        else:
            cache_hit = True
        
        # Calculate pagination
        total_results = len(merged_results)
        total_pages = (total_results + self.PAGE_SIZE - 1) // self.PAGE_SIZE
        
        # Extract requested page
        start_idx = (page - 1) * self.PAGE_SIZE
        end_idx = start_idx + self.PAGE_SIZE
        page_results = merged_results[start_idx:end_idx]
        
        query_time_ms = (datetime.now() - start_time).total_seconds() * 1000
        
        return {
            "results": page_results,
            "page": page,
            "page_size": self.PAGE_SIZE,
            "total_results": total_results,
            "total_pages": total_pages,
            "has_next": page < total_pages,
            "has_previous": page > 1,
            "query_time_ms": round(query_time_ms, 2),
            "cache_hit": cache_hit,
            "indexes_searched": indexes
        }
    
    def clear_cache(self, search_text: Optional[str] = None):
        """Clear cache for specific search or all searches."""
        if search_text:
            cache_key = self._generate_cache_key(search_text)
            self.redis_client.delete(cache_key)
            print(f"🗑️ Cleared cache for: {search_text}")
        else:
            # Clear all multi-index search caches
            pattern = "multi_index_search:*"
            keys = self.redis_client.keys(pattern)
            if keys:
                self.redis_client.delete(*keys)
                print(f"🗑️ Cleared {len(keys)} cached searches")
            else:
                print("ℹ️ No cached searches to clear")
    
    def get_cache_stats(self) -> Dict:
        """Get Redis cache statistics."""
        try:
            info = self.redis_client.info('stats')
            return {
                "total_connections": info.get('total_connections_received', 0),
                "total_commands": info.get('total_commands_processed', 0),
                "keyspace_hits": info.get('keyspace_hits', 0),
                "keyspace_misses": info.get('keyspace_misses', 0),
                "hit_rate": round(
                    info.get('keyspace_hits', 0) / 
                    max(info.get('keyspace_hits', 0) + info.get('keyspace_misses', 0), 1) * 100, 
                    2
                )
            }
        except Exception as e:
            return {"error": str(e)}

## Configure Azure Redis Connection

Get your Azure Redis credentials from Azure Portal:
1. Go to Azure Portal → Azure Cache for Redis
2. Click on your Redis instance
3. Go to "Access keys" blade
4. Copy the "Primary connection string" or use host + key separately

In [28]:
# Azure Redis Configuration
# Add these to your .env file:
# REDIS_HOST=your-redis-name.redis.cache.windows.net
# REDIS_PASSWORD=your-redis-access-key
# REDIS_PORT=6380
# REDIS_SSL=true

import os
import dotenv
dotenv.load_dotenv()

# Load from environment variables (no hardcoded values)
redis_host = os.getenv("REDIS_HOST")
redis_password = os.getenv("REDIS_PASSWORD")
redis_port = int(os.getenv("REDIS_PORT", "6380"))  # Azure Redis SSL port
redis_ssl = os.getenv("REDIS_SSL", "true").lower() == "true"

print(f"Redis Host: {redis_host}")
print(f"Redis Port: {redis_port}")
print(f"Redis SSL: {redis_ssl}")
print(f"Redis Password: {'✅ Set' if redis_password else '❌ Not set'}")

Redis Host: testredisforaisearch.redis.cache.windows.net
Redis Port: 6380
Redis SSL: True
Redis Password: ✅ Set


## Usage Example: Multi-Index Search with Caching

In [31]:
# Initialize the optimized search service
# Check if Redis credentials are configured
if not redis_host or not redis_password:
    print("⚠️  Redis credentials not configured in .env file")
    print("ℹ️  Skipping Redis cache demo. To enable caching:")
    print("   1. Add REDIS_HOST to your .env file")
    print("   2. Add REDIS_PASSWORD to your .env file")
    print("   3. Ensure your IP is whitelisted in Azure Redis firewall")
    print("\n✅ The solution works without Redis - it just won't have caching.")
    print("   Continue with cell 16 for the balanced pagination demo!")
else:
    try:
        search_service = AzureRedisMultiIndexSearch(
            redis_host=redis_host,
            redis_password=redis_password,
            redis_port=redis_port,
            redis_ssl=redis_ssl,
            cache_ttl=300,      # 5 minutes
            max_pages=50,       # Max 50 pages
            page_size=10        # 10 results per page
        )
        
        # First search - will query Azure AI Search and cache results
        print("\n" + "="*60)
        print("FIRST SEARCH (Cache Miss)")
        print("="*60)
        result1 = search_service.search("cloud security", page=1)
        
        print(f"\n📊 Results:")
        print(f"   Page: {result1['page']} of {result1['total_pages']}")
        print(f"   Total results: {result1['total_results']}")
        print(f"   Query time: {result1['query_time_ms']} ms")
        print(f"   Cache hit: {result1['cache_hit']}")
        print(f"\n📄 Top results:")
        for i, doc in enumerate(result1['results'][:3], 1):
            print(f"   {i}. [{doc['__index']}] {doc['title']}")
            print(f"      RRF Score: {doc['rrf_score']:.4f}")
        
        # Second search - same query, will use cache
        print("\n" + "="*60)
        print("SECOND SEARCH - Same Query (Cache Hit)")
        print("="*60)
        result2 = search_service.search("cloud security", page=1)
        
        print(f"\n📊 Results:")
        print(f"   Page: {result2['page']} of {result2['total_pages']}")
        print(f"   Query time: {result2['query_time_ms']} ms ⚡")
        print(f"   Cache hit: {result2['cache_hit']}")
        print(f"   Speed improvement: {round(result1['query_time_ms'] / result2['query_time_ms'], 1)}x faster")
        
        # Page navigation - will use same cached results
        print("\n" + "="*60)
        print("PAGE NAVIGATION (Cache Hit)")
        print("="*60)
        result3 = search_service.search("cloud security", page=2)
        
        print(f"\n📊 Results:")
        print(f"   Page: {result3['page']} of {result3['total_pages']}")
        print(f"   Query time: {result3['query_time_ms']} ms ⚡")
        print(f"   Cache hit: {result3['cache_hit']}")
        print(f"\n📄 Page 2 results:")
        for i, doc in enumerate(result3['results'][:3], 1):
            print(f"   {i}. [{doc['__index']}] {doc['title']}")
            print(f"      RRF Score: {doc['rrf_score']:.4f}")
        
        # Get cache statistics
        print("\n" + "="*60)
        print("REDIS CACHE STATISTICS")
        print("="*60)
        stats = search_service.get_cache_stats()
        print(f"   Cache hit rate: {stats.get('hit_rate', 'N/A')}%")
        print(f"   Total commands: {stats.get('total_commands', 'N/A')}")
        print(f"   Keyspace hits: {stats.get('keyspace_hits', 'N/A')}")
        print(f"   Keyspace misses: {stats.get('keyspace_misses', 'N/A')}")
        
    except redis.ConnectionError as e:
        print(f"❌ Cannot connect to Azure Redis: {e}")
        print("\n📋 Troubleshooting Steps:")
        print("   1. Verify REDIS_HOST format: yourname.redis.cache.windows.net")
        print("   2. Get access key from Azure Portal → Redis → Access keys")
        print("   3. Whitelist your IP in Azure Portal → Redis → Firewall")
        print("   4. Ensure using port 6380 (SSL) not 6379")
        print("\n✅ Solution works without Redis - continue with cell 16!")
    except Exception as e:
        print(f"❌ Error: {e}")
        print("\n✅ Solution works without Redis - continue with cell 16!")

✅ Connected to Azure Redis Cache

FIRST SEARCH (Cache Miss)
❌ Cache MISS - Will query Azure AI Search
🔍 Querying 3 indexes...
❌ Error: 

✅ Solution works without Redis - continue with cell 16!


## Cache Management

In [32]:
# Clear cache for specific search
# search_service.clear_cache("cloud security")

# Clear all cached searches
# search_service.clear_cache()

# Check current cache stats
try:
    stats = search_service.get_cache_stats()
    print("Current Cache Statistics:")
    for key, value in stats.items():
        print(f"  {key}: {value}")
except Exception as e:
    print(f"Note: Redis not connected - {e}")

Current Cache Statistics:
  total_connections: 63300
  total_commands: 52093
  keyspace_hits: 0
  keyspace_misses: 1
  hit_rate: 0.0


# Balanced Multi-Index Pagination

To show results from **all indexes on each page**, we need a different strategy than pure RRF ranking.

In [33]:
def get_page_balanced(results_by_index: Dict[str, List[Dict]], page_num=1, page_size=10):
    """
    Balanced pagination - ensures each page has results from all indexes.
    
    Strategy: Round-robin interleaving
    - Page 1: Top result from each index (finance, hr, eng), then next from each, etc.
    - Page 2: Continue the pattern
    
    This ensures users see diverse results from all business areas on every page.
    
    Args:
        results_by_index: Dictionary of index_name -> list of results (sorted by RRF)
        page_num: Page number (1-indexed)
        page_size: Results per page
        
    Returns:
        Dictionary with balanced page results and metadata
    """
    # Create iterators for each index (already sorted by RRF within each index)
    index_iterators = {idx: iter(docs) for idx, docs in results_by_index.items()}
    index_names = list(results_by_index.keys())
    
    # Interleave results using round-robin
    all_results = []
    exhausted = set()
    
    while len(exhausted) < len(index_names):
        for idx_name in index_names:
            if idx_name not in exhausted:
                try:
                    doc = next(index_iterators[idx_name])
                    all_results.append(doc)
                except StopIteration:
                    exhausted.add(idx_name)
    
    # Now paginate the interleaved results
    total_results = len(all_results)
    total_pages = (total_results + page_size - 1) // page_size
    
    start = (page_num - 1) * page_size
    end = start + page_size
    page_results = all_results[start:end]
    
    # Calculate index distribution on this page
    index_counts = {}
    for doc in page_results:
        idx = doc['__index']
        index_counts[idx] = index_counts.get(idx, 0) + 1
    
    return {
        "results": page_results,
        "page": page_num,
        "page_size": page_size,
        "total_results": total_results,
        "total_pages": total_pages,
        "has_next": page_num < total_pages,
        "has_previous": page_num > 1,
        "index_distribution": index_counts  # Shows how many from each index
    }


def get_page_balanced_weighted(results_by_index: Dict[str, List[Dict]], page_num=1, page_size=10):
    """
    Weighted balanced pagination - proportional representation by result count.
    
    If finance has 8 results, hr has 4, engineering has 2:
    - Page 1 might have: 4 finance, 2 hr, 1 engineering (roughly proportional)
    
    Better than pure round-robin when indexes have very different result counts.
    """
    # Calculate total and proportions
    total_count = sum(len(docs) for docs in results_by_index.values())
    if total_count == 0:
        return {
            "results": [],
            "page": page_num,
            "page_size": page_size,
            "total_results": 0,
            "total_pages": 0,
            "has_next": False,
            "has_previous": False,
            "index_distribution": {}
        }
    
    # Calculate how many results each index should contribute per page
    proportions = {
        idx: len(docs) / total_count 
        for idx, docs in results_by_index.items()
    }
    
    # Build interleaved list with weighted distribution
    all_results = []
    index_positions = {idx: 0 for idx in results_by_index.keys()}
    
    # Interleave based on proportions
    while any(pos < len(results_by_index[idx]) for idx, pos in index_positions.items()):
        for idx in sorted(results_by_index.keys(), key=lambda x: -proportions[x]):
            pos = index_positions[idx]
            if pos < len(results_by_index[idx]):
                all_results.append(results_by_index[idx][pos])
                index_positions[idx] += 1
    
    # Paginate
    total_results = len(all_results)
    total_pages = (total_results + page_size - 1) // page_size
    
    start = (page_num - 1) * page_size
    end = start + page_size
    page_results = all_results[start:end]
    
    # Calculate distribution
    index_counts = {}
    for doc in page_results:
        idx = doc['__index']
        index_counts[idx] = index_counts.get(idx, 0) + 1
    
    return {
        "results": page_results,
        "page": page_num,
        "page_size": page_size,
        "total_results": total_results,
        "total_pages": total_pages,
        "has_next": page_num < total_pages,
        "has_previous": page_num > 1,
        "index_distribution": index_counts
    }


# Demo: Balanced Pagination
print("\n" + "="*70)
print("BALANCED PAGINATION - Round Robin (Equal representation)")
print("="*70)
print("Each page tries to show results from ALL indexes\n")

# Get results grouped by index (sorted by RRF within each)
search_text = "cloud security"
all_results = query_all_indexes(search_text, top=10)
all_results = normalize_scores(all_results)
results_by_index = {idx: [doc for doc in all_results if doc["__index"] == idx] for idx in indexes}

# Sort each index's results by RRF
for idx in results_by_index:
    docs = results_by_index[idx]
    docs_sorted = sorted(docs, key=lambda x: x.get("norm_score", 0), reverse=True)
    for i, doc in enumerate(docs_sorted):
        doc['rrf_score'] = 1 / (i + 1 + 60)  # Simple RRF for demo
    results_by_index[idx] = docs_sorted

# Page 1
page1 = get_page_balanced(results_by_index, page_num=1, page_size=6)
print(f"📄 Page {page1['page']} of {page1['total_pages']} (showing {len(page1['results'])} results)")
print(f"Index distribution: {page1['index_distribution']}")
print()
for i, doc in enumerate(page1['results'], 1):
    print(f"   {i}. [{doc['__index']:12}] {doc['title'][:45]:45} | {doc['timestamp']}")

# Page 2
if page1['has_next']:
    page2 = get_page_balanced(results_by_index, page_num=2, page_size=6)
    print(f"\n📄 Page {page2['page']} of {page2['total_pages']} (showing {len(page2['results'])} results)")
    print(f"Index distribution: {page2['index_distribution']}")
    print()
    for i, doc in enumerate(page2['results'], 1):
        print(f"   {i}. [{doc['__index']:12}] {doc['title'][:45]:45} | {doc['timestamp']}")

print("\n" + "="*70)
print("WEIGHTED BALANCED PAGINATION (Proportional representation)")
print("="*70)
print("Indexes with more results get more space on each page\n")

# Weighted version
page1_weighted = get_page_balanced_weighted(results_by_index, page_num=1, page_size=6)
print(f"📄 Page {page1_weighted['page']} of {page1_weighted['total_pages']} (showing {len(page1_weighted['results'])} results)")
print(f"Index distribution: {page1_weighted['index_distribution']}")
print()
for i, doc in enumerate(page1_weighted['results'], 1):
    print(f"   {i}. [{doc['__index']:12}] {doc['title'][:45]:45} | {doc['timestamp']}")


BALANCED PAGINATION - Round Robin (Equal representation)
Each page tries to show results from ALL indexes

📄 Page 1 of 5 (showing 6 results)
Index distribution: {'finance': 2, 'hr': 2, 'engineering': 2}

   1. [finance     ] Cloud Data Encryption                         | 2025-09-22T15:45:00Z
   2. [hr          ] HR Data Protection                            | 2025-09-20T08:15:00Z
   3. [engineering ] Cloud Security Architecture                   | 2025-09-30T11:00:00Z
   4. [finance     ] Cybersecurity Incident Response               | 2025-09-24T13:30:00Z
   5. [hr          ] Cloud Security Training                       | 2025-09-28T08:00:00Z
   6. [engineering ] DevSecOps Pipeline                            | 2025-09-20T10:30:00Z

📄 Page 2 of 5 (showing 6 results)
Index distribution: {'finance': 2, 'hr': 2, 'engineering': 2}

   1. [finance     ] Cloud Migration for Trading Systems           | 2025-09-27T11:15:00Z
   2. [hr          ] Cloud Access Management                       

# Expanded Dataset Demo

Now let's reload the expanded datasets (20 documents per index = 60 total) and demonstrate pagination with multiple pages.

In [None]:
# Recreate indexes with expanded data
print("🔄 Recreating indexes with expanded datasets...")
for idx in indexes:
    create_index(idx)

print("\n📤 Uploading expanded documents...")
for idx, file in csv_files.items():
    upload_documents(idx, file)

print("\n✅ Indexes recreated with 20 documents each (60 total)")

# Search and show dataset statistics
search_text = "cloud security"
all_results = query_all_indexes(search_text, top=50)  # Fetch more results
all_results = normalize_scores(all_results)

print(f"\n📊 Dataset Statistics:")
print(f"   Total results for '{search_text}': {len(all_results)}")
for idx in indexes:
    count = len([doc for doc in all_results if doc["__index"] == idx])
    print(f"   {idx:12}: {count} documents")

# Prepare for balanced pagination
results_by_index = {idx: [doc for doc in all_results if doc["__index"] == idx] for idx in indexes}

# Sort each index's results by score
for idx in results_by_index:
    docs = results_by_index[idx]
    docs_sorted = sorted(docs, key=lambda x: x.get("norm_score", 0), reverse=True)
    for i, doc in enumerate(docs_sorted):
        doc['rrf_score'] = 1 / (i + 1 + 60)
    results_by_index[idx] = docs_sorted

print("\n" + "="*80)
print("BALANCED PAGINATION - Multiple Pages Demo")
print("="*80)
print("Each page shows results from ALL indexes (round-robin interleaving)\n")

# Show first 5 pages with 10 results each
for page_num in range(1, 6):
    page = get_page_balanced(results_by_index, page_num=page_num, page_size=10)
    
    if not page['results']:
        break
        
    print(f"\n📄 Page {page['page']} of {page['total_pages']}")
    print(f"   Index distribution: {page['index_distribution']}")
    print(f"   Showing {len(page['results'])} results:")
    
    for i, doc in enumerate(page['results'], 1):
        # Truncate title for display
        title_short = doc['title'][:50].ljust(50)
        print(f"      {i:2}. [{doc['__index']:12}] {title_short} | {doc['timestamp'][:10]}")
    
    if not page['has_next']:
        print("\n   ℹ️  No more pages")
        break

print("\n" + "="*80)
print(f"✅ Successfully demonstrated pagination across {page['total_pages']} pages")
print(f"   Each page maintained balanced representation from all {len(indexes)} indexes")
print("="*80)