### 1. Environment Setup & Advanced RAG Imports

In [None]:
import os, yaml, json
import weaviate
from weaviate.classes.init import Auth
import weaviate.classes.config as wvc
from dotenv import load_dotenv

# Load environment variables for Weaviate Cloud (WCD) and Hugging Face
load_dotenv()

# Load centralized project configuration
with open("../src/config/config.yaml", "r") as f:
    config = yaml.safe_load(f)

print("‚úÖ Environment initialized for Advanced RAG.")

### 2. Weaviate Cloud Connection

In [None]:
weaviate_url = os.getenv("WEAVIATE_URL")
weaviate_key = os.getenv("WEAVIATE_API_KEY")
hf_key = os.getenv("HF_TOKEN") # Used for server-side vectorization

# Connect to Weaviate Cloud
client = weaviate.connect_to_weaviate_cloud(
    cluster_url=weaviate_url,
    auth_credentials=Auth.api_key(weaviate_key),
    headers={"X-HuggingFace-Api-Key": hf_key}
)

if client.is_ready():
    print(f"‚úÖ Librarian connected to Weaviate Cloud at: {weaviate_url}")

### 3. Collection Schema & Indexing Configuration

In [None]:
class_name = config['weaviate']['class_name']

# Reset collection for a clean indexing run
if client.collections.exists(class_name):
    client.collections.delete(class_name)
    print(f"üóëÔ∏è Cleaned existing collection: {class_name}")

# Create Collection with Hybrid-Search Readiness
client.collections.create(
    name=class_name,
    # Configure Server-Side Vectorization
    vectorizer_config=wvc.Configure.Vectorizer.text2vec_huggingface(
        model=config['weaviate']['vectorizer_model'],
        wait_for_model=True
    ),
    properties=[
        wvc.Property(name="content", data_type=wvc.DataType.TEXT),
        wvc.Property(name="chunk_index", data_type=wvc.DataType.INT),
        wvc.Property(name="source", data_type=wvc.DataType.TEXT)
    ]
)

print(f"‚úÖ Collection '{class_name}' configured for Hybrid Search.")

### 4. Data Ingestion & Batch Indexing

In [None]:
data_path = config['paths']['train_data']

print(f"üìÇ Loading chunks from: {data_path}")

with open(data_path, "r") as f:
    training_data = [json.loads(line) for line in f]

collection = client.collections.get(class_name)

# Perform Dynamic Batch Upload
with collection.batch.dynamic() as batch:
    for idx, item in enumerate(training_data):
        batch.add_object(
            properties={
                "content": item["input"],
                "chunk_index": idx,
                "source": "Uber-2024-Annual-Report"
            }
        )
            
print(f"‚úÖ Indexed {len(training_data)} financial chunks into Non-Parametric Memory.")

### 5. Advanced Hybrid RAG Pipeline (The Librarian)

In [None]:
from sentence_transformers import CrossEncoder
from typing import List, Dict, Tuple

# Load Cross-Encoder Reranker Model
reranker = CrossEncoder(config['weaviate']['reranker_model'])

def reciprocal_rank_fusion(ranked_lists: List[List[Tuple[str, any]]], k: int = 60) -> Dict[str, float]:
    """
    Implements Reciprocal Rank Fusion (RRF) algorithm.
    
    RRF Score Formula: RRF(d) = Œ£ 1/(k + rank_i(d))
    where:
        - d is a document
        - rank_i(d) is the rank of document d in retrieval system i
        - k is a constant (typically 60) to prevent division issues
    
    Args:
        ranked_lists: List of ranked result lists from different retrievers
                     Each list contains tuples of (doc_id, document_object)
        k: RRF constant parameter (default: 60)
    
    Returns:
        Dictionary mapping doc_id to RRF score
    """
    rrf_scores = {}
    
    # Process each retrieval system's results
    for ranked_list in ranked_lists:
        for rank, (doc_id, doc_obj) in enumerate(ranked_list):
            # Initialize score if this is first time seeing this document
            if doc_id not in rrf_scores:
                rrf_scores[doc_id] = {'score': 0.0, 'doc': doc_obj}
            
            # Add RRF contribution from this retriever
            # rank starts at 0, so we add 1 to get position
            rrf_scores[doc_id]['score'] += 1.0 / (k + rank + 1)
    
    return rrf_scores

def query_librarian(question: str) -> str:
    """
    Advanced RAG Workflow with Explicit RRF Implementation:
    
    Step 1: Dense Vector Search (Semantic Retrieval)
    Step 2: BM25 Keyword Search (Lexical Retrieval)  
    Step 3: Reciprocal Rank Fusion (Combine Rankings)
    Step 4: Cross-Encoder Reranking (Precision Optimization)
    
    Args:
        question: User query
    
    Returns:
        Best matching content from the annual report
    """
    collection = client.collections.get(config['weaviate']['class_name'])
    
    # Load retrieval parameters from config
    retrieval_config = config['weaviate'].get('retrieval', {})
    dense_top_k = retrieval_config.get('dense_top_k', 20)
    bm25_top_k = retrieval_config.get('bm25_top_k', 20)
    rrf_k = retrieval_config.get('rrf_k', 60)
    final_top_k = retrieval_config.get('final_top_k', 10)
    
    # Step 1: Dense Vector Search 
    # Semantic similarity using embeddings - captures meaning
    vector_results = collection.query.near_text(
        query=question,
        limit=dense_top_k
    )
    
    # Step 2: BM25 Keyword Search 
    # Lexical matching - critical for exact financial entities like "$1.97 billion"
    bm25_results = collection.query.bm25(
        query=question,
        limit=bm25_top_k
    )
    
    # Check if we got any results
    if not vector_results.objects and not bm25_results.objects:
        return "No relevant information found in the annual report."
    
    # Step 3: Reciprocal Rank Fusion (RRF)
    # Prepare ranked lists with document IDs for RRF
    vector_ranked = [(str(obj.uuid), obj) for obj in vector_results.objects]
    bm25_ranked = [(str(obj.uuid), obj) for obj in bm25_results.objects]
    
    # Apply RRF algorithm to combine rankings
    rrf_scores = reciprocal_rank_fusion(
        ranked_lists=[vector_ranked, bm25_ranked],
        k=rrf_k
    )
    
    # Sort by RRF score and take top candidates
    rrf_sorted = sorted(
        rrf_scores.items(), 
        key=lambda x: x[1]['score'], 
        reverse=True
    )[:final_top_k]
    
    # Extract documents for reranking
    rrf_candidates = [doc_data['doc'] for doc_id, doc_data in rrf_sorted]
    
    if not rrf_candidates:
        return "No relevant information found after RRF fusion."
    
    # Step 4: Cross-Encoder Reranking 
    # Final precision step: compute contextualized relevance scores
    candidate_texts = [obj.properties["content"] for obj in rrf_candidates]
    
    # Create query-document pairs for cross-encoder
    pairs = [[question, text] for text in candidate_texts]
    
    # Calculate relevance scores using cross-attention
    rerank_scores = reranker.predict(pairs)
    
    # Find the highest-scoring passage
    best_idx = rerank_scores.argmax()
    
    # Return the most relevant content
    return candidate_texts[best_idx]

# ========== Test Run ==========
test_q = "What are the primary risks associated with the new debt instruments?"
print(f"‚ùì Query: {test_q}")
print(f"\nüîç Running Advanced RAG Pipeline:")
print(f"   Method: {config['weaviate'].get('retrieval', {}).get('method', 'explicit_rrf')}")
print(f"   Step 1: Dense Vector Search (top_k={config['weaviate'].get('retrieval', {}).get('dense_top_k', 20)})")
print(f"   Step 2: BM25 Keyword Search (top_k={config['weaviate'].get('retrieval', {}).get('bm25_top_k', 20)})")
print(f"   Step 3: Reciprocal Rank Fusion (k={config['weaviate'].get('retrieval', {}).get('rrf_k', 60)})")
print(f"   Step 4: Cross-Encoder Reranking (final={config['weaviate'].get('retrieval', {}).get('final_top_k', 10)})")

result = query_librarian(test_q)
print(f"\nüìö Librarian Citation (Top Result after RRF + Reranking):\n{result[:400]}...")