# 13 - Retrieval-Augmented Systems

This notebook covers retrieval-augmented generation and hybrid systems.

## Topics Covered:
- Embedding models
- Vector databases
- Similarity search
- Retrieval pipelines
- RAG architectures

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from typing import List, Tuple, Dict
from collections import defaultdict

np.random.seed(42)

## 1. Embedding Models and Vector Search

In [None]:
class EmbeddingModel:
    """Simple embedding model for text."""
    
    def __init__(self, vocab_size: int, d_model: int):
        self.vocab_size = vocab_size
        self.d_model = d_model
        
        # Token embeddings
        self.token_embeddings = np.random.randn(vocab_size, d_model) * 0.1
        
        # Simple vocabulary
        self.vocab = {f'token_{i}': i for i in range(vocab_size)}
        self.vocab.update({
            'the': 0, 'cat': 1, 'dog': 2, 'sat': 3, 'ran': 4,
            'on': 5, 'in': 6, 'park': 7, 'mat': 8, 'quick': 9
        })
    
    def encode_text(self, text: str) -> np.ndarray:
        """Encode text to embedding vector."""
        tokens = text.lower().split()
        
        # Get token embeddings
        embeddings = []
        for token in tokens:
            if token in self.vocab:
                embeddings.append(self.token_embeddings[self.vocab[token]])
            else:
                embeddings.append(np.random.randn(self.d_model) * 0.1)
        
        if not embeddings:
            return np.zeros(self.d_model)
        
        # Mean pooling
        sentence_embedding = np.mean(embeddings, axis=0)
        
        # Normalize
        norm = np.linalg.norm(sentence_embedding)
        if norm > 0:
            sentence_embedding = sentence_embedding / norm
        
        return sentence_embedding

class VectorDatabase:
    """Simple vector database for similarity search."""
    
    def __init__(self, d_model: int):
        self.d_model = d_model
        self.vectors = []
        self.metadata = []
    
    def add_vector(self, vector: np.ndarray, metadata: Dict):
        """Add vector with metadata."""
        self.vectors.append(vector)
        self.metadata.append(metadata)
    
    def similarity_search(self, query_vector: np.ndarray, top_k: int = 5, 
                         metric: str = 'cosine') -> List[Tuple[int, float, Dict]]:
        """Search for similar vectors."""
        similarities = []
        
        for i, vector in enumerate(self.vectors):
            if metric == 'cosine':
                similarity = np.dot(query_vector, vector)
            elif metric == 'euclidean':
                similarity = -np.linalg.norm(query_vector - vector)
            else:
                similarity = np.dot(query_vector, vector)
            
            similarities.append((i, similarity, self.metadata[i]))
        
        # Sort by similarity
        similarities.sort(key=lambda x: x[1], reverse=True)
        
        return similarities[:top_k]
    
    def get_statistics(self) -> Dict:
        """Get database statistics."""
        if not self.vectors:
            return {'size': 0, 'avg_norm': 0, 'dimension': self.d_model}
        
        vectors_array = np.array(self.vectors)
        
        return {
            'size': len(self.vectors),
            'avg_norm': np.mean(np.linalg.norm(vectors_array, axis=1)),
            'dimension': self.d_model,
            'memory_mb': vectors_array.nbytes / (1024 ** 2)
        }

class RAGPipeline:
    """Complete RAG pipeline implementation."""
    
    def __init__(self, embedding_model: EmbeddingModel, vector_db: VectorDatabase):
        self.embedding_model = embedding_model
        self.vector_db = vector_db
    
    def index_documents(self, documents: List[str]):
        """Index documents in the vector database."""
        for i, doc in enumerate(documents):
            embedding = self.embedding_model.encode_text(doc)
            metadata = {'doc_id': i, 'text': doc, 'length': len(doc.split())}
            self.vector_db.add_vector(embedding, metadata)
    
    def retrieve_and_generate(self, query: str, top_k: int = 3) -> Dict:
        """Retrieve relevant documents and generate response."""
        # Encode query
        query_embedding = self.embedding_model.encode_text(query)
        
        # Retrieve similar documents
        results = self.vector_db.similarity_search(query_embedding, top_k)
        
        # Extract retrieved texts
        retrieved_texts = [metadata['text'] for _, _, metadata in results]
        similarities = [score for _, score, _ in results]
        
        # Combine context
        context = " ".join(retrieved_texts)
        
        # Simulate generation (simplified)
        response = f"Based on retrieved context, {query.lower()} can be answered as: [Generated response using context]"
        
        return {
            'query': query,
            'retrieved_docs': retrieved_texts,
            'similarities': similarities,
            'context': context,
            'response': response
        }

def demonstrate_rag_systems():
    """Demonstrate retrieval-augmented generation systems."""
    
    print("Retrieval-Augmented Generation Systems:")
    
    # Initialize components
    embedding_model = EmbeddingModel(vocab_size=1000, d_model=128)
    vector_db = VectorDatabase(d_model=128)
    rag_pipeline = RAGPipeline(embedding_model, vector_db)
    
    # Sample knowledge base
    knowledge_base = [
        "Transformers use self-attention to process sequences in parallel.",
        "BERT is an encoder-only model trained with masked language modeling.",
        "GPT models are decoder-only and generate text autoregressively.",
        "Attention mechanisms allow models to focus on relevant input parts.",
        "Large language models are trained on billions of text tokens.",
        "Fine-tuning adapts pre-trained models to specific tasks.",
        "RAG combines retrieval with generation for factual accuracy.",
        "Vector databases enable efficient similarity search at scale.",
        "Embeddings represent text as dense numerical vectors.",
        "Context windows limit the amount of text models can process."
    ]
    
    # Index documents
    rag_pipeline.index_documents(knowledge_base)
    
    # Test queries
    test_queries = [
        "How do transformers work?",
        "What is the difference between BERT and GPT?",
        "How does RAG improve language models?"
    ]
    
    print(f"\nKnowledge Base: {len(knowledge_base)} documents indexed")
    print(f"Vector Database Stats: {vector_db.get_statistics()}")
    
    # Process queries
    results = []
    for query in test_queries:
        result = rag_pipeline.retrieve_and_generate(query, top_k=3)
        results.append(result)
        
        print(f"\nQuery: {query}")
        print(f"Top retrieved documents:")
        for i, (doc, sim) in enumerate(zip(result['retrieved_docs'], result['similarities'])):
            print(f"  {i+1}. {doc} (similarity: {sim:.3f})")
    
    # Visualizations
    plt.figure(figsize=(15, 10))
    
    # Embedding similarity matrix
    plt.subplot(2, 3, 1)
    
    # Create similarity matrix for knowledge base
    embeddings_matrix = np.array(vector_db.vectors)
    similarity_matrix = embeddings_matrix @ embeddings_matrix.T
    
    plt.imshow(similarity_matrix, cmap='Blues', aspect='auto')
    plt.title('Document Similarity Matrix')
    plt.xlabel('Document Index')
    plt.ylabel('Document Index')
    plt.colorbar()
    
    # Retrieval scores for different queries
    plt.subplot(2, 3, 2)
    
    query_names = [f'Q{i+1}' for i in range(len(test_queries))]
    
    for i, result in enumerate(results):
        similarities = result['similarities']
        plt.plot(range(len(similarities)), similarities, 'o-', 
                label=query_names[i], alpha=0.7)
    
    plt.xlabel('Retrieved Document Rank')
    plt.ylabel('Similarity Score')
    plt.title('Retrieval Quality by Query')
    plt.legend()
    plt.grid(True, alpha=0.3)
    
    # Vector database performance
    plt.subplot(2, 3, 3)
    
    db_sizes = [100, 1000, 10000, 100000, 1000000]
    
    # Simulate search times (linear for brute force, log for indexed)
    brute_force_time = np.array(db_sizes) / 1000  # Linear
    indexed_time = np.log(db_sizes) * 0.1         # Logarithmic
    
    plt.loglog(db_sizes, brute_force_time, 'r-', label='Brute Force', linewidth=2)
    plt.loglog(db_sizes, indexed_time, 'b-', label='Indexed Search', linewidth=2)
    
    plt.xlabel('Database Size')
    plt.ylabel('Search Time (ms)')
    plt.title('Vector Search Scalability')
    plt.legend()
    plt.grid(True, alpha=0.3)
    
    # RAG pipeline components
    plt.subplot(2, 3, 4)
    
    pipeline_steps = ['Query\nEncoding', 'Vector\nSearch', 'Context\nRanking', 'Response\nGeneration']
    latencies = [5, 15, 8, 50]  # Milliseconds
    
    bars = plt.bar(pipeline_steps, latencies, alpha=0.7)
    plt.ylabel('Latency (ms)')
    plt.title('RAG Pipeline Latency Breakdown')
    
    # Add cumulative latency
    cumulative = np.cumsum(latencies)
    for i, (bar, cum_lat) in enumerate(zip(bars, cumulative)):
        plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 1, 
                f'{cum_lat}ms', ha='center', fontsize=8)
    
    # Embedding quality analysis
    plt.subplot(2, 3, 5)
    
    # Simulate embedding quality metrics
    embedding_dims = [64, 128, 256, 512, 1024]
    retrieval_accuracy = [0.65, 0.72, 0.78, 0.82, 0.84]  # Accuracy improves with dimension
    search_speed = [100, 80, 60, 40, 20]  # Speed decreases with dimension
    
    ax1 = plt.gca()
    ax1.plot(embedding_dims, retrieval_accuracy, 'g-o', label='Accuracy', linewidth=2)
    ax1.set_xlabel('Embedding Dimension')
    ax1.set_ylabel('Retrieval Accuracy', color='g')
    ax1.tick_params(axis='y', labelcolor='g')
    
    ax2 = ax1.twinx()
    ax2.plot(embedding_dims, search_speed, 'r-s', label='Speed', linewidth=2)
    ax2.set_ylabel('Search Speed (relative)', color='r')
    ax2.tick_params(axis='y', labelcolor='r')
    
    plt.title('Embedding Dimension Trade-offs')
    
    # RAG vs standard LM comparison
    plt.subplot(2, 3, 6)
    
    metrics = ['Factual\nAccuracy', 'Response\nLatency', 'Knowledge\nCoverage', 'Consistency']
    standard_lm = [60, 90, 70, 85]
    rag_system = [85, 60, 95, 80]
    
    x = np.arange(len(metrics))
    width = 0.35
    
    plt.bar(x - width/2, standard_lm, width, label='Standard LM', alpha=0.7)
    plt.bar(x + width/2, rag_system, width, label='RAG System', alpha=0.7)
    
    plt.xlabel('Metric')
    plt.ylabel('Score')
    plt.title('RAG vs Standard LM')
    plt.xticks(x, metrics)
    plt.legend()
    
    plt.tight_layout()
    plt.show()
    
    print("\nRAG System Components:")
    
    print("\n1. Embedding Models:")
    print("   - Convert text to dense vectors")
    print("   - Capture semantic similarity")
    print("   - Enable efficient similarity search")
    
    print("\n2. Vector Databases:")
    print("   - Store and index embeddings")
    print("   - Fast approximate nearest neighbor search")
    print("   - Scalable to millions/billions of vectors")
    
    print("\n3. Retrieval Pipeline:")
    print("   - Query encoding")
    print("   - Similarity search")
    print("   - Result ranking and filtering")
    print("   - Context preparation")
    
    print("\n4. RAG Advantages:")
    print("   + Access to external knowledge")
    print("   + Better factual accuracy")
    print("   + Updatable knowledge without retraining")
    print("   + Reduced hallucination")
    
    print("\n5. RAG Challenges:")
    print("   - Additional latency from retrieval")
    print("   - Quality depends on retrieval accuracy")
    print("   - Context integration complexity")
    print("   - Maintaining embedding quality")

demonstrate_rag_systems()