# RAG Practical Examples - Linking Concepts Together

This notebook demonstrates key RAG concepts with practical implementations.

## 1. Setup and Dependencies

In [None]:
# Install required packages
%pip install numpy pandas scikit-learn sentence-transformers faiss-cpu matplotlib seaborn

In [None]:
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from sentence_transformers import SentenceTransformer
import faiss
import matplotlib.pyplot as plt
import seaborn as sns
from typing import List, Tuple
import re
import time

## 2. Document Processing and Chunking

In [None]:
# Sample documents for our RAG system
documents = [
    "Machine learning is a subset of artificial intelligence that focuses on algorithms that can learn from data.",
    "Deep learning uses neural networks with multiple layers to model complex patterns in data.",
    "Natural language processing enables computers to understand and generate human language.",
    "Computer vision allows machines to interpret and understand visual information from images.",
    "Reinforcement learning trains agents to make decisions through trial and error in an environment.",
    "Supervised learning uses labeled data to train models for prediction tasks.",
    "Unsupervised learning finds patterns in data without labeled examples.",
    "Transfer learning leverages pre-trained models for new but related tasks."
]

print(f"Total documents: {len(documents)}")
for i, doc in enumerate(documents):
    print(f"{i+1}. {doc}")

In [None]:
class DocumentChunker:
    """Demonstrates different chunking strategies"""
    
    @staticmethod
    def fixed_chunking(text: str, chunk_size: int = 50) -> List[str]:
        """Fixed size chunking"""
        words = text.split()
        chunks = []
        for i in range(0, len(words), chunk_size):
            chunk = ' '.join(words[i:i + chunk_size])
            chunks.append(chunk)
        return chunks
    
    @staticmethod
    def sliding_window_chunking(text: str, chunk_size: int = 50, overlap: int = 10) -> List[str]:
        """Sliding window with overlap"""
        words = text.split()
        chunks = []
        stride = chunk_size - overlap
        
        for i in range(0, len(words), stride):
            chunk = ' '.join(words[i:i + chunk_size])
            if len(chunk.split()) > overlap:  # Avoid tiny chunks
                chunks.append(chunk)
        return chunks
    
    @staticmethod
    def sentence_chunking(text: str) -> List[str]:
        """Semantic chunking by sentences"""
        sentences = re.split(r'[.!?]+', text)
        return [s.strip() for s in sentences if s.strip()]

# Demonstrate chunking strategies
sample_text = " ".join(documents)
chunker = DocumentChunker()

print("Original text length:", len(sample_text.split()), "words")
print("\nChunking Results:")

fixed_chunks = chunker.fixed_chunking(sample_text, chunk_size=20)
print(f"Fixed chunking (20 words): {len(fixed_chunks)} chunks")

sliding_chunks = chunker.sliding_window_chunking(sample_text, chunk_size=20, overlap=5)
print(f"Sliding window (20 words, 5 overlap): {len(sliding_chunks)} chunks")

sentence_chunks = chunker.sentence_chunking(sample_text)
print(f"Sentence chunking: {len(sentence_chunks)} chunks")

## 3. Embedding Generation and Vector Mathematics

In [None]:
# Load embedding model
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')

# Generate embeddings for documents
document_embeddings = embedding_model.encode(documents)

print(f"Embedding shape: {document_embeddings.shape}")
print(f"Each document is represented as a {document_embeddings.shape[1]}-dimensional vector")

# Visualize embedding statistics
plt.figure(figsize=(12, 4))

plt.subplot(1, 3, 1)
plt.hist(document_embeddings.flatten(), bins=50, alpha=0.7)
plt.title('Distribution of Embedding Values')
plt.xlabel('Value')
plt.ylabel('Frequency')

plt.subplot(1, 3, 2)
norms = np.linalg.norm(document_embeddings, axis=1)
plt.bar(range(len(norms)), norms)
plt.title('Vector Magnitudes')
plt.xlabel('Document Index')
plt.ylabel('L2 Norm')

plt.subplot(1, 3, 3)
similarity_matrix = cosine_similarity(document_embeddings)
sns.heatmap(similarity_matrix, annot=True, cmap='coolwarm', center=0)
plt.title('Document Similarity Matrix')

plt.tight_layout()
plt.show()

## 4. Distance Metrics Implementation

In [None]:
class DistanceMetrics:
    """Implementation of various distance metrics used in RAG"""
    
    @staticmethod
    def cosine_similarity(v1: np.ndarray, v2: np.ndarray) -> float:
        """Cosine similarity between two vectors"""
        dot_product = np.dot(v1, v2)
        norm_v1 = np.linalg.norm(v1)
        norm_v2 = np.linalg.norm(v2)
        return dot_product / (norm_v1 * norm_v2)
    
    @staticmethod
    def euclidean_distance(v1: np.ndarray, v2: np.ndarray) -> float:
        """Euclidean distance between two vectors"""
        return np.sqrt(np.sum((v1 - v2) ** 2))
    
    @staticmethod
    def manhattan_distance(v1: np.ndarray, v2: np.ndarray) -> float:
        """Manhattan distance between two vectors"""
        return np.sum(np.abs(v1 - v2))
    
    @staticmethod
    def dot_product(v1: np.ndarray, v2: np.ndarray) -> float:
        """Dot product similarity"""
        return np.dot(v1, v2)

# Compare different distance metrics
metrics = DistanceMetrics()
v1, v2 = document_embeddings[0], document_embeddings[1]

print("Comparing first two documents:")
print(f"Document 1: {documents[0][:50]}...")
print(f"Document 2: {documents[1][:50]}...")
print()
print(f"Cosine Similarity: {metrics.cosine_similarity(v1, v2):.4f}")
print(f"Euclidean Distance: {metrics.euclidean_distance(v1, v2):.4f}")
print(f"Manhattan Distance: {metrics.manhattan_distance(v1, v2):.4f}")
print(f"Dot Product: {metrics.dot_product(v1, v2):.4f}")

## 5. Vector Database and Indexing

In [None]:
class SimpleVectorDB:
    """Simple vector database implementation using FAISS"""
    
    def __init__(self, dimension: int):
        self.dimension = dimension
        self.index = faiss.IndexFlatIP(dimension)  # Inner Product (for cosine similarity)
        self.documents = []
        
    def add_documents(self, embeddings: np.ndarray, documents: List[str]):
        """Add documents and their embeddings to the database"""
        # Normalize embeddings for cosine similarity
        normalized_embeddings = embeddings / np.linalg.norm(embeddings, axis=1, keepdims=True)
        self.index.add(normalized_embeddings.astype('float32'))
        self.documents.extend(documents)
        
    def search(self, query_embedding: np.ndarray, k: int = 3) -> Tuple[List[float], List[str]]:
        """Search for similar documents"""
        # Normalize query embedding
        query_normalized = query_embedding / np.linalg.norm(query_embedding)
        query_normalized = query_normalized.reshape(1, -1).astype('float32')
        
        # Search
        scores, indices = self.index.search(query_normalized, k)
        
        results = []
        for i, idx in enumerate(indices[0]):
            if idx < len(self.documents):
                results.append((scores[0][i], self.documents[idx]))
                
        return results

# Create and populate vector database
vector_db = SimpleVectorDB(document_embeddings.shape[1])
vector_db.add_documents(document_embeddings, documents)

print(f"Vector database created with {len(documents)} documents")
print(f"Index dimension: {vector_db.dimension}")
print(f"Total vectors in index: {vector_db.index.ntotal}")

## 6. Query Processing and Retrieval

In [None]:
class RAGRetriever:
    """RAG retrieval system combining different approaches"""
    
    def __init__(self, vector_db: SimpleVectorDB, embedding_model, documents: List[str]):
        self.vector_db = vector_db
        self.embedding_model = embedding_model
        self.documents = documents
        
        # Create TF-IDF vectorizer for sparse retrieval
        self.tfidf = TfidfVectorizer(stop_words='english', max_features=1000)
        self.tfidf_matrix = self.tfidf.fit_transform(documents)
        
    def dense_retrieval(self, query: str, k: int = 3) -> List[Tuple[float, str]]:
        """Dense retrieval using embeddings"""
        query_embedding = self.embedding_model.encode([query])[0]
        return self.vector_db.search(query_embedding, k)
    
    def sparse_retrieval(self, query: str, k: int = 3) -> List[Tuple[float, str]]:
        """Sparse retrieval using TF-IDF"""
        query_vector = self.tfidf.transform([query])
        similarities = cosine_similarity(query_vector, self.tfidf_matrix).flatten()
        
        # Get top-k results
        top_indices = similarities.argsort()[-k:][::-1]
        results = [(similarities[i], self.documents[i]) for i in top_indices]
        return results
    
    def hybrid_retrieval(self, query: str, k: int = 3, alpha: float = 0.7) -> List[Tuple[float, str]]:
        """Hybrid retrieval combining dense and sparse methods"""
        dense_results = self.dense_retrieval(query, k*2)
        sparse_results = self.sparse_retrieval(query, k*2)
        
        # Combine scores
        combined_scores = {}
        
        for score, doc in dense_results:
            combined_scores[doc] = alpha * score
            
        for score, doc in sparse_results:
            if doc in combined_scores:
                combined_scores[doc] += (1 - alpha) * score
            else:
                combined_scores[doc] = (1 - alpha) * score
        
        # Sort and return top-k
        sorted_results = sorted(combined_scores.items(), key=lambda x: x[1], reverse=True)
        return [(score, doc) for doc, score in sorted_results[:k]]

# Create retriever
retriever = RAGRetriever(vector_db, embedding_model, documents)

# Test different retrieval methods
test_query = "What is neural network learning?"

print(f"Query: '{test_query}'\n")

print("=== Dense Retrieval Results ===")
dense_results = retriever.dense_retrieval(test_query, k=3)
for i, (score, doc) in enumerate(dense_results, 1):
    print(f"{i}. Score: {score:.4f}")
    print(f"   Document: {doc}\n")

print("=== Sparse Retrieval Results ===")
sparse_results = retriever.sparse_retrieval(test_query, k=3)
for i, (score, doc) in enumerate(sparse_results, 1):
    print(f"{i}. Score: {score:.4f}")
    print(f"   Document: {doc}\n")

print("=== Hybrid Retrieval Results ===")
hybrid_results = retriever.hybrid_retrieval(test_query, k=3)
for i, (score, doc) in enumerate(hybrid_results, 1):
    print(f"{i}. Score: {score:.4f}")
    print(f"   Document: {doc}\n")

## 7. Evaluation Metrics Implementation

In [None]:
class RAGEvaluator:
    """Evaluation metrics for RAG systems"""
    
    @staticmethod
    def precision_at_k(retrieved: List[str], relevant: List[str], k: int) -> float:
        """Precision@K metric"""
        retrieved_k = retrieved[:k]
        relevant_retrieved = len(set(retrieved_k) & set(relevant))
        return relevant_retrieved / len(retrieved_k) if retrieved_k else 0.0
    
    @staticmethod
    def recall_at_k(retrieved: List[str], relevant: List[str], k: int) -> float:
        """Recall@K metric"""
        retrieved_k = retrieved[:k]
        relevant_retrieved = len(set(retrieved_k) & set(relevant))
        return relevant_retrieved / len(relevant) if relevant else 0.0
    
    @staticmethod
    def mean_reciprocal_rank(retrieved_lists: List[List[str]], relevant_lists: List[List[str]]) -> float:
        """Mean Reciprocal Rank (MRR)"""
        reciprocal_ranks = []
        
        for retrieved, relevant in zip(retrieved_lists, relevant_lists):
            for i, doc in enumerate(retrieved, 1):
                if doc in relevant:
                    reciprocal_ranks.append(1.0 / i)
                    break
            else:
                reciprocal_ranks.append(0.0)
        
        return sum(reciprocal_ranks) / len(reciprocal_ranks)
    
    @staticmethod
    def ndcg_at_k(retrieved: List[str], relevant: List[str], k: int) -> float:
        """Normalized Discounted Cumulative Gain@K"""
        def dcg(relevances: List[int]) -> float:
            return sum(rel / np.log2(i + 2) for i, rel in enumerate(relevances))
        
        retrieved_k = retrieved[:k]
        relevances = [1 if doc in relevant else 0 for doc in retrieved_k]
        
        dcg_score = dcg(relevances)
        ideal_relevances = sorted(relevances, reverse=True)
        idcg_score = dcg(ideal_relevances)
        
        return dcg_score / idcg_score if idcg_score > 0 else 0.0

# Example evaluation
evaluator = RAGEvaluator()

# Simulate evaluation scenario
query = "machine learning algorithms"
retrieved_docs = [doc for _, doc in retriever.dense_retrieval(query, k=5)]
relevant_docs = [documents[0], documents[5]]  # Manually defined relevant docs

print(f"Query: {query}")
print(f"Retrieved documents: {len(retrieved_docs)}")
print(f"Relevant documents: {len(relevant_docs)}")
print()

# Calculate metrics
precision_3 = evaluator.precision_at_k(retrieved_docs, relevant_docs, 3)
recall_3 = evaluator.recall_at_k(retrieved_docs, relevant_docs, 3)
ndcg_3 = evaluator.ndcg_at_k(retrieved_docs, relevant_docs, 3)

print(f"Precision@3: {precision_3:.4f}")
print(f"Recall@3: {recall_3:.4f}")
print(f"NDCG@3: {ndcg_3:.4f}")

# F1 Score
f1_score = 2 * (precision_3 * recall_3) / (precision_3 + recall_3) if (precision_3 + recall_3) > 0 else 0
print(f"F1@3: {f1_score:.4f}")

## 8. Performance Analysis and Optimization

In [None]:
class PerformanceAnalyzer:
    """Analyze RAG system performance"""
    
    def __init__(self, retriever: RAGRetriever):
        self.retriever = retriever
        
    def benchmark_retrieval_methods(self, queries: List[str], k: int = 3) -> dict:
        """Benchmark different retrieval methods"""
        results = {
            'dense': {'times': [], 'results': []},
            'sparse': {'times': [], 'results': []},
            'hybrid': {'times': [], 'results': []}
        }
        
        for query in queries:
            # Dense retrieval
            start_time = time.time()
            dense_result = self.retriever.dense_retrieval(query, k)
            dense_time = time.time() - start_time
            results['dense']['times'].append(dense_time)
            results['dense']['results'].append(dense_result)
            
            # Sparse retrieval
            start_time = time.time()
            sparse_result = self.retriever.sparse_retrieval(query, k)
            sparse_time = time.time() - start_time
            results['sparse']['times'].append(sparse_time)
            results['sparse']['results'].append(sparse_result)
            
            # Hybrid retrieval
            start_time = time.time()
            hybrid_result = self.retriever.hybrid_retrieval(query, k)
            hybrid_time = time.time() - start_time
            results['hybrid']['times'].append(hybrid_time)
            results['hybrid']['results'].append(hybrid_result)
        
        return results
    
    def analyze_chunk_size_impact(self, text: str, chunk_sizes: List[int]) -> dict:
        """Analyze impact of different chunk sizes"""
        chunker = DocumentChunker()
        results = {}
        
        for size in chunk_sizes:
            chunks = chunker.fixed_chunking(text, size)
            embeddings = self.retriever.embedding_model.encode(chunks)
            
            results[size] = {
                'num_chunks': len(chunks),
                'avg_chunk_length': np.mean([len(chunk.split()) for chunk in chunks]),
                'embedding_variance': np.var(embeddings.flatten()),
                'memory_usage': embeddings.nbytes / (1024 * 1024)  # MB
            }
        
        return results

# Performance analysis
analyzer = PerformanceAnalyzer(retriever)

# Test queries
test_queries = [
    "machine learning algorithms",
    "neural networks",
    "computer vision",
    "natural language processing"
]

# Benchmark retrieval methods
benchmark_results = analyzer.benchmark_retrieval_methods(test_queries)

print("=== Retrieval Method Performance ===")
for method, data in benchmark_results.items():
    avg_time = np.mean(data['times'])
    std_time = np.std(data['times'])
    print(f"{method.capitalize()} Retrieval:")
    print(f"  Average time: {avg_time:.4f}s (Â±{std_time:.4f}s)")
    print()

# Visualize performance
plt.figure(figsize=(12, 4))

plt.subplot(1, 2, 1)
methods = list(benchmark_results.keys())
avg_times = [np.mean(benchmark_results[method]['times']) for method in methods]
plt.bar(methods, avg_times)
plt.title('Average Retrieval Time by Method')
plt.ylabel('Time (seconds)')

plt.subplot(1, 2, 2)
chunk_analysis = analyzer.analyze_chunk_size_impact(" ".join(documents), [10, 20, 30, 50])
sizes = list(chunk_analysis.keys())
num_chunks = [chunk_analysis[size]['num_chunks'] for size in sizes]
plt.plot(sizes, num_chunks, marker='o')
plt.title('Number of Chunks vs Chunk Size')
plt.xlabel('Chunk Size (words)')
plt.ylabel('Number of Chunks')

plt.tight_layout()
plt.show()

print("\n=== Chunk Size Analysis ===")
for size, metrics in chunk_analysis.items():
    print(f"Chunk size {size} words:")
    print(f"  Number of chunks: {metrics['num_chunks']}")
    print(f"  Average chunk length: {metrics['avg_chunk_length']:.1f} words")
    print(f"  Memory usage: {metrics['memory_usage']:.2f} MB")
    print()

## 9. Complete RAG Pipeline Example

In [None]:
class SimpleRAGPipeline:
    """Complete RAG pipeline implementation"""
    
    def __init__(self, documents: List[str], embedding_model_name: str = 'all-MiniLM-L6-v2'):
        self.documents = documents
        self.embedding_model = SentenceTransformer(embedding_model_name)
        
        # Process documents
        self.embeddings = self.embedding_model.encode(documents)
        
        # Create vector database
        self.vector_db = SimpleVectorDB(self.embeddings.shape[1])
        self.vector_db.add_documents(self.embeddings, documents)
        
        # Create retriever
        self.retriever = RAGRetriever(self.vector_db, self.embedding_model, documents)
        
    def query(self, question: str, k: int = 3, method: str = 'hybrid') -> dict:
        """Process a query through the complete RAG pipeline"""
        start_time = time.time()
        
        # Retrieve relevant documents
        if method == 'dense':
            results = self.retriever.dense_retrieval(question, k)
        elif method == 'sparse':
            results = self.retriever.sparse_retrieval(question, k)
        else:  # hybrid
            results = self.retriever.hybrid_retrieval(question, k)
        
        retrieval_time = time.time() - start_time
        
        # Prepare context for generation
        context = "\n".join([doc for _, doc in results])
        
        # Simulate response generation (in real implementation, this would use an LLM)
        prompt = f"""Based on the following context, answer the question:

Context:
{context}

Question: {question}

Answer:"""
        
        # Simulated response (replace with actual LLM call)
        response = f"Based on the retrieved documents, here's what I found about '{question}': {context[:200]}..."
        
        total_time = time.time() - start_time
        
        return {
            'question': question,
            'retrieved_documents': [doc for _, doc in results],
            'retrieval_scores': [score for score, _ in results],
            'context': context,
            'response': response,
            'retrieval_time': retrieval_time,
            'total_time': total_time,
            'method_used': method
        }

# Create and test complete RAG pipeline
rag_pipeline = SimpleRAGPipeline(documents)

# Test the pipeline
test_questions = [
    "What is machine learning?",
    "How does deep learning work?",
    "What are the types of learning in AI?"
]

print("=== Complete RAG Pipeline Results ===")
for question in test_questions:
    result = rag_pipeline.query(question, k=2, method='hybrid')
    
    print(f"\nQuestion: {result['question']}")
    print(f"Method: {result['method_used']}")
    print(f"Retrieval time: {result['retrieval_time']:.4f}s")
    print(f"Total time: {result['total_time']:.4f}s")
    print("\nRetrieved documents:")
    for i, (doc, score) in enumerate(zip(result['retrieved_documents'], result['retrieval_scores']), 1):
        print(f"  {i}. (Score: {score:.4f}) {doc}")
    print(f"\nResponse: {result['response'][:150]}...")
    print("-" * 80)

## 10. Key Takeaways and Best Practices

### Summary of RAG Concepts Demonstrated:

1. **Document Processing**: Different chunking strategies affect retrieval quality
2. **Embeddings**: Vector representations capture semantic meaning
3. **Distance Metrics**: Cosine similarity is most common for text similarity
4. **Vector Databases**: Enable efficient similarity search at scale
5. **Retrieval Methods**: Dense, sparse, and hybrid approaches each have strengths
6. **Evaluation**: Multiple metrics needed to assess system performance
7. **Performance**: Trade-offs between accuracy, speed, and resource usage

### Best Practices:

- **Chunk Size**: Balance between context preservation and precision
- **Embedding Models**: Choose based on domain and performance requirements
- **Retrieval Strategy**: Hybrid approaches often perform best
- **Evaluation**: Use multiple metrics and real-world testing
- **Optimization**: Monitor latency, throughput, and resource usage
- **Continuous Improvement**: Implement feedback loops for system enhancement

### Mathematical Foundations:

- Vector mathematics enables semantic search
- Distance metrics quantify similarity
- Optimization algorithms improve performance
- Statistical measures ensure quality

This notebook demonstrates how all RAG components work together to create an effective information retrieval and generation system.