In [None]:
# Experiments for RAG.

In [1]:
!pip install -q sentence-transformers langchain chromadb scikit-learn nltk datasets evaluate beautifulsoup4 requests lxml faiss-cpu transformers

In [44]:
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModel
from sentence_transformers import SentenceTransformer
import faiss
from bs4 import BeautifulSoup
import requests
import xml.etree.ElementTree as ET
import nltk
from nltk.tokenize import sent_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
import pickle
import os
from tqdm import tqdm
import json
import ssl

In [45]:
nltk.download('punkt')
nltk.download('punkt_tab')


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/robinsingh/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/robinsingh/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [46]:
# Cell 3: Configuration
class Config:
    def __init__(self):
        # Directory structure
        self.models_dir = "models"
        self.indexes_dir = "indexes"
        self.cache_dir = "cache"
        self.raw_data_dir = "raw_data"
        self.processed_dir = "processed_data"
        
        # Model parameters
        self.max_tokens = 512
        self.top_k = 5
        
        # Create necessary directories
        for dir_path in [self.models_dir, self.indexes_dir, self.cache_dir, 
                        self.raw_data_dir, self.processed_dir]:
            if not os.path.exists(dir_path):
                os.makedirs(dir_path)

config = Config()

In [47]:
# Cell 4: Content Fetching and Processing
class ContentFetcher:
    def __init__(self, sitemap_url):
        self.sitemap_url = sitemap_url
    
    def fetch_urls(self, limit=10):
        print(f"Fetching up to {limit} URLs from sitemap...")
        response = requests.get(self.sitemap_url)
        root = ET.fromstring(response.content)
        urls = []
        
        for url in root.findall(".//{http://www.sitemaps.org/schemas/sitemap/0.9}loc"):
            if len(urls) < limit:
                urls.append(url.text)
            else:
                break
        
        print(f"Found {len(urls)} URLs")
        return urls
    
    def fetch_and_save_content(self, urls):
        print("Fetching content from URLs...")
        contents = {}
        
        for url in tqdm(urls):
            try:
                response = requests.get(url)
                soup = BeautifulSoup(response.content, 'html.parser')
                
                # Remove script and style elements
                for element in soup(["script", "style"]):
                    element.decompose()
                
                # Get text content
                text = soup.get_text()
                
                # Basic cleaning
                text = text.replace('\n', ' ').replace('\r', ' ')
                text = ' '.join(text.split())
                
                contents[url] = text
                
                # Save raw content
                filename = url.split('/')[-1] or 'index'
                filename = f"{filename}.txt"
                with open(os.path.join(config.raw_data_dir, filename), 'w', encoding='utf-8') as f:
                    f.write(text)
                    
            except Exception as e:
                print(f"Error fetching {url}: {str(e)}")
        
        # Save URL to content mapping
        with open(os.path.join(config.raw_data_dir, 'url_mapping.json'), 'w') as f:
            json.dump(contents, f, indent=2)
        
        return contents
    
    def chunk_and_save_text(self, contents):
        print("Chunking text into smaller segments...")
        all_chunks = []
        chunk_mapping = {}
        
        for url, text in contents.items():
            sentences = sent_tokenize(text)
            chunks = []
            current_chunk = []
            current_length = 0
            
            for sentence in sentences:
                sentence_length = len(sentence.split())
                if current_length + sentence_length > config.max_tokens:
                    if current_chunk:
                        chunk_text = " ".join(current_chunk)
                        chunks.append(chunk_text)
                        all_chunks.append(chunk_text)
                    current_chunk = [sentence]
                    current_length = sentence_length
                else:
                    current_chunk.append(sentence)
                    current_length += sentence_length
            
            if current_chunk:
                chunk_text = " ".join(current_chunk)
                chunks.append(chunk_text)
                all_chunks.append(chunk_text)
            
            chunk_mapping[url] = chunks
        
        # Save all chunks and mapping
        with open(os.path.join(config.processed_dir, 'all_chunks.json'), 'w', encoding='utf-8') as f:
            json.dump(all_chunks, f, indent=2)
        
        with open(os.path.join(config.processed_dir, 'chunk_mapping.json'), 'w', encoding='utf-8') as f:
            json.dump(chunk_mapping, f, indent=2)
        
        return all_chunks, chunk_mapping

In [48]:
# Cell 5: Model Management
class ModelManager:
    def __init__(self):
        self.models = {}
        self.tokenizers = {}
        self.model_configs = {
            'e5': 'intfloat/e5-base-v2',
            'sbert': 'sentence-transformers/all-MiniLM-L6-v2'
        }
    
    def download_and_save_models(self):
        print("Downloading and saving models...")
        for model_name, model_path in self.model_configs.items():
            print(f"\nProcessing {model_name} from {model_path}...")
            model_save_path = os.path.join(config.models_dir, model_name)
            
            if not os.path.exists(model_save_path):
                if model_name == 'sbert':
                    model = SentenceTransformer(model_path)
                    model.save(model_save_path)
                else:
                    tokenizer = AutoTokenizer.from_pretrained(model_path)
                    model = AutoModel.from_pretrained(model_path)
                    
                    tokenizer.save_pretrained(model_save_path)
                    model.save_pretrained(model_save_path)
                print(f"Successfully downloaded and saved {model_name}")
            else:
                print(f"{model_name} already exists in local storage")
    
    def load_models(self):
        print("Loading models...")
        for model_name in self.model_configs.keys():
            model_path = os.path.join(config.models_dir, model_name)
            if not os.path.exists(model_path):
                raise ValueError(f"Model {model_name} not found. Run download_and_save_models first.")
            
            if model_name == 'sbert':
                self.models[model_name] = SentenceTransformer(model_path)
            else:
                self.tokenizers[model_name] = AutoTokenizer.from_pretrained(model_path)
                self.models[model_name] = AutoModel.from_pretrained(model_path)


In [49]:
# Cell 6: Indexing and Vectorization
class VectorIndexer:
    def __init__(self, model_manager):
        self.model_manager = model_manager
        self.indexes = {}
    
    def generate_embeddings(self, text, model_name):
        if model_name == 'sbert':
            return self.model_manager.models[model_name].encode([text])[0]
        else:
            model = self.model_manager.models[model_name]
            tokenizer = self.model_manager.tokenizers[model_name]
            
            inputs = tokenizer(text, padding=True, truncation=True, 
                             return_tensors="pt", max_length=config.max_tokens)
            with torch.no_grad():
                outputs = model(**inputs)
            return outputs.last_hidden_state.mean(dim=1).numpy()[0]
    
    def create_indexes(self, chunks):
        print("Creating indexes...")
        # Create TF-IDF index
        print("\nCreating TF-IDF index...")
        tfidf = TfidfVectorizer()
        tfidf_vectors = tfidf.fit_transform(chunks)
        
        with open(os.path.join(config.indexes_dir, 'tfidf.pkl'), 'wb') as f:
            pickle.dump((tfidf, tfidf_vectors), f)
        
        # Create FAISS indexes
        for model_name in self.model_manager.model_configs.keys():
            print(f"\nCreating FAISS index for {model_name}...")
            embeddings = []
            
            for chunk in tqdm(chunks):
                embedding = self.generate_embeddings(chunk, model_name)
                embeddings.append(embedding)
            
            embeddings = np.array(embeddings)
            index = faiss.IndexFlatL2(embeddings.shape[1])
            index.add(embeddings)
            
            faiss.write_index(index, os.path.join(config.indexes_dir, f"{model_name}.index"))
            print(f"Index saved for {model_name}")

In [50]:
# Cell 7: Main Pipeline Class
class DocumentRetrieval:
    def __init__(self):
        self.content_fetcher = None
        self.model_manager = ModelManager()
        self.vector_indexer = None
        self.chunks = None
        self.tfidf = None
        self.tfidf_vectors = None
        self.indexes = {}
    
    def setup(self, sitemap_url):
        print("\nStep 1: Fetching and processing content...")
        self.content_fetcher = ContentFetcher(sitemap_url)
        urls = self.content_fetcher.fetch_urls()
        contents = self.content_fetcher.fetch_and_save_content(urls)
        self.chunks, _ = self.content_fetcher.chunk_and_save_text(contents)
        
        print("\nStep 2: Downloading and loading models...")
        self.model_manager.download_and_save_models()
        self.model_manager.load_models()
        
        print("\nStep 3: Creating indexes...")
        self.vector_indexer = VectorIndexer(self.model_manager)
        self.vector_indexer.create_indexes(self.chunks)
        
        print("\nSetup complete!")
    
    def load_indexes(self):
        # Load TF-IDF
        with open(os.path.join(config.indexes_dir, 'tfidf.pkl'), 'rb') as f:
            self.tfidf, self.tfidf_vectors = pickle.load(f)
        
        # Load FAISS indexes
        for model_name in self.model_manager.model_configs.keys():
            index_path = os.path.join(config.indexes_dir, f"{model_name}.index")
            self.indexes[model_name] = faiss.read_index(index_path)
    
    def query(self, query_text, k=5):
        if not self.indexes:
            self.load_indexes()
        
        # TF-IDF search
        query_vector = self.tfidf.transform([query_text])
        tfidf_scores = (query_vector * self.tfidf_vectors.T).toarray()[0]
        keyword_indices = np.argsort(tfidf_scores)[-k:][::-1]
        
        # Semantic search
        semantic_results = {}
        for model_name, index in self.indexes.items():
            query_embedding = self.vector_indexer.generate_embeddings(query_text, model_name)
            D, I = index.search(query_embedding.reshape(1, -1), k)
            semantic_results[model_name] = {'indices': I[0], 'distances': D[0]}
        
        # Combine results
        candidate_indices = set(keyword_indices)
        for results in semantic_results.values():
            candidate_indices.update(results['indices'])
        
        # Get final documents and scores
        final_docs = [self.chunks[i] for i in candidate_indices]
        final_scores = [tfidf_scores[i] for i in candidate_indices]
        
        # Sort by score
        sorted_pairs = sorted(zip(final_docs, final_scores), key=lambda x: x[1], reverse=True)
        return [doc for doc, _ in sorted_pairs[:k]], [score for _, score in sorted_pairs[:k]]

In [51]:
# Cell 8: Usage Example
# Initialize the pipeline
retrieval = DocumentRetrieval()

# Run setup (only need to do this once)
retrieval.setup("https://nextjs.org/sitemap.xml")


Step 1: Fetching and processing content...
Fetching up to 10 URLs from sitemap...
Found 10 URLs
Fetching content from URLs...


100%|██████████| 10/10 [00:01<00:00,  5.03it/s]


Chunking text into smaller segments...

Step 2: Downloading and loading models...
Downloading and saving models...

Processing e5 from intfloat/e5-base-v2...
Successfully downloaded and saved e5

Processing sbert from sentence-transformers/all-MiniLM-L6-v2...
Successfully downloaded and saved sbert
Loading models...

Step 3: Creating indexes...
Creating indexes...

Creating TF-IDF index...

Creating FAISS index for e5...


100%|██████████| 41/41 [00:07<00:00,  5.56it/s]


Index saved for e5

Creating FAISS index for sbert...


100%|██████████| 41/41 [00:00<00:00, 42.17it/s]

Index saved for sbert

Setup complete!





In [52]:
# Now you can run multiple queries
def search_docs(query):
    results, scores = retrieval.query(query)
    print("\nQuery:", query)
    print("\nTop 5 Results:")
    for i, (doc, score) in enumerate(zip(results, scores), 1):
        print(f"\n{i}. Score: {score:.4f}")
        print(f"Document: {doc[:200]}...")

In [55]:
search_docs("apple silicon support in nextjs")



Query: apple silicon support in nextjs

Top 5 Results:

1. Score: 0.1940
Document: Next.js 10.1 | Next.jsSkip to contentSearch documentation...Search...⌘KShowcaseDocsBlogTemplatesEnterpriseSearch documentation...Search...⌘KDeployLearnBack to BlogMonday, March 29th 2021Next.js 10.1Po...

2. Score: 0.0942
Document: When the nextjs.org website launched, we would manually keep the documentation changes up to date by periodically copying the content from the Next.js GitHub repository to the website GitHub repositor...

3. Score: 0.0930
Document: Next.js, the React framework for production, enables you to incrementally adopt React.Read MoreOctober 27th, 2020+5Next.js 10We are excited to introduce Next.js 10, featuring: Built-in Image Component...

4. Score: 0.0574
Document: Additional Layouts Based on your feedback, we've added a variety of new layouts and options for next/image: layout=fill: You don't need to provide width and height. (Demo) layout=fixed: Native img beh...

5. Score: 0.050

In [56]:
search_docs("what is nextjs")



Query: what is nextjs

Top 5 Results:

1. Score: 0.1533
Document: When the nextjs.org website launched, we would manually keep the documentation changes up to date by periodically copying the content from the Next.js GitHub repository to the website GitHub repositor...

2. Score: 0.0926
Document: New Next.js Documentation | Next.jsSkip to contentSearch documentation...Search...⌘KShowcaseDocsBlogTemplatesEnterpriseSearch documentation...Search...⌘KDeployLearnBack to BlogThursday, January 9th 20...

3. Score: 0.0639
Document: This is sometimes called the “donut” pattern: The outer part of the donut is a server component that handles data fetching or heavy logic. The hole in the middle is a child component that might have s...

4. Score: 0.0574
Document: Sites that load personalized content or ads may also experience wildly different performance from user to user. An emulated test cannot capture these important signals. Next.js Speed Insights allows y...

5. Score: 0.0499
Document: Some 

In [57]:
import time
from sklearn.metrics import precision_score, recall_score, ndcg_score
import numpy as np

class ModelEvaluator:
    def __init__(self, retrieval_system):
        self.retrieval = retrieval_system
        
    def evaluate_single_model(self, model_name, query, k=5):
        """Evaluate a single model's performance on a query"""
        start_time = time.time()
        
        # Get query embedding
        query_embedding = self.retrieval.vector_indexer.generate_embeddings(query, model_name)
        
        # Search using FAISS
        D, I = self.retrieval.indexes[model_name].search(query_embedding.reshape(1, -1), k)
        
        end_time = time.time()
        response_time = end_time - start_time
        
        # Get the documents
        results = [self.retrieval.chunks[i] for i in I[0]]
        scores = [-d for d in D[0]]  # Convert distances to scores (negative distance)
        
        return results, scores, response_time
    
    def evaluate_models(self, test_queries):
        """
        Evaluate each model separately
        """
        models = ['e5', 'sbert']
        results = {model: {
            "response_times": [],
            "keyword_matches": [],
            "ndcg_scores": []
        } for model in models}
        
        print("\nStarting Model-by-Model Evaluation:")
        for model in models:
            print(f"\n{'-'*20}")
            print(f"Evaluating {model.upper()} Model")
            print(f"{'-'*20}")
            
            for test_case in test_queries:
                query = test_case["query"]
                expected = test_case["expected_keywords"]
                relevance = test_case["relevance_scores"]
                
                # Get results for this model
                docs, scores, response_time = self.evaluate_single_model(model, query)
                
                # Store response time
                results[model]["response_times"].append(response_time)
                
                # Check keyword matches
                keyword_matches = []
                for doc in docs:
                    matches = sum(1 for keyword in expected if keyword.lower() in doc.lower())
                    keyword_matches.append(matches / len(expected))
                results[model]["keyword_matches"].append(max(keyword_matches))
                
                # Calculate NDCG
                if len(relevance) > 0:
                    predicted_scores = scores[:len(relevance)]
                    ndcg = ndcg_score([relevance], [predicted_scores])
                    results[model]["ndcg_scores"].append(ndcg)
                
                # Print detailed results for this query
                print(f"\nQuery: {query}")
                print(f"Response Time: {response_time:.3f} seconds")
                print(f"Top 2 Results:")
                for i, (doc, score) in enumerate(zip(docs[:2], scores[:2]), 1):
                    print(f"\n{i}. Score: {score:.4f}")
                    print(f"Preview: {doc[:150]}...")
                    matches = [kw for kw in expected if kw.lower() in doc.lower()]
                    print(f"Matching keywords: {matches}")
            
            # Print summary for this model
            print(f"\nSummary for {model.upper()}:")
            print(f"Average Response Time: {np.mean(results[model]['response_times']):.3f} seconds")
            print(f"Average Keyword Match Rate: {np.mean(results[model]['keyword_matches']):.2f}")
            print(f"Average NDCG Score: {np.mean(results[model]['ndcg_scores']):.2f}")
        
        return results

In [58]:
test_queries = [
    {
        "query": "How to implement SSR in Next.js?",
        "expected_keywords": [
            "server", "side", "rendering", "getServerSideProps", "SSR",
            "initial props", "data fetching"
        ],
        "relevance_scores": [1, 1, 1, 0, 0]
    },
    {
        "query": "What are API routes in Next.js?",
        "expected_keywords": [
            "api", "routes", "handler", "endpoint", "backend",
            "serverless", "functions"
        ],
        "relevance_scores": [1, 1, 0, 0, 0]
    },
    {
        "query": "How to deploy Next.js?",
        "expected_keywords": [
            "deploy", "vercel", "production", "build", "hosting",
            "environment", "configuration"
        ],
        "relevance_scores": [1, 1, 1, 0, 0]
    },
    {
        "query": "How to optimize Next.js performance?",
        "expected_keywords": [
            "optimize", "performance", "lazy loading", "images",
            "caching", "bundling", "lighthouse"
        ],
        "relevance_scores": [1, 1, 1, 0, 0]
    }
]

In [59]:
def run_model_evaluation():
    # Make sure retrieval system is initialized
    if not hasattr(retrieval, 'indexes') or not retrieval.indexes:
        print("Loading indexes...")
        retrieval.load_indexes()
    
    # Create evaluator
    evaluator = ModelEvaluator(retrieval)
    
    # Run evaluation
    print("\nStarting evaluation...")
    results = evaluator.evaluate_models(test_queries)
    
    # Print comparative analysis
    print("\n" + "="*50)
    print("Comparative Analysis")
    print("="*50)
    
    metrics = ["response_times", "keyword_matches", "ndcg_scores"]
    metric_names = ["Response Time (s)", "Keyword Match Rate", "NDCG Score"]
    
    for metric, metric_name in zip(metrics, metric_names):
        print(f"\n{metric_name}:")
        for model in results:
            avg_value = np.mean(results[model][metric])
            print(f"{model.upper()}: {avg_value:.3f}")
    
    return results

In [60]:
evaluation_results = run_model_evaluation()



Starting evaluation...

Starting Model-by-Model Evaluation:

--------------------
Evaluating E5 Model
--------------------

Query: How to implement SSR in Next.js?
Response Time: 0.075 seconds
Top 2 Results:

1. Score: -53.3164
Preview: Read MoreFebruary 17th, 2022+9Next.js 12.1We're excited to release one of our most requested features with Next.js 12.1: On-demand ISR (Beta) Expanded...
Matching keywords: ['server', 'data fetching']

2. Score: -53.6190
Preview: Next.js 10 | Next.jsSkip to contentSearch documentation...Search...⌘KShowcaseDocsBlogTemplatesEnterpriseSearch documentation...Search...⌘KDeployLearnB...
Matching keywords: ['server', 'side', 'rendering', 'getServerSideProps', 'data fetching']

Query: What are API routes in Next.js?
Response Time: 0.050 seconds
Top 2 Results:

1. Score: -47.5533
Preview: Then, easily revalidate cached data and update your UI in one network roundtrip.Advanced Routing & Nested LayoutsCreate routes using the file system, ...
Matching keywords: ['