In [1]:
import json
import os
import numpy as np
from typing import List, Dict, Tuple
from dataclasses import dataclass
import tiktoken
from openai import OpenAI
from sentence_transformers import SentenceTransformer
from tqdm import tqdm
import time
import chromadb
from chromadb.config import Settings
import gc
import shutil
import ijson

  from .autonotebook import tqdm as notebook_tqdm


## Document Chunking

In [None]:
@dataclass
class ChunkingStrategy:
    """Configuration for chunking strategy"""
    name: str
    chunk_size: int
    chunk_overlap: int
    
class DocumentChunker:
    """
    Splits documents into chunks for RAG pipeline testing
    Tests multiple chunking strategies
    """
    
    def __init__(self, encoding_name: str = "cl100k_base"):
        """
        Initialize chunker
        encoding_name: tiktoken encoding (cl100k_base for GPT-4, text-embedding-ada-002)
        """
        self.encoding = tiktoken.get_encoding(encoding_name)
        
        # Define chunking strategies to test
        self.strategies = [
            ChunkingStrategy(name="small", chunk_size=256, chunk_overlap=50),
            ChunkingStrategy(name="medium", chunk_size=512, chunk_overlap=100),
            ChunkingStrategy(name="large", chunk_size=768, chunk_overlap=150),
            ChunkingStrategy(name="extra_large", chunk_size=1024, chunk_overlap=200),
        ]
    
    def count_tokens(self, text: str) -> int:
        """Count tokens in text"""
        return len(self.encoding.encode(text))
    
    def split_by_tokens(self, text: str, chunk_size: int, overlap: int) -> List[str]:
        """
        Split text into chunks by token count with overlap
        
        Args:
            text: Text to split
            chunk_size: Maximum tokens per chunk
            overlap: Number of overlapping tokens between chunks
        
        Returns:
            List of text chunks
        """
        tokens = self.encoding.encode(text)
        chunks = []
        
        start = 0
        while start < len(tokens):
            # Get chunk
            end = start + chunk_size
            chunk_tokens = tokens[start:end]
            
            # Decode back to text
            chunk_text = self.encoding.decode(chunk_tokens)
            chunks.append(chunk_text)
            
            # Move to next chunk with overlap
            start = end - overlap
            
            # Prevent infinite loop
            if start >= len(tokens) - overlap:
                break
        
        return chunks
    
    def split_by_sentences(self, text: str, chunk_size: int, overlap: int) -> List[str]:
        """
        Split text by sentences, respecting token limits
        Better for maintaining semantic coherence
        """
        # Simple sentence splitting (can be improved with spaCy/nltk)
        sentences = []
        current = ""
        
        for char in text:
            current += char
            if char in '.!?' and len(current) > 20:  # Minimum sentence length
                sentences.append(current.strip())
                current = ""
        
        if current.strip():
            sentences.append(current.strip())
        
        # Group sentences into chunks
        chunks = []
        current_chunk = ""
        current_tokens = 0
        
        for sentence in sentences:
            sentence_tokens = self.count_tokens(sentence)
            
            # If single sentence exceeds chunk size, split it
            if sentence_tokens > chunk_size:
                if current_chunk:
                    chunks.append(current_chunk.strip())
                    current_chunk = ""
                    current_tokens = 0
                
                # Split long sentence by tokens
                token_chunks = self.split_by_tokens(sentence, chunk_size, overlap)
                chunks.extend(token_chunks)
                continue
            
            # Check if adding sentence exceeds chunk size
            if current_tokens + sentence_tokens > chunk_size:
                chunks.append(current_chunk.strip())
                
                # Start new chunk with overlap
                overlap_text = self.get_overlap_text(current_chunk, overlap)
                current_chunk = overlap_text + " " + sentence
                current_tokens = self.count_tokens(current_chunk)
            else:
                current_chunk += " " + sentence
                current_tokens += sentence_tokens
        
        if current_chunk.strip():
            chunks.append(current_chunk.strip())
        
        return chunks
    
    def get_overlap_text(self, text: str, overlap_tokens: int) -> str:
        """Get last N tokens from text for overlap"""
        tokens = self.encoding.encode(text)
        if len(tokens) <= overlap_tokens:
            return text
        
        overlap_tokens_list = tokens[-overlap_tokens:]
        return self.encoding.decode(overlap_tokens_list)
    
    def chunk_document(self, document: Dict, strategy: ChunkingStrategy, 
                       method: str = "sentences") -> List[Dict]:
        """
        Chunk a single document using specified strategy
        
        Args:
            document: Document dict with 'content', 'title', etc.
            strategy: ChunkingStrategy configuration
            method: 'sentences' or 'tokens'
        
        Returns:
            List of chunk dicts with metadata
        """
        content = document.get('content', '')
        
        if not content or len(content.strip()) < 50:
            return []
        
        # Choose splitting method
        if method == "sentences":
            chunks = self.split_by_sentences(
                content, 
                strategy.chunk_size, 
                strategy.chunk_overlap
            )
        else:
            chunks = self.split_by_tokens(
                content,
                strategy.chunk_size,
                strategy.chunk_overlap
            )
        
        # Create chunk objects with metadata
        chunk_objects = []
        for i, chunk_text in enumerate(chunks):
            chunk_obj = {
                "chunk_id": f"{document.get('id', 'doc')}_{strategy.name}_chunk_{i}",
                "document_id": document.get('id'),
                "chunk_index": i,
                "total_chunks": len(chunks),
                "text": chunk_text,
                "token_count": self.count_tokens(chunk_text),
                "strategy": strategy.name,
                "chunk_size": strategy.chunk_size,
                "overlap": strategy.chunk_overlap,
                
                # Preserve document metadata
                "document_title": document.get('title', ''),
                "document_type": document.get('type', ''),
                "region": document.get('region', ''),
                "province": document.get('province', ''),
                "metadata": document.get('metadata', {})
            }
            chunk_objects.append(chunk_obj)
        
        return chunk_objects
    
    def process_all_documents(self, documents: List[Dict], 
                             output_dir: str = "data/chunked",
                             method: str = "sentences"):
        """
        Process all documents with all chunking strategies
        
        Args:
            documents: List of document dicts
            output_dir: Directory to save chunked data
            method: Chunking method ('sentences' or 'tokens')
        """
        os.makedirs(output_dir, exist_ok=True)
        
        print(f"üî™ Chunking {len(documents)} documents with {len(self.strategies)} strategies")
        print(f"üìä Method: {method}")
        print("="*60)
        
        # Process each strategy
        for strategy in self.strategies:
            print(f"\n{'='*60}")
            print(f"Strategy: {strategy.name}")
            print(f"  Chunk Size: {strategy.chunk_size} tokens")
            print(f"  Overlap: {strategy.chunk_overlap} tokens")
            print(f"{'='*60}")
            
            all_chunks = []
            
            for doc in tqdm(documents, desc=f"Chunking ({strategy.name})"):
                chunks = self.chunk_document(doc, strategy, method)
                all_chunks.extend(chunks)
            
            # Save chunks for this strategy
            output_file = os.path.join(output_dir, f"chunks_{strategy.name}.json")
            with open(output_file, 'w', encoding='utf-8') as f:
                json.dump(all_chunks, f, indent=2, ensure_ascii=False)
            
            # Calculate statistics
            total_tokens = sum(c['token_count'] for c in all_chunks)
            avg_tokens = total_tokens / len(all_chunks) if all_chunks else 0
            
            print(f"\nüìä Statistics for {strategy.name}:")
            print(f"  Total Chunks: {len(all_chunks):,}")
            print(f"  Total Tokens: {total_tokens:,}")
            print(f"  Avg Tokens/Chunk: {avg_tokens:.0f}")
            print(f"  Saved to: {output_file}")
        
        # Generate comparison report
        self.generate_comparison_report(output_dir)
        
        print(f"\n{'='*60}")
        print("‚úÖ Chunking Complete!")
        print(f"üìÇ All chunked data saved to: {output_dir}/")
        print(f"{'='*60}")
    
    def generate_comparison_report(self, output_dir: str):
        """Generate a comparison report of all strategies"""
        report = {
            "strategies": [],
            "method": "sentence-based"
        }
        
        for strategy in self.strategies:
            chunk_file = os.path.join(output_dir, f"chunks_{strategy.name}.json")
            
            if os.path.exists(chunk_file):
                with open(chunk_file, 'r', encoding='utf-8') as f:
                    chunks = json.load(f)
                
                strategy_stats = {
                    "name": strategy.name,
                    "chunk_size": strategy.chunk_size,
                    "overlap": strategy.chunk_overlap,
                    "total_chunks": len(chunks),
                    "total_tokens": sum(c['token_count'] for c in chunks),
                    "avg_tokens_per_chunk": sum(c['token_count'] for c in chunks) / len(chunks) if chunks else 0,
                    "file": f"chunks_{strategy.name}.json"
                }
                report["strategies"].append(strategy_stats)
        
        report_file = os.path.join(output_dir, "chunking_report.json")
        with open(report_file, 'w', encoding='utf-8') as f:
            json.dump(report, f, indent=2)
        
        print(f"\nüìÑ Comparison report saved to: {report_file}")

def main():
    """Main execution"""
    
    # Load processed documents
    train_file = "data/processed/canada_wilderness_train.json"
    
    if not os.path.exists(train_file):
        print(f"‚ùå Training data not found: {train_file}")
        print("Run combine_canada_data.py first!")
        return
    
    print("üìñ Loading documents...")
    with open(train_file, 'r', encoding='utf-8') as f:
        documents = json.load(f)
    
    print(f"‚úÖ Loaded {len(documents):,} documents")
    
    # Initialize chunker
    chunker = DocumentChunker()
    
    # Process documents
    # Note: For 277k documents, this will take time
    # Consider processing a subset first for testing
    
    # # Option 1: Process subset for testing
    # test_mode = input("\nüß™ Test mode with 1000 documents? (y/n): ").lower().strip()
    
    # if test_mode == 'y':
    #     print("üß™ Running in test mode with 1000 documents")
    #     documents = documents[:1000]
    
    # Process all documents
    chunker.process_all_documents(
        documents=documents,
        output_dir="data/chunked",
        method="sentences"  # Better for semantic coherence
    )

if __name__ == "__main__":
    main()

üìñ Loading documents...
‚úÖ Loaded 233,027 documents
üî™ Chunking 233027 documents with 4 strategies
üìä Method: sentences

Strategy: small
  Chunk Size: 256 tokens
  Overlap: 50 tokens


Chunking (small): 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 233027/233027 [00:14<00:00, 15802.80it/s]



üìä Statistics for small:
  Total Chunks: 233,078
  Total Tokens: 9,198,884
  Avg Tokens/Chunk: 39
  Saved to: data/chunked\chunks_small.json

Strategy: medium
  Chunk Size: 512 tokens
  Overlap: 100 tokens


Chunking (medium): 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 233027/233027 [00:20<00:00, 11648.04it/s]



üìä Statistics for medium:
  Total Chunks: 233,032
  Total Tokens: 9,196,824
  Avg Tokens/Chunk: 39
  Saved to: data/chunked\chunks_medium.json

Strategy: large
  Chunk Size: 768 tokens
  Overlap: 150 tokens


Chunking (large): 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 233027/233027 [00:17<00:00, 13456.94it/s]



üìä Statistics for large:
  Total Chunks: 233,028
  Total Tokens: 9,196,473
  Avg Tokens/Chunk: 39
  Saved to: data/chunked\chunks_large.json

Strategy: extra_large
  Chunk Size: 1024 tokens
  Overlap: 200 tokens


Chunking (extra_large): 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 233027/233027 [00:46<00:00, 5018.29it/s]



üìä Statistics for extra_large:
  Total Chunks: 233,027
  Total Tokens: 9,196,323
  Avg Tokens/Chunk: 39
  Saved to: data/chunked\chunks_extra_large.json

üìÑ Comparison report saved to: data/chunked\chunking_report.json

‚úÖ Chunking Complete!
üìÇ All chunked data saved to: data/chunked/


## Embedding Generation

In [None]:
class EmbeddingGenerator:
    """Generate embeddings using only free, local models  - without api keys"""
    
    def __init__(self):
        self.output_dir = "data/embeddings"
        os.makedirs(self.output_dir, exist_ok=True)
        
        self.models = {
            "minilm": "all-MiniLM-L6-v2",  # Fast, 384 dims
            #"bge_large": "BAAI/bge-large-en-v1.5",  # High quality, 1024 dims
        }
        
        self.loaded_models = {}
    
    def load_model(self, model_name: str):
        """Load model lazily"""
        if model_name not in self.loaded_models:
            print(f"  üì• Loading {model_name}...")
            self.loaded_models[model_name] = SentenceTransformer(self.models[model_name])
        return self.loaded_models[model_name]
    
    def generate_embeddings(self, chunks: List[Dict], model_key: str, 
                          batch_size: int = 32):
        """Generate embeddings for chunks"""
        
        model = self.load_model(model_key)
        texts = [chunk['text'] for chunk in chunks]
        
        print(f"  üé® Generating embeddings...")
        all_embeddings = []
        
        for i in tqdm(range(0, len(texts), batch_size)):
            batch = texts[i:i + batch_size]
            embeddings = model.encode(batch, convert_to_numpy=True, show_progress_bar=False)
            all_embeddings.extend(embeddings.tolist())
        
        # Add embeddings to chunks
        result = []
        for chunk, embedding in zip(chunks, all_embeddings):
            chunk_copy = chunk.copy()
            chunk_copy['embedding'] = embedding
            chunk_copy['embedding_model'] = model_key
            chunk_copy['embedding_dimension'] = len(embedding)
            result.append(chunk_copy)
        
        return result
    
    def process_strategy(self, strategy_name: str):
        """Process one chunking strategy"""
        
        chunk_file = f"data/chunked/chunks_{strategy_name}.json"
        
        if not os.path.exists(chunk_file):
            print(f"‚ùå Not found: {chunk_file}")
            return
        
        print(f"\n{'='*60}")
        print(f"Processing: {strategy_name}")
        print(f"{'='*60}")
        
        # FIX: Add encoding='utf-8'
        with open(chunk_file, 'r', encoding='utf-8') as f:
            chunks = json.load(f)
        
        print(f"üìä Loaded {len(chunks):,} chunks")
        
        # Process with each model
        for model_key in self.models.keys():
            print(f"\nüîß Model: {model_key}")
            
            chunks_with_emb = self.generate_embeddings(chunks, model_key)
            
            # Save - FIX: Add encoding='utf-8'
            output_file = os.path.join(
                self.output_dir,
                f"{strategy_name}_{model_key}.json"
            )
            
            with open(output_file, 'w', encoding='utf-8') as f:
                json.dump(chunks_with_emb, f, ensure_ascii=False)
            
            size_mb = os.path.getsize(output_file) / (1024 * 1024)
            print(f"  ‚úÖ Saved: {output_file} ({size_mb:.2f} MB)")

def main():
    print("üÜì Free Embedding Generator (Local Models Only)")
    print("="*60)
    
    generator = EmbeddingGenerator()
    
    # Ask user which strategy to process
    print("\nAvailable chunking strategies:")
    print("  1. small (256 tokens)")
    print("  2. medium (512 tokens)")
    print("  3. large (768 tokens)")
    print("  4. extra_large (1024 tokens)")
    print("  5. all strategies")
    
    choice = input("\nSelect strategy (1-5, default=1): ").strip() or '1'
    
    strategies_map = {
        '1': ['small'],
        '2': ['medium'],
        '3': ['large'],
        '4': ['extra_large'],
        '5': ['small', 'medium', 'large', 'extra_large']
    }
    
    strategies = strategies_map.get(choice, ['small'])
    
    print(f"\nüéØ Processing {len(strategies)} strategy/strategies")
    
    for strategy in strategies:
        generator.process_strategy(strategy)
    
    print("\n" + "="*60)
    print("‚úÖ Embedding generation complete!")
    print(f"üìÇ Embeddings saved to: {generator.output_dir}/")
    print("="*60)

if __name__ == "__main__":
    main()

üÜì Free Embedding Generator (Local Models Only)

Available chunking strategies:
  1. small (256 tokens)
  2. medium (512 tokens)
  3. large (768 tokens)
  4. extra_large (1024 tokens)
  5. all strategies



üéØ Processing 1 strategy/strategies

Processing: small
üìä Loaded 233,078 chunks

üîß Model: minilm
  üì• Loading minilm...
  üé® Generating embeddings...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 7284/7284 [56:15<00:00,  2.16it/s]  


  ‚úÖ Saved: data/embeddings\small_minilm.json (2058.80 MB)

üîß Model: bge_large
  üì• Loading bge_large...
  üé® Generating embeddings...


  1%|          | 63/7284 [09:39<18:58:34,  9.46s/it]

In [2]:
#current files in embedding directory
embedding_files = os.listdir("data/embeddings")
print("Current embedding files:", embedding_files)

Current embedding files: ['extra_large_minilm.json', 'medium_minilm.json', 'small_minilm.json']


## Vector Database Generation

In [3]:
class VectorDatabaseManager:
    """
    Manages vector database with organized directory structure
    Each collection gets its own named directory
    """
    
    def __init__(self, base_directory: str = "data/vector_db/"):
        """
        Initialize manager
        
        Args:
            base_directory: Base directory for all vector databases
        """
        self.base_directory = base_directory
        os.makedirs(base_directory, exist_ok=True)
        
        # Track available collections
        self.available_embeddings = self.scan_available_embeddings()
        self.collection_clients = {}  # Store separate clients per collection
    
    def scan_available_embeddings(self) -> List[Dict]:
        """Scan for available embedding files"""
        
        embedding_dir = "data/embeddings"
        available = []
        
        if not os.path.exists(embedding_dir):
            print("‚ö†Ô∏è  No embeddings directory found")
            return available
        
        for filename in os.listdir(embedding_dir):
            if filename.endswith('.json') and not filename.endswith('_report.json'):
                parts = filename.replace('.json', '').split('_')
                
                if len(parts) >= 2:
                    chunk_strategy = parts[0]
                    embedding_model = '_'.join(parts[1:])
                    
                    file_path = os.path.join(embedding_dir, filename)
                    file_size_mb = os.path.getsize(file_path) / (1024 * 1024)
                    
                    collection_name = f"{chunk_strategy}_{embedding_model}"
                    
                    available.append({
                        "filename": filename,
                        "filepath": file_path,
                        "chunk_strategy": chunk_strategy,
                        "embedding_model": embedding_model,
                        "collection_name": collection_name,
                        "file_size_mb": round(file_size_mb, 2),
                        "db_directory": os.path.join(self.base_directory, collection_name)
                    })
        
        return available
    
    def get_client_for_collection(self, collection_name: str, db_directory: str):
        """
        Get or create a ChromaDB client for a specific collection
        Each collection gets its own directory
        """
        
        if collection_name not in self.collection_clients:
            # Create directory for this collection
            os.makedirs(db_directory, exist_ok=True)
            
            # Create dedicated client for this collection
            client = chromadb.PersistentClient(path=db_directory)
            self.collection_clients[collection_name] = client
        
        return self.collection_clients[collection_name]
    
    def load_embeddings_file(self, filepath: str) -> List[Dict]:
        """Load embeddings from JSON file"""
        
        print(f"  üìñ Loading: {os.path.basename(filepath)}")
        
        with open(filepath, 'r', encoding='utf-8') as f:
            data = json.load(f)
        
        print(f"  ‚úÖ Loaded {len(data):,} chunks")
        return data
    
    def setup_collection(self, embedding_config: Dict):
        """
        Setup a single collection in its own organized directory
        
        Args:
            embedding_config: Dictionary with embedding file info
        """
        
        collection_name = embedding_config['collection_name']
        db_directory = embedding_config['db_directory']
        
        print(f"\n{'='*70}")
        print(f"Setting up: {collection_name}")
        print(f"{'='*70}")
        print(f"  Chunk Strategy: {embedding_config['chunk_strategy']}")
        print(f"  Embedding Model: {embedding_config['embedding_model']}")
        print(f"  File Size: {embedding_config['file_size_mb']} MB")
        print(f"  Directory: {db_directory}")
        
        # Load embeddings
        chunks = self.load_embeddings_file(embedding_config['filepath'])
        
        if not chunks:
            print("  ‚ùå No chunks loaded, skipping")
            return False
        
        # Get embedding dimension
        embedding_dim = len(chunks[0]['embedding'])
        print(f"  üìä Embedding Dimension: {embedding_dim}")
        
        # Get dedicated client for this collection
        client = self.get_client_for_collection(collection_name, db_directory)
        
        # Delete existing collection if any
        try:
            client.delete_collection(name=collection_name)
            print(f"  üóëÔ∏è  Deleted existing collection")
        except:
            pass
        
        # Create collection
        collection = client.create_collection(
            name=collection_name,
            metadata={"hnsw:space": "cosine"}
        )
        print(f"  ‚úÖ Created collection")
        
        # Add chunks in batches
        print(f"  üíæ Adding {len(chunks):,} chunks...")
        
        batch_size = 1000
        
        for i in tqdm(range(0, len(chunks), batch_size), desc="  Progress"):
            batch = chunks[i:i + batch_size]
            
            ids = []
            embeddings = []
            documents = []
            metadatas = []
            
            for chunk in batch:
                ids.append(chunk['chunk_id'])
                embeddings.append(chunk['embedding'])
                documents.append(chunk['text'])
                
                metadata = {
                    'document_id': chunk.get('document_id', ''),
                    'document_title': chunk.get('document_title', ''),
                    'document_type': chunk.get('document_type', ''),
                    'region': chunk.get('region', ''),
                    'province': chunk.get('province', ''),
                    'chunk_index': chunk.get('chunk_index', 0),
                    'total_chunks': chunk.get('total_chunks', 1),
                    'token_count': chunk.get('token_count', 0),
                    'strategy': chunk.get('strategy', ''),
                    'embedding_model': chunk.get('embedding_model', '')
                }
                
                if 'metadata' in chunk and isinstance(chunk['metadata'], dict):
                    trail_meta = chunk['metadata']
                    metadata['trail_type'] = trail_meta.get('trail_type', '')
                    metadata['difficulty'] = trail_meta.get('difficulty', '')
                    metadata['surface'] = trail_meta.get('surface', '')
                
                metadatas.append(metadata)
            
            try:
                collection.add(
                    ids=ids,
                    embeddings=embeddings,
                    documents=documents,
                    metadatas=metadatas
                )
            except Exception as e:
                print(f"\n  ‚ö†Ô∏è  Error in batch {i//batch_size}: {e}")
                continue
        
        # Verify
        count = collection.count()
        print(f"  ‚úÖ Collection contains {count:,} items")
        
        return True
    
    def setup_all_collections(self):
        """Setup all available collections"""
        
        print("üóÑÔ∏è  Organized Vector Database Setup")
        print("="*70)
        
        if not self.available_embeddings:
            print("‚ùå No embedding files found!")
            return
        
        print(f"\nüìä Found {len(self.available_embeddings)} embedding files:")
        for emb in self.available_embeddings:
            print(f"  - {emb['filename']} ({emb['file_size_mb']} MB)")
        
        print("\n" + "="*70)
        print("Each collection will be stored in:")
        for emb in self.available_embeddings:
            print(f"  üìÅ {emb['db_directory']}")
        
        print("\n" + "="*70)
        proceed = input("\nProceed with setup? (y/n): ").strip().lower()
        
        if proceed != 'y':
            print("Cancelled")
            return
        
        # Setup all collections
        successful = []
        failed = []
        
        for emb_config in self.available_embeddings:
            try:
                if self.setup_collection(emb_config):
                    successful.append(emb_config['collection_name'])
            except Exception as e:
                print(f"\n‚ùå Failed: {emb_config['collection_name']}")
                print(f"   Error: {e}")
                failed.append(emb_config['collection_name'])
        
        # Summary
        print(f"\n{'='*70}")
        print("üìä SETUP SUMMARY")
        print(f"{'='*70}")
        print(f"‚úÖ Successfully setup: {len(successful)} collections")
        for name in successful:
            print(f"   - {name}")
        
        if failed:
            print(f"\n‚ùå Failed: {len(failed)} collections")
            for name in failed:
                print(f"   - {name}")
        
        print(f"\nüìÅ Directory Structure:")
        print(f"{self.base_directory}/")
        for emb in self.available_embeddings:
            if emb['collection_name'] in successful:
                print(f"  ‚îú‚îÄ‚îÄ {emb['collection_name']}/")
        
        print(f"\n{'='*70}")
    
    def list_all_collections(self):
        """List all collections across all directories"""
        
        print("\nüìö All Collections:")
        print("="*70)
        
        if not os.path.exists(self.base_directory):
            print("  No collections found")
            return
        
        total_items = 0
        
        for item in os.listdir(self.base_directory):
            item_path = os.path.join(self.base_directory, item)
            
            if os.path.isdir(item_path):
                try:
                    client = chromadb.PersistentClient(path=item_path)
                    collections = client.list_collections()
                    
                    for collection in collections:
                        count = collection.count()
                        total_items += count
                        print(f"  üìÅ {item}/")
                        print(f"     ‚îî‚îÄ {collection.name}: {count:,} items")
                
                except Exception as e:
                    print(f"  ‚ö†Ô∏è  {item}: Error loading ({e})")
        
        print(f"\n  Total: {total_items:,} items across all collections")
        print("="*70)

def main():
    """Main execution"""
    
    print("\nüóÑÔ∏è  Organized Vector Database Setup")
    print("="*70)
    print("This will create a clean directory structure:")
    print("  data/vector_db/")
    print("    ‚îú‚îÄ‚îÄ small_minilm/")
    print("    ‚îú‚îÄ‚îÄ medium_minilm/")
    print("    ‚îî‚îÄ‚îÄ extra_large_minilm/")
    print("="*70 + "\n")
    
    manager = VectorDatabaseManager()
    
    # Setup collections
    manager.setup_all_collections()
    
    # List all collections
    manager.list_all_collections()

if __name__ == "__main__":
    main()


üóÑÔ∏è  Organized Vector Database Setup
This will create a clean directory structure:
  data/vector_db/
    ‚îú‚îÄ‚îÄ small_minilm/
    ‚îú‚îÄ‚îÄ medium_minilm/
    ‚îî‚îÄ‚îÄ extra_large_minilm/

üóÑÔ∏è  Organized Vector Database Setup

üìä Found 3 embedding files:
  - extra_large_minilm.json (2061.45 MB)
  - medium_minilm.json (2059.05 MB)
  - small_minilm.json (2058.8 MB)

Each collection will be stored in:
  üìÅ data/vector_db/extra_large_minilm
  üìÅ data/vector_db/medium_minilm
  üìÅ data/vector_db/small_minilm


Setting up: extra_large_minilm
  Chunk Strategy: extra
  Embedding Model: large_minilm
  File Size: 2061.45 MB
  Directory: data/vector_db/extra_large_minilm
  üìñ Loading: extra_large_minilm.json
  ‚úÖ Loaded 233,027 chunks
  üìä Embedding Dimension: 384
  ‚úÖ Created collection
  üíæ Adding 233,027 chunks...


  Progress: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 234/234 [18:31<00:00,  4.75s/it]


  ‚úÖ Collection contains 233,027 items

Setting up: medium_minilm
  Chunk Strategy: medium
  Embedding Model: minilm
  File Size: 2059.05 MB
  Directory: data/vector_db/medium_minilm
  üìñ Loading: medium_minilm.json
  ‚úÖ Loaded 233,032 chunks
  üìä Embedding Dimension: 384
  ‚úÖ Created collection
  üíæ Adding 233,032 chunks...


  Progress: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 234/234 [17:45<00:00,  4.55s/it]


  ‚úÖ Collection contains 233,032 items

Setting up: small_minilm
  Chunk Strategy: small
  Embedding Model: minilm
  File Size: 2058.8 MB
  Directory: data/vector_db/small_minilm
  üìñ Loading: small_minilm.json
  ‚úÖ Loaded 233,078 chunks
  üìä Embedding Dimension: 384
  ‚úÖ Created collection
  üíæ Adding 233,078 chunks...


  Progress: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 234/234 [08:08<00:00,  2.09s/it]


  ‚úÖ Collection contains 233,078 items

üìä SETUP SUMMARY
‚úÖ Successfully setup: 3 collections
   - extra_large_minilm
   - medium_minilm
   - small_minilm

üìÅ Directory Structure:
data/vector_db//
  ‚îú‚îÄ‚îÄ extra_large_minilm/
  ‚îú‚îÄ‚îÄ medium_minilm/
  ‚îú‚îÄ‚îÄ small_minilm/


üìö All Collections:
  üìÅ extra_large_minilm/
     ‚îî‚îÄ extra_large_minilm: 233,027 items
  üìÅ medium_minilm/
     ‚îî‚îÄ medium_minilm: 233,032 items
  üìÅ small_minilm/
     ‚îî‚îÄ small_minilm: 233,078 items

  Total: 699,137 items across all collections


## Vector-DB Test

In [6]:
class VectorDBTester:
    """
    Test all 3 MiniLM vector database collections
    Works with organized directory structure
    """
    
    def __init__(self, base_directory: str = "data/vector_db/"):
        """Initialize tester"""
        
        print("üß™ Initializing Vector Database Tester")
        print("="*70)
        
        self.base_directory = base_directory
        
        # Collections we have
        self.collections = [
            "small_minilm",
            "medium_minilm",
            "extra_large_minilm"
        ]
        
        # Store clients for each collection
        self.clients = {}
        
        # Load embedding model (only MiniLM)
        print("üì• Loading embedding model...")
        print("  - Loading all-MiniLM-L6-v2 (384 dims)...")
        self.embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
        print("‚úÖ Model loaded\n")
        
        # Test queries
        self.test_queries = [
            {
                "query": "What are hiking trails in British Columbia?",
                "category": "location",
                "difficulty": "easy"
            },
            {
                "query": "Find trails with concrete surface in Quebec",
                "category": "surface",
                "difficulty": "medium"
            },
            {
                "query": "Are there wheelchair accessible trails in Ontario?",
                "category": "accessibility",
                "difficulty": "easy"
            },
            {
                "query": "What trails allow bicycles?",
                "category": "activity",
                "difficulty": "easy"
            },
            {
                "query": "Tell me about Banff National Park",
                "category": "park",
                "difficulty": "easy"
            },
            {
                "query": "What are challenging mountain trails in Alberta?",
                "category": "difficulty",
                "difficulty": "hard"
            },
            {
                "query": "Find beginner-friendly trails near Toronto",
                "category": "multi-criteria",
                "difficulty": "medium"
            }
        ]
    
    def get_client_for_collection(self, collection_name: str):
        """Get ChromaDB client for a specific collection"""
        
        if collection_name not in self.clients:
            collection_path = os.path.join(self.base_directory, collection_name)
            
            if not os.path.exists(collection_path):
                raise FileNotFoundError(f"Collection directory not found: {collection_path}")
            
            self.clients[collection_name] = chromadb.PersistentClient(path=collection_path)
        
        return self.clients[collection_name]
    
    def list_all_collections(self) -> List[str]:
        """List all available collections"""
        
        print("üìö Available Collections:")
        print("-"*70)
        
        available = []
        
        for collection_name in self.collections:
            collection_path = os.path.join(self.base_directory, collection_name)
            
            if not os.path.exists(collection_path):
                print(f"  ‚ö†Ô∏è  {collection_name}: Directory not found")
                continue
            
            try:
                client = self.get_client_for_collection(collection_name)
                collections = client.list_collections()
                
                for collection in collections:
                    count = collection.count()
                    
                    # Get embedding dimension
                    sample = collection.peek(limit=1)
                    if sample['embeddings'] is not None and len(sample['embeddings']) > 0:
                        dim = len(sample['embeddings'][0])
                    else:
                        dim = 'unknown'
                    
                    print(f"  ‚úì {collection_name}")
                    print(f"     Items: {count:,} | Dims: {dim} | Model: MiniLM")
                    
                    available.append(collection_name)
            
            except Exception as e:
                print(f"  ‚ö†Ô∏è  {collection_name}: Error loading ({e})")
        
        print()
        return available
    
    def test_single_query(self, collection_name: str, query_text: str, 
                         top_k: int = 5) -> Dict:
        """Test a single query"""
        
        try:
            # Get client for this collection
            client = self.get_client_for_collection(collection_name)
            
            # Get collection
            collection = client.get_collection(name=collection_name)
            
            # Encode query
            start_time = time.time()
            query_embedding = self.embedding_model.encode(query_text, convert_to_numpy=True)
            encode_time = time.time() - start_time
            
            # Query collection
            start_time = time.time()
            results = collection.query(
                query_embeddings=[query_embedding.tolist()],
                n_results=top_k,
                include=['documents', 'metadatas', 'distances']
            )
            query_time = time.time() - start_time
            
            return {
                "success": True,
                "results": results,
                "encode_time": encode_time,
                "query_time": query_time,
                "model_used": "minilm"
            }
            
        except Exception as e:
            return {
                "success": False,
                "error": str(e)
            }
    
    def display_query_results(self, query_text: str, collection_name: str, 
                             result: Dict, show_top_n: int = 3):
        """Display query results"""
        
        print(f"\nüîé Query: '{query_text}'")
        print(f"   Collection: {collection_name}")
        print("-"*70)
        
        if not result['success']:
            print(f"   ‚ùå Error: {result.get('error', 'Unknown error')}\n")
            return
        
        print(f"   ‚è±Ô∏è  Encoding: {result['encode_time']*1000:.2f}ms")
        print(f"   ‚è±Ô∏è  Query: {result['query_time']*1000:.2f}ms")
        print(f"   ‚è±Ô∏è  Total: {(result['encode_time'] + result['query_time'])*1000:.2f}ms")
        print(f"   ‚úÖ Retrieved {len(result['results']['ids'][0])} results\n")
        
        # Show top results
        for i in range(min(show_top_n, len(result['results']['ids'][0]))):
            metadata = result['results']['metadatas'][0][i]
            document = result['results']['documents'][0][i]
            distance = result['results']['distances'][0][i]
            similarity = 1 - distance
            
            print(f"   {i+1}. {metadata.get('document_title', 'Unknown')}")
            print(f"      Similarity: {similarity:.4f}")
            print(f"      Region: {metadata.get('region', 'N/A')}")
            print(f"      Type: {metadata.get('document_type', 'N/A')}")
            
            if metadata.get('difficulty') and metadata.get('difficulty') != 'unknown':
                print(f"      Difficulty: {metadata.get('difficulty')}")
            if metadata.get('surface') and metadata.get('surface') != 'unknown':
                print(f"      Surface: {metadata.get('surface')}")
            
            print(f"      Preview: {document[:120]}...")
            print()
    
    def compare_all_collections(self, query_text: str, top_k: int = 5):
        """Compare same query across all 3 collections"""
        
        print(f"\n{'='*70}")
        print(f"üî¨ CROSS-COLLECTION COMPARISON")
        print(f"{'='*70}")
        print(f"Query: '{query_text}'\n")
        
        collection_names = self.list_all_collections()
        
        if not collection_names:
            print("‚ùå No collections available!")
            return {}
        
        all_results = {}
        
        for collection_name in collection_names:
            result = self.test_single_query(collection_name, query_text, top_k)
            all_results[collection_name] = result
            self.display_query_results(query_text, collection_name, result, show_top_n=2)
        
        # Comparative summary
        print(f"{'='*70}")
        print("üìä COMPARISON SUMMARY")
        print(f"{'='*70}\n")
        
        # Performance comparison
        print("‚è±Ô∏è  Performance Metrics:")
        print("-"*70)
        for name, result in all_results.items():
            if result['success']:
                total_time = (result['encode_time'] + result['query_time']) * 1000
                print(f"  {name:25s} | {total_time:6.2f}ms")
        
        # Accuracy comparison
        print(f"\nüéØ Top Result Similarity:")
        print("-"*70)
        for name, result in all_results.items():
            if result['success'] and result['results']['distances'][0]:
                top_similarity = 1 - result['results']['distances'][0][0]
                top_title = result['results']['metadatas'][0][0].get('document_title', 'Unknown')
                print(f"  {name:25s} | {top_similarity:.4f} | {top_title[:30]}")
        
        # Winners
        print(f"\nüèÜ Winners:")
        print("-"*70)
        
        # Fastest
        fastest = min(all_results.items(), 
                     key=lambda x: (x[1]['encode_time'] + x[1]['query_time']) if x[1]['success'] else float('inf'))
        fastest_time = (fastest[1]['encode_time'] + fastest[1]['query_time']) * 1000
        print(f"  ‚ö° Fastest: {fastest[0]} ({fastest_time:.2f}ms)")
        
        # Most accurate
        highest_sim = max(all_results.items(),
                         key=lambda x: (1 - x[1]['results']['distances'][0][0]) if x[1]['success'] and x[1]['results']['distances'][0] else 0)
        highest_score = 1 - highest_sim[1]['results']['distances'][0][0] if highest_sim[1]['success'] else 0
        print(f"  üéØ Most Accurate: {highest_sim[0]} ({highest_score:.4f})")
        
        print()
        
        return all_results
    
    def run_comprehensive_test_suite(self):
        """Run tests on all collections with all queries"""
        
        print(f"\n{'='*70}")
        print("üß™ COMPREHENSIVE TEST SUITE - ALL 3 COLLECTIONS")
        print(f"{'='*70}\n")
        
        collection_names = self.list_all_collections()
        
        if not collection_names:
            print("‚ùå No collections found!")
            return
        
        # Summary results
        summary = {'queries': {}}
        
        # Test each query
        for i, test_query in enumerate(self.test_queries, 1):
            query_text = test_query['query']
            
            print(f"\n{'='*70}")
            print(f"Test Query {i}/{len(self.test_queries)}")
            print(f"Category: {test_query['category']} | Difficulty: {test_query['difficulty']}")
            print(f"{'='*70}")
            
            query_results = self.compare_all_collections(query_text, top_k=5)
            summary['queries'][query_text] = query_results
            
            time.sleep(0.5)
        
        # Generate final summary
        self.generate_final_summary(summary, collection_names)
    
    def generate_final_summary(self, summary: Dict, collection_names: List[str]):
        """Generate final comprehensive summary"""
        
        print(f"\n{'='*70}")
        print("üìä FINAL COMPREHENSIVE SUMMARY")
        print(f"{'='*70}\n")
        
        # Average performance
        print("‚è±Ô∏è  Average Performance by Collection:")
        print("-"*70)
        
        for collection in collection_names:
            total_time = 0
            count = 0
            
            for query_text, results in summary['queries'].items():
                if collection in results and results[collection]['success']:
                    total_time += (results[collection]['encode_time'] + 
                                 results[collection]['query_time']) * 1000
                    count += 1
            
            avg_time = total_time / count if count > 0 else 0
            print(f"  {collection:25s} | Avg: {avg_time:6.2f}ms | Tests: {count}")
        
        # Average similarity
        print(f"\nüéØ Average Top Result Similarity:")
        print("-"*70)
        
        for collection in collection_names:
            total_sim = 0
            count = 0
            
            for query_text, results in summary['queries'].items():
                if collection in results and results[collection]['success']:
                    if results[collection]['results']['distances'][0]:
                        similarity = 1 - results[collection]['results']['distances'][0][0]
                        total_sim += similarity
                        count += 1
            
            avg_sim = total_sim / count if count > 0 else 0
            print(f"  {collection:25s} | Avg Similarity: {avg_sim:.4f}")
        
        # Recommendations
        print(f"\nüí° Recommendations for Your RAG Pipelines:")
        print("-"*70)
        
        # Speed champion
        fastest_times = {}
        for collection in collection_names:
            total_time = 0
            count = 0
            for query_text, results in summary['queries'].items():
                if collection in results and results[collection]['success']:
                    total_time += (results[collection]['encode_time'] + 
                                 results[collection]['query_time'])
                    count += 1
            fastest_times[collection] = total_time / count if count > 0 else float('inf')
        
        fastest = min(fastest_times.items(), key=lambda x: x[1])
        print(f"\n  üöÄ Pipeline 1 (Speed-Optimized)")
        print(f"     Collection: {fastest[0]}")
        print(f"     Avg time: {fastest[1]*1000:.2f}ms")
        print(f"     ‚Üí Best for: High-volume queries, real-time chat")
        
        # Accuracy champion
        accuracy_scores = {}
        for collection in collection_names:
            total_sim = 0
            count = 0
            for query_text, results in summary['queries'].items():
                if collection in results and results[collection]['success']:
                    if results[collection]['results']['distances'][0]:
                        total_sim += 1 - results[collection]['results']['distances'][0][0]
                        count += 1
            accuracy_scores[collection] = total_sim / count if count > 0 else 0
        
        most_accurate = max(accuracy_scores.items(), key=lambda x: x[1])
        print(f"\n  üéØ Pipeline 2 (Quality-Optimized)")
        print(f"     Collection: {most_accurate[0]}")
        print(f"     Avg similarity: {most_accurate[1]:.4f}")
        print(f"     ‚Üí Best for: Complex queries, detailed answers")
        
        # Balanced
        balanced_scores = {}
        for collection in collection_names:
            if collection in fastest_times and collection in accuracy_scores:
                # Normalize and combine
                norm_time = 1 - (fastest_times[collection] / max(fastest_times.values()))
                norm_acc = accuracy_scores[collection] / max(accuracy_scores.values())
                balanced_scores[collection] = (norm_time + norm_acc) / 2
        
        if balanced_scores:
            balanced = max(balanced_scores.items(), key=lambda x: x[1])
            print(f"\n  ‚öñÔ∏è  Pipeline 3 (Balanced)")
            print(f"     Collection: {balanced[0]}")
            print(f"     Balance score: {balanced[1]:.3f}")
            print(f"     ‚Üí Best for: Production use, general queries")
        
        print(f"\n{'='*70}\n")
        
        # Save results
        self.save_results(summary, fastest_times, accuracy_scores)
    
    def save_results(self, summary: Dict, speed_scores: Dict, accuracy_scores: Dict):
        """Save test results"""
        
        output_dir = "data/evaluation"
        os.makedirs(output_dir, exist_ok=True)
        
        results = {
            "test_date": time.strftime("%Y-%m-%d %H:%M:%S"),
            "collections_tested": list(speed_scores.keys()),
            "total_queries": len(summary['queries']),
            "performance_scores": {
                "speed_ms": {k: round(v*1000, 2) for k, v in speed_scores.items()},
                "accuracy": {k: round(v, 4) for k, v in accuracy_scores.items()}
            },
            "winners": {
                "fastest": min(speed_scores.items(), key=lambda x: x[1])[0],
                "most_accurate": max(accuracy_scores.items(), key=lambda x: x[1])[0]
            }
        }
        
        output_file = os.path.join(output_dir, "vector_db_test_results.json")
        
        with open(output_file, 'w', encoding='utf-8') as f:
            json.dump(results, f, indent=2, ensure_ascii=False)
        
        print(f"üíæ Test results saved to: {output_file}")

def main():
    """Main execution"""
    
    print("\nüß™ Vector Database Test Suite (3 MiniLM Collections)\n")
    print("Tests organized directory structure:")
    print("  data/vector_db/small_minilm/")
    print("  data/vector_db/medium_minilm/")
    print("  data/vector_db/extra_large_minilm/\n")
    print("Options:")
    print("  1. Run full test suite (all queries, all collections)")
    print("  2. Quick comparison test (1 query, all collections)")
    print("  3. Test custom query on all collections")
    
    choice = input("\nSelect option (1-3, default=2): ").strip() or '2'
    
    tester = VectorDBTester()
    
    if choice == '1':
        # Full test suite
        print("\n‚ö†Ô∏è  This will take ~5 minutes")
        confirm = input("Continue? (y/n): ").strip().lower()
        if confirm == 'y':
            tester.run_comprehensive_test_suite()
        else:
            print("Cancelled")
    
    elif choice == '2':
        # Quick comparison
        query = "What are the best hiking trails in British Columbia?"
        tester.compare_all_collections(query, top_k=5)
    
    elif choice == '3':
        # Custom query
        query = input("\nEnter your query: ").strip()
        if query:
            tester.compare_all_collections(query, top_k=5)
        else:
            print("No query provided")

if __name__ == "__main__":
    main()


üß™ Vector Database Test Suite (3 MiniLM Collections)

Tests organized directory structure:
  data/vector_db/small_minilm/
  data/vector_db/medium_minilm/
  data/vector_db/extra_large_minilm/

Options:
  1. Run full test suite (all queries, all collections)
  2. Quick comparison test (1 query, all collections)
  3. Test custom query on all collections
üß™ Initializing Vector Database Tester
üì• Loading embedding model...
  - Loading all-MiniLM-L6-v2 (384 dims)...
‚úÖ Model loaded


‚ö†Ô∏è  This will take ~5 minutes

üß™ COMPREHENSIVE TEST SUITE - ALL 3 COLLECTIONS

üìö Available Collections:
----------------------------------------------------------------------
  ‚úì small_minilm
     Items: 233,078 | Dims: 384 | Model: MiniLM
  ‚úì medium_minilm
     Items: 233,032 | Dims: 384 | Model: MiniLM
  ‚úì extra_large_minilm
     Items: 233,027 | Dims: 384 | Model: MiniLM


Test Query 1/7
Category: location | Difficulty: easy

üî¨ CROSS-COLLECTION COMPARISON
Query: 'What are hiking tra