# RAG System Component Testing Notebook

## Systematic validation of all modules in the Universal Knowledge Ingestion System


### Setup and Imports

In [1]:
import os
import sys
import time
import shutil
import tempfile
import numpy as np
from pathlib import Path
from datetime import datetime

sys.path.append("../")

from config.settings import settings
from config.settings import get_settings
from utils.text_cleaner import TextCleaner
from config.logging_config import get_logger
from vector_store.bm25_index import BM25Index
from chunking.token_counter import TokenCounter
from chunking.fixed_chunker import FixedChunker
from embeddings.bge_embedder import get_embedder
from vector_store.bm25_index import get_bm25_index
from vector_store.index_builder import IndexBuilder
from chunking.overlap_manager import OverlapManager
from vector_store.faiss_manager import FAISSManager
from embeddings.model_loader import get_model_loader
from chunking.semantic_chunker import SemanticChunker
from vector_store.backup_manager import BackupManager
from vector_store.metadata_store import MetadataStore
from vector_store.index_persister import IndexPersister
from vector_store.faiss_manager import get_faiss_manager
from vector_store.index_builder import get_index_builder
from embeddings.embedding_cache import get_embedding_cache
from embeddings.batch_processor import get_batch_processor
from vector_store.backup_manager import get_backup_manager
from vector_store.metadata_store import get_metadata_store
from vector_store.index_persister import get_index_persister
from chunking.hierarchical_chunker import HierarchicalChunker
from document_parser.parser_factory import get_parser_factory
from chunking.adaptive_selector import AdaptiveChunkingSelector


# Setup
print("=" * 80)
print("RAG PIPELINE - INTEGRATION TESTING")

print(f"Started: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")

# Test files Paths
TEST_FILES = {#"pdf"         : Path("test.pdf"),
              "scanned_pdf" : Path("test_scanned.pdf"),
              #"docx"        : Path("test.docx"),
              #"txt"         : Path("test.txt"),
              #"zip"         : Path("test.zip"),
             }

print("\nüìÅ Checking test files...")

available_files = dict()

for file_type, file_path in TEST_FILES.items():
    if file_path.exists():
        size_mb = file_path.stat().st_size / (1024 * 1024)
        print(f"‚úì {file_type.upper()}: {file_path.name} ({size_mb:.2f} MB)")
        available_files[file_type] = file_path
    
    else:
        print(f"‚úó {file_type.upper()}: Not found at {file_path}")

if not available_files:
    print("\n‚ö†Ô∏è  No test files found! Please update TEST_FILES paths.")
    print("Example:")
    print('  TEST_FILES = {')
    print('      "pdf": Path("data/test.pdf"),')
    print('      "txt": Path("data/test.txt"),')
    print('  }')
    
else:
    print(f"\n‚úì Found {len(available_files)} test files")

print("=" * 80)


2025-12-03 16:18:14 - [34mroot[0m - [32mINFO[0m - Logging configured: level=DEBUG, console=True, file=True
RAG PIPELINE - INTEGRATION TESTING
Started: 2025-12-03 16:18:18


üìÅ Checking test files...
‚úì SCANNED_PDF: test_scanned.pdf (5.16 MB)

‚úì Found 1 test files


### DOCUMENT PARSING

In [None]:
print("\n" + "=" * 80)
print("DOCUMENT PARSING")
print("=" * 80)

parser_factory = get_parser_factory()

print(f"\nüìã Supported extensions: {', '.join(parser_factory.get_supported_extensions())}")

parsed_documents = dict()

for file_type, file_path in available_files.items():
    print(f"\n{'=' * 60}")
    print(f"Parsing: {file_type.upper()} - {file_path.name}")
    print('=' * 60)
    
    try:
        # Parse document
        text, metadata              = parser_factory.parse(file_path        = file_path,
                                                           extract_metadata = True,
                                                           clean_text       = True,
                                                          )
        
        # Store results
        parsed_documents[file_type] = {'text'      : text,
                                       'metadata'  : metadata,
                                       'file_path' : file_path,
                                      }

        # Display info
        print(f"‚úì Parsed successfully!")
        print(f"  Document ID: {metadata.document_id}")
        print(f"  Type: {metadata.document_type.value}")
        print(f"  Text Length: {len(text):,} characters")
        print(f"  File Size: {metadata.file_size_mb:.2f} MB")
        
        if metadata.num_pages:
            print(f"  Pages: {metadata.num_pages}")
        
        if metadata.title:
            print(f"  Title: {metadata.title}")
        
        # Show sample text
        print(f"\nüìÑ Sample text (first 200 chars):")
        print(f"  {text}")
        
    except Exception as e:
        print(f"‚úó Failed to parse: {e}")
        logger.error(f"Parse error for {file_type}: {e}")

print(f"\n‚úì Successfully parsed {len(parsed_documents)}/{len(available_files)} documents")


### TEXT CLEANING VALIDATION

In [None]:
print("\n" + "=" * 80)
print("TEXT CLEANING VALIDATION")
print("=" * 80)

if parsed_documents:
    # Take first document as example
    sample_type        = list(parsed_documents.keys())[0]
    sample_text        = parsed_documents[sample_type]['text']
    
    print(f"\nTesting text cleaning on {sample_type.upper()}...")
    
    # Original stats
    original_length    = len(sample_text)
    original_sentences = len(TextCleaner.extract_sentences(sample_text))
    
    print(f"  Original: {original_length:,} chars, {original_sentences} sentences")
    
    # Clean text
    cleaned_text       = TextCleaner.clean(text                 = sample_text,
                                           remove_html          = True,
                                           normalize_whitespace = True,
                                           preserve_structure   = True,
                                          )
                                        
    cleaned_length     = len(cleaned_text)
    cleaned_sentences  = len(TextCleaner.extract_sentences(cleaned_text))
    reduction          = ((original_length - cleaned_length) / original_length * 100) if (original_length > 0) else 0
    
    print(f"  Cleaned: {cleaned_length:,} chars, {cleaned_sentences} sentences")
    print(f"  Reduction: {reduction:.1f}%")
    
    print(f"\nüìÑ Cleaned sample:")
    print(f"  {cleaned_text}")


### TOKEN COUNTING

In [None]:
print("\n" + "=" * 80)
print("TOKEN COUNTING")
print("=" * 80)

token_counter = TokenCounter(tokenizer_type = "cl100k_base")

print(f"Tokenizer: {token_counter.tokenizer_type}")

token_stats = dict()

for file_type, doc_data in parsed_documents.items():
    text                   = doc_data['text']
    stats                  = token_counter.get_token_stats(text = text)
    token_stats[file_type] = stats
    
    print(f"\nüìä {file_type.upper()}:")
    print(f"  Tokens: {stats['tokens']:,}")
    print(f"  Characters: {stats['characters']:,}")
    print(f"  Words: {stats['words']:,}")
    print(f"  Chars/Token: {stats['chars_per_token']:.2f}")
    print(f"  Tokens/Word: {stats['tokens_per_word']:.2f}")



In [None]:
ocr_doc = parsed_documents['scanned_pdf']
ocr_doc 

### CHUNKING STRATEGIES

#### Test 1: Fixed Chunker

In [None]:
# Test with first available document
if parsed_documents:
    test_type     = list(parsed_documents.keys())[1]
    test_data     = parsed_documents[test_type]
    test_text     = test_data['text']
    test_metadata = test_data['metadata']
    
    print(f"\nüî™ Testing chunking on: {test_type.upper()}")
    print(f"   Document: {test_metadata.filename}")
    print(f"   Tokens: {token_stats[test_type]['tokens']:,}")
    
    print(f"\n{'=' * 60}")
    print("TEST 1: Fixed Chunking")
    print('=' * 60)
    
    fixed_chunker = FixedChunker(chunk_size                  = 512,
                                 overlap                     = 50,
                                 respect_sentence_boundaries = True
                                )
    
    fixed_chunks  = fixed_chunker.chunk_text(text     = test_text, 
                                             metadata = test_metadata,
                                            )
    
    fixed_stats   = fixed_chunker.get_chunk_statistics(chunks = fixed_chunks)
    
    print(f"‚úì Created {len(fixed_chunks)} chunks")
    print(f"  Total Tokens: {fixed_stats['total_tokens']:,}")
    print(f"  Avg Tokens/Chunk: {fixed_stats['avg_tokens_per_chunk']:.1f}")
    print(f"  Min Tokens: {fixed_stats['min_tokens']}")
    print(f"  Max Tokens: {fixed_stats['max_tokens']}")
    
    # Show first chunk
    if fixed_chunks:
        for i, chunk in enumerate(fixed_chunks):
            print(f"\nüìÑ Chunk sample: {i}")
            print(f"  ID: {chunk.chunk_id}")
            print(f"  Tokens: {chunk.token_count}")
            print(f"  Text: {chunk.text}")


#### Test 2: Semantic Chunker

In [None]:
# Test with first available document
if parsed_documents:
    test_type     = list(parsed_documents.keys())[0]
    test_data     = parsed_documents[test_type]
    test_text     = test_data['text']
    test_metadata = test_data['metadata']
    
    print(f"\nüî™ Testing chunking on: {test_type.upper()}")
    print(f"   Document: {test_metadata.filename}")
    print(f"   Tokens: {token_stats[test_type]['tokens']:,}")
    
    print(f"\n{'=' * 60}")
    print("TEST 2: Semantic Chunking")
    print('=' * 60)
    
    try:
        semantic_chunker = SemanticChunker(chunk_size           = 512,
                                           overlap              = 50,
                                           similarity_threshold = 0.95,
                                          )
        
        semantic_chunks  = semantic_chunker.chunk_text(text     = test_text, 
                                                       metadata = test_metadata,
                                                      )
        
        semantic_stats   = semantic_chunker.get_chunk_statistics(chunks = semantic_chunks)
        
        print(f"‚úì Created {len(semantic_chunks)} semantic chunks")
        print(f"  Total Tokens: {semantic_stats['total_tokens']:,}")
        print(f"  Avg Tokens/Chunk: {semantic_stats['avg_tokens_per_chunk']:.1f}")
        
        if semantic_chunks:
            for i, chunk in enumerate(semantic_chunks):
                print(f"\nüìÑ Semantic chunk: {i}")
                print(f"  Tokens: {chunk.token_count}")
                print(f"  Text: {chunk.text}")
    
    except Exception as e:
        print(f"‚ö†Ô∏è  Semantic chunking unavailable: {e}")
        print("   (Embedding model may need to be downloaded)")
    

#### Test 3: Hierarchical Chunker

In [None]:
# Test with first available document
if parsed_documents:
    test_type     = list(parsed_documents.keys())[0]
    test_data     = parsed_documents[test_type]
    test_text     = test_data['text']
    test_metadata = test_data['metadata']
    
    print(f"\nüî™ Testing chunking on: {test_type.upper()}")
    print(f"   Document: {test_metadata.filename}")
    print(f"   Tokens: {token_stats[test_type]['tokens']:,}")
    
    print(f"\n{'=' * 60}")
    print("TEST 3: Hierarchical Chunking")
    print('=' * 60)
    
    hierarchical_chunker = HierarchicalChunker(parent_chunk_size = 2048,
                                               child_chunk_size  = 512,
                                               overlap           = 50,
                                              )
    
    hierarchical_chunks  = hierarchical_chunker.chunk_text(text     = test_text, 
                                                           metadata = test_metadata,
                                                          )
    
    hierarchical_stats   = hierarchical_chunker.get_chunk_statistics(chunks = hierarchical_chunks)
    
    print(f"‚úì Created {len(hierarchical_chunks)} child chunks")
    print(f"  Total Tokens: {hierarchical_stats['total_tokens']:,}")
    print(f"  Avg Tokens/Chunk: {hierarchical_stats['avg_tokens_per_chunk']:.1f}")
    
    # Get parent-child relationships
    relationships = hierarchical_chunker.get_parent_child_relationships(chunks = hierarchical_chunks)
    print(f"  Parent Chunks: {len(relationships)}")
    
    if hierarchical_chunks:
        print(f"\nüìÑ First child chunk:")
        chunk = hierarchical_chunks[0]
        print(f"  ID: {chunk.chunk_id}")
        print(f"  Parent: {chunk.metadata.get('parent_chunk_id', 'N/A')}")
        print(f"  Tokens: {chunk.token_count}")
        print(f"  Text: {chunk.text}")
        

#### Test 4: Adaptive Selector

In [None]:
# Test with first available document
if parsed_documents:
    test_type     = list(parsed_documents.keys())[0]
    test_data     = parsed_documents[test_type]
    test_text     = test_data['text']
    test_metadata = test_data['metadata']
    
    print(f"\nüî™ Testing chunking on: {test_type.upper()}")
    print(f"   Document: {test_metadata.filename}")
    print(f"   Tokens: {token_stats[test_type]['tokens']:,}")
    
    print(f"\n{'=' * 60}")
    print("TEST 4: Adaptive Chunking Selector")
    print('=' * 60)
    
    adaptive_selector          = AdaptiveChunkingSelector()
    
    # Analyze document
    analysis                   = adaptive_selector._analyze_document(text     = test_text, 
                                                                     metadata = test_metadata,
                                                                    )
    
    print(f"\nüìä Document Analysis:")
    print(f"  Total Tokens: {analysis['total_tokens']:,}")
    print(f"  Size Category: {analysis['document_size_category']}")
    print(f"  Sentences: {analysis['sentence_count']:,}")
    print(f"  Paragraphs: {analysis['paragraph_count']:,}")
    
    # Get strategy recommendation
    strategy, analysis_result = adaptive_selector.select_chunking_strategy(text     = test_text, 
                                                                           metadata = test_metadata,
                                                                          )
    
    print(f"\nüéØ Recommended Strategy: {strategy.value.upper()}")
    print(f"   Reason: {analysis_result['selection_reason']}")
    
    # Apply adaptive chunking
    adaptive_chunks           = adaptive_selector.chunk_text(text     = test_text, 
                                                             metadata = test_metadata,
                                                            )
    
    adaptive_stats            = adaptive_selector.fixed_chunker.get_chunk_statistics(chunks = adaptive_chunks)
    
    print(f"\n‚úì Adaptive chunking completed")
    print(f"  Chunks Created: {len(adaptive_chunks)}")
    print(f"  Total Tokens: {adaptive_stats['total_tokens']:,}")


### OVERLAP MANAGEMENT

In [None]:
print("\n" + "=" * 80)
print("OVERLAP MANAGEMENT")
print("=" * 80)

if parsed_documents and fixed_chunks:
    overlap_manager = OverlapManager(overlap_tokens = 50)
    
    # Get overlap statistics
    overlap_stats   = overlap_manager.get_overlap_statistics(chunks = fixed_chunks)
    
    print(f"\nüìä Overlap Statistics:")
    print(f"  Number of Chunks: {overlap_stats['num_chunks']}")
    print(f"  Number of Overlaps: {overlap_stats['num_overlaps']}")
    print(f"  Avg Overlap Tokens: {overlap_stats['avg_overlap_tokens']:.1f}")
    print(f"  Min Overlap Tokens: {overlap_stats['min_overlap_tokens']}")
    print(f"  Max Overlap Tokens: {overlap_stats['max_overlap_tokens']}")
    print(f"  Avg Overlap %: {overlap_stats['avg_overlap_percentage']:.1f}%")


### EMBEDDINGS MODULE TESTING

#### TEST MODEL LOADER

In [None]:
# ============================================================================
# EMBEDDINGS MODULE TESTING
# ============================================================================

print("\n" + "=" * 80)
print("EMBEDDINGS MODULE TESTING")
print("=" * 80)

import time
import numpy as np

# Import embeddings module
try:
    from embeddings.bge_embedder import get_embedder
    from embeddings.embedding_cache import get_embedding_cache
    from embeddings.model_loader import get_model_loader
    from embeddings.batch_processor import get_batch_processor
    print("‚úì Embeddings modules imported successfully")
except ImportError as e:
    print(f"‚úó Failed to import embeddings modules: {e}")
    raise

# ----------------------------------------------------------------------------
# 1. TEST MODEL LOADER
# ----------------------------------------------------------------------------
print("\n" + "-" * 60)
print("TEST 1: Model Loader")
print("-" * 60)

try:
    model_loader = get_model_loader()
    print("‚úì Model loader initialized")
    
    # Test model loading
    model = model_loader.load_model(device="cpu")
    model_info = model_loader.get_model_info()
    
    print(f"‚úì Model loaded successfully:")
    print(f"  Model: {model_info.get('model_name', 'unknown')}")
    print(f"  Device: {model_info.get('device', 'unknown')}")
    print(f"  Embedding dimension: {model_info.get('embedding_dimension', 'unknown')}")
    print(f"  Cache size: {model_info.get('cache_size', 'unknown')}")
    
    # Test model unloading
    model_loader.unload_model()
    print("‚úì Model unloaded successfully")
    
    # Reload for further tests
    model = model_loader.load_model()
    
except Exception as e:
    print(f"‚úó Model loader test failed: {e}")
    import traceback
    traceback.print_exc()
    raise

# ----------------------------------------------------------------------------
# 2. TEST BATCH PROCESSOR
# ----------------------------------------------------------------------------
print("\n" + "-" * 60)
print("TEST 2: Batch Processor")
print("-" * 60)

try:
    batch_processor = get_batch_processor()
    print("‚úì Batch processor initialized")
    
    # Test batch splitting
    test_texts = ["Sample text " + str(i) for i in range(50)]
    batches = batch_processor.split_into_optimal_batches(
        texts=test_texts,
        target_batch_size=10
    )
    
    print(f"‚úì Batch splitting successful:")
    print(f"  Input texts: {len(test_texts)}")
    print(f"  Created batches: {len(batches)}")
    print(f"  Batch sizes: {[len(b) for b in batches][:5]}...")  # Show first 5
    
    # Test processing stats
    stats = batch_processor.get_processing_stats()
    print(f"  Processing stats: {stats}")
    
except Exception as e:
    print(f"‚úó Batch processor test failed: {e}")
    import traceback
    traceback.print_exc()
    raise

# ----------------------------------------------------------------------------
# 3. TEST BGE EMBEDDER
# ----------------------------------------------------------------------------
print("\n" + "-" * 60)
print("TEST 3: BGE Embedder")
print("-" * 60)

try:
    embedder = get_embedder()
    print("‚úì BGE embedder initialized")
    
    # Get model info
    model_info = embedder.get_model_info()
    print(f"‚úì Embedder info:")
    print(f"  Model: {model_info.get('model_name', 'unknown')}")
    print(f"  Dimension: {model_info.get('embedding_dim', 'unknown')}")
    print(f"  Device: {model_info.get('device', 'unknown')}")
    print(f"  Supports batch: {model_info.get('supports_batch', False)}")
    
    # Test single text embedding
    print("\nTesting single text embedding...")
    test_text = "This is a test document for embedding generation"
    single_embedding = embedder.embed_text(test_text, normalize=True)
    
    print(f"‚úì Single embedding generated:")
    print(f"  Text length: {len(test_text)} chars")
    print(f"  Embedding shape: {single_embedding.shape}")
    print(f"  Embedding dtype: {single_embedding.dtype}")
    print(f"  Embedding norm: {np.linalg.norm(single_embedding):.6f}")
    
    # Validate embedding
    is_valid = embedder.validate_embedding(single_embedding)
    print(f"  Embedding valid: {is_valid}")
    
    # Test batch embedding
    print("\nTesting batch embedding...")
    test_texts_batch = [
        "First document for testing embeddings",
        "Second document with different content",
        "Third document for comprehensive testing",
        "Fourth document to verify batch processing",
        "Fifth document to complete the test set"
    ]
    
    start_time = time.time()
    batch_embeddings = embedder.embed_texts(
        texts=test_texts_batch,
        batch_size=2,
        normalize=True
    )
    batch_time = (time.time() - start_time) * 1000
    
    print(f"‚úì Batch embedding successful:")
    print(f"  Texts embedded: {len(batch_embeddings)}")
    print(f"  Time taken: {batch_time:.0f}ms")
    print(f"  Avg time per text: {batch_time/len(batch_embeddings):.0f}ms")
    
    # Verify batch embeddings
    valid_count = sum(1 for emb in batch_embeddings if embedder.validate_embedding(emb))
    print(f"  Valid embeddings: {valid_count}/{len(batch_embeddings)}")
    
    # Test cosine similarity
    print("\nTesting cosine similarity...")
    if len(batch_embeddings) >= 2:
        similarity = embedder.cosine_similarity(batch_embeddings[0], batch_embeddings[1])
        print(f"‚úì Cosine similarity between first two embeddings: {similarity:.4f}")
        
        # Self-similarity should be ~1.0
        self_similarity = embedder.cosine_similarity(batch_embeddings[0], batch_embeddings[0])
        print(f"  Self-similarity (should be ~1.0): {self_similarity:.6f}")
    
except Exception as e:
    print(f"‚úó BGE embedder test failed: {e}")
    import traceback
    traceback.print_exc()
    raise

# ----------------------------------------------------------------------------
# 4. TEST EMBEDDING CACHE
# ----------------------------------------------------------------------------
print("\n" + "-" * 60)
print("TEST 4: Embedding Cache")
print("-" * 60)

try:
    cache = get_embedding_cache()
    print("‚úì Embedding cache initialized")
    
    # Get initial stats
    initial_stats = cache.get_stats()
    print(f"‚úì Initial cache stats:")
    print(f"  Cache size: {initial_stats.get('cache_size', 0)}")
    print(f"  Max size: {initial_stats.get('max_size', 0)}")
    print(f"  Hits: {initial_stats.get('hits', 0)}")
    print(f"  Misses: {initial_stats.get('misses', 0)}")
    
    # Test cache operations
    print("\nTesting cache operations...")
    
    # Test single embedding caching
    cache_text = "This text will be cached"
    cache_embedding = np.random.randn(embedder.get_embedding_dimension()).astype(np.float32)
    
    # Store in cache
    cache.set_embedding(cache_text, cache_embedding)
    
    # Retrieve from cache
    retrieved = cache.get_embedding(cache_text)
    
    if retrieved is not None:
        print(f"‚úì Single embedding caching successful")
        print(f"  Retrieved shape: {retrieved.shape}")
        print(f"  Arrays equal: {np.allclose(cache_embedding, retrieved, atol=1e-6)}")
    
    # Test batch caching
    print("\nTesting batch caching...")
    batch_texts = ["Batch text " + str(i) for i in range(3)]
    batch_embeddings = [np.random.randn(embedder.get_embedding_dimension()).astype(np.float32) for _ in range(3)]
    
    cache.batch_set_embeddings(batch_texts, batch_embeddings)
    
    cached_batch, missing = cache.batch_get_embeddings(batch_texts)
    
    print(f"‚úì Batch caching successful:")
    print(f"  Texts cached: {len(batch_texts)}")
    print(f"  Retrieved: {sum(1 for emb in cached_batch if emb is not None)}")
    print(f"  Missing: {len(missing)}")
    
    # Test smart caching with embed function
    print("\nTesting smart caching with embed function...")
    
    def mock_embed_function(texts, batch_size=None):
        """Mock embedding function for testing"""
        return [np.random.randn(embedder.get_embedding_dimension()).astype(np.float32) for _ in texts]
    
    # First call should miss cache
    start_time = time.time()
    embeddings1 = cache.get_cached_embeddings(
        texts=batch_texts,
        embed_function=mock_embed_function,
        batch_size=2
    )
    time1 = (time.time() - start_time) * 1000
    
    # Second call should hit cache
    start_time = time.time()
    embeddings2 = cache.get_cached_embeddings(
        texts=batch_texts,
        embed_function=mock_embed_function,
        batch_size=2
    )
    time2 = (time.time() - start_time) * 1000
    
    print(f"‚úì Smart caching test:")
    print(f"  First call time: {time1:.2f}ms (should be slower)")
    print(f"  Second call time: {time2:.2f}ms (should be faster)")
    print(f"  Speedup: {time1/time2:.1f}x")
    
    # Verify embeddings are the same
    all_same = all(np.allclose(e1, e2, atol=1e-6) for e1, e2 in zip(embeddings1, embeddings2))
    print(f"  Cached embeddings consistent: {all_same}")
    
    # Get final stats
    final_stats = cache.get_stats()
    hit_rate = final_stats.get('hit_rate_percentage', 0)
    print(f"\n‚úì Final cache stats:")
    print(f"  Cache size: {final_stats.get('cache_size', 0)}")
    print(f"  Hits: {final_stats.get('hits', 0)}")
    print(f"  Misses: {final_stats.get('misses', 0)}")
    print(f"  Hit rate: {hit_rate:.1f}%")
    print(f"  Embeddings generated: {final_stats.get('embeddings_generated', 0)}")
    
    # Test cache clearing
    cache.clear()
    after_clear_stats = cache.get_stats()
    print(f"  After clear - Cache size: {after_clear_stats.get('cache_size', 0)}")
    
except Exception as e:
    print(f"‚úó Embedding cache test failed: {e}")
    import traceback
    traceback.print_exc()
    raise

# ----------------------------------------------------------------------------
# 5. TEST INTEGRATION WITH CHUNKS
# ----------------------------------------------------------------------------
print("\n" + "-" * 60)
print("TEST 5: Integration with Document Chunks")
print("-" * 60)

if 'fixed_chunks' in locals() and fixed_chunks:
    try:
        # Use actual chunks from previous testing
        test_chunks = fixed_chunks[:3]  # Use first 3 chunks
        
        print(f"Testing embedding generation on {len(test_chunks)} document chunks...")
        
        # Generate embeddings for chunks
        start_time = time.time()
        embedded_chunks = embedder.embed_chunks(
            chunks=test_chunks,
            batch_size=2,
            normalize=True
        )
        embedding_time = (time.time() - start_time) * 1000
        
        print(f"‚úì Document chunk embedding successful:")
        print(f"  Chunks embedded: {len(embedded_chunks)}")
        print(f"  Time taken: {embedding_time:.0f}ms")
        print(f"  Avg time per chunk: {embedding_time/len(embedded_chunks):.0f}ms")
        
        # Verify chunks have embeddings
        chunks_with_embeddings = sum(1 for chunk in embedded_chunks if hasattr(chunk, 'embedding') and chunk.embedding is not None)
        print(f"  Chunks with embeddings: {chunks_with_embeddings}/{len(embedded_chunks)}")
        
        if chunks_with_embeddings > 0:
            # Show first chunk details
            first_chunk = embedded_chunks[0]
            print(f"\nüìÑ First chunk details:")
            print(f"  Chunk ID: {first_chunk.chunk_id}")
            print(f"  Text length: {len(first_chunk.text)} chars")
            print(f"  Has embedding: {hasattr(first_chunk, 'embedding') and first_chunk.embedding is not None}")
            
            if hasattr(first_chunk, 'embedding') and first_chunk.embedding is not None:
                embedding_array = np.array(first_chunk.embedding)
                print(f"  Embedding shape: {embedding_array.shape}")
                print(f"  Embedding dtype: {embedding_array.dtype}")
                print(f"  Embedding norm: {np.linalg.norm(embedding_array):.6f}")
        
        # Test with cache integration
        print("\nTesting chunk embedding with cache...")
        
        # Clear cache for clean test
        cache.clear()
        
        # First embedding (should miss cache)
        start_time = time.time()
        embedded_chunks_1 = embedder.embed_chunks(test_chunks, batch_size=2)
        time_1 = (time.time() - start_time) * 1000
        
        # Second embedding (should hit cache if using cache)
        start_time = time.time()
        embedded_chunks_2 = embedder.embed_chunks(test_chunks, batch_size=2)
        time_2 = (time.time() - start_time) * 1000
        
        print(f"  First embedding time: {time_1:.0f}ms")
        print(f"  Second embedding time: {time_2:.0f}ms")
        
        if time_2 < time_1:
            print(f"  Cache likely working (speedup: {time_1/time_2:.1f}x)")
        else:
            print(f"  Note: No speedup observed (cache may not be integrated in embed_chunks)")
        
    except Exception as e:
        print(f"‚úó Chunk integration test failed: {e}")
        import traceback
        traceback.print_exc()
        raise
else:
    print("‚ö†Ô∏è  No chunks available for integration test")
    print("   Run chunking tests first or create test chunks")

# ----------------------------------------------------------------------------
# 6. PERFORMANCE BENCHMARKING
# ----------------------------------------------------------------------------
print("\n" + "-" * 60)
print("TEST 6: Performance Benchmarking")
print("-" * 60)

try:
    # Create test data of varying sizes
    test_sizes = [1, 5, 10, 20, 50]
    test_texts_varying = []
    
    for i in range(max(test_sizes)):
        test_texts_varying.append(f"Test document number {i} with some content to embed. " * 10)
    
    print("Benchmarking embedding performance...")
    print(f"{'Texts':<10} {'Time (ms)':<12} {'Time/Text (ms)':<15} {'Memory':<10}")
    print("-" * 50)
    
    for size in test_sizes:
        texts = test_texts_varying[:size]
        
        # Clear cache for fair benchmark
        cache.clear()
        
        # Time embedding
        start_time = time.time()
        embeddings = embedder.embed_texts(texts, batch_size=min(32, size))
        elapsed = (time.time() - start_time) * 1000
        
        # Calculate metrics
        per_text = elapsed / size if size > 0 else 0
        
        # Rough memory estimate
        if embeddings and len(embeddings) > 0:
            # Estimate memory: 4 bytes per float * dimensions * number of embeddings
            memory_kb = (embeddings[0].nbytes * len(embeddings)) / 1024
        else:
            memory_kb = 0
        
        print(f"{size:<10} {elapsed:<12.1f} {per_text:<15.2f} {memory_kb:<10.1f}KB")
    
    print("\n‚úì Performance benchmarking complete")
    
except Exception as e:
    print(f"‚úó Performance benchmarking failed: {e}")
    import traceback
    traceback.print_exc()

# ----------------------------------------------------------------------------
# 7. ERROR HANDLING TESTING
# ----------------------------------------------------------------------------
print("\n" + "-" * 60)
print("TEST 7: Error Handling")
print("-" * 60)

try:
    print("Testing error scenarios...")
    
    # Test empty text
    try:
        empty_result = embedder.embed_text("")
        print("‚úó Should have raised error for empty text")
    except Exception as e:
        print(f"‚úì Correctly raised error for empty text: {type(e).__name__}")
    
    # Test None text
    try:
        none_result = embedder.embed_text(None)
        print("‚úó Should have raised error for None text")
    except Exception as e:
        print(f"‚úì Correctly raised error for None text: {type(e).__name__}")
    
    # Test invalid batch size
    try:
        invalid_batch = embedder.embed_texts(["test"], batch_size=0)
        print("‚úó Should have handled invalid batch size")
    except Exception as e:
        print(f"‚úì Correctly handled invalid batch size: {type(e).__name__}")
    
    print("‚úì Error handling tests passed")
    
except Exception as e:
    print(f"‚úó Error handling test failed: {e}")
    import traceback
    traceback.print_exc()

# ----------------------------------------------------------------------------
# SUMMARY
# ----------------------------------------------------------------------------
print("\n" + "=" * 80)
print("EMBEDDINGS MODULE TEST SUMMARY")
print("=" * 80)

# Get final component stats
try:
    model_stats = model_loader.get_model_info()
    batch_stats = batch_processor.get_processing_stats()
    cache_stats = cache.get_stats()
    embedder_info = embedder.get_model_info()
    
    print("\nüìä FINAL COMPONENT STATISTICS:")
    print(f"\nModel Loader:")
    print(f"  Model: {model_stats.get('model_name', 'unknown')}")
    print(f"  Loaded: {model_stats.get('loaded', False)}")
    print(f"  Cache size: {model_stats.get('cache_size', 0)}")
    
    print(f"\nBatch Processor:")
    print(f"  Total batches: {batch_stats.get('total_batches', 0)}")
    print(f"  Total texts: {batch_stats.get('total_texts', 0)}")
    print(f"  Failed batches: {batch_stats.get('failed_batches', 0)}")
    print(f"  Success rate: {batch_stats.get('success_rate', 0):.1f}%")
    
    print(f"\nEmbedder:")
    print(f"  Model: {embedder_info.get('model_name', 'unknown')}")
    print(f"  Dimension: {embedder_info.get('embedding_dim', 'unknown')}")
    print(f"  Device: {embedder_info.get('device', 'unknown')}")
    
    print(f"\nEmbedding Cache:")
    print(f"  Cache size: {cache_stats.get('cache_size', 0)}")
    print(f"  Max size: {cache_stats.get('max_size', 0)}")
    print(f"  Hits: {cache_stats.get('hits', 0)}")
    print(f"  Misses: {cache_stats.get('misses', 0)}")
    print(f"  Hit rate: {cache_stats.get('hit_rate_percentage', 0):.1f}%")
    print(f"  Embeddings generated: {cache_stats.get('embeddings_generated', 0)}")
    
except Exception as e:
    print(f"Error getting final stats: {e}")

print("\n‚úÖ EMBEDDINGS MODULE TESTING COMPLETE")
print("   Next step: Vector Store Module Testing")
print("=" * 80)

### VECTOR STORE MODULE TESTING

In [None]:

# Create temporary directory for testing
test_temp_dir = Path(tempfile.mkdtemp(prefix="vector_store_test_"))
print(f"\nüìÅ Using temporary directory: {test_temp_dir}")

# ----------------------------------------------------------------------------
# 1. TEST BACKUP MANAGER
# ----------------------------------------------------------------------------
print("\n" + "-" * 60)
print("TEST 1: Backup Manager")
print("-" * 60)

try:
    # Create test directory structure
    test_vector_store = test_temp_dir / "vector_store"
    test_backup_dir = test_temp_dir / "backups"
    test_vector_store.mkdir(exist_ok=True)
    test_backup_dir.mkdir(exist_ok=True)
    
    # Initialize backup manager
    backup_manager = BackupManager(
        backup_dir=test_backup_dir,
        vector_store_dir=test_vector_store
    )
    print("‚úì Backup manager initialized")
    
    # Create some test files in vector store
    test_files = ["faiss.index", "bm25_index.pkl", "metadata.db"]
    for file_name in test_files:
        test_file = test_vector_store / file_name
        test_file.write_text(f"Test content for {file_name}")
    
    # Test backup creation
    print("\nTesting backup creation...")
    backup_path = backup_manager.create_backup(
        backup_name="test_backup",
        description="Test backup for integration testing"
    )
    
    print(f"‚úì Backup created: {Path(backup_path).name}")
    
    # Test backup listing
    print("\nTesting backup listing...")
    backups = backup_manager.list_backups()
    print(f"‚úì Found {len(backups)} backups")
    
    if backups:
        backup_info = backups[0]
        print(f"  Latest backup: {backup_info['name']}")
        print(f"  Size: {backup_info['size_mb']:.2f} MB")
    
    # Test backup verification
    print("\nTesting backup verification...")
    is_valid = backup_manager.verify_backup(Path(backup_path))
    print(f"‚úì Backup verification: {'Valid' if is_valid else 'Invalid'}")
    
    # Test backup statistics
    print("\nTesting backup statistics...")
    backup_stats = backup_manager.get_backup_stats()
    print(f"‚úì Backup stats:")
    print(f"  Total backups: {backup_stats.get('total_backups', 0)}")
    print(f"  Total size: {backup_stats.get('total_size_mb', 0):.2f} MB")
    print(f"  Auto backup: {backup_stats.get('auto_backup', False)}")
    
    # Test cleanup old backups
    print("\nTesting backup cleanup...")
    cleanup_result = backup_manager.cleanup_old_backups(keep_count=2, keep_days=1)
    print(f"‚úì Backup cleanup: {cleanup_result.get('message', 'Unknown')}")
    print(f"  Deleted: {cleanup_result.get('deleted', 0)}")
    print(f"  Kept: {cleanup_result.get('kept', 0)}")
    
    # Test auto backup check
    print("\nTesting auto backup check...")
    auto_backup_result = backup_manager.auto_backup_check(documents_processed=50)
    if auto_backup_result:
        print(f"‚úì Auto backup triggered: {Path(auto_backup_result).name}")
    else:
        print(f"‚úì Auto backup not triggered (interval not reached)")
    
except Exception as e:
    print(f"‚úó Backup manager test failed: {e}")
    import traceback
    traceback.print_exc()

# ----------------------------------------------------------------------------
# 2. TEST INDEX PERSISTER
# ----------------------------------------------------------------------------
print("\n" + "-" * 60)
print("TEST 2: Index Persister")
print("-" * 60)

try:
    # Create test directory for persister
    test_persister_dir = test_temp_dir / "persister_test"
    test_persister_dir.mkdir(exist_ok=True)
    
    # Initialize index persister
    index_persister = IndexPersister(vector_store_dir=test_persister_dir)
    print("‚úì Index persister initialized")
    
    # Test file existence check
    print("\nTesting index file existence...")
    files_exist = index_persister.index_files_exist()
    print(f"‚úì Index files exist: {files_exist}")
    
    # Test file info
    print("\nTesting file information...")
    files_info = index_persister.get_index_files_info()
    print(f"‚úì File info retrieved for {len(files_info)} files")
    
    for file_name, info in files_info.items():
        exists = info.get('exists', False)
        print(f"  {file_name}: {'Exists' if exists else 'Missing'}")
    
    # Test persistence statistics
    print("\nTesting persistence statistics...")
    persistence_stats = index_persister.get_persistence_stats()
    print(f"‚úì Persistence stats:")
    print(f"  Total size: {persistence_stats.get('total_size_mb', 0):.2f} MB")
    print(f"  File count: {persistence_stats.get('file_count', 0)}")
    
    # Test metadata saving/loading
    print("\nTesting metadata operations...")
    test_metadata = {
        "test_key": "test_value",
        "timestamp": "2024-01-01T12:00:00",
        "count": 42
    }
    
    save_success = index_persister.save_index_metadata(test_metadata, "test_metadata.json")
    print(f"‚úì Metadata save: {'Success' if save_success else 'Failed'}")
    
    loaded_metadata = index_persister.load_index_metadata("test_metadata.json")
    print(f"‚úì Metadata load: {len(loaded_metadata)} keys loaded")
    
    if loaded_metadata:
        print(f"  Loaded key 'test_key': {loaded_metadata.get('test_key', 'Not found')}")
    
    # Test cleanup (basic)
    print("\nTesting index cleanup...")
    cleanup_result = index_persister.cleanup_old_indexes(keep_latest=True)
    print(f"‚úì Index cleanup: {cleanup_result.get('message', 'Unknown')}")
    
except Exception as e:
    print(f"‚úó Index persister test failed: {e}")
    import traceback
    traceback.print_exc()

# ----------------------------------------------------------------------------
# 3. TEST METADATA STORE
# ----------------------------------------------------------------------------
print("\n" + "-" * 60)
print("TEST 3: Metadata Store")
print("-" * 60)

try:
    # Create test database
    test_db_path = test_temp_dir / "test_metadata.db"
    
    # Initialize metadata store
    metadata_store = MetadataStore(db_path=test_db_path)
    print("‚úì Metadata store initialized")
    
    # Test database readiness
    print("\nTesting database readiness...")
    is_ready = metadata_store.is_ready()
    print(f"‚úì Database ready: {is_ready}")
    
    # Create test chunks
    print("\nCreating test chunks...")
    from config.models import DocumentChunk
    from datetime import datetime
    
    test_chunks = []
    for i in range(5):
        chunk = DocumentChunk(
            chunk_id=f"chunk_test_{i}",
            document_id=f"doc_test_{i // 2}",  # 2 chunks per document
            text=f"This is test chunk {i} with some content for metadata testing.",
            embedding=[0.1 * j for j in range(384)],  # Mock embedding
            chunk_index=i,
            start_char=i * 100,
            end_char=(i + 1) * 100,
            page_number=1,
            section_title=f"Section {i}",
            token_count=50,
            metadata={
                "source": "test",
                "timestamp": datetime.now().isoformat(),
                "test_field": f"value_{i}"
            }
        )
        test_chunks.append(chunk)
    
    print(f"‚úì Created {len(test_chunks)} test chunks")
    
    # Test chunk storage
    print("\nTesting chunk storage...")
    storage_result = metadata_store.store_chunks(test_chunks, rebuild=True)
    print(f"‚úì Chunk storage result:")
    print(f"  Stored chunks: {storage_result.get('stored_chunks', 0)}")
    print(f"  Stored documents: {storage_result.get('stored_documents', 0)}")
    
    # Test getting chunk metadata
    print("\nTesting chunk metadata retrieval...")
    if test_chunks:
        chunk_metadata = metadata_store.get_chunk_metadata(test_chunks[0].chunk_id)
        print(f"‚úì Chunk metadata retrieved: {chunk_metadata is not None}")
        if chunk_metadata:
            print(f"  Chunk ID: {chunk_metadata.get('chunk_id', 'Unknown')}")
            print(f"  Document ID: {chunk_metadata.get('document_id', 'Unknown')}")
            print(f"  Text preview: {chunk_metadata.get('text', '')[:50]}...")
    
    # Test getting chunks by document
    print("\nTesting chunks by document...")
    if test_chunks:
        doc_chunks = metadata_store.get_chunks_by_document(test_chunks[0].document_id)
        print(f"‚úì Retrieved {len(doc_chunks)} chunks for document")
    
    # Test getting all chunks
    print("\nTesting all chunks retrieval...")
    all_chunks = metadata_store.get_all_chunks()
    print(f"‚úì Retrieved {len(all_chunks)} total chunks")
    
    # Test getting document metadata
    print("\nTesting document metadata...")
    if test_chunks:
        doc_metadata = metadata_store.get_document_metadata(test_chunks[0].document_id)
        print(f"‚úì Document metadata retrieved: {doc_metadata is not None}")
        if doc_metadata:
            print(f"  Document ID: {doc_metadata.get('document_id', 'Unknown')}")
            print(f"  Filename: {doc_metadata.get('filename', 'Unknown')}")
            print(f"  Document type: {doc_metadata.get('document_type', 'Unknown')}")
    
    # Test statistics
    print("\nTesting metadata store statistics...")
    stats = metadata_store.get_stats()
    print(f"‚úì Metadata store stats:")
    print(f"  Documents: {stats.get('documents', 0)}")
    print(f"  Chunks: {stats.get('chunks', 0)}")
    print(f"  Database size: {stats.get('database_size_mb', 0):.2f} MB")
    
    # Test size information
    print("\nTesting size information...")
    size_info = metadata_store.get_size()
    print(f"‚úì Size info:")
    print(f"  Disk usage: {size_info.get('disk_mb', 0):.2f} MB")
    
    # Test clearing metadata
    print("\nTesting metadata clearing...")
    metadata_store.clear()
    print("‚úì Metadata cleared")
    
except Exception as e:
    print(f"‚úó Metadata store test failed: {e}")
    import traceback
    traceback.print_exc()

# ----------------------------------------------------------------------------
# 4. TEST FAISS MANAGER
# ----------------------------------------------------------------------------
print("\n" + "-" * 60)
print("TEST 4: FAISS Manager")
print("-" * 60)

try:
    # Create test directory for FAISS
    test_faiss_dir = test_temp_dir / "faiss_test"
    test_faiss_dir.mkdir(exist_ok=True)
    
    # Initialize FAISS manager
    faiss_manager = FAISSManager(vector_store_dir=test_faiss_dir)
    print("‚úì FAISS manager initialized")
    
    # Create test embeddings and chunk IDs
    print("\nCreating test embeddings...")
    embedding_dim = 384  # BGE-small dimension
    num_vectors = 100
    
    # Generate random embeddings
    np.random.seed(42)
    test_embeddings = np.random.randn(num_vectors, embedding_dim).astype('float32')
    
    # Normalize embeddings (as BGE embedder would do)
    norms = np.linalg.norm(test_embeddings, axis=1, keepdims=True)
    test_embeddings = test_embeddings / norms
    
    # Create chunk IDs
    test_chunk_ids = [f"chunk_faiss_{i}" for i in range(num_vectors)]
    
    # Test index building
    print("\nTesting FAISS index building...")
    build_stats = faiss_manager.build_index(
        embeddings=test_embeddings,
        chunk_ids=test_chunk_ids,
        rebuild=True
    )
    
    print(f"‚úì FAISS index built:")
    print(f"  Vectors: {build_stats.get('vectors', 0)}")
    print(f"  Build time: {build_stats.get('build_time_seconds', 0):.2f}s")
    print(f"  Index type: {build_stats.get('index_type', 'Unknown')}")
    
    # Test index statistics
    print("\nTesting FAISS index statistics...")
    index_stats = faiss_manager.get_index_stats()
    print(f"‚úì FAISS index stats:")
    print(f"  Built: {index_stats.get('built', False)}")
    print(f"  Vector count: {index_stats.get('vector_count', 0)}")
    print(f"  Embedding dim: {index_stats.get('embedding_dim', 0)}")
    print(f"  Search count: {index_stats.get('search_count', 0)}")
    
    # Test search functionality
    print("\nTesting FAISS search...")
    # Create a query embedding
    query_embedding = np.random.randn(embedding_dim).astype('float32')
    query_embedding = query_embedding / np.linalg.norm(query_embedding)
    
    search_results = faiss_manager.search(
        query_embedding=query_embedding,
        top_k=5
    )
    
    print(f"‚úì FAISS search completed:")
    print(f"  Results returned: {len(search_results)}")
    if search_results:
        print(f"  Top result: {search_results[0][0]} with score {search_results[0][1]:.4f}")
    
    # Test adding to index
    print("\nTesting adding to FAISS index...")
    new_embeddings = np.random.randn(10, embedding_dim).astype('float32')
    norms = np.linalg.norm(new_embeddings, axis=1, keepdims=True)
    new_embeddings = new_embeddings / norms
    new_chunk_ids = [f"chunk_new_{i}" for i in range(10)]
    
    add_stats = faiss_manager.add_to_index(new_embeddings, new_chunk_ids)
    print(f"‚úì Added to FAISS index:")
    print(f"  Added vectors: {add_stats.get('added', 0)}")
    print(f"  New total: {add_stats.get('new_total', 0)}")
    
    # Test index optimization
    print("\nTesting FAISS index optimization...")
    opt_stats = faiss_manager.optimize_index()
    print(f"‚úì FAISS optimization: {opt_stats.get('message', 'Unknown')}")
    
    # Test index size
    print("\nTesting FAISS index size...")
    size_info = faiss_manager.get_index_size()
    print(f"‚úì FAISS size info:")
    print(f"  Memory: {size_info.get('memory_mb', 0):.2f} MB")
    print(f"  Disk: {size_info.get('disk_mb', 0):.2f} MB")
    print(f"  Vector count: {size_info.get('vector_count', 0)}")
    
    # Test index readiness
    print("\nTesting index readiness...")
    is_built = faiss_manager.is_index_built()
    print(f"‚úì Index built: {is_built}")
    
except Exception as e:
    print(f"‚úó FAISS manager test failed: {e}")
    import traceback
    traceback.print_exc()

# ----------------------------------------------------------------------------
# 5. TEST BM25 INDEX (if available)
# ----------------------------------------------------------------------------
print("\n" + "-" * 60)
print("TEST 5: BM25 Index")
print("-" * 60)

if BM25_AVAILABLE:
    try:
        # Initialize BM25 index
        bm25_index = BM25Index()
        print("‚úì BM25 index initialized")
        
        # Create test texts and chunk IDs
        print("\nCreating test texts...")
        test_texts = [
            "This is a test document about artificial intelligence and machine learning.",
            "Machine learning algorithms can learn from data and make predictions.",
            "Artificial intelligence is transforming many industries today.",
            "Deep learning is a subset of machine learning using neural networks.",
            "Natural language processing helps computers understand human language.",
            "Computer vision enables machines to interpret visual information.",
            "Reinforcement learning involves agents learning from interactions.",
            "Supervised learning uses labeled datasets for training models.",
            "Unsupervised learning finds patterns in unlabeled data.",
            "Transfer learning applies knowledge from one domain to another."
        ]
        
        test_bm25_chunk_ids = [f"chunk_bm25_{i}" for i in range(len(test_texts))]
        
        # Test index building
        print("\nTesting BM25 index building...")
        build_stats = bm25_index.build_index(
            texts=test_texts,
            chunk_ids=test_bm25_chunk_ids,
            rebuild=True
        )
        
        print(f"‚úì BM25 index built:")
        print(f"  Documents: {build_stats.get('documents', 0)}")
        print(f"  Vocabulary size: {build_stats.get('vocabulary_size', 0)}")
        print(f"  Build time: {build_stats.get('build_time_seconds', 0):.2f}s")
        
        # Test index statistics
        print("\nTesting BM25 index statistics...")
        index_stats = bm25_index.get_index_stats()
        print(f"‚úì BM25 index stats:")
        print(f"  Built: {index_stats.get('built', False)}")
        print(f"  Document count: {index_stats.get('document_count', 0)}")
        print(f"  Vocabulary size: {index_stats.get('vocabulary_size', 0)}")
        print(f"  Search count: {index_stats.get('search_count', 0)}")
        
        # Test search functionality
        print("\nTesting BM25 search...")
        test_queries = [
            "machine learning",
            "artificial intelligence",
            "neural networks",
            "computer vision"
        ]
        
        for query in test_queries:
            results = bm25_index.search(query=query, top_k=3)
            print(f"  Query: '{query}' -> {len(results)} results")
            if results:
                print(f"    Top: {results[0][0]} (score: {results[0][1]:.4f})")
        
        # Test adding to index
        print("\nTesting adding to BM25 index...")
        new_texts = [
            "New document about AI ethics and responsible innovation.",
            "Another document discussing future trends in AI."
        ]
        new_chunk_ids = [f"chunk_new_{i}" for i in range(len(new_texts))]
        
        add_stats = bm25_index.add_to_index(new_texts, new_chunk_ids)
        print(f"‚úì Added to BM25 index:")
        print(f"  Added documents: {add_stats.get('added', 0)}")
        print(f"  New total: {add_stats.get('new_total', 0)}")
        
        # Test term statistics
        print("\nTesting term statistics...")
        test_terms = ["learning", "artificial", "computer"]
        for term in test_terms:
            term_stats = bm25_index.get_term_stats(term)
            if term_stats:
                print(f"  Term '{term}': freq={term_stats.get('term_frequency', 0)}, "
                      f"doc_freq={term_stats.get('document_frequency', 0)}")
        
        # Test index optimization
        print("\nTesting BM25 index optimization...")
        opt_stats = bm25_index.optimize_index()
        print(f"‚úì BM25 optimization: {opt_stats.get('message', 'Unknown')}")
        
        # Test index size
        print("\nTesting BM25 index size...")
        size_info = bm25_index.get_index_size()
        print(f"‚úì BM25 size info:")
        print(f"  Memory: {size_info.get('memory_mb', 0):.2f} MB")
        print(f"  Disk: {size_info.get('disk_mb', 0):.2f} MB")
        print(f"  Document count: {size_info.get('document_count', 0)}")
        
        # Test index readiness
        print("\nTesting BM25 index readiness...")
        is_built = bm25_index.is_index_built()
        print(f"‚úì BM25 index built: {is_built}")
        
    except Exception as e:
        print(f"‚úó BM25 index test failed: {e}")
        import traceback
        traceback.print_exc()
else:
    print("‚ö†Ô∏è  BM25 tests skipped (rank_bm25 not available)")

# ----------------------------------------------------------------------------
# 6. TEST INDEX BUILDER (Integration Test)
# ----------------------------------------------------------------------------
print("\n" + "-" * 60)
print("TEST 6: Index Builder (Integration)")
print("-" * 60)

try:
    # Create test directory for index builder
    test_index_builder_dir = test_temp_dir / "index_builder_test"
    test_index_builder_dir.mkdir(exist_ok=True)
    
    # Initialize index builder
    index_builder = IndexBuilder(vector_store_dir=test_index_builder_dir)
    print("‚úì Index builder initialized")
    
    # Create test chunks with embeddings
    print("\nCreating test chunks with embeddings...")
    from config.models import DocumentChunk
    
    test_integration_chunks = []
    num_test_chunks = 20
    
    for i in range(num_test_chunks):
        # Create embedding
        embedding = np.random.randn(384).astype('float32').tolist()
        embedding = [float(x) for x in embedding]  # Ensure Python floats
        
        chunk = DocumentChunk(
            chunk_id=f"chunk_integration_{i}",
            document_id=f"doc_integration_{i // 5}",  # 5 chunks per document
            text=f"This is integration test chunk {i} for the index builder. It contains text about various topics for testing purposes.",
            embedding=embedding,
            chunk_index=i,
            start_char=i * 100,
            end_char=(i + 1) * 100,
            page_number=1 + (i // 10),
            section_title=f"Section {i % 3}",
            token_count=75,
            metadata={
                "source": "integration_test",
                "timestamp": "2024-01-01T12:00:00",
                "test_id": i
            }
        )
        test_integration_chunks.append(chunk)
    
    print(f"‚úì Created {len(test_integration_chunks)} test chunks")
    
    # Test building all indexes
    print("\nTesting index building (FAISS + BM25 + Metadata)...")
    build_stats = index_builder.build_indexes(
        chunks=test_integration_chunks,
        rebuild=True
    )
    
    print(f"‚úì Index building completed:")
    print(f"  Total chunks: {build_stats.get('total_chunks', 0)}")
    print(f"  Build time: {build_stats.get('build_time_seconds', 0):.2f}s")
    print(f"  Chunks/sec: {build_stats.get('chunks_per_second', 0):.1f}")
    
    # Check component stats
    faiss_stats = build_stats.get('faiss', {})
    bm25_stats = build_stats.get('bm25', {})
    metadata_stats = build_stats.get('metadata', {})
    
    print(f"\n  FAISS: {faiss_stats.get('vectors', 0)} vectors")
    print(f"  BM25: {bm25_stats.get('documents', 0)} documents")
    print(f"  Metadata: {metadata_stats.get('stored_chunks', 0)} chunks stored")
    
    # Test index statistics
    print("\nTesting comprehensive index statistics...")
    index_stats = index_builder.get_index_stats()
    print(f"‚úì Index statistics retrieved")
    
    total_chunks = index_stats.get('total_chunks_indexed', 0)
    print(f"  Total chunks indexed: {total_chunks}")
    
    # Test index readiness
    print("\nTesting index readiness...")
    is_built = index_builder.is_index_built()
    print(f"‚úì All indexes built: {is_built}")
    
    # Test index optimization
    print("\nTesting index optimization...")
    opt_stats = index_builder.optimize_indexes()
    print(f"‚úì Index optimization completed")
    
    # Test index size
    print("\nTesting index size information...")
    size_info = index_builder.get_index_size()
    print(f"‚úì Index size info:")
    print(f"  Total memory: {size_info.get('total_memory_mb', 0):.2f} MB")
    print(f"  Total disk: {size_info.get('total_disk_mb', 0):.2f} MB")
    
    # Test integration with previously tested components
    print("\nTesting integration with FAISS manager...")
    faiss_manager = index_builder.faiss_manager
    if faiss_manager:
        faiss_integration_stats = faiss_manager.get_index_stats()
        print(f"  FAISS integration: {faiss_integration_stats.get('vector_count', 0)} vectors")
    
    print("\nTesting integration with metadata store...")
    metadata_integration_stats = index_builder.metadata_store.get_stats()
    print(f"  Metadata integration: {metadata_integration_stats.get('chunks', 0)} chunks")
    
    # Test clearing indexes
    print("\nTesting index clearing...")
    index_builder.clear_indexes()
    print("‚úì Indexes cleared")
    
    # Verify indexes are cleared
    is_still_built = index_builder.is_index_built()
    print(f"  Indexes after clear: {'Built' if is_still_built else 'Cleared'}")
    
except Exception as e:
    print(f"‚úó Index builder test failed: {e}")
    import traceback
    traceback.print_exc()

# ----------------------------------------------------------------------------
# 7. TEST GLOBAL INSTANCES
# ----------------------------------------------------------------------------
print("\n" + "-" * 60)
print("TEST 7: Global Instances")
print("-" * 60)

try:
    print("Testing global instance getters...")
    
    # Test get_backup_manager
    backup_manager_global = get_backup_manager()
    print(f"‚úì Backup manager global instance: {backup_manager_global is not None}")
    
    # Test get_index_persister
    index_persister_global = get_index_persister()
    print(f"‚úì Index persister global instance: {index_persister_global is not None}")
    
    # Test get_metadata_store
    metadata_store_global = get_metadata_store()
    print(f"‚úì Metadata store global instance: {metadata_store_global is not None}")
    
    # Test get_index_builder
    index_builder_global = get_index_builder()
    print(f"‚úì Index builder global instance: {index_builder_global is not None}")
    
    # Test get_faiss_manager
    faiss_manager_global = get_faiss_manager()
    print(f"‚úì FAISS manager global instance: {faiss_manager_global is not None}")
    
    # Test get_bm25_index (if available)
    if BM25_AVAILABLE:
        bm25_index_global = get_bm25_index()
        print(f"‚úì BM25 index global instance: {bm25_index_global is not None}")
    
    print("‚úì All global instances retrieved successfully")
    
except Exception as e:
    print(f"‚úó Global instances test failed: {e}")
    import traceback
    traceback.print_exc()

# ----------------------------------------------------------------------------
# 8. TEST INTEGRATION WITH EMBEDDINGS MODULE
# ----------------------------------------------------------------------------
print("\n" + "-" * 60)
print("TEST 8: Integration with Embeddings Module")
print("-" * 60)

# Check if we have embeddings from previous tests
if 'embedder' in locals() and 'test_chunks' in locals():
    try:
        print("Testing integration between embeddings and vector store...")
        
        # Create a new test directory
        test_integration_dir = test_temp_dir / "embedding_vector_integration"
        test_integration_dir.mkdir(exist_ok=True)
        
        # Get embedder from previous test
        print("  Using embedder from previous tests...")
        
        # Create test chunks with actual embeddings
        print("  Creating chunks with actual embeddings...")
        test_texts_for_embedding = [
            "Artificial intelligence is revolutionizing many industries.",
            "Machine learning algorithms require large datasets for training.",
            "Natural language processing enables human-computer interaction.",
            "Computer vision systems can identify objects in images.",
            "Deep learning uses neural networks with multiple layers."
        ]
        
        # Generate embeddings using embedder
        embeddings_list = embedder.embed_texts(
            texts=test_texts_for_embedding,
            batch_size=2,
            normalize=True
        )
        
        print(f"  Generated {len(embeddings_list)} embeddings")
        
        # Create DocumentChunk objects
        from config.models import DocumentChunk
        
        integration_chunks = []
        for i, (text, embedding) in enumerate(zip(test_texts_for_embedding, embeddings_list)):
            chunk = DocumentChunk(
                chunk_id=f"integration_chunk_{i}",
                document_id="integration_doc_1",
                text=text,
                embedding=embedding.tolist(),
                chunk_index=i,
                start_char=i * 100,
                end_char=(i + 1) * 100,
                page_number=1,
                section_title=f"AI Topic {i}",
                token_count=len(text.split()),
                metadata={
                    "source": "embedding_integration_test",
                    "embedding_dim": len(embedding)
                }
            )
            integration_chunks.append(chunk)
        
        print(f"  Created {len(integration_chunks)} chunks with embeddings")
        
        # Build indexes using index builder
        print("  Building indexes with embedded chunks...")
        integration_index_builder = IndexBuilder(vector_store_dir=test_integration_dir)
        integration_stats = integration_index_builder.build_indexes(
            chunks=integration_chunks,
            rebuild=True
        )
        
        print(f"  Indexes built successfully:")
        print(f"    Chunks indexed: {integration_stats.get('total_chunks', 0)}")
        print(f"    FAISS vectors: {integration_stats.get('faiss', {}).get('vectors', 0)}")
        print(f"    BM25 documents: {integration_stats.get('bm25', {}).get('documents', 0)}")
        
        # Test retrieval
        print("  Testing retrieval integration...")
        
        # Create a query embedding
        query_text = "machine learning artificial intelligence"
        query_embedding = embedder.embed_text(query_text, normalize=True)
        
        # Use FAISS manager to search
        faiss_results = integration_index_builder.faiss_manager.search(
            query_embedding=query_embedding,
            top_k=3
        )
        
        print(f"  FAISS search results: {len(faiss_results)} matches")
        if faiss_results:
            for i, (chunk_id, score) in enumerate(faiss_results[:3]):
                print(f"    Result {i+1}: {chunk_id} (score: {score:.4f})")
        
        # Use BM25 to search (if available)
        if BM25_AVAILABLE:
            bm25_results = integration_index_builder.bm25_index.search(
                query=query_text,
                top_k=3
            )
            print(f"  BM25 search results: {len(bm25_results)} matches")
        
        print("‚úì Embedding-vector store integration test successful")
        
    except Exception as e:
        print(f"‚úó Integration test failed: {e}")
        import traceback
        traceback.print_exc()
else:
    print("‚ö†Ô∏è  Embedding integration test skipped (run embedding tests first)")

# ----------------------------------------------------------------------------
# CLEANUP
# ----------------------------------------------------------------------------
print("\n" + "-" * 60)
print("CLEANUP")
print("-" * 60)

try:
    # Clean up temporary directory
    if test_temp_dir.exists():
        shutil.rmtree(test_temp_dir)
        print(f"‚úì Cleaned up temporary directory: {test_temp_dir}")
except Exception as e:
    print(f"‚ö†Ô∏è  Cleanup warning: {e}")

# ----------------------------------------------------------------------------
# SUMMARY
# ----------------------------------------------------------------------------
print("\n" + "=" * 80)
print("VECTOR STORE MODULE TEST SUMMARY")
print("=" * 80)

print("\nüìä TEST RESULTS:")
print("1. ‚úÖ Backup Manager: Tested backup creation, listing, verification, and cleanup")
print("2. ‚úÖ Index Persister: Tested file persistence, metadata operations, and cleanup")
print("3. ‚úÖ Metadata Store: Tested chunk storage, retrieval, statistics, and database operations")
print("4. ‚úÖ FAISS Manager: Tested index building, search, optimization, and statistics")
print("5. " + ("‚úÖ" if BM25_AVAILABLE else "‚ö†Ô∏è ") + " BM25 Index: " + 
      ("Tested index building, search, term statistics, and optimization" if BM25_AVAILABLE else "Skipped (rank_bm25 not available)"))
print("6. ‚úÖ Index Builder: Tested comprehensive index building with all components")
print("7. ‚úÖ Global Instances: Verified all global instance getters work correctly")
print("8. " + ("‚úÖ" if 'embedder' in locals() else "‚ö†Ô∏è ") + " Embedding Integration: " + 
      ("Tested integration with embeddings module" if 'embedder' in locals() else "Run embedding tests first"))

print("\nüéØ NEXT STEPS:")
print("1. ‚úÖ Vector Store Module - Testing Complete")
print("2. ‚è≥ Retrieval Module - Ready for testing")
print("3. ‚è≥ Generation Module - Ready for testing")
print("4. ‚è≥ RAGAS Module - Ready for testing")
print("5. ‚è≥ End-to-End Pipeline - Ready for final integration test")

print("\nüí° RECOMMENDATIONS:")
print("1. Ensure rank_bm25 is installed for full BM25 functionality")
print("2. Test with larger datasets to validate performance")
print("3. Verify persistence by restarting and reloading indexes")

print("\n‚úÖ VECTOR STORE MODULE TESTING COMPLETE")
print("   Next step: Retrieval Module Testing")
print("=" * 80)

### SUMMARY REPORT

In [None]:
print("\n" + "=" * 80)
print("STEP 8: INTEGRATION TEST SUMMARY")
print("=" * 80)

print(f"\n‚úÖ PIPELINE STATUS: {'WORKING' if parsed_documents else 'NEEDS ATTENTION'}")

print(f"\nüìã Files Processed:")
for file_type in parsed_documents.keys():
    print(f"  ‚úì {file_type.upper()}")

print(f"\nüìä Statistics:")
print(f"  Documents Parsed: {len(parsed_documents)}")
if token_stats:
    total_tokens = sum(stats['tokens'] for stats in token_stats.values())
    print(f"  Total Tokens: {total_tokens:,}")
if 'fixed_chunks' in locals():
    print(f"  Chunks Created (Fixed): {len(fixed_chunks)}")
if 'semantic_chunks' in locals():
    print(f"  Chunks Created (Semantic): {len(semantic_chunks)}")
if 'hierarchical_chunks' in locals():
    print(f"  Chunks Created (Hierarchical): {len(hierarchical_chunks)}")

print(f"\nüéØ Next Steps:")
print("  1. ‚úì Configuration - Working")
print("  2. ‚úì Document Parsing - Working")
print("  3. ‚úì Text Cleaning - Working")
print("  4. ‚úì Token Counting - Working")
print("  5. ‚úì Chunking - Working")
print("  6. ‚è≥ Embedding Generation - Ready for next phase")
print("  7. ‚è≥ Vector Indexing - Ready for next phase")
print("  8. ‚è≥ Retrieval Testing - Ready for next phase")

print("\n" + "=" * 80)
print(f"Completed: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print("=" * 80)

print("\nüí° TIP: Update TEST_FILES paths at the top to test with your actual files!")

# ----------------------------------------------------------------------------
# SUMMARY
# ----------------------------------------------------------------------------
print("\n" + "=" * 80)
print("EMBEDDINGS MODULE TEST SUMMARY")
print("=" * 80)

# Get final component stats
try:
    model_stats = model_loader.get_model_info()
    batch_stats = batch_processor.get_processing_stats()
    cache_stats = cache.get_stats()
    embedder_info = embedder.get_model_info()
    
    print("\nüìä FINAL COMPONENT STATISTICS:")
    print(f"\nModel Loader:")
    print(f"  Model: {model_stats.get('model_name', 'unknown')}")
    print(f"  Loaded: {model_stats.get('loaded', False)}")
    print(f"  Cache size: {model_stats.get('cache_size', 0)}")
    
    print(f"\nBatch Processor:")
    print(f"  Total batches: {batch_stats.get('total_batches', 0)}")
    print(f"  Total texts: {batch_stats.get('total_texts', 0)}")
    print(f"  Failed batches: {batch_stats.get('failed_batches', 0)}")
    print(f"  Success rate: {batch_stats.get('success_rate', 0):.1f}%")
    
    print(f"\nEmbedder:")
    print(f"  Model: {embedder_info.get('model_name', 'unknown')}")
    print(f"  Dimension: {embedder_info.get('embedding_dim', 'unknown')}")
    print(f"  Device: {embedder_info.get('device', 'unknown')}")
    
    print(f"\nEmbedding Cache:")
    print(f"  Cache size: {cache_stats.get('cache_size', 0)}")
    print(f"  Max size: {cache_stats.get('max_size', 0)}")
    print(f"  Hits: {cache_stats.get('hits', 0)}")
    print(f"  Misses: {cache_stats.get('misses', 0)}")
    print(f"  Hit rate: {cache_stats.get('hit_rate_percentage', 0):.1f}%")
    print(f"  Embeddings generated: {cache_stats.get('embeddings_generated', 0)}")
    
except Exception as e:
    print(f"Error getting final stats: {e}")

print("\n‚úÖ EMBEDDINGS MODULE TESTING COMPLETE")
print("   Next step: Vector Store Module Testing")
print("=" * 80)