# FAISS Index Tests

This notebook tests FAISS index functionality:
1. Load existing FAISS index
2. Search for similar embeddings
3. Compare search results with database queries
4. Performance benchmarks


In [None]:
# Setup and imports
import numpy as np
from memory import (
    load_faiss_index,
    search_faiss_index,
    get_index_stats,
    load_embedding,
    get_node,
    count_sentences,
    load_sentence,
)
import os

print("All imports successful")


## Test 1: Load FAISS Index


In [None]:
# Load FAISS index
data_dir = "data"
faiss_index_path = os.path.join(data_dir, "faiss.index")

if not os.path.exists(faiss_index_path):
    print(f"FAISS index not found at {faiss_index_path}")
    print("Run process_text.py first to generate the index")
else:
    faiss_index = load_faiss_index(faiss_index_path)
    stats = get_index_stats(faiss_index)
    
    print("FAISS Index Statistics:")
    print(f"  Type: {stats['index_type']}")
    print(f"  Number of vectors: {stats['num_vectors']}")
    print(f"  Embedding dimension: {stats['embedding_dim']}")
    print(f"  Is trained: {stats['is_trained']}")
    
    if stats['index_type'] == 'ivf-pq':
        print(f"  nlist: {stats['nlist']}")
        print(f"  m: {stats['m']}")
        print(f"  nbits: {stats['nbits']}")
    
    print("\n[OK] FAISS index loaded successfully")


## Test 2: Search for Similar Embeddings


In [None]:
# Load embeddings and test search
embeddings_path = os.path.join(data_dir, "embeddings.npy")
sentences_path = os.path.join(data_dir, "sentences.jsonl")

if os.path.exists(embeddings_path) and os.path.exists(faiss_index_path):
    embeddings = np.load(embeddings_path, mmap_mode='r')
    faiss_index = load_faiss_index(faiss_index_path)
    
    # Test search with first embedding
    query_embedding = embeddings[0]
    k = 10
    
    print(f"Searching for top {k} neighbors of first sentence...")
    distances, indices = search_faiss_index(faiss_index, query_embedding, k=k)
    
    print(f"\nTop {k} nearest neighbors:")
    print("-" * 60)
    for i, (idx, dist) in enumerate(zip(indices, distances)):
        sentence = load_sentence(sentences_path, idx)
        if sentence:
            text = sentence.get('text', '')[:60]
            print(f"{i+1}. Index {idx}: distance={dist:.4f}")
            print(f"   Text: {text}...")
        else:
            print(f"{i+1}. Index {idx}: distance={dist:.4f} (sentence not found)")
    
    print("\n[OK] FAISS search test passed")
else:
    print("[SKIP] Required files not found")


## Test 3: Search with Query Embedding


In [None]:
# Test search with a new query embedding (average of first 5 sentences)
if os.path.exists(embeddings_path) and os.path.exists(faiss_index_path):
    embeddings = np.load(embeddings_path, mmap_mode='r')
    faiss_index = load_faiss_index(faiss_index_path)
    
    # Create query embedding as average of first 5 sentences
    query_emb = np.mean(embeddings[:5], axis=0).astype(np.float32)
    
    print("Searching with average embedding of first 5 sentences...")
    k = 10
    distances, indices = search_faiss_index(faiss_index, query_emb, k=k)
    
    print(f"\nTop {k} results:")
    print("-" * 60)
    for i, (idx, dist) in enumerate(zip(indices, distances)):
        sentence = load_sentence(sentences_path, idx)
        if sentence:
            text = sentence.get('text', '')[:60]
            print(f"{i+1}. Index {idx}: distance={dist:.4f}")
            print(f"   Text: {text}...")
    
    print("\n[OK] Query embedding search test passed")
else:
    print("[SKIP] Required files not found")


## Test 4: Compare FAISS Search with Database


In [None]:
# Compare FAISS search results with database node retrieval
from memory import get_node, init_db

if os.path.exists(embeddings_path) and os.path.exists(faiss_index_path):
    embeddings = np.load(embeddings_path, mmap_mode='r')
    faiss_index = load_faiss_index(faiss_index_path)
    db_path = os.path.join(data_dir, "memory.db")
    init_db(db_path)
    
    # Search for neighbors of node 1
    test_node_id = 1
    node = get_node(test_node_id, sentences_path=sentences_path, 
                   embeddings_path=embeddings_path, db_path=db_path)
    
    if node and node['embedding']:
        query_emb = np.array(node['embedding'], dtype=np.float32)
        
        # FAISS search
        k = 5
        distances, indices = search_faiss_index(faiss_index, query_emb, k=k)
        
        print(f"Node {test_node_id}: {node['sentence']['text'][:50]}...")
        print(f"\nFAISS top {k} neighbors (by embedding similarity):")
        for i, (idx, dist) in enumerate(zip(indices, distances)):
            # Convert embedding index to node ID (assuming 1-based)
            neighbor_node_id = idx + 1
            neighbor_node = get_node(neighbor_node_id, sentences_path=sentences_path,
                                   embeddings_path=embeddings_path, db_path=db_path)
            if neighbor_node:
                print(f"  {i+1}. Node {neighbor_node_id}: distance={dist:.4f}")
                print(f"     Text: {neighbor_node['sentence']['text'][:50]}...")
        
        # Database graph neighbors
        print(f"\nDatabase graph neighbors (by edge connections):")
        for i, neighbor in enumerate(node['neighbors'][:k]):
            neighbor_node = get_node(neighbor['v'], sentences_path=sentences_path,
                                   embeddings_path=embeddings_path, db_path=db_path)
            if neighbor_node:
                print(f"  {i+1}. Node {neighbor['v']}: weight={neighbor['weight']:.2f}, type={neighbor['edge_type']}")
                print(f"     Text: {neighbor_node['sentence']['text'][:50]}...")
        
        print("\n[OK] Comparison test completed")
    else:
        print("[ERROR] Could not retrieve test node")
else:
    print("[SKIP] Required files not found")


## Test 5: Performance Benchmark


In [None]:
# Benchmark FAISS search performance
import time

if os.path.exists(embeddings_path) and os.path.exists(faiss_index_path):
    embeddings = np.load(embeddings_path, mmap_mode='r')
    faiss_index = load_faiss_index(faiss_index_path)
    
    num_queries = 100
    k = 10
    
    print(f"Running {num_queries} searches with k={k}...")
    
    # Warm up
    search_faiss_index(faiss_index, embeddings[0], k=k)
    
    # Benchmark
    start_time = time.time()
    for i in range(num_queries):
        query_idx = i % len(embeddings)
        query_emb = embeddings[query_idx]
        distances, indices = search_faiss_index(faiss_index, query_emb, k=k)
    end_time = time.time()
    
    total_time = end_time - start_time
    avg_time = total_time / num_queries
    
    print(f"\nPerformance Results:")
    print(f"  Total time: {total_time:.4f} seconds")
    print(f"  Average time per query: {avg_time*1000:.2f} ms")
    print(f"  Queries per second: {num_queries/total_time:.1f}")
    print(f"  Index size: {faiss_index.ntotal} vectors")
    
    print("\n[OK] Performance benchmark completed")
else:
    print("[SKIP] Required files not found")


## Test 6: Batch Search


In [None]:
# Test batch search (multiple queries at once)
if os.path.exists(embeddings_path) and os.path.exists(faiss_index_path):
    embeddings = np.load(embeddings_path, mmap_mode='r')
    faiss_index = load_faiss_index(faiss_index_path)
    
    # Create batch of query embeddings
    batch_size = 5
    query_batch = embeddings[:batch_size].astype(np.float32)
    k = 3
    
    print(f"Batch search: {batch_size} queries, k={k}")
    
    # FAISS supports batch search natively
    distances, indices = faiss_index.search(query_batch, k)
    
    print(f"\nBatch search results:")
    print("-" * 60)
    for query_idx in range(batch_size):
        print(f"\nQuery {query_idx + 1}:")
        for i, (idx, dist) in enumerate(zip(indices[query_idx], distances[query_idx])):
            sentence = load_sentence(sentences_path, idx)
            if sentence:
                text = sentence.get('text', '')[:50]
                print(f"  {i+1}. Index {idx}: distance={dist:.4f} - {text}...")
    
    print("\n[OK] Batch search test passed")
else:
    print("[SKIP] Required files not found")


## Summary

FAISS index tests completed. The index provides fast similarity search for embeddings.
