# Embedding Quality Test

Test embedding models and quality.

In [None]:
import sys
from pathlib import Path
sys.path.insert(0, str(Path.cwd()))

from src.indexing.embedding_models import HuggingFaceEmbedder, CachedEmbedder
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

## Initialize Embedding Model

In [None]:
# Initialize embedder
embedder = CachedEmbedder(model_name="all-MiniLM-L6-v2", cache_dir=Path("./data/cache"))

print(f"Embedding dimension: {embedder.get_embedding_dimension()}")
print(f"Model: {embedder.model_name}")

## Test Semantic Similarity

In [None]:
# Test semantic similarity
sentences = [
    "The attention mechanism improves transformer performance",
    "Transformers use attention to weight different tokens",
    "Deep learning models require large datasets",
    "Neural networks have many parameters",
]

# Embed
embeddings = embedder.embed_documents(sentences)
embeddings = np.array(embeddings)

# Compute similarities
similarity_matrix = cosine_similarity(embeddings)

print("Semantic Similarity Matrix:")
print(similarity_matrix)

# Find most similar pair
max_sim = 0
max_pair = (0, 1)
for i in range(len(sentences)):
    for j in range(i+1, len(sentences)):
        if similarity_matrix[i][j] > max_sim:
            max_sim = similarity_matrix[i][j]
            max_pair = (i, j)

print(f"\nMost similar:")
print(f"  1. {sentences[max_pair[0]]}")
print(f"  2. {sentences[max_pair[1]]}")
print(f"  Similarity: {max_sim:.4f}")

## Test Caching

In [None]:
import time

test_text = "Testing cache performance with repeated embeddings"

# First embedding (not cached)
start = time.time()
emb1 = embedder.embed_query(test_text)
time_first = time.time() - start

# Second embedding (cached)
start = time.time()
emb2 = embedder.embed_query(test_text)
time_second = time.time() - start

print(f"First call: {time_first*1000:.2f}ms")
print(f"Second call (cached): {time_second*1000:.2f}ms")
print(f"Speedup: {time_first/time_second:.1f}x")
print(f"Cache working: {emb1 == emb2}")