# Product Quantization Demo

This notebook demonstrates the complete Product Quantization pipeline.

In [None]:
# Import required modules
import pandas as pd
import numpy as np
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()

from data_processor import DataProcessor, SimpleRetrievalSystem
from pq import ProductQuantizer, generate_random_embeddings

print("Modules imported successfully!")
print("Environment variables loaded from .env file")

In [None]:
# Load and process data
processor = DataProcessor()
df = processor.load_data()
print(f"Loaded {len(df)} sentences")
df.head()

In [None]:
# Generate embeddings (using random for demo - replace with Cohere if available)
embeddings = generate_random_embeddings(len(processor.texts), 384)
processor.embeddings = embeddings
print(f"Generated embeddings shape: {embeddings.shape}")

In [None]:
# Train Product Quantizer
pq = ProductQuantizer(M=8, K=256, verbose=True)
pq.fit(embeddings)

# Show compression statistics
memory_stats = pq.get_memory_usage(len(embeddings))
print(f"\nCompression Analysis:")
for key, value in memory_stats.items():
    if 'ratio' in key:
        print(f"{key}: {value:.1f}x")
    else:
        print(f"{key}: {value/1024:.1f} KB")

In [None]:
# Create retrieval system
retrieval_system = SimpleRetrievalSystem(M=8, K=256)
retrieval_system.pq = pq  # Use trained quantizer
retrieval_system.index_documents(embeddings, processor.texts, processor.sentiments)

print("Retrieval system ready!")

In [None]:
# Demo search
query_embedding = embeddings[0]  # Use first embedding as query
results, distances, sentiments = retrieval_system.search(query_embedding, k=5)

print("Search Results:")
for i, (text, dist, sentiment) in enumerate(zip(results, distances, sentiments)):
    print(f"{i+1}. [{sentiment}] (distance: {dist:.3f})")
    print(f"   {text[:100]}...\n")

In [None]:
# Evaluate performance
test_queries = embeddings[:10]  # Use first 10 as test queries
recall_results = retrieval_system.evaluate_recall(test_queries)

print("Recall Performance:")
for k, recall in recall_results.items():
    print(f"Recall@{k}: {recall:.3f}")