In [None]:
import pandas as pd
import numpy as np
from IPython.display import display
import json
import time
from tqdm import tqdm
import os
from typing import List, Dict, Any

# For EmbeddingGemma model
from transformers import AutoModel, AutoTokenizer
import torch

# For HuggingFace authentication (if using gated models)
try:
    from huggingface_hub import login
    HF_LOGIN_AVAILABLE = True
except ImportError:
    HF_LOGIN_AVAILABLE = False
    print("Note: huggingface_hub not installed. Install with: uv add huggingface_hub")

import chromadb
from chromadb.config import Settings

In [8]:
# ----------- Paths Configuration -----------
# Get the project root directory
# In Jupyter, we detect the project root by looking for common project files
current_dir = os.getcwd()
if os.path.basename(current_dir) == "notebooks":
    # If we're in the notebooks directory, go up one level
    PROJECT_ROOT = os.path.dirname(current_dir)
else:
    # If we're in the project root, use current directory
    PROJECT_ROOT = current_dir

# Data file path
DF_PATH = os.path.join(PROJECT_ROOT, "data", "homes_preprocessed_data.csv")
# Alternative simple path (if running from project root): DF_PATH = "data/homes_preprocessed_data.csv"

# Vector store path
VS_PATH = os.path.join(PROJECT_ROOT, "vector_store_1")
COLL_NAME = "agentic_voice_assistant_vdb"

print(f"Project root: {PROJECT_ROOT}")
print(f"Data path: {DF_PATH}")

# ----------- Embedding Model Configuration -----------
# Based on token analysis: avg=265, max=919, 95th percentile=546
# 
# Option 1: EmbeddingGemma (GATED - requires HuggingFace access)
# - Model: "google/embedding-gemma-300m"
# - Context window: 2048 tokens (perfect for your data!)
# - Embedding dimension: 768 (supports MRL: can truncate to 512/256/128)
# - 308M parameters, efficient for on-device use
# - NOTE: Requires HuggingFace account and access request
#
# Option 2: Open alternatives (no authentication needed)
# - "BAAI/bge-large-en-v1.5" (1024 dims, 512 tokens) - State-of-the-art, open
# - "sentence-transformers/all-mpnet-base-v2" (384 dims, 512 tokens) - Good quality
# - "sentence-transformers/all-MiniLM-L6-v2" (384 dims, 512 tokens) - Fast, small

USE_EMBEDDING_GEMMA = True  # Set to False to use open alternative

if USE_EMBEDDING_GEMMA:
    EMBEDDING_MODEL_NAME = "google/embedding-gemma-300m"
    EMBEDDING_DIM = 768
    MAX_SEQ_LENGTH = 2048
    print("Using EmbeddingGemma (gated model - requires HuggingFace access)")
    print("If you get access errors, set USE_EMBEDDING_GEMMA = False to use an open alternative")
else:
    # Open alternative with large context
    EMBEDDING_MODEL_NAME = "BAAI/bge-large-en-v1.5"  # 1024 dims, 512 tokens
    EMBEDDING_DIM = 1024
    MAX_SEQ_LENGTH = 512
    print("Using open alternative: BAAI/bge-large-en-v1.5")

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

# ----------- Load data -----------
def must_exist(p):
    assert os.path.exists(p), f"Missing file: {p}"

must_exist(DF_PATH)
os.makedirs(VS_PATH, exist_ok=True)

df = pd.read_csv(DF_PATH)
print(f"Loaded {len(df)} products from {DF_PATH}")

# Check if embedding_text column exists
if "embedding_text" not in df.columns:
    raise ValueError("DataFrame must have 'embedding_text' column. Run data_preprocessing.ipynb first.")

print(f"Found 'embedding_text' column with {df['embedding_text'].notna().sum()} non-null values\n")

# ----------- Reconnect to vector store -----------
client = chromadb.PersistentClient(
    path=VS_PATH,
    settings=Settings(anonymized_telemetry=False)
)
collection = client.get_or_create_collection(name=COLL_NAME)
print(f"Vector store connected! Current embeddings: {collection.count()}\n")


Project root: /Users/brunamedeiros/Documents/GitHub/agentic-voice-assistant
Data path: /Users/brunamedeiros/Documents/GitHub/agentic-voice-assistant/data/homes_preprocessed_data.csv
Using EmbeddingGemma (gated model - requires HuggingFace access)
If you get access errors, set USE_EMBEDDING_GEMMA = False to use an open alternative
Using device: cpu
Loaded 708 products from /Users/brunamedeiros/Documents/GitHub/agentic-voice-assistant/data/homes_preprocessed_data.csv
Found 'embedding_text' column with 708 non-null values

Vector store connected! Current embeddings: 0



In [11]:
# ----------- HuggingFace Authentication (for gated models) -----------
if USE_EMBEDDING_GEMMA:
    # Check if we need to authenticate
    hf_token = os.getenv("HF_TOKEN")
    
    if not hf_token and HF_LOGIN_AVAILABLE:
        print("="*60)
        print("HUGGINGFACE AUTHENTICATION REQUIRED")
        print("="*60)
        print("EmbeddingGemma is a gated model. You need to:")
        print("1. Request access at: https://huggingface.co/google/embedding-gemma-300m")
        print("2. Get your token from: https://huggingface.co/settings/tokens")
        print("3. Either:")
        print("   - Run: huggingface-cli login")
        print("   - Or set environment variable: export HUGGING_FACE_HUB_TOKEN=your_token")
        print("   - Or login in code below")
        print("="*60)
        
        # Try to login if token is available
        try:
            if HF_LOGIN_AVAILABLE:
                print("\nAttempting to login to HuggingFace...")
                print("If you haven't logged in, you'll need to:")
                print("1. Get token from https://huggingface.co/settings/tokens")
                print("2. Run: huggingface-cli login")
                print("   OR set HUGGING_FACE_HUB_TOKEN environment variable")
                # Uncomment the line below and add your token if needed:
                # login(token="your_token_here")
        except Exception as e:
            print(f"Login attempt failed: {e}")
            print("Continuing anyway - will fail if model is gated...")

# ----------- Load Embedding Model -----------
print(f"\nLoading model: {EMBEDDING_MODEL_NAME}")
print("This may take a few minutes on first run (downloading model)...")

try:
    # Load model and tokenizer
    tokenizer = AutoTokenizer.from_pretrained(EMBEDDING_MODEL_NAME)
    embedding_model = AutoModel.from_pretrained(EMBEDDING_MODEL_NAME, device_map=device)
    embedding_model.eval()  # Set to evaluation mode
    
    print(f"✓ Model loaded on {device}")
    
    # Get actual max sequence length from model config
    if hasattr(embedding_model, 'config'):
        actual_max_length = getattr(embedding_model.config, 'max_position_embeddings', MAX_SEQ_LENGTH)
        max_seq_length = min(actual_max_length, MAX_SEQ_LENGTH)
    else:
        max_seq_length = MAX_SEQ_LENGTH
    
    print(f"Model max sequence length: {max_seq_length} tokens")
    print(f"Your data: avg=265 tokens, max=919 tokens, 95th percentile=546 tokens")
    
    if max_seq_length >= 1024:
        print(f"✓ Model can handle all products without chunking! ({max_seq_length} > 919)")
    else:
        print(f"⚠️  Some products ({919} tokens) may exceed model limit ({max_seq_length}).")
        print("   Will use chunking for longer texts.")
    print()
    
except Exception as e:
    if "gated" in str(e).lower() or "access" in str(e).lower():
        print("\n" + "="*60)
        print("ACCESS DENIED - Model is gated")
        print("="*60)
        print("To fix this:")
        print("1. Request access: https://huggingface.co/google/embedding-gemma-300m")
        print("2. Login: huggingface-cli login")
        print("   OR set: export HUGGING_FACE_HUB_TOKEN=your_token")
        print("\nAlternatively, set USE_EMBEDDING_GEMMA = False to use an open model")
        print("="*60)
    raise

# ----------- Embedding Functions -----------

def get_text_embedding(text: str) -> List[float]:
    """
    Get embedding for a single text using the loaded model.
    
    Args:
        text: Text to embed
        
    Returns:
        List of embedding values
    """
    if not text or not text.strip():
        # Return zero vector if empty
        return [0.0] * EMBEDDING_DIM
    
    # Tokenize text
    inputs = tokenizer(
        text, 
        return_tensors="pt", 
        padding=True, 
        truncation=True, 
        max_length=max_seq_length
    )
    inputs = {k: v.to(device) for k, v in inputs.items()}
    
    # Generate embeddings
    with torch.no_grad():
        outputs = embedding_model(**inputs)
        
        # Different models return embeddings differently
        if hasattr(outputs, 'last_hidden_state'):
            # For models like EmbeddingGemma: use mean pooling
            embeddings = outputs.last_hidden_state.mean(dim=1).squeeze()
        elif hasattr(outputs, 'pooler_output') and outputs.pooler_output is not None:
            # For BERT-like models: use pooler output
            embeddings = outputs.pooler_output.squeeze()
        elif hasattr(outputs, 'embeddings'):
            # Some models have embeddings directly
            embeddings = outputs.embeddings.squeeze()
        else:
            # Fallback: mean pooling of last hidden state
            embeddings = outputs.last_hidden_state.mean(dim=1).squeeze()
    
    # Convert to numpy array then list
    if isinstance(embeddings, torch.Tensor):
        embedding_list = embeddings.cpu().numpy().tolist()
    else:
        embedding_list = embeddings.tolist()
    
    # Handle single dimension vs multi-dimension
    if isinstance(embedding_list[0], list):
        embedding_list = embedding_list[0]
    
    # Truncate to desired dimension if needed (for MRL or consistency)
    if len(embedding_list) > EMBEDDING_DIM:
        embedding_list = embedding_list[:EMBEDDING_DIM]
    elif len(embedding_list) < EMBEDDING_DIM:
        # Pad with zeros if needed (shouldn't happen, but safety check)
        embedding_list.extend([0.0] * (EMBEDDING_DIM - len(embedding_list)))
    
    return embedding_list


def create_overlapping_chunks(text: str, max_tokens: int = None, overlap_tokens: int = 50) -> List[str]:
    """
    Split text into overlapping chunks based on token count.
    Uses EmbeddingGemma's tokenizer to ensure chunks fit within token limits.
    
    Args:
        text: Text to chunk
        max_tokens: Maximum tokens per chunk (defaults to model's max_seq_length)
        overlap_tokens: Number of tokens to overlap between chunks
        
    Returns:
        List of text chunks
    """
    if not text or not text.strip():
        return []
    
    if max_tokens is None:
        max_tokens = max_seq_length
    
    # Tokenize the full text (without special tokens for accurate counting)
    tokens = tokenizer.encode(text, add_special_tokens=False)
    
    # If text fits in one chunk, return as-is
    if len(tokens) <= max_tokens:
        return [text]
    
    chunks = []
    start = 0
    
    while start < len(tokens):
        # Get chunk of tokens
        end = min(start + max_tokens, len(tokens))
        chunk_tokens = tokens[start:end]
        
        # Decode back to text
        chunk_text = tokenizer.decode(chunk_tokens, skip_special_tokens=True)
        chunks.append(chunk_text)
        
        # Move start position with overlap
        if end >= len(tokens):
            break
        start = end - overlap_tokens
    
    return chunks


def get_text_embedding_from_chunk(text_chunk: str, product_id: str, chunk_number: int):
    """
    Get xAI/Grok text embedding + metadata for a chunk.
    
    Args:
        text_chunk: Text chunk to embed
        product_id: Product ID
        chunk_number: Chunk number for this product
        
    Returns:
        Tuple of (embedding, metadata) or (None, None) if error
    """
    try:
        if text_chunk and text_chunk.strip():
            embedding = get_text_embedding(text_chunk)
            
            metadata = {
                "product_id": str(product_id),
                "type": "text",
                "chunk": chunk_number
            }
            return embedding, metadata
        return None, None
    except Exception as e:
        print(f"ERROR processing chunk {chunk_number} for {product_id}: {e}")
        return None, None


print("Embedding functions defined!")


NameError: name 'HF_LOGIN_AVAILABLE' is not defined

In [6]:
print("="*60)
print("TESTING TEXT EMBEDDING")
print("="*60)

test_df = unique_df.head(3)

embeddings_to_store = []
metadatas_to_store = []
ids_to_store = []
documents_to_store = []


print("-"*60)
print("EMBEDDING")
print("-"*60)
for i in range(len(test_df)):
    row = test_df.iloc[i]
    
    if pd.notna(row['Uniq Id']) and pd.notna(row['text_to_embed']):
        uniq_id = row['Uniq Id']
        full_text = str(row['text_to_embed'])
        
        # Get chunks for this product
        chunks = create_overlapping_chunks(full_text)
        
        # Process each chunk
        for chunk_num, text_chunk in enumerate(chunks, 1):
            
            # Create unique ChromaDB ID for each chunk
            unique_chroma_id = f"text_{uniq_id}_{chunk_num}"

            # Skip if chunk was already embedded
            try:
                existing = collection.get(ids=[unique_chroma_id])
                if existing['ids']: 
                    print(f"Skipping {unique_chroma_id} - already exists")
                    continue
            except:
                pass
            
            # Get embedding and metadata
            result = get_text_embedding_from_chunk(text_chunk, uniq_id, chunk_num)
            if result is not None:
                embedding, metadata = result
                if embedding is not None:
                    embeddings_to_store.append(embedding.tolist())
                    metadatas_to_store.append(metadata)
                    ids_to_store.append(unique_chroma_id)
                    documents_to_store.append(text_chunk)
            
            # Store every BATCH_SIZE embeddings
            if len(embeddings_to_store) >= BATCH_SIZE:
                collection.add(embeddings=embeddings_to_store, metadatas=metadatas_to_store, ids=ids_to_store, documents=documents_to_store)
                print(f"Stored batch of {len(embeddings_to_store)} text embeddings")
                # Clear lists for next batch
                embeddings_to_store = []
                metadatas_to_store = []
                ids_to_store = []
                documents_to_store = []

# Store final batch
if embeddings_to_store:
    collection.add(embeddings=embeddings_to_store, metadatas=metadatas_to_store, ids=ids_to_store, documents=documents_to_store)
    print(f"Stored final batch of {len(embeddings_to_store)} text embeddings")

print()
print("-"*60)
print("Checking results")
print("-"*60)
all_data = collection.get()
print(f"\nTotal embeddings: {len(all_data['ids'])}")

# Count text vs image embeddings
text_embeddings = [id for id in all_data['ids'] if id.startswith('text_')]
image_embeddings = [id for id in all_data['ids'] if id.startswith('img_')]
print(f"Text embeddings: {len(text_embeddings)}")
print(f"Image embeddings: {len(image_embeddings)}")

# Check text chunks per product
text_metadata = [meta for meta in all_data['metadatas'] if meta['type'] == 'text']
text_uniq_ids = [meta['uniq_id'] for meta in text_metadata]
unique_text_uniq_ids = set(text_uniq_ids)
print(f"Products with text embeddings: {len(unique_text_uniq_ids)}")

# Show chunks per product
from collections import Counter
chunks_per_product = Counter(text_uniq_ids)
print(f"Average chunks per product: {len(text_uniq_ids) / len(unique_text_uniq_ids):.1f}")
print(f"Max chunks for one product: {max(chunks_per_product.values())}")

TESTING TEXT EMBEDDING


NameError: name 'unique_df' is not defined

In [None]:
# FINAL VERIFICATION CHECK
print("="*60)
print("FINAL VECTOR STORE VERIFICATION")
print("="*60)

# Check collection details
print(f"Collection name: {collection.name}")
print(f"Total embeddings: {collection.count()}")

# Get all data
all_data = collection.get()

print(f"Documents stored: {all_data['documents'] is not None}")
if all_data['documents']:
    print(f"Sample text document: {all_data['documents'][0][:100]}...")
    print(f"Sample image document: {[doc for doc in all_data['documents'] if doc and doc.startswith('http')][0][:50]}...")

# Count by type
text_embeddings = [id for id in all_data['ids'] if id.startswith('text_')]
image_embeddings = [id for id in all_data['ids'] if id.startswith('img_')]

print(f"\nImage embeddings: {len(image_embeddings)}")
print(f"Text embeddings: {len(text_embeddings)}")

# Check they have the same uniq_ids (products)
text_uniq_ids = set([meta['uniq_id'] for meta in all_data['metadatas'] if meta['type'] == 'text'])
image_uniq_ids = set([meta['uniq_id'] for meta in all_data['metadatas'] if meta['type'] == 'image'])

print(f"\nProducts with text embeddings: {len(text_uniq_ids)}")
print(f"Products with image embeddings: {len(image_uniq_ids)}")
print(f"Products with BOTH text and images: {len(text_uniq_ids & image_uniq_ids)}")

# Sample IDs to verify format
print(f"\nSample image IDs: {image_embeddings[:3] if image_embeddings else 'None'}")
print(f"Sample text IDs: {text_embeddings[:3] if text_embeddings else 'None'}")

# Quick search test to make sure both work
try:
    query_text = "longboard skateboard"
    query_embedding = get_text_embedding(query_text)  # 512 dims

    results = collection.query(
        query_embeddings=[query_embedding.tolist()],  # 512 dims
        n_results=5
    )

    result_types = [meta['type'] for meta in results['metadatas'][0]]
    print(f"\nSearch test - found types: {set(result_types)}")
    print("Both text and image embeddings are searchable!")
except Exception as e:
    print(f"Search test failed: {e}")

print("\n" + "="*60)

FINAL VECTOR STORE VERIFICATION
Collection name: amazon_products_exploded_v3
Total embeddings: 33997
Documents stored: True
Sample text document: https://images-na.ssl-images-amazon.com/images/I/51j3fPQTQkL.jpg...
Sample image document: https://images-na.ssl-images-amazon.com/images/I/5...

Image embeddings: 33975
Text embeddings: 22

Products with text embeddings: 3
Products with image embeddings: 9980
Products with BOTH text and images: 3

Sample image IDs: ['img_4c69b61db1fc16e7013b43fc926e502d_0', 'img_4c69b61db1fc16e7013b43fc926e502d_1', 'img_4c69b61db1fc16e7013b43fc926e502d_2']
Sample text IDs: ['text_4c69b61db1fc16e7013b43fc926e502d_1', 'text_4c69b61db1fc16e7013b43fc926e502d_2', 'text_4c69b61db1fc16e7013b43fc926e502d_3']

Search test - found types: {'text'}
Both text and image embeddings are searchable!



In [None]:
print("="*60)
print("FULL DATASET TEXT EMBEDDING")
print("="*60)

BATCH_SIZE = 100
embeddings_to_store = []
metadatas_to_store = []
ids_to_store = []
documents_to_store = []


print("-"*60)
print("EMBEDDING")
for i in range(len(unique_df)): 
    #row = test_df.iloc[i]
    row = unique_df.iloc[i]  
    
    if pd.notna(row['Uniq Id']) and pd.notna(row['text_to_embed']):
        uniq_id = row['Uniq Id']
        full_text = str(row['text_to_embed'])
        
        # Get chunks for this product
        chunks = create_overlapping_chunks(full_text)
        
        # Process each chunk
        for chunk_num, text_chunk in enumerate(chunks, 1):
            
            # Create unique ChromaDB ID for each chunk
            unique_chroma_id = f"text_{uniq_id}_{chunk_num}"

            # Skip if chunk was already embedded
            try:
                existing = collection.get(ids=[unique_chroma_id])
                if existing['ids']: 
                    print(f"Skipping {unique_chroma_id} - already exists")
                    continue
            except:
                pass
            
            # Get embedding and metadata
            result = get_text_embedding_from_chunk(text_chunk, uniq_id, chunk_num)
            if result is not None:
                embedding, metadata = result
                if embedding is not None:
                    embeddings_to_store.append(embedding.tolist())
                    metadatas_to_store.append(metadata)
                    ids_to_store.append(unique_chroma_id)
                    documents_to_store.append(text_chunk)
            
            # Store every BATCH_SIZE embeddings
            if len(embeddings_to_store) >= BATCH_SIZE:
                collection.add(embeddings=embeddings_to_store, metadatas=metadatas_to_store, ids=ids_to_store, documents=documents_to_store)
                print(f"Stored batch of {len(embeddings_to_store)} text embeddings")
                # Clear lists for next batch
                embeddings_to_store = []
                metadatas_to_store = []
                ids_to_store = []
                documents_to_store = []

# Store final batch
if embeddings_to_store:
    collection.add(embeddings=embeddings_to_store, metadatas=metadatas_to_store, ids=ids_to_store, documents=documents_to_store)
    print(f"Stored final batch of {len(embeddings_to_store)} text embeddings")

print()
print("-"*60)
print("Checking results")
print("-"*60)
all_data = collection.get()
print(f"\nTotal embeddings: {len(all_data['ids'])}")

# Count text vs image embeddings
text_embeddings = [id for id in all_data['ids'] if id.startswith('text_')]
image_embeddings = [id for id in all_data['ids'] if id.startswith('img_')]
print(f"Text embeddings: {len(text_embeddings)}")
print(f"Image embeddings: {len(image_embeddings)}")

# Check text chunks per product
text_metadata = [meta for meta in all_data['metadatas'] if meta['type'] == 'text']
text_uniq_ids = [meta['uniq_id'] for meta in text_metadata]
unique_text_uniq_ids = set(text_uniq_ids)
print(f"Products with text embeddings: {len(unique_text_uniq_ids)}")

# Show chunks per product
from collections import Counter
chunks_per_product = Counter(text_uniq_ids)
print(f"Average chunks per product: {len(text_uniq_ids) / len(unique_text_uniq_ids):.1f}")
print(f"Max chunks for one product: {max(chunks_per_product.values())}")

FULL DATASET TEXT EMBEDDING
------------------------------------------------------------
EMBEDDING
Skipping text_4c69b61db1fc16e7013b43fc926e502d_1 - already exists
Skipping text_4c69b61db1fc16e7013b43fc926e502d_2 - already exists
Skipping text_4c69b61db1fc16e7013b43fc926e502d_3 - already exists
Skipping text_4c69b61db1fc16e7013b43fc926e502d_4 - already exists
Skipping text_4c69b61db1fc16e7013b43fc926e502d_5 - already exists
Skipping text_4c69b61db1fc16e7013b43fc926e502d_6 - already exists
Skipping text_66d49bbed043f5be260fa9f7fbff5957_1 - already exists
Skipping text_66d49bbed043f5be260fa9f7fbff5957_2 - already exists
Skipping text_66d49bbed043f5be260fa9f7fbff5957_3 - already exists
Skipping text_66d49bbed043f5be260fa9f7fbff5957_4 - already exists
Skipping text_66d49bbed043f5be260fa9f7fbff5957_5 - already exists
Skipping text_66d49bbed043f5be260fa9f7fbff5957_6 - already exists
Skipping text_66d49bbed043f5be260fa9f7fbff5957_7 - already exists
Skipping text_66d49bbed043f5be260fa9f7fbff5

In [None]:
# Verify no duplicate chunk IDs
text_ids = [id for id in all_data['ids'] if id.startswith('text_')]
unique_text_ids = set(text_ids)
print(f"Text embeddings: {len(text_ids)}")
print(f"Unique text IDs: {len(unique_text_ids)}")
print(f"Duplicates: {len(text_ids) - len(unique_text_ids)}")

Text embeddings: 52668
Unique text IDs: 52668
Duplicates: 0


In [None]:
# ----------- Generate and Store Embeddings -----------

print("="*60)
print("FULL DATASET TEXT EMBEDDING")
print("="*60)

BATCH_SIZE = 100  # Store embeddings in batches of 100

# Filter out rows without embedding_text
df_valid = df[df["embedding_text"].notna() & (df["embedding_text"] != "")].copy()
print(f"Processing {len(df_valid)} products with valid embedding text\n")

# Initialize batch storage lists
embeddings_to_store = []
metadatas_to_store = []
ids_to_store = []
documents_to_store = []

print("-"*60)
print("EMBEDDING")
print("-"*60)

# Process each product
for i in tqdm(range(len(df_valid)), desc="Processing products"):
    row = df_valid.iloc[i]
    
    if pd.notna(row["product_id"]) and pd.notna(row["embedding_text"]):
        product_id = str(row["product_id"])
        full_text = str(row["embedding_text"])
        
        # Get chunks for this product
        # With EmbeddingGemma's 2048 token context, most products won't need chunking
        chunks = create_overlapping_chunks(full_text, max_tokens=max_seq_length, overlap_tokens=50)
        
        # Process each chunk
        for chunk_num, text_chunk in enumerate(chunks, 1):
            # Create unique ChromaDB ID for each chunk
            unique_chroma_id = f"text_{product_id}_{chunk_num}"
            
            # Skip if chunk was already embedded
            try:
                existing = collection.get(ids=[unique_chroma_id])
                if existing["ids"]:
                    continue  # Skip if already exists
            except:
                pass
            
            # Get embedding and metadata
            result = get_text_embedding_from_chunk(text_chunk, product_id, chunk_num)
            if result is not None:
                embedding, metadata = result
                if embedding is not None:
                    embeddings_to_store.append(embedding)
                    metadatas_to_store.append(metadata)
                    ids_to_store.append(unique_chroma_id)
                    documents_to_store.append(text_chunk)
            
            # Store every BATCH_SIZE embeddings
            if len(embeddings_to_store) >= BATCH_SIZE:
                collection.add(
                    embeddings=embeddings_to_store,
                    metadatas=metadatas_to_store,
                    ids=ids_to_store,
                    documents=documents_to_store
                )
                print(f"Stored batch of {len(embeddings_to_store)} text embeddings")
                # Clear lists for next batch
                embeddings_to_store = []
                metadatas_to_store = []
                ids_to_store = []
                documents_to_store = []

# Store final batch
if embeddings_to_store:
    collection.add(
        embeddings=embeddings_to_store,
        metadatas=metadatas_to_store,
        ids=ids_to_store,
        documents=documents_to_store
    )
    print(f"Stored final batch of {len(embeddings_to_store)} text embeddings")

print()
print("-"*60)
print("Checking results")
print("-"*60)

all_data = collection.get()
print(f"\nTotal embeddings: {len(all_data['ids'])}")

# Count text embeddings
text_embeddings = [id for id in all_data["ids"] if id.startswith("text_")]
print(f"Text embeddings: {len(text_embeddings)}")

# Check text chunks per product
text_metadata = [meta for meta in all_data["metadatas"] if meta.get("type") == "text"]
text_product_ids = [meta["product_id"] for meta in text_metadata]
unique_text_product_ids = set(text_product_ids)
print(f"Products with text embeddings: {len(unique_text_product_ids)}")

# Show chunks per product
from collections import Counter
chunks_per_product = Counter(text_product_ids)
if len(unique_text_product_ids) > 0:
    print(f"Average chunks per product: {len(text_product_ids) / len(unique_text_product_ids):.1f}")
    print(f"Max chunks for one product: {max(chunks_per_product.values())}")

# Verify no duplicate chunk IDs
unique_text_ids = set(text_embeddings)
print(f"Text embeddings: {len(text_embeddings)}")
print(f"Unique text IDs: {len(unique_text_ids)}")
print(f"Duplicates: {len(text_embeddings) - len(unique_text_ids)}")

print(f"\nTotal embeddings in vector store: {collection.count()}")


In [None]:
# ----------- Test Retrieval -----------

print("="*60)
print("TESTING TEXT EMBEDDING RETRIEVAL")
print("="*60)

# Test query
test_query = "coffee maker under $50"
print(f"Testing retrieval with query: '{test_query}'\n")

# Get embedding for query
query_embedding = get_text_embedding(test_query)

# Search in ChromaDB
results = collection.query(
    query_embeddings=[query_embedding],
    n_results=5
)

print("Top 5 results:")
for i, (chunk_id, distance, metadata, document) in enumerate(zip(
    results["ids"][0],
    results["distances"][0],
    results["metadatas"][0],
    results["documents"][0] if results.get("documents") else [None] * len(results["ids"][0])
), 1):
    print(f"\n{i}. Product ID: {metadata.get('product_id', 'N/A')}")
    print(f"   Chunk ID: {chunk_id}")
    print(f"   Chunk Number: {metadata.get('chunk', 'N/A')}")
    print(f"   Type: {metadata.get('type', 'N/A')}")
    print(f"   Distance: {distance:.4f}")
    if document:
        print(f"   Document preview: {document[:100]}...")

print("\n" + "="*60)
print("FINAL VECTOR STORE VERIFICATION")
print("="*60)

# Check collection details
print(f"Collection name: {collection.name}")
print(f"Total embeddings: {collection.count()}")

# Get all data
all_data = collection.get()
print(f"Documents stored: {all_data.get('documents') is not None}")

if all_data.get("documents"):
    sample_docs = [doc for doc in all_data["documents"] if doc]
    if sample_docs:
        print(f"Sample text document: {sample_docs[0][:100]}...")

# Count by type
text_embeddings = [id for id in all_data["ids"] if id.startswith("text_")]
print(f"\nText embeddings: {len(text_embeddings)}")

# Check products
text_metadata = [meta for meta in all_data["metadatas"] if meta.get("type") == "text"]
text_product_ids = set([meta["product_id"] for meta in text_metadata])
print(f"Products with text embeddings: {len(text_product_ids)}")

# Sample IDs to verify format
print(f"\nSample text IDs: {text_embeddings[:3] if text_embeddings else 'None'}")

# Quick search test
try:
    query_text = "kitchen appliance"
    query_embedding = get_text_embedding(query_text)
    results = collection.query(
        query_embeddings=[query_embedding],
        n_results=5
    )
    result_types = [meta.get("type") for meta in results["metadatas"][0]]
    print(f"\nSearch test - found types: {set(result_types)}")
    print("Text embeddings are searchable!")
except Exception as e:
    print(f"Search test failed: {e}")

print("\n" + "="*60)
