In [2]:
import pandas as pd
import numpy as np
from IPython.display import display
import json
import time
from tqdm import tqdm
import os
from typing import List, Dict, Any

# For BAAI/bge-large-en-v1.5 model
from transformers import AutoModel, AutoTokenizer
import torch
from huggingface_hub import login

# Vector database
import chromadb
from chromadb.config import Settings

In [4]:
# # DELETE ALL EMBEDDINGS - RUN THIS ONCE
# client.delete_collection(name=COLL_NAME)
# collection = client.create_collection(name=COLL_NAME)
# print(f"✓ DELETED ALL EMBEDDINGS")
# print(f"✓ Current count: {collection.count()}")

In [5]:
# ----------- Paths Configuration -----------
# Get the project root directory
# In Jupyter, we detect the project root by looking for common project files
current_dir = os.getcwd()
if os.path.basename(current_dir) == "notebooks":
    # If we're in the notebooks directory, go up one level
    PROJECT_ROOT = os.path.dirname(current_dir)
else:
    # If we're in the project root, use current directory
    PROJECT_ROOT = current_dir

# Data file path
DF_PATH = os.path.join(PROJECT_ROOT, "data", "homes_preprocessed_data.csv") # Alternative simple path (if running from project root): DF_PATH = "data/homes_preprocessed_data.csv"

# Vector store path
VECTOR_STORE_VERSION = "v1"
VS_PATH = os.path.join(PROJECT_ROOT, f"vector_store_{VECTOR_STORE_VERSION}")
COLL_NAME = "agentic_voice_assistant_vdb"

print(f"Project root: {PROJECT_ROOT}")
print(f"Data path: {DF_PATH}")

# ----------- Embedding Model Configuration -----------

EMBEDDING_MODEL_NAME = "BAAI/bge-large-en-v1.5"  
EMBEDDING_DIM = 1024
MAX_SEQ_LENGTH = 512
BATCH_SIZE = 100
print("Using open alternative: BAAI/bge-large-en-v1.5")



device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

# ----------- Load data -----------
def must_exist(p):
    assert os.path.exists(p), f"Missing file: {p}"

must_exist(DF_PATH)
os.makedirs(VS_PATH, exist_ok=True)

df = pd.read_csv(DF_PATH)
print(f"Loaded {len(df)} products from {DF_PATH}")

# Check if embedding_text column exists
if "embedding_text" not in df.columns:
    raise ValueError("DataFrame must have 'embedding_text' column. Run data_preprocessing.ipynb first.")

print(f"Found 'embedding_text' column with {df['embedding_text'].notna().sum()} non-null values\n")

# ----------- Reconnect to vector store -----------
client = chromadb.PersistentClient(
    path=VS_PATH,
    settings=Settings(anonymized_telemetry=False)
)
collection = client.get_or_create_collection(name=COLL_NAME)
print(f"Vector store connected! Current embeddings: {collection.count()}\n")


Project root: /Users/brunamedeiros/Documents/GitHub/agentic-voice-assistant
Data path: /Users/brunamedeiros/Documents/GitHub/agentic-voice-assistant/data/homes_preprocessed_data.csv
Using open alternative: BAAI/bge-large-en-v1.5
Using device: cpu
Loaded 708 products from /Users/brunamedeiros/Documents/GitHub/agentic-voice-assistant/data/homes_preprocessed_data.csv
Found 'embedding_text' column with 708 non-null values

Vector store connected! Current embeddings: 0



In [6]:
# ----------- Load Embedding Model -----------
print(f"\nLoading model: {EMBEDDING_MODEL_NAME}")
tokenizer = AutoTokenizer.from_pretrained(EMBEDDING_MODEL_NAME)
embedding_model = AutoModel.from_pretrained(EMBEDDING_MODEL_NAME, device_map=device)
embedding_model.eval()  # Set to evaluation mode
print(f"✓ Model loaded on {device}")

# Get actual max sequence length from model config
if hasattr(embedding_model, 'config'):
    actual_max_length = getattr(embedding_model.config, 'max_position_embeddings', MAX_SEQ_LENGTH)
    max_seq_length = min(actual_max_length, MAX_SEQ_LENGTH)
else:
    max_seq_length = MAX_SEQ_LENGTH

print(f"Model max sequence length: {max_seq_length} tokens")
print(f"Your data: avg=265 tokens, max=919 tokens, 95th percentile=546 tokens")

if max_seq_length >= 1024:
    print(f"✓ Model can handle all products without chunking! ({max_seq_length} > 919)")
else:
    print(f"  Some products (919 tokens) may exceed model limit ({max_seq_length}).")
    print("   Use chunking for longer texts.")
print()

# Calculate max tokens from your data
max_tokens_in_data = df['embedding_text'].apply(
    lambda x: len(tokenizer.encode(str(x), add_special_tokens=False)) if pd.notna(x) else 0
).max()

print(f"Max tokens in dataset: {max_tokens_in_data}")
print(f"Model max context: {max_seq_length}")

# Use for chunking
CHUNK_SIZE = max_seq_length  # 512 for BGE
OVERLAP = 50



# ----------- Embedding Functions -----------

def get_text_embedding(text: str) -> List[float]:
    """
    Get embedding for a single text using the loaded model.
    
    Args:
        text: Text to embed
        
    Returns:
        List of normalized embedding values
    """
    if not text or not text.strip():
        # Return zero vector if empty
        return [0.0] * EMBEDDING_DIM
    
    # Tokenize text
    inputs = tokenizer(
        text, 
        return_tensors="pt", 
        padding=True, 
        truncation=True, 
        max_length=max_seq_length
    )
    inputs = {k: v.to(device) for k, v in inputs.items()}
    
    # Generate embeddings
    with torch.no_grad():
        outputs = embedding_model(**inputs)
        
        # For BGE models: use mean pooling of last hidden state
        embeddings = outputs.last_hidden_state.mean(dim=1).squeeze()
    
    # Convert to numpy
    import numpy as np
    embedding_array = embeddings.cpu().numpy()
    
    # CRITICAL: Normalize the embedding for cosine similarity
    norm = np.linalg.norm(embedding_array)
    if norm > 0:
        embedding_array = embedding_array / norm
    
    # Convert to list
    embedding_list = embedding_array.tolist()
    
    # Ensure correct dimension
    if len(embedding_list) > EMBEDDING_DIM:
        embedding_list = embedding_list[:EMBEDDING_DIM]
    elif len(embedding_list) < EMBEDDING_DIM:
        embedding_list.extend([0.0] * (EMBEDDING_DIM - len(embedding_list)))
    
    return embedding_list


def create_overlapping_chunks(text: str, max_tokens: int = None, overlap_tokens: int = 50) -> List[str]:
    """
    Split text into overlapping chunks based on token count.
    Uses EmbeddingGemma's tokenizer to ensure chunks fit within token limits.
    
    Args:
        text: Text to chunk
        max_tokens: Maximum tokens per chunk (defaults to model's max_seq_length)
        overlap_tokens: Number of tokens to overlap between chunks
        
    Returns:
        List of text chunks
    """
    if not text or not text.strip():
        return []
    
    if max_tokens is None:
        max_tokens = max_seq_length
    
    # Tokenize the full text (without special tokens for accurate counting)
    tokens = tokenizer.encode(text, add_special_tokens=False)
    
    # If text fits in one chunk, return as-is
    if len(tokens) <= max_tokens:
        return [text]
    
    chunks = []
    start = 0
    
    while start < len(tokens):
        # Get chunk of tokens
        end = min(start + max_tokens, len(tokens))
        chunk_tokens = tokens[start:end]
        
        # Decode back to text
        chunk_text = tokenizer.decode(chunk_tokens, skip_special_tokens=True)
        chunks.append(chunk_text)
        
        # Move start position with overlap
        if end >= len(tokens):
            break
        start = end - overlap_tokens
    
    return chunks


def get_text_embedding_from_chunk(text_chunk: str, product_id: str, chunk_number: int):
    """
    Get xAI/Grok text embedding + metadata for a chunk.
    
    Args:
        text_chunk: Text chunk to embed
        product_id: Product ID
        chunk_number: Chunk number for this product
        
    Returns:
        Tuple of (embedding, metadata) or (None, None) if error
    """
    try:
        if text_chunk and text_chunk.strip():
            embedding = get_text_embedding(text_chunk)
            
            metadata = {
                "product_id": str(product_id),
                "type": "text",
                "chunk": chunk_number
            }
            return embedding, metadata
        return None, None
    except Exception as e:
        print(f"ERROR processing chunk {chunk_number} for {product_id}: {e}")
        return None, None


print("Embedding functions defined!")



Loading model: BAAI/bge-large-en-v1.5


Token indices sequence length is longer than the specified maximum sequence length for this model (574 > 512). Running this sequence through the model will result in indexing errors


✓ Model loaded on cpu
Model max sequence length: 512 tokens
Your data: avg=265 tokens, max=919 tokens, 95th percentile=546 tokens
  Some products (919 tokens) may exceed model limit (512).
   Use chunking for longer texts.

Max tokens in dataset: 920
Model max context: 512
Embedding functions defined!


In [7]:
print("="*60)
print("TESTING TEXT EMBEDDING")
print("="*60)

test_df = df.head(3)

embeddings_to_store = []
metadatas_to_store = []
ids_to_store = []
documents_to_store = []


print("-"*60)
print("EMBEDDING")
print("-"*60)
for i in range(len(test_df)):
    row = test_df.iloc[i]
    
    if pd.notna(row['product_id']) and pd.notna(row['embedding_text']):
        product_id = row['product_id']
        full_text = str(row['embedding_text'])
        
        # Get chunks for this product
        chunks = create_overlapping_chunks(full_text)
        
        # Process each chunk
        for chunk_num, text_chunk in enumerate(chunks, 1):
            
            # Create unique ChromaDB ID for each chunk
            unique_chroma_id = f"text_{product_id}_{chunk_num}"

            # Skip if chunk was already embedded
            try:
                existing = collection.get(ids=[unique_chroma_id])
                if existing['ids']: 
                    print(f"Skipping {unique_chroma_id} - already exists")
                    continue
            except:
                pass
            
            # Get embedding and metadata
            result = get_text_embedding_from_chunk(text_chunk, product_id, chunk_num)
            if result is not None:
                embedding, metadata = result
                if embedding is not None:
                    embeddings_to_store.append(embedding)
                    metadatas_to_store.append(metadata)
                    ids_to_store.append(unique_chroma_id)
                    documents_to_store.append(text_chunk)
            
            # Store every BATCH_SIZE embeddings
            if len(embeddings_to_store) >= BATCH_SIZE:
                collection.add(embeddings=embeddings_to_store, metadatas=metadatas_to_store, ids=ids_to_store, documents=documents_to_store)
                print(f"Stored batch of {len(embeddings_to_store)} text embeddings")
                # Clear lists for next batch
                embeddings_to_store = []
                metadatas_to_store = []
                ids_to_store = []
                documents_to_store = []

# Store final batch
if embeddings_to_store:
    collection.add(embeddings=embeddings_to_store, metadatas=metadatas_to_store, ids=ids_to_store, documents=documents_to_store)
    print(f"Stored final batch of {len(embeddings_to_store)} text embeddings")

print()
print("-"*60)
print("Checking results")
print("-"*60)
all_data = collection.get()
print(f"\nTotal embeddings: {len(all_data['ids'])}")

# Count text embeddings
text_embeddings = [id for id in all_data['ids'] if id.startswith('text_')]
print(f"Text embeddings: {len(text_embeddings)}")

# Check text chunks per product
text_metadata = [meta for meta in all_data['metadatas'] if meta['type'] == 'text']
text_product_ids = [meta['product_id'] for meta in text_metadata]
unique_text_product_ids = set(text_product_ids)
print(f"Products with text embeddings: {len(unique_text_product_ids)}")

# Show chunks per product
from collections import Counter
chunks_per_product = Counter(text_product_ids)
print(f"Average chunks per product: {len(text_product_ids) / len(unique_text_product_ids):.1f}")
print(f"Max chunks for one product: {max(chunks_per_product.values())}")

TESTING TEXT EMBEDDING
------------------------------------------------------------
EMBEDDING
------------------------------------------------------------
Stored final batch of 3 text embeddings

------------------------------------------------------------
Checking results
------------------------------------------------------------

Total embeddings: 3
Text embeddings: 3
Products with text embeddings: 3
Average chunks per product: 1.0
Max chunks for one product: 1


In [8]:
# FINAL VERIFICATION CHECK
print("="*60)
print("FINAL VECTOR STORE VERIFICATION")
print("="*60)

# Check collection details
print(f"Collection name: {collection.name}")
print(f"Total embeddings: {collection.count()}")

# Get all data
all_data = collection.get()

print(f"Documents stored: {all_data['documents'] is not None}")
if all_data['documents']:
    print(f"Sample text document: {all_data['documents'][0][:100]}...")

# Count by type
text_embeddings = [id for id in all_data['ids'] if id.startswith('text_')]
print(f"Text embeddings: {len(text_embeddings)}")

# Check they have the same product_ids (products)
text_product_ids = set([meta['product_id'] for meta in all_data['metadatas'] if meta['type'] == 'text'])
print(f"\nProducts with text embeddings: {len(text_product_ids)}")

# Sample IDs to verify format
print(f"Sample text IDs: {text_embeddings[:3] if text_embeddings else 'None'}")

# Quick search test to make sure both work
try:
    query_text = "longboard skateboard"
    prefixed_query = f"Represent this sentence for searching relevant passages: {query_text}"
    query_embedding = get_text_embedding(prefixed_query)

    results = collection.query(
        query_embeddings=[query_embedding.tolist()],  # 512 dims
        n_results=5
    )

    result_types = [meta['type'] for meta in results['metadatas'][0]]
    print(f"\nSearch test - found types: {set(result_types)}")
    print("Text embeddings are searchable!")
except Exception as e:
    print(f"Search test failed: {e}")

print("\n" + "="*60)

FINAL VECTOR STORE VERIFICATION
Collection name: agentic_voice_assistant_vdb
Total embeddings: 3
Documents stored: True
Sample text document: ARTSCAPE Etched Glass 24" x 36" Window Film, 24-by-36-Inch Brand: ARTSCAPE Etched Glass Category: ho...
Text embeddings: 3

Products with text embeddings: 3
Sample text IDs: ['text_cc2083338a16c3fe2f7895289d2e98fe_1', 'text_39f1b8a2129315da0288cd058b6b6086_1', 'text_a11d9462309527143094a0f68bce0a58_1']
Search test failed: 'list' object has no attribute 'tolist'



In [9]:
print("="*60)
print("FULL DATASET TEXT EMBEDDING")
print("="*60)

BATCH_SIZE = 100
embeddings_to_store = []
metadatas_to_store = []
ids_to_store = []
documents_to_store = []


print("-"*60)
print("EMBEDDING")
for i in range(len(df)): 
    #row = test_df.iloc[i]
    row = df.iloc[i]  
    
    if pd.notna(row['product_id']) and pd.notna(row['embedding_text']):
        product_id = row['product_id']
        full_text = str(row['embedding_text'])
        
        # Get chunks for this product
        chunks = create_overlapping_chunks(full_text)
        
        # Process each chunk
        for chunk_num, text_chunk in enumerate(chunks, 1):
            
            # Create unique ChromaDB ID for each chunk
            unique_chroma_id = f"text_{product_id}_{chunk_num}"

            # Skip if chunk was already embedded
            try:
                existing = collection.get(ids=[unique_chroma_id])
                if existing['ids']: 
                    print(f"Skipping {unique_chroma_id} - already exists")
                    continue
            except:
                pass
            
            # Get embedding and metadata
            result = get_text_embedding_from_chunk(text_chunk, product_id, chunk_num)
            if result is not None:
                embedding, metadata = result
                if embedding is not None:
                    embeddings_to_store.append(embedding)
                    metadatas_to_store.append(metadata)
                    ids_to_store.append(unique_chroma_id)
                    documents_to_store.append(text_chunk)
            
            # Store every BATCH_SIZE embeddings
            if len(embeddings_to_store) >= BATCH_SIZE:
                collection.add(embeddings=embeddings_to_store, metadatas=metadatas_to_store, ids=ids_to_store, documents=documents_to_store)
                print(f"Stored batch of {len(embeddings_to_store)} text embeddings")
                # Clear lists for next batch
                embeddings_to_store = []
                metadatas_to_store = []
                ids_to_store = []
                documents_to_store = []

# Store final batch
if embeddings_to_store:
    collection.add(embeddings=embeddings_to_store, metadatas=metadatas_to_store, ids=ids_to_store, documents=documents_to_store)
    print(f"Stored final batch of {len(embeddings_to_store)} text embeddings")

print()
print("-"*60)
print("Checking results")
print("-"*60)
all_data = collection.get()
print(f"\nTotal embeddings: {len(all_data['ids'])}")

# Count text  embeddings
text_embeddings = [id for id in all_data['ids'] if id.startswith('text_')]
print(f"Text embeddings: {len(text_embeddings)}")

# Check text chunks per product
text_metadata = [meta for meta in all_data['metadatas'] if meta['type'] == 'text']
text_product_ids = [meta['product_id'] for meta in text_metadata]
unique_text_product_ids = set(text_product_ids)
print(f"Products with text embeddings: {len(unique_text_product_ids)}")

# Show chunks per product
from collections import Counter
chunks_per_product = Counter(text_product_ids)
print(f"Average chunks per product: {len(text_product_ids) / len(unique_text_product_ids):.1f}")
print(f"Max chunks for one product: {max(chunks_per_product.values())}")

FULL DATASET TEXT EMBEDDING
------------------------------------------------------------
EMBEDDING
Skipping text_cc2083338a16c3fe2f7895289d2e98fe_1 - already exists
Skipping text_39f1b8a2129315da0288cd058b6b6086_1 - already exists
Skipping text_a11d9462309527143094a0f68bce0a58_1 - already exists
Stored batch of 100 text embeddings
Stored batch of 100 text embeddings
Stored batch of 100 text embeddings
Stored batch of 100 text embeddings
Stored batch of 100 text embeddings
Stored batch of 100 text embeddings
Stored batch of 100 text embeddings
Stored final batch of 50 text embeddings

------------------------------------------------------------
Checking results
------------------------------------------------------------

Total embeddings: 753
Text embeddings: 753
Products with text embeddings: 708
Average chunks per product: 1.1
Max chunks for one product: 2


In [10]:
# Verify no duplicate chunk IDs
text_ids = [id for id in all_data['ids'] if id.startswith('text_')]
unique_text_ids = set(text_ids)
print(f"Text embeddings: {len(text_ids)}")
print(f"Unique text IDs: {len(unique_text_ids)}")
print(f"Duplicates: {len(text_ids) - len(unique_text_ids)}")

Text embeddings: 753
Unique text IDs: 753
Duplicates: 0


In [11]:
# Test the embedding function
test_text = "coffee maker"
test_embedding = get_text_embedding(test_text)

print(f"Embedding length: {len(test_embedding)}")
print(f"First 5 values: {test_embedding[:5]}")
print(f"Embedding magnitude: {sum(x**2 for x in test_embedding)**0.5:.4f}")
print(f"Min value: {min(test_embedding):.4f}")
print(f"Max value: {max(test_embedding):.4f}")

Embedding length: 1024
First 5 values: [-0.030896326526999474, -0.00982965249568224, -0.021214906126260757, -0.017253097146749496, -0.0065686460584402084]
Embedding magnitude: 1.0000
Min value: -0.1024
Max value: 0.1886


In [15]:
# ============================================================
# TESTING TEXT EMBEDDING RETRIEVAL
# ============================================================

print("="*60)
print("TESTING TEXT EMBEDDING RETRIEVAL")
print("="*60)

# Test query
test_query = "curtains or drapes"
print(f"Testing retrieval with query: '{test_query}'\n")

# Get embedding for query (with BGE prefix for better results)
prefixed_query = f"Represent this sentence for searching relevant passages: {test_query}"
query_embedding = get_text_embedding(prefixed_query)

# Search in ChromaDB
results = collection.query(
    query_embeddings=[query_embedding],
    n_results=20,  # Get more candidates to filter
    include=["metadatas", "documents", "distances"]
)

# Filter by distance threshold
DISTANCE_THRESHOLD = 0.6  # Start with 1.0, adjust if needed

filtered_results = []
seen_products = set()

for chunk_id, distance, metadata, doc in zip(
    results["ids"][0],
    results["distances"][0],
    results["metadatas"][0],
    results["documents"][0]
):
    # Only keep results under threshold
    if distance > DISTANCE_THRESHOLD:
        continue
    
    product_id = metadata.get('product_id')
    
    # Deduplicate: only show each product once (best chunk)
    if product_id in seen_products:
        continue
    
    seen_products.add(product_id)
    filtered_results.append({
        "id": chunk_id,
        "product_id": product_id,
        "distance": distance,
        "relevance": f"{(1 - distance/2)*100:.1f}%",
        "chunk": metadata.get('chunk', 1),
        "document": doc
    })

print(f"Found {len(filtered_results)} unique products under threshold {DISTANCE_THRESHOLD}\n")

# Print the FILTERED results
if filtered_results:
    print("Top results:")
    for i, result in enumerate(filtered_results[:10], 1):
        print(f"\n{i}. Product ID: {result['product_id'][:20]}...")
        print(f"   Relevance: {result['relevance']}")
        print(f"   Distance: {result['distance']:.4f}")
        print(f"   Chunk: {result['chunk']}")
        print(f"   Preview: {result['document'][:150]}...")
else:
    print("❌ No results found under the threshold!")
    print(f"Try increasing DISTANCE_THRESHOLD (current: {DISTANCE_THRESHOLD})")
    print("\nShowing top 5 results regardless of threshold:")
    for i, (chunk_id, distance, metadata, doc) in enumerate(zip(
        results["ids"][0][:5],
        results["distances"][0][:5],
        results["metadatas"][0][:5],
        results["documents"][0][:5]
    ), 1):
        print(f"\n{i}. Distance: {distance:.4f}")
        print(f"   Preview: {doc[:100]}...")

print("\n" + "="*60)

TESTING TEXT EMBEDDING RETRIEVAL
Testing retrieval with query: 'curtains or drapes'

Found 3 unique products under threshold 0.6

Top results:

1. Product ID: cd23e01d89bafecec05b...
   Relevance: 73.2%
   Distance: 0.5352
   Chunk: 1
   Preview: Pairs to Go Tiago Window Panel Pair, 60x95, Citron Brand: Pairs Category: home & kitchen Category path: Home & Kitchen Home Décor Kids' Room Décor Win...

2. Product ID: 3727a15da01bb547b9ee...
   Relevance: 71.7%
   Distance: 0.5653
   Chunk: 1
   Preview: AmazonBasics Kids Room Darkening Blackout Window Curtain Set with Grommets - 42" x 63", True Red Brand: AmazonBasics Kids Room Darkening Blackout Wind...

3. Product ID: 737fa6ad2a852bebd118...
   Relevance: 71.1%
   Distance: 0.5773
   Chunk: 1
   Preview: AmazonBasics Kids Room Darkening Blackout Window Curtain Set with Grommets - 42" x 84", Navy Galaxy Brand: AmazonBasics Kids Room Darkening Blackout W...



In [None]:
{
  "sku": product_id,
  "title": product_name,
  "price": price,
  "rating": None (none in your dataset),
  "brand": brand,
  "ingredients": ingredients,
  "doc_id": text chunk id,
  'shipping_weight_lbs': shipping_weight_lbs, 
  'model_number': model_number
}


In [None]:
1. collection.name is  agentic_voice_assistant_vdb
2. <yes/no/changes>
3. <groq model name>