In [7]:
import pandas as pd
import numpy as np
import lancedb
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import os
import pickle
import time
from scipy.sparse import load_npz
import pyarrow as pa
import tempfile
import shutil

import sys
sys.path.append("../")
from src.evaluation.metrics import calculate_hits_at_k, calculate_mrr, evaluate_search_system

In [2]:
# Load baseline dataset
df_baseline = pd.read_parquet("../data/processed/df_baseline_clean.parquet")
df = df_baseline.copy()
print(f"Df shape: {df.shape}")
print(f"Unique queries: {len(df['query'].unique())}")
print(f"Columns: {list(df.columns)}")

Df shape: (519, 11)
Unique queries: 53
Columns: ['example_id', 'query', 'query_id', 'product_id', 'split', 'product_title', 'product_description', 'product_bullet_point', 'product_brand', 'product_color', 'combined_text']


In [3]:
# Load transformer embeddings
transformer_embeddings = np.load("../src/embeddings/baseline/transformer_embeddings.npy")
print(f"\nTransformer embeddings shape: {transformer_embeddings.shape}")

# Load model name
with open("../src/embeddings/baseline/transformer_model_name.txt", "r") as f:
    model_name = f.read().strip()
print(f"Model used: {model_name}")

# Load TF-IDF artifacts for baseline comparison
tfidf_matrix = load_npz("../src/vector_stores/baseline/tfidf_matrix.npz")
with open("../src/vector_stores/baseline/tfidf_vectorizer.pkl", "rb") as f:
    tfidf_vectorizer = pickle.load(f)
print(f"TF-IDF matrix shape: {tfidf_matrix.shape}")


Transformer embeddings shape: (519, 384)
Model used: all-MiniLM-L6-v2
TF-IDF matrix shape: (519, 1000)


In [8]:
# Re-using same evaluation metrics from notebook 03 placed into evaluation/metrics.py
# This is a simple function to print the evaluation results
def print_evaluation_results(results, method_name):
    """Pretty print evaluation results"""
    print(f"\n--- {method_name} Performance ---")
    print(f"HITS@1:  {results['hits_at_1']:.3f}")
    print(f"HITS@5:  {results['hits_at_5']:.3f}")
    print(f"HITS@10: {results['hits_at_10']:.3f}")
    print(f"MRR:     {results['mrr']:.3f}")

In [None]:
# Load the transformer model for query embedding
## Use simple cosine similarity search for now
transformer_model = SentenceTransformer(model_name)
print(f"Loaded transformer model: {model_name}")

def simple_cosine_search(query, df, embeddings, model, top_k=10):
    """Simple cosine similarity search using pre-computed embeddings"""
    # Embed the query
    query_embedding = model.encode([query])
    
    # Calculate cosine similarities
    similarities = cosine_similarity(query_embedding, embeddings)[0]
    
    # Get top-k most similar products
    top_indices = similarities.argsort()[-top_k:][::-1]
    
    # Return results with scores
    results = df.iloc[top_indices].copy()
    results['score'] = similarities[top_indices]
    
    return results

Loaded transformer model: all-MiniLM-L6-v2


In [10]:
# Test simple cosine search
test_query = "coffee maker"
print(f"Testing simple cosine search with query: '{test_query}'")

start_time = time.time()
simple_results = simple_cosine_search(test_query, df_baseline, transformer_embeddings, transformer_model, top_k=5)
simple_search_time = time.time() - start_time

print(f"\nSearch time: {simple_search_time:.4f} seconds")
print("\nTop 5 results:")
for idx, row in simple_results.iterrows():
    print(f"Score: {row['score']:.3f} | {row['product_title'][:80]}...")

Testing simple cosine search with query: 'coffee maker'

Search time: 0.1697 seconds

Top 5 results:
Score: 0.602 | Presto 02835 MyJo Single Cup Coffee Maker, Black...
Score: 0.593 | Cuisinart DCC-3200P1 Perfectemp Coffee Maker, 14 Cup Progammable with Glass Cara...
Score: 0.587 | CHEFMAN Single Serve One Cup Coffee Maker, up to 14 Oz, InstaCoffee Brews in 30 ...
Score: 0.576 | Hamilton Beach (47950) Coffee Maker with 12 Cup Capacity & Internal Storage Coff...
Score: 0.563 | Elite Gourmet EHC111A Maxi-Matic Personal 14oz Single-Serve Compact Coffee Maker...


In [11]:
# Setup LanceDB table with our embeddings and metadata
def create_lancedb_table(df, embeddings, db_path="../src/vector_databases/lancedb"):
    """Create LanceDB table with embeddings and metadata"""
    # Connect to LanceDB
    db = lancedb.connect(db_path)
    
    # Prepare data for LanceDB: list of dictionaries with vectors
    data = []
    for idx, (_, row) in enumerate(df.iterrows()):
        data.append({
            "vector": embeddings[idx].tolist(),  # Convert numpy array to list
            "product_id": row['product_id'],
            "query": row['query'],
            "product_title": row['product_title'],
            "product_description": row.get('product_description', ''),
            "product_bullet_point": row.get('product_bullet_point', ''),
            "product_brand": row.get('product_brand', ''),
            "product_color": row.get('product_color', ''),
            "combined_text": row['combined_text'],
        })
    
    # Create table (drop if exists)
    table_name = "products"
    if table_name in db.table_names():
        db.drop_table(table_name)
    
    table = db.create_table(table_name, data)
    
    print(f"Created LanceDB table with {len(data)} vectors")
    print(f"Vector dimension: {len(embeddings[0])}")
    print(f"Database path: {db_path}")
    
    return db, table

In [12]:
# Create LanceDB table
db, lance_table = create_lancedb_table(df_baseline, transformer_embeddings)

Created LanceDB table with 519 vectors
Vector dimension: 384
Database path: ../src/vector_databases/lancedb


In [21]:
# Implement search functions using LanceDB's vector search capabilities.
def lancedb_search(query, df, table, model, top_k=10):
    """Search using LanceDB vector similarity"""
    
    # Embed the query
    query_embedding = model.encode([query])[0]
    # Perform vector search
    results = table.search(query_embedding).distance_type("cosine").limit(top_k).to_pandas()
    
    # Add score column
    results['score'] = 1 - results['_distance']  # Convert distance to similarity
    # Sort by score descending to match simple cosine order - NOT NEEDED
    ## results = results.sort_values('score', ascending=False)
    return results

def lancedb_search_with_filter(query, df, table, model, filter_condition=None, top_k=10):
    """Search using LanceDB with metadata filtering"""
    
    # Embed the query
    query_embedding = model.encode([query])[0]
    # Perform vector search with optional filtering
    search_query = table.search(query_embedding).distance_type("cosine").limit(top_k)
    
    if filter_condition:
        search_query = search_query.where(filter_condition)
    
    results = search_query.to_pandas()
    # Add score column
    results['score'] = 1 - results['_distance']
    # Sort by score descending to match simple cosine order - NOT NEEDED
    ## results = results.sort_values('score', ascending=False)
    return results


In [22]:
# Test LanceDB search - without filter for now
print(f"Testing LanceDB search with query: '{test_query}'")

start_time = time.time()
lance_results = lancedb_search(test_query, df_baseline, lance_table, transformer_model, top_k=5)
lance_search_time = time.time() - start_time

print(f"\nLanceDB search time: {lance_search_time:.4f} seconds")
print("\nTop 5 LanceDB results:")
for idx, row in lance_results.iterrows():
    print(f"Score: {row['score']:.3f} | {row['product_title'][:80]}...")

print(f"\nPerformance comparison:")
print(f"Simple cosine: {simple_search_time:.4f}s")
print(f"LanceDB:       {lance_search_time:.4f}s")

Testing LanceDB search with query: 'coffee maker'

LanceDB search time: 0.1741 seconds

Top 5 LanceDB results:
Score: 0.602 | Presto 02835 MyJo Single Cup Coffee Maker, Black...
Score: 0.593 | Cuisinart DCC-3200P1 Perfectemp Coffee Maker, 14 Cup Progammable with Glass Cara...
Score: 0.587 | CHEFMAN Single Serve One Cup Coffee Maker, up to 14 Oz, InstaCoffee Brews in 30 ...
Score: 0.576 | Hamilton Beach (47950) Coffee Maker with 12 Cup Capacity & Internal Storage Coff...
Score: 0.563 | Elite Gourmet EHC111A Maxi-Matic Personal 14oz Single-Serve Compact Coffee Maker...

Performance comparison:
Simple cosine: 0.1697s
LanceDB:       0.1741s


* Same results and comparable performance time but simple cosine slightly faster:
  * VDBs provide perf. value at scale, but maybe not for such small datasets.

In [23]:
# Also look at TF-IDF search for comparison
def tfidf_search(query, df, tfidf_matrix, vectorizer, top_k=10):
    """TF-IDF search - baseline approach from notebook 03"""
    # Transform query using fitted vectorizer
    query_vector = vectorizer.transform([query])
    # Calculate cosine similarities
    similarities = cosine_similarity(query_vector, tfidf_matrix)[0]
    # Get top-k most similar products
    top_indices = similarities.argsort()[-top_k:][::-1]
    # Return results with scores
    results = df.iloc[top_indices].copy()
    results['score'] = similarities[top_indices]
    
    return results

In [24]:
# Test TF-IDF search for comparison
print(f"Testing TF-IDF search with query: '{test_query}'")

start_time = time.time()
tfidf_results = tfidf_search(test_query, df_baseline, tfidf_matrix, tfidf_vectorizer, top_k=5)
tfidf_search_time = time.time() - start_time

print(f"\nTF-IDF search time: {tfidf_search_time:.4f} seconds")
print("\nTop 5 TF-IDF results:")
for idx, row in tfidf_results.iterrows():
    print(f"Score: {row['score']:.3f} | {row['product_title'][:80]}...")

Testing TF-IDF search with query: 'coffee maker'

TF-IDF search time: 0.4533 seconds

Top 5 TF-IDF results:
Score: 0.750 | Outon Coffee Maker 10 Cup, Programmable Drip Coffee Maker, Multiple Brew Strengt...
Score: 0.641 | Elite Gourmet EHC111A Maxi-Matic Personal 14oz Single-Serve Compact Coffee Maker...
Score: 0.616 | CHULUX Single Cup Coffee Maker Machine,12 Ounce Pod Coffee Brewer,One Touch Func...
Score: 0.589 | CHEFMAN Single Serve One Cup Coffee Maker, up to 14 Oz, InstaCoffee Brews in 30 ...
Score: 0.550 | Single Serve K Cup Coffee Maker for K-Cup Pods and Ground Coffee, Compact Design...


* TF-IDF search took longer due to larger matrix.
* Also returned different results from simple cosine and lancedb but Elite and CHEFMAN returned (different position though) 

In [27]:
# Evaluate all 3 approaches together with HITS@K and MRR metrics
print("=" * 60)
print("COMPREHENSIVE SEARCH METHOD EVALUATION")
print("=" * 60)

# Evaluate TF-IDF baseline
print("\n1. Evaluating TF-IDF baseline...")
tfidf_metrics = evaluate_search_system(
    df_baseline, tfidf_search, tfidf_matrix, tfidf_vectorizer
)
print_evaluation_results(tfidf_metrics, "TF-IDF Baseline")

# Evaluate simple cosine similarity
print("\n2. Evaluating simple cosine similarity...")
cosine_metrics = evaluate_search_system(
    df_baseline, simple_cosine_search, transformer_embeddings, transformer_model
)
print_evaluation_results(cosine_metrics, "Simple Cosine Similarity")

# Evaluate LanceDB
print("\n3. Evaluating LanceDB vector search...")
lance_metrics = evaluate_search_system(
    df_baseline, lancedb_search, lance_table, transformer_model
)
print_evaluation_results(lance_metrics, "LanceDB Vector Search")

print("\n" + "=" * 60)
print("SUMMARY COMPARISON")
print("=" * 60)

# Create comparison table
comparison_df = pd.DataFrame({
    'Method': ['TF-IDF', 'Simple Cosine', 'LanceDB'],
    'HITS@1': [tfidf_metrics['hits_at_1'], cosine_metrics['hits_at_1'], lance_metrics['hits_at_1']],
    'HITS@5': [tfidf_metrics['hits_at_5'], cosine_metrics['hits_at_5'], lance_metrics['hits_at_5']],
    'HITS@10': [tfidf_metrics['hits_at_10'], cosine_metrics['hits_at_10'], lance_metrics['hits_at_10']],
    'MRR': [tfidf_metrics['mrr'], cosine_metrics['mrr'], lance_metrics['mrr']]
})

print(comparison_df.round(3))

# Performance insights
print("\n----  Key Insights  ----")
print(f"Best HITS@1: {comparison_df.loc[comparison_df['HITS@1'].idxmax(), 'Method']} ({comparison_df['HITS@1'].max():.3f})")
print(f"Best MRR: {comparison_df.loc[comparison_df['MRR'].idxmax(), 'Method']} ({comparison_df['MRR'].max():.3f})")
print(f"Simple Cosine vs LanceDB accuracy difference: {abs(cosine_metrics['hits_at_1'] - lance_metrics['hits_at_1']):.3f}")


COMPREHENSIVE SEARCH METHOD EVALUATION

1. Evaluating TF-IDF baseline...
Evaluating on 53 unique queries...

--- TF-IDF Baseline Performance ---
HITS@1:  0.849
HITS@5:  0.906
HITS@10: 0.943
MRR:     0.869

2. Evaluating simple cosine similarity...
Evaluating on 53 unique queries...

--- Simple Cosine Similarity Performance ---
HITS@1:  0.981
HITS@5:  0.981
HITS@10: 0.981
MRR:     0.981

3. Evaluating LanceDB vector search...
Evaluating on 53 unique queries...

--- LanceDB Vector Search Performance ---
HITS@1:  0.981
HITS@5:  0.981
HITS@10: 0.981
MRR:     0.981

SUMMARY COMPARISON
          Method  HITS@1  HITS@5  HITS@10    MRR
0         TF-IDF   0.849   0.906    0.943  0.869
1  Simple Cosine   0.981   0.981    0.981  0.981
2        LanceDB   0.981   0.981    0.981  0.981

----  Key Insights  ----
Best HITS@1: Simple Cosine (0.981)
Best MRR: Simple Cosine (0.981)
Simple Cosine vs LanceDB accuracy difference: 0.000


* Wrote evaluation for definite winner but simple cosine and lancedb are equivalent - simple cosine chosen since it was 'first' for the tie.
* At small scale (519 items), simple cosine and LanceDB appear equivalent.  Choice between them depends on prod reqs:
  * applying metadata info, filtering, and scalability

In [28]:
# Show how vector db can combine semantic search with metadata filtering (product brand keyword)
print("METADATA FILTERING EXAMPLE")
print("-" * 40)

query = "wireless headphones"
print(f"Query: '{query}'")

# Regular search
regular_results = lancedb_search(query, df_baseline, lance_table, transformer_model, top_k=5)
print("\nRegular search results:")
for idx, row in regular_results.iterrows():
    brand = row.get('product_brand', 'N/A')
    print(f"Score: {row['score']:.3f} | Brand: {brand} | {row['product_title'][:60]}...")

# Check available brands for filtering
available_brands = df_baseline['product_brand'].value_counts().head(10)
print(f"\nTop brands in dataset: {list(available_brands.index[:5])}")

# Filtered search (if we have brand data)
if not df_baseline['product_brand'].isna().all():
    popular_brand = available_brands.index[0] if len(available_brands) > 0 else None
    if popular_brand and popular_brand != '':
        print(f"\nFiltered search (brand = '{popular_brand}'):")
        filtered_results = lancedb_search_with_filter(
            query, df_baseline, lance_table, transformer_model, 
            filter_condition=f"product_brand = '{popular_brand}'", top_k=5
        )
        for idx, row in filtered_results.iterrows():
            print(f"Score: {row['score']:.3f} | Brand: {row['product_brand']} | {row['product_title'][:60]}...")
    else:
        print("\nNote: Limited brand metadata available for filtering demo")
else:
    print("\nNote: No brand metadata available for filtering demo")

METADATA FILTERING EXAMPLE
----------------------------------------
Query: 'wireless headphones'

Regular search results:
Score: 0.378 | Brand: Nike | Nike Jacquard Hairbands 6 pack...
Score: 0.324 | Brand: CARESHINE | Full Face Mask with Free Adjustable Headgear Gel Full Face M...
Score: 0.306 | Brand: ZIKU | Compact Wireless Charger. ZIKU 3 in 1 Foldable 15W Wireless ...
Score: 0.295 | Brand: Nike | Nike Dri-Fit Headband Home & Away...
Score: 0.284 | Brand: Nike | Nike Speed Performance Headband(Black/White, Osfm)...

Top brands in dataset: ['Nike', 'DC', 'Apple', "Rubie's", 'Crocs']

Filtered search (brand = 'Nike'):
Score: 0.378 | Brand: Nike | Nike Jacquard Hairbands 6 pack...
Score: 0.295 | Brand: Nike | Nike Dri-Fit Headband Home & Away...
Score: 0.284 | Brand: Nike | Nike Speed Performance Headband(Black/White, Osfm)...
Score: 0.274 | Brand: Nike | Nike Men's Printed Dri-FIT Head Tie (Dark_Grey_Black_White)...
Score: 0.270 | Brand: Nike | Nike Swoosh Sport Headbands 6pk (One Si

In [30]:
# Show how vector db can perform batch operations for multiple queries
print("BATCH OPERATIONS COMPARISON")
print("-" * 40)

# Select multiple test queries
test_queries = df_baseline['query'].unique()[:10]  # First 10 unique queries
print(f"Testing batch operations with {len(test_queries)} queries")

# Simple cosine similarity batch
print("\nSimple cosine similarity (individual queries):")
start_time = time.time()
cosine_batch_results = []
for query in test_queries:
    result = simple_cosine_search(query, df_baseline, transformer_embeddings, transformer_model, top_k=5)
    cosine_batch_results.append(result)
cosine_batch_time = time.time() - start_time
print(f"Time for {len(test_queries)} queries: {cosine_batch_time:.4f} seconds")
print(f"Average time per query: {cosine_batch_time/len(test_queries):.4f} seconds")

# LanceDB batch
print("\nLanceDB (individual queries):")
start_time = time.time()
lance_batch_results = []
for query in test_queries:
    result = lancedb_search(query, df_baseline, lance_table, transformer_model, top_k=5)
    lance_batch_results.append(result)
lance_batch_time = time.time() - start_time
print(f"Time for {len(test_queries)} queries: {lance_batch_time:.4f} seconds")
print(f"Average time per query: {lance_batch_time/len(test_queries):.4f} seconds")

# Performance comparison
print(f"\nBatch performance comparison:")
print(f"Simple cosine: {cosine_batch_time:.4f}s total ({cosine_batch_time/len(test_queries):.4f}s per query)")
print(f"LanceDB:       {lance_batch_time:.4f}s total ({lance_batch_time/len(test_queries):.4f}s per query)")

BATCH OPERATIONS COMPARISON
----------------------------------------
Testing batch operations with 10 queries

Simple cosine similarity (individual queries):
Time for 10 queries: 0.1443 seconds
Average time per query: 0.0144 seconds

LanceDB (individual queries):
Time for 10 queries: 0.1717 seconds
Average time per query: 0.0172 seconds

Batch performance comparison:
Simple cosine: 0.1443s total (0.0144s per query)
LanceDB:       0.1717s total (0.0172s per query)


In [32]:
# Show how vector db can perform batch operations for multiple queries
print("BATCH OPERATIONS COMPARISON (with more queries - n=30)")
print("-" * 40)

# Select multiple test queries
test_queries = df_baseline['query'].unique()[:30]  # Up it to first 30 unique queries
print(f"Testing batch operations with {len(test_queries)} queries")

# Simple cosine similarity batch
print("\nSimple cosine similarity (individual queries):")
start_time = time.time()
cosine_batch_results = []
for query in test_queries:
    result = simple_cosine_search(query, df_baseline, transformer_embeddings, transformer_model, top_k=5)
    cosine_batch_results.append(result)
cosine_batch_time = time.time() - start_time
print(f"Time for {len(test_queries)} queries: {cosine_batch_time:.4f} seconds")
print(f"Average time per query: {cosine_batch_time/len(test_queries):.4f} seconds")

# LanceDB batch
print("\nLanceDB (individual queries):")
start_time = time.time()
lance_batch_results = []
for query in test_queries:
    result = lancedb_search(query, df_baseline, lance_table, transformer_model, top_k=5)
    lance_batch_results.append(result)
lance_batch_time = time.time() - start_time
print(f"Time for {len(test_queries)} queries: {lance_batch_time:.4f} seconds")
print(f"Average time per query: {lance_batch_time/len(test_queries):.4f} seconds")

# Performance comparison
print(f"\nBatch performance comparison:")
print(f"Simple cosine: {cosine_batch_time:.4f}s total ({cosine_batch_time/len(test_queries):.4f}s per query)")
print(f"LanceDB:       {lance_batch_time:.4f}s total ({lance_batch_time/len(test_queries):.4f}s per query)")

BATCH OPERATIONS COMPARISON (with more queries - n=30)
----------------------------------------
Testing batch operations with 30 queries

Simple cosine similarity (individual queries):
Time for 30 queries: 0.3758 seconds
Average time per query: 0.0125 seconds

LanceDB (individual queries):
Time for 30 queries: 0.4919 seconds
Average time per query: 0.0164 seconds

Batch performance comparison:
Simple cosine: 0.3758s total (0.0125s per query)
LanceDB:       0.4919s total (0.0164s per query)


In [33]:
# Show how vector db can perform batch operations for multiple queries
print("BATCH OPERATIONS COMPARISON (with more queries - n=50)")
print("-" * 40)

# Select multiple test queries
test_queries = df_baseline['query'].unique()[:50]  # Up it to first 50 unique queries - almost all
print(f"Testing batch operations with {len(test_queries)} queries")

# Simple cosine similarity batch
print("\nSimple cosine similarity (individual queries):")
start_time = time.time()
cosine_batch_results = []
for query in test_queries:
    result = simple_cosine_search(query, df_baseline, transformer_embeddings, transformer_model, top_k=5)
    cosine_batch_results.append(result)
cosine_batch_time = time.time() - start_time
print(f"Time for {len(test_queries)} queries: {cosine_batch_time:.4f} seconds")
print(f"Average time per query: {cosine_batch_time/len(test_queries):.4f} seconds")

# LanceDB batch
print("\nLanceDB (individual queries):")
start_time = time.time()
lance_batch_results = []
for query in test_queries:
    result = lancedb_search(query, df_baseline, lance_table, transformer_model, top_k=5)
    lance_batch_results.append(result)
lance_batch_time = time.time() - start_time
print(f"Time for {len(test_queries)} queries: {lance_batch_time:.4f} seconds")
print(f"Average time per query: {lance_batch_time/len(test_queries):.4f} seconds")

# Performance comparison
print(f"\nBatch performance comparison:")
print(f"Simple cosine: {cosine_batch_time:.4f}s total ({cosine_batch_time/len(test_queries):.4f}s per query)")
print(f"LanceDB:       {lance_batch_time:.4f}s total ({lance_batch_time/len(test_queries):.4f}s per query)")

BATCH OPERATIONS COMPARISON (with more queries - n=50)
----------------------------------------
Testing batch operations with 50 queries

Simple cosine similarity (individual queries):
Time for 50 queries: 0.7599 seconds
Average time per query: 0.0152 seconds

LanceDB (individual queries):
Time for 50 queries: 0.9958 seconds
Average time per query: 0.0199 seconds

Batch performance comparison:
Simple cosine: 0.7599s total (0.0152s per query)
LanceDB:       0.9958s total (0.0199s per query)


* While transformer embeddings dramatically improve search accuracy over TF-IDF (98% vs 85% HITS@1), for such a small dataset, the simple cosine search is faster than vector database operations.
* Try scaling up the size for further comparison

In [35]:
print("SCALABILITY SIMULATION")
print("-" * 40)

def create_scaled_dataset(df, embeddings, scale_factor=5):
    """Create a larger dataset by replicating and slightly modifying data"""
    scaled_df_list = []
    scaled_embeddings_list = []
    
    for i in range(scale_factor):
        # Create copy with modified product_ids to avoid duplicates
        df_copy = df.copy()
        df_copy['product_id'] = df_copy['product_id'] + f"_copy_{i}"
        scaled_df_list.append(df_copy)
        
        # Add small noise to embeddings to simulate variation
        noise = np.random.normal(0, 0.001, embeddings.shape)
        scaled_embeddings_list.append(embeddings + noise)
    
    scaled_df = pd.concat(scaled_df_list, ignore_index=True)
    scaled_embeddings = np.vstack(scaled_embeddings_list)
    
    return scaled_df, scaled_embeddings

# Create scaled dataset (5x larger = ~2,500 products)
print("Creating scaled dataset for scalability test...")
scaled_df, scaled_embeddings = create_scaled_dataset(df_baseline, transformer_embeddings, scale_factor=5)
print(f"Scaled dataset size: {scaled_df.shape[0]} products")
print(f"Scaled embeddings shape: {scaled_embeddings.shape}")

# Create scaled LanceDB table
print("\nCreating scaled LanceDB table...")
scaled_db, scaled_lance_table = create_lancedb_table(
    scaled_df, scaled_embeddings, 
    db_path="../src/vector_databases/lancedb_scaled"
)

# Test performance on scaled data
test_query_scaled = "coffee maker"
print(f"\nTesting search performance on scaled data with query: '{test_query_scaled}'")

# Simple cosine on scaled data
print("\nSimple cosine similarity (scaled):")
start_time = time.time()
scaled_cosine_results = simple_cosine_search(
    test_query_scaled, scaled_df, scaled_embeddings, transformer_model, top_k=10
)
scaled_cosine_time = time.time() - start_time
print(f"Search time: {scaled_cosine_time:.4f} seconds")

# LanceDB on scaled data
print("\nLanceDB (scaled):")
start_time = time.time()
scaled_lance_results = lancedb_search(
    test_query_scaled, scaled_df, scaled_lance_table, transformer_model, top_k=10
)
scaled_lance_time = time.time() - start_time
print(f"Search time: {scaled_lance_time:.4f} seconds")

# Performance scaling analysis
print(f"\n----  Scalability Analysis  ----")
print(f"Original dataset ({len(df_baseline)} products):")
print(f"  Simple cosine: {simple_search_time:.4f}s")
print(f"  LanceDB:       {lance_search_time:.4f}s")
print(f"\nScaled dataset ({len(scaled_df)} products):")
print(f"  Simple cosine: {scaled_cosine_time:.4f}s (slowdown: {scaled_cosine_time/simple_search_time:.2f}x)")
print(f"  LanceDB:       {scaled_lance_time:.4f}s (slowdown: {scaled_lance_time/lance_search_time:.2f}x)")

if scaled_lance_time > 0:
    scaled_speedup = scaled_cosine_time / scaled_lance_time
    print(f"\nLanceDB advantage on scaled data: {scaled_speedup:.2f}x faster")

SCALABILITY SIMULATION
----------------------------------------
Creating scaled dataset for scalability test...
Scaled dataset size: 2595 products
Scaled embeddings shape: (2595, 384)

Creating scaled LanceDB table...
Created LanceDB table with 2595 vectors
Vector dimension: 384
Database path: ../src/vector_databases/lancedb_scaled

Testing search performance on scaled data with query: 'coffee maker'

Simple cosine similarity (scaled):
Search time: 0.0429 seconds

LanceDB (scaled):
Search time: 0.0311 seconds

----  Scalability Analysis  ----
Original dataset (519 products):
  Simple cosine: 0.1697s
  LanceDB:       0.1741s

Scaled dataset (2595 products):
  Simple cosine: 0.0429s (slowdown: 0.25x)
  LanceDB:       0.0311s (slowdown: 0.18x)

LanceDB advantage on scaled data: 1.38x faster


In [36]:
# Summarize findings
print("KEY FINDINGS SUMMARY")

print("\n ACCURACY COMPARISON:")
print(f"     TF-IDF Baseline:        HITS@1: {tfidf_metrics['hits_at_1']:.3f}, MRR: {tfidf_metrics['mrr']:.3f}")
print(f"     Simple Cosine (Trans):  HITS@1: {cosine_metrics['hits_at_1']:.3f}, MRR: {cosine_metrics['mrr']:.3f}")
print(f"     LanceDB (Trans):        HITS@1: {lance_metrics['hits_at_1']:.3f}, MRR: {lance_metrics['mrr']:.3f}")

print("\n PERFORMANCE CHARACTERISTICS:")
print(f"     Small dataset ({len(df_baseline)} products):")
print(f"       - Simple cosine: {simple_search_time:.4f}s")
print(f"       - LanceDB:       {lance_search_time:.4f}s")
print(f"     Scaled dataset ({len(scaled_df)} products):")
print(f"       - Simple cosine: {scaled_cosine_time:.4f}s")
print(f"       - LanceDB:       {scaled_lance_time:.4f}s")

KEY FINDINGS SUMMARY

 ACCURACY COMPARISON:
     TF-IDF Baseline:        HITS@1: 0.849, MRR: 0.869
     Simple Cosine (Trans):  HITS@1: 0.981, MRR: 0.981
     LanceDB (Trans):        HITS@1: 0.981, MRR: 0.981

 PERFORMANCE CHARACTERISTICS:
     Small dataset (519 products):
       - Simple cosine: 0.1697s
       - LanceDB:       0.1741s
     Scaled dataset (2595 products):
       - Simple cosine: 0.0429s
       - LanceDB:       0.0311s


* Vector databases allow metadata filtering and faster processing with scale
* Some potential next steps:
  * try out hybrid search (semantic + keyword)
  * caching for popular queries

In [37]:
# Save evaluation results
vector_db_results = {
    'method_comparison': {
        'tfidf_baseline': tfidf_metrics,
        'simple_cosine': cosine_metrics,
        'lancedb': lance_metrics
    },
    'performance_analysis': {
        'small_dataset_size': len(df_baseline),
        'scaled_dataset_size': len(scaled_df),
        'simple_cosine_time_small': simple_search_time,
        'lancedb_time_small': lance_search_time,
        'simple_cosine_time_scaled': scaled_cosine_time,
        'lancedb_time_scaled': scaled_lance_time
    },
    'model_info': {
        'transformer_model': model_name,
        'embedding_dimension': transformer_embeddings.shape[1],
        'evaluation_queries': len(df_baseline['query'].unique())
    },
    'conclusions': {
        'best_method_hits1': comparison_df.loc[comparison_df['HITS@1'].idxmax(), 'Method'],
        'best_method_mrr': comparison_df.loc[comparison_df['MRR'].idxmax(), 'Method'],
        'recommendation': 'LanceDB recommended for production due to metadata filtering and scalability'
    }
}

# Save results
with open(f"../src/results/vector_database/vector_database_evaluation.pkl", "wb") as f:
    pickle.dump(vector_db_results, f)

# Save comparison table as CSV
comparison_df.to_csv(f"../src/results/vector_database/method_comparison.csv", index=False)