In [1]:
import pandas as pd
import numpy as np
import json
import lancedb
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import os
import pickle
import time
from scipy.sparse import load_npz
import sys
sys.path.append("../")
from src.evaluation.metrics import calculate_hits_at_k, calculate_mrr, evaluate_search_system
from src.text_processing.cleaning import clean_text_light, clean_text_moderate


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\schel\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\schel\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [3]:
# Load baseline and imputed datasets
df_baseline = pd.read_parquet("../data/processed/df_baseline_clean.parquet")
df_imputed = pd.read_parquet("../data/processed/df_imputed_clean.parquet")

print(f"Baseline dataset: {df_baseline.shape}")
print(f"Imputed dataset: {df_imputed.shape}")

# Load keyword artifacts - don't load raw since has many stopwords
with open("../src/text_processing/artifacts/light_keywords.json", "r") as f:
    light_keywords = json.load(f)

with open("../src/text_processing/artifacts/moderate_keywords.json", "r") as f:
    moderate_keywords = json.load(f)

print(f"\nKeyword artifacts loaded:")
print(f"Light keywords: {len(light_keywords)}")
print(f"Moderate keywords: {len(moderate_keywords)}")

Baseline dataset: (519, 11)
Imputed dataset: (519, 11)

Keyword artifacts loaded:
Light keywords: 150
Moderate keywords: 150


In [8]:
# Evaluate different text combination strategies
def create_combined_text_v1(row):
    """Original strategy: title + description + brand + bullets + color"""
    parts = [
        str(row.get('product_title', '')),
        str(row.get('product_description', '')),
        str(row.get('product_brand', '')),
        str(row.get('product_bullet_point', '')),
        str(row.get('product_color', ''))
    ]
    return ' '.join([p for p in parts if p and p.lower() not in ['nan', 'none', '']])

def create_combined_text_v2(row):
    """Strategy v2: title + brand + color + bullets + description (prioritize metadata)"""
    parts = [
        str(row.get('product_title', '')),
        str(row.get('product_brand', '')),
        str(row.get('product_color', '')),
        str(row.get('product_bullet_point', '')),
        str(row.get('product_description', ''))
    ]
    return ' '.join([p for p in parts if p and p.lower() not in ['nan', 'none', '']])

def create_combined_text_v3(row):
    """Strategy v3: title + bullets + brand + color + description (prioritize bullets)"""
    parts = [
        str(row.get('product_title', '')),
        str(row.get('product_bullet_point', '')),
        str(row.get('product_brand', '')),
        str(row.get('product_color', '')),
        str(row.get('product_description', ''))
    ]
    return ' '.join([p for p in parts if p and p.lower() not in ['nan', 'none', '']])

# Create datasets with different combination strategies
print("Creating datasets with different text combination strategies...")

df_baseline_v1 = df_baseline.copy()
df_baseline_v1['combined_text_v1'] = df_baseline_v1.apply(create_combined_text_v1, axis=1)

df_baseline_v2 = df_baseline.copy()
df_baseline_v2['combined_text_v2'] = df_baseline_v2.apply(create_combined_text_v2, axis=1)

df_baseline_v3 = df_baseline.copy()
df_baseline_v3['combined_text_v3'] = df_baseline_v3.apply(create_combined_text_v3, axis=1)

Creating datasets with different text combination strategies...


In [9]:
# Test to see if field ordering likely matters for embeddings
## See if embeddings are much different between strategies first

# Load a pre-trained model for quick testing
from sentence_transformers import SentenceTransformer
test_model = SentenceTransformer('all-MiniLM-L6-v2')

# Create embeddings for each strategy
embeddings_v1 = test_model.encode(df_baseline_v1['combined_text_v1'].tolist())
embeddings_v2 = test_model.encode(df_baseline_v2['combined_text_v2'].tolist())
embeddings_v3 = test_model.encode(df_baseline_v3['combined_text_v3'].tolist())

# Calculate similarity between strategies
from sklearn.metrics.pairwise import cosine_similarity
sim_v1_v2 = np.mean([cosine_similarity([embeddings_v1[i]], [embeddings_v2[i]])[0][0] for i in range(len(df_baseline))])
sim_v1_v3 = np.mean([cosine_similarity([embeddings_v1[i]], [embeddings_v3[i]])[0][0] for i in range(len(df_baseline))])
sim_v2_v3 = np.mean([cosine_similarity([embeddings_v2[i]], [embeddings_v3[i]])[0][0] for i in range(len(df_baseline))])

print(f"\nEmbedding similarities between strategies:")
print(f"V1 vs V2: {sim_v1_v2:.4f}")
print(f"V1 vs V3: {sim_v1_v3:.4f}")
print(f"V2 vs V3: {sim_v2_v3:.4f}")

if min(sim_v1_v2, sim_v1_v3, sim_v2_v3) > 0.98:
    print("\nField ordering has MINIMAL impact on embeddings")
    print("We can proceed with V2 (metadata first) for conceptual clarity.")
    chosen_strategy = "v2"
else:
    print("\nField ordering DOES affect embeddings significantly!")
    print("We should evaluate all strategies systematically.")
    chosen_strategy = "evaluate_all"

print(f"\nChosen strategy: {chosen_strategy}")


Embedding similarities between strategies:
V1 vs V2: 0.9556
V1 vs V3: 0.9548
V2 vs V3: 0.9941

Field ordering DOES affect embeddings significantly!
We should evaluate all strategies systematically.

Chosen strategy: evaluate_all


In [10]:
# Evaluate text combination strategies
## Simple search function for all strategies
def simple_cosine_search(query, df, embeddings, model, top_k=10):
    """Simple cosine similarity search"""
    query_embedding = model.encode([query])[0]
    similarities = cosine_similarity([query_embedding], embeddings)[0]
    
    top_indices = np.argsort(similarities)[::-1][:top_k]
    
    results = []
    for idx in top_indices:
        results.append({
            'product_id': df.iloc[idx]['product_id'],
            'product_title': df.iloc[idx]['product_title'],
            'score': similarities[idx]
        })
    
    return pd.DataFrame(results)

print("TEXT COMBINATION STRATEGY EVALUATION")

print("1. Evaluating Strategy V1 (Original Order)...")
v1_metrics = evaluate_search_system(df_baseline_v1, simple_cosine_search, embeddings_v1, test_model)

print("2. Evaluating Strategy V2 (Metadata First)...")
v2_metrics = evaluate_search_system(df_baseline_v2, simple_cosine_search, embeddings_v2, test_model)

print("3. Evaluating Strategy V3 (Bullets First)...")
v3_metrics = evaluate_search_system(df_baseline_v3, simple_cosine_search, embeddings_v3, test_model)

# Compare results
strategy_comparison = pd.DataFrame({
    'Strategy': ['V1 (Original)', 'V2 (Metadata First)', 'V3 (Bullets First)'],
    'HITS@1': [v1_metrics['hits_at_1'], v2_metrics['hits_at_1'], v3_metrics['hits_at_1']],
    'HITS@5': [v1_metrics['hits_at_5'], v2_metrics['hits_at_5'], v3_metrics['hits_at_5']],
    'MRR': [v1_metrics['mrr'], v2_metrics['mrr'], v3_metrics['mrr']]
})

print("\nSTRATEGY COMPARISON")
print("=" * 40)
print(strategy_comparison.round(3))

# Find winner
best_idx = strategy_comparison['HITS@1'].idxmax()
best_strategy = strategy_comparison.loc[best_idx, 'Strategy']
print(f"\nWinner: {best_strategy} (HITS@1: {strategy_comparison.loc[best_idx, 'HITS@1']:.3f})")

TEXT COMBINATION STRATEGY EVALUATION
1. Evaluating Strategy V1 (Original Order)...
Evaluating on 53 unique queries...
2. Evaluating Strategy V2 (Metadata First)...
Evaluating on 53 unique queries...
3. Evaluating Strategy V3 (Bullets First)...
Evaluating on 53 unique queries...

STRATEGY COMPARISON
              Strategy  HITS@1  HITS@5    MRR
0        V1 (Original)   0.962   0.981  0.972
1  V2 (Metadata First)   0.943   0.981  0.962
2   V3 (Bullets First)   0.943   0.981  0.962

Winner: V1 (Original) (HITS@1: 0.962)


* Original text combining in current format performed best with marginally higher HITS@1 and MRR
  * Different imputation strategies could affect this in the future but will proceed with original text combination 

In [11]:
# Save embeddings for potential reuse
np.save("../src/embeddings/combining/strategy_v1_embeddings.npy", embeddings_v1)
np.save("../src/embeddings/combining/strategy_v2_embeddings.npy", embeddings_v2)
np.save("../src/embeddings/combining/strategy_v3_embeddings.npy", embeddings_v3)

# Save model name
with open("../src/embeddings/combining/model_name.txt", "w") as f:
    f.write("all-MiniLM-L6-v2")

In [12]:
# Prefer to build complete vector database from scratch with current v_1 strategy
## rather than using pre-computed embeddings
def create_embeddings_and_vector_db(df, text_column, model_name="all-MiniLM-L6-v2", db_path_suffix=""):
    """
    Complete pipeline: raw text -> embeddings -> vector database
    This demonstrates the full workflow a production system would use
    """
    print(f"Creating embeddings for {len(df)} products using {model_name}...")
    
    # Step 1: Load transformer model
    model = SentenceTransformer(model_name)
    
    # Step 2: Create embeddings from text
    texts = df[text_column].tolist()
    embeddings = model.encode(texts, show_progress_bar=True)
    print(f"Created embeddings shape: {embeddings.shape}")
    
    # Step 3: Create LanceDB vector database
    db_path = f"../src/vector_databases/lancedb_hybrid{db_path_suffix}"
    os.makedirs(db_path, exist_ok=True)
    db = lancedb.connect(db_path)
    
    # Prepare data for LanceDB
    data = []
    for idx, (_, row) in enumerate(df.iterrows()):
        data.append({
            "vector": embeddings[idx].tolist(),
            "product_id": row['product_id'],
            "query": row['query'],
            "product_title": row['product_title'],
            "product_description": row.get('product_description', ''),
            "product_bullet_point": row.get('product_bullet_point', ''),
            "product_brand": row.get('product_brand', ''),
            "product_color": row.get('product_color', ''),
            "combined_text": row[text_column]
        })
    
    # Create table (drop if exists)
    table_name = "products"
    if table_name in db.table_names():
        db.drop_table(table_name)
    
    table = db.create_table(table_name, data)
    
    print(f"Created LanceDB table: {len(data)} vectors, {embeddings.shape[1]} dimensions")
    print(f"Database path: {db_path}")
    
    return model, embeddings, db, table

In [13]:
# Strategy V2 (metadata first) - this addresses our truncation concerns
model_v1, embeddings_v1, db_v1, table_v1 = create_embeddings_and_vector_db(
    df_baseline_v1, 'combined_text_v1', db_path_suffix="_v1"
)

Creating embeddings for 519 products using all-MiniLM-L6-v2...


Batches:   0%|          | 0/17 [00:00<?, ?it/s]

Created embeddings shape: (519, 384)
Created LanceDB table: 519 vectors, 384 dimensions
Database path: ../src/vector_databases/lancedb_hybrid_v1


In [23]:
# Hybrid search implementation (Keyword + Semantic)
def calculate_keyword_score(query, product_text, keywords_dict):
    """
    Calculate keyword matching score using frequency-weighted word matching
    keywords_dict contains word -> frequency counts
    """
    query_words = [word.lower().strip() for word in query.split()]
    product_text_lower = product_text.lower()
    
    if not query_words:
        return 0.0
    
    total_weight = 0
    matched_weight = 0
    
    for word in query_words:
        if word in keywords_dict:
            # Use inverse frequency as weight (rare words are more important)
            frequency = keywords_dict[word]
            weight = 1.0 / (1.0 + frequency)  # Inverse frequency weighting
            total_weight += weight
            
            # Check if word appears in product text
            if word in product_text_lower:
                matched_weight += weight
    
    if total_weight == 0:
        return 0.0
    
    # Return weighted fraction of matched keywords
    return matched_weight / total_weight

def hybrid_search(query, df, table, model, keywords_dict, alpha=0.7, beta=0.3, keyword_threshold=0.1, top_k=10):
    """
    Hybrid search combining semantic similarity and keyword matching
    alpha: weight for semantic score
    beta: weight for keyword score
    """
    # step 1: get semantic search results
    query_embedding = model.encode([query])[0]
    # Using top_k * 2 to allow for reranking - products low in semantic might rank high in hybrid
    semantic_results = table.search(query_embedding).distance_type("cosine").limit(top_k * 2).to_pandas()
    
    # step 2: calc hybrid scores
    hybrid_scores = []
    for idx, row in semantic_results.iterrows():
        semantic_score = 1 - row['_distance']  # Convert distance to similarity
        keyword_score = calculate_keyword_score(query, row['combined_text'], keywords_dict)
        
        # combine scores - only apply keyword boost if above threshold
        if keyword_score >= keyword_threshold:
            hybrid_score = alpha * semantic_score + beta * keyword_score
        else:
            hybrid_score = semantic_score  # Pure semantic if keywords don't meet threshold
        
        hybrid_scores.append({
            'semantic_score': semantic_score,
            'keyword_score': keyword_score,
            'hybrid_score': hybrid_score,
            'product_id': row['product_id'],
            'product_title': row['product_title'],
            'combined_text': row['combined_text']
        })
    
    # step3: sort by hybrid score and return top-k
    hybrid_scores.sort(key=lambda x: x['hybrid_score'], reverse=True)
    top_results = hybrid_scores[:top_k]
    
    # Converting back to df format for eval
    result_df = pd.DataFrame(top_results)
    result_df['score'] = result_df['hybrid_score']
    
    return result_df

def pure_semantic_search(query, df, table, model, top_k=10):
    """Pure semantic search for comparison"""
    query_embedding = model.encode([query])[0]
    results = table.search(query_embedding).distance_type("cosine").limit(top_k).to_pandas()
    results['score'] = 1 - results['_distance']
    return results

In [25]:
# Test hybrid search performance
test_query = "coffee maker"
print(f"Testing hybrid search with query: '{test_query}'")

# Pure semantic results
print("\n----  Pure Semantic Search  -----")
semantic_results = pure_semantic_search(test_query, df_baseline_v1, table_v1, model_v1, top_k=5)
for idx, row in semantic_results.iterrows():
    print(f"Score: {row['score']:.3f} | {row['product_title'][:70]}...")

# Hybrid search results  
print("\n----  Hybrid Search (70/30 semantic/keyword split)  -----")
hybrid_results = hybrid_search(test_query, df_baseline_v1, table_v1, model_v1, light_keywords, 
                               alpha=0.7, beta=0.3, top_k=5)
for idx, row in hybrid_results.iterrows():
    print(f"Hybrid: {row['hybrid_score']:.3f} (S:{row['semantic_score']:.3f}, K:{row['keyword_score']:.3f}) | {row['product_title'][:60]}...")

Testing hybrid search with query: 'coffee maker'

----  Pure Semantic Search  -----
Score: 0.613 | Cuisinart DCC-3200P1 Perfectemp Coffee Maker, 14 Cup Progammable with ...
Score: 0.582 | Presto 02835 MyJo Single Cup Coffee Maker, Black...
Score: 0.567 | CHEFMAN Single Serve One Cup Coffee Maker, up to 14 Oz, InstaCoffee Br...
Score: 0.551 | Elite Gourmet EHC111A Maxi-Matic Personal 14oz Single-Serve Compact Co...
Score: 0.530 | Single Serve K Cup Coffee Maker for K-Cup Pods and Ground Coffee, Comp...

----  Hybrid Search (70/30 semantic/keyword split)  -----
Hybrid: 0.729 (S:0.613, K:1.000) | Cuisinart DCC-3200P1 Perfectemp Coffee Maker, 14 Cup Progamm...
Hybrid: 0.707 (S:0.582, K:1.000) | Presto 02835 MyJo Single Cup Coffee Maker, Black...
Hybrid: 0.697 (S:0.567, K:1.000) | CHEFMAN Single Serve One Cup Coffee Maker, up to 14 Oz, Inst...
Hybrid: 0.686 (S:0.551, K:1.000) | Elite Gourmet EHC111A Maxi-Matic Personal 14oz Single-Serve ...
Hybrid: 0.671 (S:0.530, K:1.000) | Single Serve K 

In [None]:
# Evaluate hybrid search approaches
def print_evaluation_results(results, method_name):
    """Pretty print evaluation results"""
    print(f"\n--- {method_name} Performance ---")
    print(f"HITS@1:  {results['hits_at_1']:.3f}")
    print(f"HITS@5:  {results['hits_at_5']:.3f}")
    print(f"HITS@10: {results['hits_at_10']:.3f}")
    print(f"MRR:     {results['mrr']:.3f}")

# Create wrapper functions for evaluation compatibility with V1 strategy
def eval_semantic_search(query, df, *args, top_k=10):
    """Wrapper for semantic search evaluation"""
    table, model = args
    return pure_semantic_search(query, df, table, model, top_k)

def eval_hybrid_search_light(query, df, *args, top_k=10):
    """Wrapper for hybrid search with light keywords"""
    table, model = args
    return hybrid_search(query, df, table, model, light_keywords, alpha=0.7, beta=0.3, keyword_threshold=0.1, top_k=top_k)

def eval_hybrid_search_moderate(query, df, *args, top_k=10):
    """Wrapper for hybrid search with moderate keywords"""
    table, model = args
    return hybrid_search(query, df, table, model, moderate_keywords, alpha=0.7, beta=0.3, keyword_threshold=0.1, top_k=top_k)

print("=" * 60)
print("HYBRID SEARCH EVALUATION (V1 Strategy)")
print("=" * 60)

# Use the existing V1 LanceDB setup (winner from text combination evaluation)
print("Using V1 strategy (Original Order) - the winning text combination approach")
print(f"Database: {len(df_baseline_v1)} products, {table_v1.count_rows()} vectors")

# Evaluate pure semantic search using V1
print("\n1. Evaluating pure semantic search (V1 strategy)...")
semantic_metrics = evaluate_search_system(
    df_baseline_v1, eval_semantic_search, table_v1, model_v1
)
print_evaluation_results(semantic_metrics, "Pure Semantic Search")

# Evaluate hybrid search with light keywords
print("\n2. Evaluating hybrid search (light keywords)...")
hybrid_light_metrics = evaluate_search_system(
    df_baseline_v1, eval_hybrid_search_light, table_v1, model_v1
)
print_evaluation_results(hybrid_light_metrics, "Hybrid Search (Light Keywords)")

# Evaluate hybrid search with moderate keywords  
print("\n3. Evaluating hybrid search (moderate keywords)...")
hybrid_moderate_metrics = evaluate_search_system(
    df_baseline_v1, eval_hybrid_search_moderate, table_v1, model_v1
)
print_evaluation_results(hybrid_moderate_metrics, "Hybrid Search (Moderate Keywords)")

# Summary comparison
print("\n" + "=" * 60)
print("HYBRID SEARCH COMPARISON RESULTS")
print("=" * 60)

comparison_df = pd.DataFrame({
    'Method': ['Pure Semantic', 'Hybrid (Light)', 'Hybrid (Moderate)'],
    'HITS@1': [semantic_metrics['hits_at_1'], hybrid_light_metrics['hits_at_1'], hybrid_moderate_metrics['hits_at_1']],
    'HITS@5': [semantic_metrics['hits_at_5'], hybrid_light_metrics['hits_at_5'], hybrid_moderate_metrics['hits_at_5']],
    'HITS@10': [semantic_metrics['hits_at_10'], hybrid_light_metrics['hits_at_10'], hybrid_moderate_metrics['hits_at_10']],
    'MRR': [semantic_metrics['mrr'], hybrid_light_metrics['mrr'], hybrid_moderate_metrics['mrr']]
})

print(comparison_df.round(3))

# Performance insights
print("\n--- Key Insights ---")
best_hits1_idx = comparison_df['HITS@1'].idxmax()
best_method = comparison_df.loc[best_hits1_idx, 'Method']
best_score = comparison_df.loc[best_hits1_idx, 'HITS@1']
print(f"Best HITS@1: {best_method} ({best_score:.3f})")

best_mrr_idx = comparison_df['MRR'].idxmax()
best_mrr_method = comparison_df.loc[best_mrr_idx, 'Method'] 
best_mrr_score = comparison_df.loc[best_mrr_idx, 'MRR']
print(f"Best MRR: {best_mrr_method} ({best_mrr_score:.3f})")

# Calculate improvements
light_improvement = hybrid_light_metrics['hits_at_1'] - semantic_metrics['hits_at_1']
moderate_improvement = hybrid_moderate_metrics['hits_at_1'] - semantic_metrics['hits_at_1']
print(f"\nHybrid improvements over pure semantic:")
print(f"Light keywords:    {light_improvement:+.3f} HITS@1")
print(f"Moderate keywords: {moderate_improvement:+.3f} HITS@1")

# Keyword comparison
keyword_diff = hybrid_light_metrics['hits_at_1'] - hybrid_moderate_metrics['hits_at_1']
print(f"Light vs Moderate: {keyword_diff:+.3f} HITS@1 difference")

HYBRID SEARCH EVALUATION (V1 Strategy)
Using V1 strategy (Original Order) - the winning text combination approach
Database: 519 products, 519 vectors

1. Evaluating pure semantic search (V1 strategy)...
Evaluating on 53 unique queries...

--- Pure Semantic Search Performance ---
HITS@1:  0.962
HITS@5:  0.981
HITS@10: 0.981
MRR:     0.972

2. Evaluating hybrid search (light keywords)...
Evaluating on 53 unique queries...

--- Hybrid Search (Light Keywords) Performance ---
HITS@1:  0.981
HITS@5:  0.981
HITS@10: 0.981
MRR:     0.981

3. Evaluating hybrid search (moderate keywords)...
Evaluating on 53 unique queries...

--- Hybrid Search (Moderate Keywords) Performance ---
HITS@1:  0.981
HITS@5:  0.981
HITS@10: 0.981
MRR:     0.981

HYBRID SEARCH COMPARISON RESULTS
              Method  HITS@1  HITS@5  HITS@10    MRR
0      Pure Semantic   0.962   0.981    0.981  0.972
1     Hybrid (Light)   0.981   0.981    0.981  0.981
2  Hybrid (Moderate)   0.981   0.981    0.981  0.981

--- Key Insight

* Hybrid (light_keywords + semantic) approach slightly better than pure semantic search
  * Note moderate was just lemmatizing light keywords

In [27]:
# Prepare imputed dataset with original text combination strategy
print("=" * 60)
print("BASELINE vs IMPUTED DATA COMPARISON (V1 Strategy)")
print("=" * 60)

df_imputed_v1 = df_imputed.copy()
df_imputed_v1['combined_text_v1'] = df_imputed_v1.apply(create_combined_text_v1, axis=1)

print("Creating vector database for imputed dataset using V1 strategy...")
model_imputed, embeddings_imputed, db_imputed, table_imputed = create_embeddings_and_vector_db(
    df_imputed_v1, 'combined_text_v1', db_path_suffix="_imputed_v1"
)

print(f"Imputed database: {len(df_imputed_v1)} products, {table_imputed.count_rows()} vectors")

# Evaluate semantic search on imputed data
print("\n1. Evaluating semantic search on imputed data...")
semantic_imputed_metrics = evaluate_search_system(
    df_imputed_v1, eval_semantic_search, table_imputed, model_imputed
)
print_evaluation_results(semantic_imputed_metrics, "Semantic Search (Imputed Data)")

# Evaluate hybrid search on imputed data  
print("\n2. Evaluating hybrid search on imputed data...")
def eval_hybrid_imputed(query, df, *args, top_k=10):
    table, model = args
    return hybrid_search(query, df, table, model, light_keywords, alpha=0.7, beta=0.3, keyword_threshold=0.1, top_k=top_k)

hybrid_imputed_metrics = evaluate_search_system(
    df_imputed_v1, eval_hybrid_imputed, table_imputed, model_imputed
)
print_evaluation_results(hybrid_imputed_metrics, "Hybrid Search (Imputed Data)")

# Compare baseline vs imputed performance
print("\n" + "=" * 60)
print("BASELINE vs IMPUTED PERFORMANCE SUMMARY")
print("=" * 60)

imputed_comparison = pd.DataFrame({
    'Dataset': ['Baseline (V1)', 'Imputed (V1)'],
    'Semantic_HITS@1': [semantic_metrics['hits_at_1'], semantic_imputed_metrics['hits_at_1']],
    'Semantic_MRR': [semantic_metrics['mrr'], semantic_imputed_metrics['mrr']],
    'Hybrid_HITS@1': [hybrid_light_metrics['hits_at_1'], hybrid_imputed_metrics['hits_at_1']],
    'Hybrid_MRR': [hybrid_light_metrics['mrr'], hybrid_imputed_metrics['mrr']]
})

print(imputed_comparison.round(3))

# Calculate and display potential improvements from imputation
semantic_improvement = semantic_imputed_metrics['hits_at_1'] - semantic_metrics['hits_at_1']
hybrid_improvement = hybrid_imputed_metrics['hits_at_1'] - hybrid_light_metrics['hits_at_1']
semantic_mrr_improvement = semantic_imputed_metrics['mrr'] - semantic_metrics['mrr']
hybrid_mrr_improvement = hybrid_imputed_metrics['mrr'] - hybrid_light_metrics['mrr']

print(f"\n--- Imputation Impact ---")
print(f"Semantic search:")
print(f"  HITS@1: {semantic_improvement:+.3f}")
print(f"  MRR:    {semantic_mrr_improvement:+.3f}")
print(f"Hybrid search:")
print(f"  HITS@1: {hybrid_improvement:+.3f}")
print(f"  MRR:    {hybrid_mrr_improvement:+.3f}")

# Determine if imputation helps
if semantic_improvement > 0 or hybrid_improvement > 0:
    print(f"\nImputation IMPROVES search performance")
    best_dataset = "Imputed"
else:
    print(f"\nImputation does not improve search performance")
    best_dataset = "Baseline"

print(f"Recommendation: Use {best_dataset} dataset for production")

BASELINE vs IMPUTED DATA COMPARISON (V1 Strategy)
Creating vector database for imputed dataset using V1 strategy...
Creating embeddings for 519 products using all-MiniLM-L6-v2...


Batches:   0%|          | 0/17 [00:00<?, ?it/s]

Created embeddings shape: (519, 384)
Created LanceDB table: 519 vectors, 384 dimensions
Database path: ../src/vector_databases/lancedb_hybrid_imputed_v1
Imputed database: 519 products, 519 vectors

1. Evaluating semantic search on imputed data...
Evaluating on 53 unique queries...

--- Semantic Search (Imputed Data) Performance ---
HITS@1:  0.962
HITS@5:  0.981
HITS@10: 0.981
MRR:     0.972

2. Evaluating hybrid search on imputed data...
Evaluating on 53 unique queries...

--- Hybrid Search (Imputed Data) Performance ---
HITS@1:  0.981
HITS@5:  0.981
HITS@10: 0.981
MRR:     0.981

BASELINE vs IMPUTED PERFORMANCE SUMMARY
         Dataset  Semantic_HITS@1  Semantic_MRR  Hybrid_HITS@1  Hybrid_MRR
0  Baseline (V1)            0.962         0.972          0.981       0.981
1   Imputed (V1)            0.962         0.972          0.981       0.981

--- Imputation Impact ---
Semantic search:
  HITS@1: +0.000
  MRR:    +0.000
Hybrid search:
  HITS@1: +0.000
  MRR:    +0.000

Imputation does not

* Continue with baseline data and original combined text utilizing hybrid search

In [30]:
# Evaluate different alpha/beta weight combinations for hybrid search
def evaluate_hybrid_weights(df, table, model, keywords_dict, alpha_values):
    """Test different alpha/beta weight combinations for hybrid search"""
    results = []
    
    for alpha in alpha_values:
        beta = 1 - alpha
        print(f"Testing alpha={alpha:.1f}, beta={beta:.1f}...")
        
        def eval_hybrid_weighted(query, df, *args, top_k=10):
            table, model = args
            return hybrid_search(query, df, table, model, keywords_dict, alpha, beta, keyword_threshold=0.1, top_k=top_k)
        
        metrics = evaluate_search_system(df, eval_hybrid_weighted, table, model, k_values=[1, 5])
        metrics['alpha'] = alpha
        metrics['beta'] = beta
        results.append(metrics)
    
    return pd.DataFrame(results)

In [31]:
# Test different weight combinations
print("Optimizing hybrid search weights...")
alpha_values = [0.5, 0.6, 0.7, 0.8, 0.9, 1.0]  # 1.0 = pure semantic

weight_results = evaluate_hybrid_weights(df_baseline_v1, table_v1, model_v1, light_keywords, alpha_values)

print("\n----  Weight Optimization Results  ------")
print(weight_results[['alpha', 'beta', 'hits_at_1', 'hits_at_5', 'mrr']].round(3))

# Find optimal weights
best_idx = weight_results['hits_at_1'].idxmax()
optimal_alpha = weight_results.loc[best_idx, 'alpha']
optimal_beta = weight_results.loc[best_idx, 'beta']
best_hits1 = weight_results.loc[best_idx, 'hits_at_1']

print(f"\nOptimal weights: alpha={optimal_alpha:.1f}, beta={optimal_beta:.1f}")
print(f"Best HITS@1: {best_hits1:.3f}")

# Check if hybrid is better than pure semantic
pure_semantic_hits1 = weight_results[weight_results['alpha'] == 1.0]['hits_at_1'].iloc[0]
improvement = best_hits1 - pure_semantic_hits1
print(f"Improvement over pure semantic: {improvement:+.3f} HITS@1")

Optimizing hybrid search weights...
Testing alpha=0.5, beta=0.5...
Evaluating on 53 unique queries...
Testing alpha=0.6, beta=0.4...
Evaluating on 53 unique queries...
Testing alpha=0.7, beta=0.3...
Evaluating on 53 unique queries...
Testing alpha=0.8, beta=0.2...
Evaluating on 53 unique queries...
Testing alpha=0.9, beta=0.1...
Evaluating on 53 unique queries...
Testing alpha=1.0, beta=0.0...
Evaluating on 53 unique queries...

----  Weight Optimization Results  ------
   alpha  beta  hits_at_1  hits_at_5    mrr
0    0.5   0.5      0.981      0.981  0.981
1    0.6   0.4      0.981      0.981  0.981
2    0.7   0.3      0.981      0.981  0.981
3    0.8   0.2      0.981      0.981  0.981
4    0.9   0.1      0.981      0.981  0.981
5    1.0   0.0      0.962      0.981  0.972

Optimal weights: alpha=0.5, beta=0.5
Best HITS@1: 0.981
Improvement over pure semantic: +0.019 HITS@1


* Not any improvement by varying alpha/beta combos
  * Will keep at 70/30 split for strong semantic foundation with keyword enhancement

In [32]:
# Look into secondary ranking on top of hybrid search
def apply_secondary_ranking(search_results, boost_brands=None, brand_boost=0.1):
    """
    Apply business logic secondary ranking to search results
    To demonstrate practical production considerations beyond pure relevance
    """
    results = search_results.copy()
    
    # Business rule 1: Brand preference boost
    if boost_brands:
        for brand in boost_brands:
            brand_mask = results['product_title'].str.contains(brand, case=False, na=False)
            results.loc[brand_mask, 'score'] += brand_boost
    
    # Business rule 2: Exact title match boost
    query_words = set(results.iloc[0].get('query', '').lower().split()) if len(results) > 0 else set()
    for idx, row in results.iterrows():
        title_words = set(str(row.get('product_title', '')).lower().split())
        overlap = len(query_words.intersection(title_words))
        if overlap >= 2:  # At least 2 words match
            results.loc[idx, 'score'] += 0.05 * overlap
    
    # Re-sort by updated scores
    results = results.sort_values('score', ascending=False).reset_index(drop=True)
    return results

def hybrid_search_with_ranking(query, df, table, model, keywords_dict, 
                               boost_brands=None, alpha=0.7, beta=0.3, keyword_threshold=0.1, top_k=10):
    """Hybrid search with secondary ranking"""
    # Get initial hybrid results
    initial_results = hybrid_search(query, df, table, model, keywords_dict, alpha, beta, keyword_threshold=0.1, top_k=top_k * 2)
    # Apply secondary ranking
    ranked_results = apply_secondary_ranking(initial_results, boost_brands)
     # Return top-k after ranking
    return ranked_results.head(top_k)

In [34]:
# Test secondary ranking
print("Testing secondary ranking with brand boost...")

test_query = "coffee maker"
print(f"\nQuery: '{test_query}'")

# Without secondary ranking - just show pure hybrid results
print("\n-----   Without Secondary Ranking   -------")
base_results = hybrid_search(test_query, df_baseline_v1, table_v1, model_v1, light_keywords, 
                             alpha=0.7, beta=0.3, keyword_threshold=0.1, top_k=5)
for idx, row in base_results.iterrows():
    print(f"Score: {row['hybrid_score']:.3f} | {row['product_title'][:70]}...")

# With secondary ranking - show Hamilton Beach boost effect
print("\n-------  With Secondary Ranking (Hamilton Beach boost)   ------")
ranked_results = hybrid_search_with_ranking(test_query, df_baseline_v1, table_v1, model_v1, light_keywords,
                    boost_brands=['Hamilton Beach'], alpha=0.7, beta=0.3, keyword_threshold=0.1, top_k=5)
for idx, row in ranked_results.iterrows():
    is_hamilton_beach = 'Hamilton Beach' in str(row.get('product_title', ''))
    boost_indicator = " [BOOSTED]" if is_hamilton_beach else ""
    print(f"Score: {row['score']:.3f}{boost_indicator} | {row['product_title'][:70]}...")

Testing secondary ranking with brand boost...

Query: 'coffee maker'

-----   Without Secondary Ranking   -------
Score: 0.729 | Cuisinart DCC-3200P1 Perfectemp Coffee Maker, 14 Cup Progammable with ...
Score: 0.707 | Presto 02835 MyJo Single Cup Coffee Maker, Black...
Score: 0.697 | CHEFMAN Single Serve One Cup Coffee Maker, up to 14 Oz, InstaCoffee Br...
Score: 0.686 | Elite Gourmet EHC111A Maxi-Matic Personal 14oz Single-Serve Compact Co...
Score: 0.671 | Single Serve K Cup Coffee Maker for K-Cup Pods and Ground Coffee, Comp...

-------  With Secondary Ranking (Hamilton Beach boost)   ------
Score: 0.763 [BOOSTED] | Hamilton Beach (47950) Coffee Maker with 12 Cup Capacity & Internal St...
Score: 0.729 | Cuisinart DCC-3200P1 Perfectemp Coffee Maker, 14 Cup Progammable with ...
Score: 0.707 | Presto 02835 MyJo Single Cup Coffee Maker, Black...
Score: 0.697 | CHEFMAN Single Serve One Cup Coffee Maker, up to 14 Oz, InstaCoffee Br...
Score: 0.686 | Elite Gourmet EHC111A Maxi-Matic Person

In [35]:
# Test secondary ranking
print("Testing secondary ranking with brand boost...")

test_query = "coffee maker"
print(f"\nQuery: '{test_query}'")

# Without secondary ranking - just show pure hybrid results
print("\n-----   Without Secondary Ranking   -------")
base_results = hybrid_search(test_query, df_baseline_v1, table_v1, model_v1, light_keywords, 
                             alpha=0.7, beta=0.3, keyword_threshold=0.1, top_k=5)
for idx, row in base_results.iterrows():
    print(f"Score: {row['hybrid_score']:.3f} | {row['product_title'][:70]}...")

# With secondary ranking - show CHEFMAN boost effect (able to see it from un-boosted results)
print("\n-------  With Secondary Ranking (CHEFMAN boost)   ------")
ranked_results = hybrid_search_with_ranking(test_query, df_baseline_v1, table_v1, model_v1, light_keywords,
                    boost_brands=['CHEFMAN'], alpha=0.7, beta=0.3, keyword_threshold=0.1, top_k=5)
for idx, row in ranked_results.iterrows():
    is_chefman = 'CHEFMAN' in str(row.get('product_title', ''))
    boost_indicator = " [BOOSTED]" if is_chefman else ""
    print(f"Score: {row['score']:.3f}{boost_indicator} | {row['product_title'][:70]}...")

Testing secondary ranking with brand boost...

Query: 'coffee maker'

-----   Without Secondary Ranking   -------
Score: 0.729 | Cuisinart DCC-3200P1 Perfectemp Coffee Maker, 14 Cup Progammable with ...
Score: 0.707 | Presto 02835 MyJo Single Cup Coffee Maker, Black...
Score: 0.697 | CHEFMAN Single Serve One Cup Coffee Maker, up to 14 Oz, InstaCoffee Br...
Score: 0.686 | Elite Gourmet EHC111A Maxi-Matic Personal 14oz Single-Serve Compact Co...
Score: 0.671 | Single Serve K Cup Coffee Maker for K-Cup Pods and Ground Coffee, Comp...

-------  With Secondary Ranking (CHEFMAN boost)   ------
Score: 0.797 [BOOSTED] | CHEFMAN Single Serve One Cup Coffee Maker, up to 14 Oz, InstaCoffee Br...
Score: 0.729 | Cuisinart DCC-3200P1 Perfectemp Coffee Maker, 14 Cup Progammable with ...
Score: 0.707 | Presto 02835 MyJo Single Cup Coffee Maker, Black...
Score: 0.686 | Elite Gourmet EHC111A Maxi-Matic Personal 14oz Single-Serve Compact Co...
Score: 0.671 | Single Serve K Cup Coffee Maker for K-Cup Pods

In [37]:
# Test secondary ranking
print("Testing secondary ranking with query/title overlap boost...")

test_query = "single serve K Cup coffee maker for K-Cup pods"
print(f"\nQuery: '{test_query}'")

# With secondary ranking - show query overlap boost effect (able to see it from un-boosted results)
print("\n-------  With Secondary Ranking (query overlap)   ------")
ranked_results = hybrid_search_with_ranking(test_query, df_baseline_v1, table_v1, model_v1, light_keywords,
                    alpha=0.7, beta=0.3, keyword_threshold=0.1, top_k=5)
for idx, row in ranked_results.iterrows():
    print(f"Score: {row['score']:.3f} | {row['product_title'][:70]}...")

Testing secondary ranking with query/title overlap boost...

Query: 'single serve K Cup coffee maker for K-Cup pods'

-------  With Secondary Ranking (query overlap)   ------
Score: 0.807 | Keurig K-Mini Coffee Maker, Single Serve K-Cup Pod Coffee Brewer, 6 to...
Score: 0.788 | Single Serve K Cup Coffee Maker for K-Cup Pods and Ground Coffee, Comp...
Score: 0.787 | Keurig K-Mini Plus Coffee Maker, Single Serve K-Cup Pod Coffee Brewer,...
Score: 0.785 | Keurig MAIN-85544 Compact Single-Serve K-Cup Pod Coffee Maker, Black, ...
Score: 0.743 | CHEFMAN Single Serve One Cup Coffee Maker, up to 14 Oz, InstaCoffee Br...


* Secondary ranking with brand boost or query overlap is powerful to coerce search results to user preference

In [38]:
# Finalize notebook findings and save results
print("\n1. HYBRID SEARCH PERFORMANCE:")
print(f"   Pure Semantic:     HITS@1={semantic_metrics['hits_at_1']:.3f}, MRR={semantic_metrics['mrr']:.3f}")
print(f"   Hybrid (Light):    HITS@1={hybrid_light_metrics['hits_at_1']:.3f}, MRR={hybrid_light_metrics['mrr']:.3f}")
print(f"   Hybrid (Moderate): HITS@1={hybrid_moderate_metrics['hits_at_1']:.3f}, MRR={hybrid_moderate_metrics['mrr']:.3f}")

print("\n2. TEXT COMBINATION STRATEGY:")
print("   Strategy V1 (Original Order - title + description + bullets + brand + color)")
print("   Winner with HITS@1=0.962 vs V2=0.943, V3=0.943")

print("\n3. BASELINE vs IMPUTED DATA:")
print(f"   Semantic improvement: {semantic_improvement:+.3f} HITS@1")
print(f"   Hybrid improvement:   {hybrid_improvement:+.3f} HITS@1")

print("\n4. WEIGHT OPTIMIZATION RESULTS:")
print(f"   All hybrid combinations (α=0.5-0.9): HITS@1=0.981")
print(f"   Pure semantic (α=1.0): HITS@1=0.962") 
print(f"   Conclusion: Any hybrid ratio works equally well")
print(f"   Chosen: α=0.7, β=0.3 (semantic priority + keyword enhancement)")

print("\n5. SECONDARY RANKING:")
print("   - Successfully demonstrated brand boosting (Hamilton Beach, CHEFMAN)")
print("   - Shows business logic can override pure relevance scores")
print("   - Ready for production customization (margins, partnerships, etc.)")

print("\n6. PRODUCTION RECOMMENDATIONS:")
print("   - Use hybrid search: +0.019 HITS@1 improvement (96.2% → 98.1%)")
print("   - Text combination: V1 Original Order (winner vs V2/V3)")
print("   - Dataset: Baseline (imputation showed +0.000 improvement)")
print("   - Weights: α=0.7, β=0.3 (semantic priority, keyword enhancement)")
print("   - Keyword threshold: 0.1 (avoid weak matches)")
print("   - Secondary ranking: Available for business logic customization")


1. HYBRID SEARCH PERFORMANCE:
   Pure Semantic:     HITS@1=0.962, MRR=0.972
   Hybrid (Light):    HITS@1=0.981, MRR=0.981
   Hybrid (Moderate): HITS@1=0.981, MRR=0.981

2. TEXT COMBINATION STRATEGY:
   Strategy V1 (Original Order - title + description + bullets + brand + color)
   Winner with HITS@1=0.962 vs V2=0.943, V3=0.943

3. BASELINE vs IMPUTED DATA:
   Semantic improvement: +0.000 HITS@1
   Hybrid improvement:   +0.000 HITS@1

4. WEIGHT OPTIMIZATION RESULTS:
   All hybrid combinations (α=0.5-0.9): HITS@1=0.981
   Pure semantic (α=1.0): HITS@1=0.962
   Conclusion: Any hybrid ratio works equally well
   Chosen: α=0.7, β=0.3 (semantic priority + keyword enhancement)

5. SECONDARY RANKING:
   - Successfully demonstrated brand boosting (Hamilton Beach, CHEFMAN)
   - Shows business logic can override pure relevance scores
   - Ready for production customization (margins, partnerships, etc.)

6. PRODUCTION RECOMMENDATIONS:
   - Use hybrid search: +0.019 HITS@1 improvement (96.2% → 98.

In [39]:
# Save to results directory
final_results = {
    'hybrid_search_results': {
        'pure_semantic': semantic_metrics,
        'hybrid_light': hybrid_light_metrics,
        'hybrid_moderate': hybrid_moderate_metrics
    },
    'imputation_impact': {
        'semantic_improvement': semantic_improvement,
        'hybrid_improvement': hybrid_improvement
    },
    'weight_optimization': {
        'all_hybrid_performance': 0.981,
        'pure_semantic_performance': 0.962,
        'chosen_alpha': 0.7,
        'chosen_beta': 0.3,
        'reasoning': 'semantic_priority_with_keyword_enhancement'
    },
    'production_config': {
        'text_combination': 'strategy_v1_original_order',
        'embedding_model': 'all-MiniLM-L6-v2',
        'keywords_type': 'light',
        'keyword_threshold': 0.1,
        'use_imputed_data': semantic_improvement > 0,
        'weights': {'alpha': 0.7, 'beta': 0.3},
        'secondary_ranking_available': True
    }
}

os.makedirs("../src/results/hybrid_search", exist_ok=True)
with open("../src/results/hybrid_search/hybrid_search_evaluation.pkl", "wb") as f:
    pickle.dump(final_results, f)