In [9]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import os
import pickle
from scipy.sparse import save_npz

In [2]:
# Load baseline dataset
df_baseline = pd.read_parquet("../data/processed/df_baseline_clean.parquet")

In [4]:
# Baseline Embedding Functions
def create_tfidf_embeddings(df, max_features=1000):
    """Create TF-IDF embeddings - baseline approach"""
    
    # Use combined_text field created in text processing
    texts = df['combined_text'].tolist()
    
    # Create TF-IDF vectorizer
    vectorizer = TfidfVectorizer(
        max_features=max_features,
        stop_words='english',
        ngram_range=(1, 2),
        lowercase=True
    )
    
    # Fit and transform
    tfidf_matrix = vectorizer.fit_transform(texts)
    
    print(f"TF-IDF matrix shape: {tfidf_matrix.shape}")
    return tfidf_matrix, vectorizer

def create_transformer_embeddings(df, model_name="all-MiniLM-L6-v2"):
    """Create transformer embeddings using sentence-transformers"""
    print(f"Creating transformer embeddings with {model_name}...")
    
    # Load model
    model = SentenceTransformer(model_name)
    
    # Use combined_text field
    texts = df['combined_text'].tolist()
    
    # Create embeddings
    embeddings = model.encode(texts, show_progress_bar=True)
    
    print(f"Transformer embeddings shape: {embeddings.shape}")
    return embeddings, model

# Create both baseline embeddings
tfidf_matrix, tfidf_vectorizer = create_tfidf_embeddings(df_baseline)
transformer_embeddings, transformer_model = create_transformer_embeddings(df_baseline)

TF-IDF matrix shape: (519, 1000)
Creating transformer embeddings with all-MiniLM-L6-v2...


Batches:   0%|          | 0/17 [00:00<?, ?it/s]

Transformer embeddings shape: (519, 384)


In [5]:
# Simple Search Functions for  baseline comparison
def tfidf_search(query, df, tfidf_matrix, vectorizer, top_k=10):
    """Simple TF-IDF search"""
    # Transform query
    query_vector = vectorizer.transform([query])
    
    # Compute cosine similarities
    similarities = cosine_similarity(query_vector, tfidf_matrix).flatten()
    
    # Get top results
    top_indices = similarities.argsort()[-top_k:][::-1]
    
    # Return results with scores
    results = df.iloc[top_indices].copy()
    results['score'] = similarities[top_indices]
    
    return results

def transformer_search(query, df, embeddings, model, top_k=10):
    """Simple transformer search"""
    # Encode query
    query_embedding = model.encode([query])
    
    # Compute cosine similarities
    similarities = cosine_similarity(query_embedding, embeddings).flatten()
    
    # Get top results  
    top_indices = similarities.argsort()[-top_k:][::-1]
    
    # Return results with scores
    results = df.iloc[top_indices].copy()
    results['score'] = similarities[top_indices]
    
    return results

In [6]:
# Test both search methods
test_query = "coffee maker"
print(f"Testing query: '{test_query}'\n")

print("----  TF-IDF Results  ---")
tfidf_results = tfidf_search(test_query, df_baseline, tfidf_matrix, tfidf_vectorizer, top_k=5)
for idx, row in tfidf_results.iterrows():
    print(f"Score: {row['score']:.3f} | {row['product_title'][:80]}...")

print("\n----  Transformer Results  ----")
transformer_results = transformer_search(test_query, df_baseline, transformer_embeddings, transformer_model, top_k=5)
for idx, row in transformer_results.iterrows():
    print(f"Score: {row['score']:.3f} | {row['product_title'][:80]}...")

Testing query: 'coffee maker'

----  TF-IDF Results  ---
Score: 0.750 | Outon Coffee Maker 10 Cup, Programmable Drip Coffee Maker, Multiple Brew Strengt...
Score: 0.641 | Elite Gourmet EHC111A Maxi-Matic Personal 14oz Single-Serve Compact Coffee Maker...
Score: 0.616 | CHULUX Single Cup Coffee Maker Machine,12 Ounce Pod Coffee Brewer,One Touch Func...
Score: 0.589 | CHEFMAN Single Serve One Cup Coffee Maker, up to 14 Oz, InstaCoffee Brews in 30 ...
Score: 0.550 | Single Serve K Cup Coffee Maker for K-Cup Pods and Ground Coffee, Compact Design...

----  Transformer Results  ----
Score: 0.602 | Presto 02835 MyJo Single Cup Coffee Maker, Black...
Score: 0.593 | Cuisinart DCC-3200P1 Perfectemp Coffee Maker, 14 Cup Progammable with Glass Cara...
Score: 0.587 | CHEFMAN Single Serve One Cup Coffee Maker, up to 14 Oz, InstaCoffee Brews in 30 ...
Score: 0.576 | Hamilton Beach (47950) Coffee Maker with 12 Cup Capacity & Internal Storage Coff...
Score: 0.563 | Elite Gourmet EHC111A Maxi-Matic Per

In [7]:
# Set up evaluation metrics for search systems
def calculate_hits_at_k(search_results, relevant_product_ids, k):
    """Calculate HITS@K: Did any relevant products appear in top K results?"""
    top_k_product_ids = search_results.head(k)['product_id'].tolist()
    relevant_found = len(set(top_k_product_ids).intersection(set(relevant_product_ids)))
    return 1 if relevant_found > 0 else 0

def calculate_mrr(search_results, relevant_product_ids):
    """Calculate Mean Reciprocal Rank"""
    for rank, product_id in enumerate(search_results['product_id'].tolist(), 1):
        if product_id in relevant_product_ids:
            return 1.0 / rank
    return 0.0

def evaluate_search_system(df, search_function, *search_args, k_values=[1, 5, 10]):
    """Evaluate search system using HITS@K and MRR"""
    unique_queries = df['query'].unique()
    
    results = {f'hits_at_{k}': [] for k in k_values}
    results['mrr'] = []
    
    print(f"Evaluating on {len(unique_queries)} unique queries...")
    
    for query in unique_queries:
        # Find relevant products for this query
        relevant_products = df[df['query'] == query]['product_id'].tolist()
        
        # Get search results
        search_results = search_function(query, df, *search_args, top_k=max(k_values))
        
        # Calculate metrics
        for k in k_values:
            hits = calculate_hits_at_k(search_results, relevant_products, k)
            results[f'hits_at_{k}'].append(hits)
        
        mrr = calculate_mrr(search_results, relevant_products)
        results['mrr'].append(mrr)
    
    # Calculate averages
    avg_results = {}
    for metric, values in results.items():
        avg_results[metric] = np.mean(values)
    
    return avg_results

In [8]:
# Evaluate both systems
print("----  BASELINE EVALUATION RESULTS  ----")

print("\n--- TF-IDF Performance ---")
tfidf_metrics = evaluate_search_system(
    df_baseline, tfidf_search, tfidf_matrix, tfidf_vectorizer
)
for metric, score in tfidf_metrics.items():
    print(f"{metric.upper()}: {score:.3f}")

print("\n--- Transformer Performance ---")
transformer_metrics = evaluate_search_system(
    df_baseline, transformer_search, transformer_embeddings, transformer_model
)
for metric, score in transformer_metrics.items():
    print(f"{metric.upper()}: {score:.3f}")

print(f"\n====  SUMMARY  ====")
print(f"Better HITS@1: {'Transformer' if transformer_metrics['hits_at_1'] > tfidf_metrics['hits_at_1'] else 'TF-IDF'}")
print(f"Better MRR: {'Transformer' if transformer_metrics['mrr'] > tfidf_metrics['mrr'] else 'TF-IDF'}")

----  BASELINE EVALUATION RESULTS  ----

--- TF-IDF Performance ---
Evaluating on 53 unique queries...
HITS_AT_1: 0.849
HITS_AT_5: 0.906
HITS_AT_10: 0.943
MRR: 0.869

--- Transformer Performance ---
Evaluating on 53 unique queries...
HITS_AT_1: 0.981
HITS_AT_5: 0.981
HITS_AT_10: 0.981
MRR: 0.981

====  SUMMARY  ====
Better HITS@1: Transformer
Better MRR: Transformer


In [10]:
# Save baseline artifacts in organized structure

# Save TF-IDF components in sparse vector store
with open("../src/vector_stores/baseline/tfidf_vectorizer.pkl", "wb") as f:
    pickle.dump(tfidf_vectorizer, f)
# Save the TF-IDF matrix too!
save_npz("../src/vector_stores/baseline/tfidf_matrix.npz", tfidf_matrix)

# Save transformer embeddings 
np.save("../src/embeddings/baseline/transformer_embeddings.npy", transformer_embeddings)
with open("../src/embeddings/baseline/transformer_model_name.txt", "w") as f:
    f.write("all-MiniLM-L6-v2")

# Save baseline evaluation metrics
baseline_results = {
    'tfidf_metrics': tfidf_metrics,
    'transformer_metrics': transformer_metrics,
    'dataset_info': {
        'num_products': len(df_baseline),
        'num_unique_queries': len(df_baseline['query'].unique()),
        'approach': 'baseline_no_imputation'
    }
}

with open("../src/evaluation/baseline/baseline_metrics.pkl", "wb") as f:
    pickle.dump(baseline_results, f)
