In [9]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# dropping stopwords, Lemmatization 
import nltk                              # 
from nltk.corpus import stopwords        # removing stopwords
from nltk.stem import WordNetLemmatizer  # stemmization / lemmetization
from sklearn.pipeline import Pipeline    # 
from nltk.corpus import wordnet          # Enriching the query with also synonyms and semantically related words

nltk.download('stopwords')  # Adding Stopwords to the preprocessing the data phase 
nltk.download('wordnet')    # conceptual relationships between words

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [None]:
#clone the git repo that contains the data and additional information about the dataset
!git clone https://github.com/wayfair/WANDS.git

In [None]:
# Adding preprocess data phase before running the tf-IDF algorithm
def preprocess_text(text):
    stop_words = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()
    # Tokenize, remove stopwords, and lemmatize
    tokens = text.lower().split()
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return ' '.join(tokens)

# Handling null values for more columns (product names, descriptions, class, hierarchy, features)
def preprocess_dataframe(dataframe):
    for col in dataframe:
        if dataframe[col].dtype == 'object': 
            if dataframe[col].isna().sum() > 0: 
                print('Number of null values in', col, 'is:', dataframe[col].isna().sum())
            dataframe[col] = dataframe[col].fillna('').apply(preprocess_text)
    return dataframe

# Defining functions for product search using Tf-IDF
def calculate_tfidf(dataframe):
    # combined_text = dataframe['product_name'] + ' ' + dataframe['product_description']
    combined_text = dataframe['product_name'].copy() # increasing the weight of product name by adding it twice
    for col in dataframe:
        if dataframe[col].dtype == 'object': combined_text += ' ' + dataframe[col] 

    # vectorizer = TfidfVectorizer()
    vectorizer = TfidfVectorizer(ngram_range=(1, 2)) # adding biagram also; TfidfVectorizer: 1) transform text to numerical, 2) identify important words, 3) filter out common words
    
    # convert combined_text to list of unicode strings
    tfidf_matrix = vectorizer.fit_transform(combined_text.values.astype('U')) # U: Converting to the unicoce string format; fit_transform: converts each doc to a numerical vector
    return vectorizer, tfidf_matrix

def get_top_products(vectorizer, tfidf_matrix, query, top_n=10):
    query_vector = vectorizer.transform([query])
    cosine_similarities = cosine_similarity(query_vector, tfidf_matrix).flatten()
    top_product_indices = cosine_similarities.argsort()[-top_n:][::-1]
    return top_product_indices

In [44]:
# Defining functions for evaluating retrieval performance
def map_at_k(true_ids, predicted_ids, k=10):
    #if either list is empty, return 0
    if not len(true_ids) or not len(predicted_ids):
        return 0.0

    score = 0.0
    num_hits = 0.0

    for i, p_id in enumerate(predicted_ids[:k]):
        if p_id in true_ids and p_id not in predicted_ids[:i]:
            num_hits += 1.0
            score += num_hits / (i + 1.0)

    return score / min(len(true_ids), k)

In [13]:
# get search queries
query_df = pd.read_csv(r"WANDS/dataset/query.csv", sep='\t')

In [65]:
query_df.head()

Unnamed: 0,query_id,query,query_class
0,0,salon chair,Massage Chairs
1,1,smart coffee table,Coffee & Cocktail Tables
2,2,dinosaur,Kids Wall Décor
3,3,turquoise pillows,Accent Pillows
4,4,chair and a half recliner,Recliners


In [10]:
# get products
product_df = pd.read_csv("WANDS/dataset/product.csv", sep='\t')

In [82]:
product_df.head()

Unnamed: 0,product_id,product_name,product_class,category hierarchy,product_description,product_features,rating_count,average_rating,review_count
0,0,solid wood platform bed,Beds,Furniture / Bedroom Furniture / Beds & Headboa...,"good , deep sleep can be quite difficult to ha...",overallwidth-sidetoside:64.7|dsprimaryproducts...,15.0,4.5,15.0
1,1,all-clad 7 qt . slow cooker,Slow Cookers,Kitchen & Tabletop / Small Kitchen Appliances ...,"create delicious slow-cooked meals , from tend...",capacityquarts:7|producttype : slow cooker|pro...,100.0,2.0,98.0
2,2,all-clad electrics 6.5 qt . slow cooker,Slow Cookers,Kitchen & Tabletop / Small Kitchen Appliances ...,prepare home-cooked meals on any schedule with...,features : keep warm setting|capacityquarts:6....,208.0,3.0,181.0
3,3,all-clad all professional tools pizza cutter,"Slicers, Peelers And Graters",Browse By Brand / All-Clad,this original stainless tool was designed to c...,overallwidth-sidetoside:3.5|warrantylength : l...,69.0,4.5,42.0
4,4,baldwin prestige alcott passage knob with roun...,Door Knobs,Home Improvement / Doors & Door Hardware / Doo...,the hardware has a rich heritage of delivering...,compatibledoorthickness:1.375 '' |countryofori...,70.0,5.0,42.0


In [51]:
# get manually labeled groundtruth lables
label_df = pd.read_csv("WANDS/dataset/label.csv", sep='\t')

In [52]:
label_df.head()

Unnamed: 0,id,query_id,product_id,label
0,0,0,25434,Exact
1,1,0,12088,Irrelevant
2,2,0,42931,Exact
3,3,0,2636,Exact
4,4,0,42923,Exact


In [53]:
#group the labels for each query to use when identifying exact matches
grouped_label_df = label_df.groupby('query_id')

In [71]:
# Preprocess of data - finding null values in product names and descriptions
preprocess_dataframe(product_df)

Number of null values in product_class is: 2852
Number of null values in category hierarchy is: 1556
Number of null values in product_description is: 6008


Unnamed: 0,product_id,product_name,product_class,category hierarchy,product_description,product_features,rating_count,average_rating,review_count
0,0,solid wood platform bed,bed,furniture / bedroom furniture / bed & headboar...,"good , deep sleep quite difficult busy age . f...",overallwidth-sidetoside:64.7|dsprimaryproducts...,15.0,4.5,15.0
1,1,all-clad 7 qt . slow cooker,slow cooker,kitchen & tabletop / small kitchen appliance /...,"create delicious slow-cooked meal , tender mea...",capacityquarts:7|producttype : slow cooker|pro...,100.0,2.0,98.0
2,2,all-clad electric 6.5 qt . slow cooker,slow cooker,kitchen & tabletop / small kitchen appliance /...,prepare home-cooked meal schedule essential sl...,feature : keep warm setting|capacityquarts:6.5...,208.0,3.0,181.0
3,3,all-clad professional tool pizza cutter,"slicers, peeler grater",browse brand / all-clad,original stainless tool designed complement ma...,overallwidth-sidetoside:3.5|warrantylength : l...,69.0,4.5,42.0
4,4,baldwin prestige alcott passage knob round ros...,door knob,home improvement / door & door hardware / door...,hardware rich heritage delivering modern luxur...,compatibledoorthickness:1.375 '' |countryofori...,70.0,5.0,42.0
...,...,...,...,...,...,...,...,...,...
42989,42989,malibu pressure balanced diverter fixed shower...,shower panel,home improvement / bathroom remodel & bathroom...,malibu pressure balanced diverter fixed shower...,producttype : shower panel|spraypattern : rain...,3.0,4.5,2.0
42990,42990,emmeline 5 piece breakfast dining set,dining table set,furniture / kitchen & dining furniture / dinin...,,basematerialdetails : steel| : gray wood|ofhar...,1314.0,4.5,864.0
42991,42991,maloney 3 piece pub table set,dining table set,furniture / kitchen & dining furniture / dinin...,pub table set includes 1 counter height table ...,additionaltoolsrequirednotincluded : power dri...,49.0,4.0,41.0
42992,42992,fletcher 27.5 '' wide polyester armchair,teen lounge furniture|accent chair,furniture / living room furniture / chair & se...,"bring iconic , modern style space cinch armcha...",legmaterialdetails : rubberwood|backheight-sea...,1746.0,4.5,1226.0


In [23]:
# Calculate TF-IDF
vectorizer, tfidf_matrix = calculate_tfidf(product_df) # Using new enriched dataframe (having more columns to compare with the query)

In [36]:
# Usinmg WordNet to enrich the query 
def expand_query(query):
    synonyms = set()
    for syn in wordnet.synsets(query):
        for lemma in syn.lemmas():
            if lemma.name() not in synonyms: synonyms.add(lemma.name())
    return query + ' ' + ' '.join(synonyms)

# Sanity check code block to see if the search results are relevant
# Implementing a function to retrieve top K product IDs for a query
def get_top_product_ids_for_query(query):
    top_product_indices = get_top_products(vectorizer, tfidf_matrix, query, top_n=10)
    top_product_ids = product_df.iloc[top_product_indices]['product_id'].tolist()
    return top_product_ids

# Defining the test query
query = "armchair"
query = expand_query(query) # Enriching the query

# Obtain top product IDs
top_product_ids = get_top_product_ids_for_query(query)

print(f"Top products for '{query}':")
for product_id in top_product_ids:
    product = product_df.loc[product_df['product_id'] == product_id]
    print(product_id, product['product_name'].values[0])

Top products for 'armchair armchair':
42698 donham armchair
31564 biloxi 34.75 '' wide armchair
29626 armchair
41306 hartsell 33 '' wide armchair
23907 faizah 27.6 '' wide tufted polyester armchair
12756 24.41 '' wide tufted polyester armchair
41270 almaraz 33.7 '' wide leather match armchair
1140 charnley 47 '' wide chenille armchair
29627 bostick 25 '' wide polyester armchair
33678 colby 24 '' wide armchair


In [54]:
# Implementing a function to retrieve exact match product IDs for a query_id
def get_exact_matches_for_query(query_id):
    query_group = grouped_label_df.get_group(query_id)
    exact_matches = query_group.loc[query_group['label'] == 'Exact']['product_id'].values
    return exact_matches

# Applying the function to obtain top product IDs and adding top K product IDs to the dataframe 
# query_df['top_product_ids'] = query_df['query'].apply(get_top_product_ids_for_query)
query_df['expand_query'] = query_df['query'].apply(expand_query) # Enriching the query
query_df['top_product_ids'] = query_df['expand_query'].apply(get_top_product_ids_for_query) # Passing the enriched query to the function

# Adding the list of exact match product_IDs from labels_df
query_df['relevant_ids'] = query_df['query_id'].apply(get_exact_matches_for_query)

# Now assigning the map@k score
query_df['map@k'] = query_df.apply(lambda x: map_at_k(x['relevant_ids'], x['top_product_ids'], k=10), axis=1)

In [76]:
# Calculating the MAP across the entire query set
query_df.loc[:, 'map@k'].mean()

np.float64(0.34261142408352235)

First, we compute the similarity (using cosine similarity on TF-IDF vectors) between each search query and all retrieved products.
For each query, we rank the products by similarity and calculate Average Precision based on whether the top results contain the relevant (exact match) products.
Then, we calculate Mean Average Precision (MAP) across all queries — this gives us an overall measure of how well our search engine retrieves relevant results.

In this setup, the MAP@10 score is approximately 34%, which means that on average, relevant products appear relatively high in the top 10 retrieved results.

The next step is to explore whether using more semantic embedding models (e.g., BM25, Word2Vec, Sentence Transformers, and BERT) can improve the MAP score by capturing deeper contextual or semantic similarity between queries and products.

Lets start with BM25!

In [38]:
from rank_bm25 import BM25Okapi                      # loading BM25 package

def calculate_bm25(dataframe):                       # Calculating tokenized corpus (Word Embeddings) using BM25
    combined_text = dataframe['product_name'].copy() # increasing the weight of product name by adding it twice
    for col in dataframe:
        if dataframe[col].dtype == 'object': combined_text += ' ' + dataframe[col] 

    tokenized_corpus = [doc.split() for doc in combined_text.values.astype('U')]
    bm25 = BM25Okapi(tokenized_corpus)
    return bm25, tokenized_corpus

bm25, tokenized_corpus = calculate_bm25(product_df)  # Using new enriched dataframe and also new model

In [39]:
def get_top_products_bm25(query, top_n=10): # Same function with get_top_product_ids_for_query but with using BM25
    query_tokens = query.split()
    scores = bm25.get_scores(query_tokens)
    top_product_indices = np.argsort(scores)[-top_n:][::-1]
    return top_product_indices

top_product_ids = get_top_products_bm25(query) # Getting indices of most similar products for the query using BM25

print(f"Top products for '{query}':")
for product_id in top_product_ids:
    product = product_df.loc[product_df['product_id'] == product_id]
    print(product_id, product['product_name'].values[0])

Top products for 'armchair armchair':
42698 donham armchair
31564 biloxi 34.75 '' wide armchair
41306 hartsell 33 '' wide armchair
41270 almaraz 33.7 '' wide leather match armchair
12756 24.41 '' wide tufted polyester armchair
23907 faizah 27.6 '' wide tufted polyester armchair
29627 bostick 25 '' wide polyester armchair
5989 caloundra 39.38 '' wide genuine leather armchair
1140 charnley 47 '' wide chenille armchair
15440 avoca 6 piece rattan sectional seating group with cushions


In [None]:
query_df['top_product_ids'] = query_df['expand_query'].apply(get_top_products_bm25)                           # Passing the enriched query to the function (BM25)
query_df['map@k'] = query_df.apply(lambda x: map_at_k(x['relevant_ids'], x['top_product_ids'], k=10), axis=1) # Now assign the map@k score
query_df.loc[:, 'map@k'].mean()                                                                               # calculate the MAP across the entire query set

np.float64(0.35594770998677255)

The MAP@10 score has improved from 34% (TF-IDF) to 36% using BM25 — a 2% absolute gain.
This suggests that BM25 is more effective than TF-IDF for ranking relevant products based on keyword matching.

Next step: Evaluate semantic models like Word2Vec to capture contextual similarity beyond exact keyword overlap.

In [None]:
from gensim.models import KeyedVectors
from sklearn.metrics.pairwise import cosine_similarity

# Load pretrained Word2Vec model (e.g., Google News vectors)
# model = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin.gz', binary=True)

In [34]:
# --- Step 1: Build sentence embeddings for all products ---
def calculate_word2vec(dataframe, model):
    combined_text = dataframe['product_name'].copy()
    for col in dataframe:
        if dataframe[col].dtype == 'object':
            combined_text += ' ' + dataframe[col]

    # Preprocess and tokenize each document
    tokenized_corpus = [doc.lower().split() for doc in combined_text.values.astype('U')]

    # Create product embeddings by averaging word vectors
    product_embeddings = []
    for tokens in tokenized_corpus:
        word_vecs = [model[word] for word in tokens if word in model]
        if word_vecs:
            product_embeddings.append(np.mean(word_vecs, axis=0))
        else:
            product_embeddings.append(np.zeros(model.vector_size))
    
    return np.array(product_embeddings)  # shape: (num_products, vector_size)

# --- Step 2: Query matching using cosine similarity ---
def get_top_products_word2vec(query, model, product_embeddings, top_n=10):
    query_tokens = query.lower().split()
    word_vecs = [model[word] for word in query_tokens if word in model]

    if not word_vecs:
        return []  # No valid tokens in query

    query_vec = np.mean(word_vecs, axis=0).reshape(1, -1)
    similarities = cosine_similarity(query_vec, product_embeddings).flatten()
    top_product_indices = similarities.argsort()[-top_n:][::-1]
    return top_product_indices

In [40]:
# Precompute embeddings for products
product_embeddings = calculate_word2vec(product_df, model)

# Enrich the query if needed (e.g., using expand_query)
top_product_indices = get_top_products_word2vec(query, model, product_embeddings, top_n=10)

# Retrieve product IDs
top_product_ids = product_df.iloc[top_product_indices]['product_id'].tolist()

print(f"Top products for '{query}':")
for product_id in top_product_ids:
    product = product_df.loc[product_df['product_id'] == product_id]
    print(product_id, product['product_name'].values[0])                                                                             # calculate the MAP across the entire query set

Top products for 'armchair armchair':
12756 24.41 '' wide tufted polyester armchair
37139 schmucker 39 '' wide tufted armchair
41306 hartsell 33 '' wide armchair
1140 charnley 47 '' wide chenille armchair
1141 colinton 41 '' wide chenille armchair
41270 almaraz 33.7 '' wide leather match armchair
29383 mikayla 27.75 '' wide armchair
543 waytrim indoor chaise lounge sofa , folding lazy sofa floor chair 6-position folding padded , lounger bed with armrests and a pillow chaise couch - gray
39722 alannis 27 '' wide armchair
3879 marrone 23.75 '' wide tufted armchair


In [56]:
query_df['top_product_ids'] = query_df['expand_query'].apply(lambda q: get_top_products_word2vec(q, model, product_embeddings)) # Passing the enriched query to the function (Word2Vec)
query_df['map@k'] = query_df.apply(lambda x: map_at_k(x['relevant_ids'], x['top_product_ids'], k=10), axis=1) # Now assign the map@k score
query_df.loc[:, 'map@k'].mean()                                                                               # calculate the MAP across the entire query set

0.18117363637198705

⚠️ **Observation**: The Word2Vec model achieved significantly lower performance compared to BM25.  
This suggests that the pretrained Word2Vec embeddings (e.g., GoogleNews) may not be well-aligned with our domain-specific vocabulary or context.