In [2]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# dropping stopwords, Lemmatization 
import nltk                              # 
from nltk.corpus import stopwords        # removing stopwords
from nltk.stem import WordNetLemmatizer  # stemmization / lemmetization
from sklearn.pipeline import Pipeline    # 
from nltk.corpus import wordnet          # Enriching the query with also synonyms and semantically related words

nltk.download('stopwords')  # Adding Stopwords to the preprocessing the data phase 
nltk.download('wordnet')    # conceptual relationships between words

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [None]:
#clone the git repo that contains the data and additional information about the dataset
!git clone https://github.com/wayfair/WANDS.git

In [4]:
# Adding preprocess data phase before running the tf-IDF algorithm
def preprocess_text(text):
    stop_words = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()
    # Tokenize, remove stopwords, and lemmatize
    tokens = text.lower().split()
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return ' '.join(tokens)

# Handling null values for more columns (product names, descriptions, class, hierarchy, features)
def preprocess_dataframe(dataframe):
    for col in dataframe:
        if dataframe[col].dtype == 'object': 
            if dataframe[col].isna().sum() > 0: 
                print('Number of null values in', col, 'is:', dataframe[col].isna().sum())
            dataframe[col] = dataframe[col].fillna('').apply(preprocess_text)
    return dataframe

# Defining functions for product search using Tf-IDF
def calculate_tfidf(dataframe):
    # combined_text = dataframe['product_name'] + ' ' + dataframe['product_description']
    combined_text = dataframe['product_name'] # increasing the weight of product name by adding it twice
    for col in dataframe:
        if dataframe[col].dtype == 'object': combined_text += ' ' + dataframe[col] 

    # vectorizer = TfidfVectorizer()
    vectorizer = TfidfVectorizer(ngram_range=(1, 2)) # adding biagram also
    
    # convert combined_text to list of unicode strings
    tfidf_matrix = vectorizer.fit_transform(combined_text.values.astype('U'))
    return vectorizer, tfidf_matrix

def get_top_products(vectorizer, tfidf_matrix, query, top_n=10):
    query_vector = vectorizer.transform([query])
    cosine_similarities = cosine_similarity(query_vector, tfidf_matrix).flatten()
    top_product_indices = cosine_similarities.argsort()[-top_n:][::-1]
    return top_product_indices

In [5]:
# Defining functions for evaluating retrieval performance
def map_at_k(true_ids, predicted_ids, k=10):
    #if either list is empty, return 0
    if not len(true_ids) or not len(predicted_ids):
        return 0.0

    score = 0.0
    num_hits = 0.0

    for i, p_id in enumerate(predicted_ids[:k]):
        if p_id in true_ids and p_id not in predicted_ids[:i]:
            num_hits += 1.0
            score += num_hits / (i + 1.0)

    return score / min(len(true_ids), k)

In [6]:
# get search queries
query_df = pd.read_csv(r"WANDS/dataset/query.csv", sep='\t')

In [7]:
query_df.head()

Unnamed: 0,query_id,query,query_class
0,0,salon chair,Massage Chairs
1,1,smart coffee table,Coffee & Cocktail Tables
2,2,dinosaur,Kids Wall Décor
3,3,turquoise pillows,Accent Pillows
4,4,chair and a half recliner,Recliners


In [8]:
# get products
product_df = pd.read_csv("WANDS/dataset/product.csv", sep='\t')

In [9]:
product_df.head()

Unnamed: 0,product_id,product_name,product_class,category hierarchy,product_description,product_features,rating_count,average_rating,review_count
0,0,solid wood platform bed,Beds,Furniture / Bedroom Furniture / Beds & Headboa...,"good , deep sleep can be quite difficult to ha...",overallwidth-sidetoside:64.7|dsprimaryproducts...,15.0,4.5,15.0
1,1,all-clad 7 qt . slow cooker,Slow Cookers,Kitchen & Tabletop / Small Kitchen Appliances ...,"create delicious slow-cooked meals , from tend...",capacityquarts:7|producttype : slow cooker|pro...,100.0,2.0,98.0
2,2,all-clad electrics 6.5 qt . slow cooker,Slow Cookers,Kitchen & Tabletop / Small Kitchen Appliances ...,prepare home-cooked meals on any schedule with...,features : keep warm setting|capacityquarts:6....,208.0,3.0,181.0
3,3,all-clad all professional tools pizza cutter,"Slicers, Peelers And Graters",Browse By Brand / All-Clad,this original stainless tool was designed to c...,overallwidth-sidetoside:3.5|warrantylength : l...,69.0,4.5,42.0
4,4,baldwin prestige alcott passage knob with roun...,Door Knobs,Home Improvement / Doors & Door Hardware / Doo...,the hardware has a rich heritage of delivering...,compatibledoorthickness:1.375 '' |countryofori...,70.0,5.0,42.0


In [10]:
# get manually labeled groundtruth lables
label_df = pd.read_csv("WANDS/dataset/label.csv", sep='\t')

In [11]:
label_df.head()

Unnamed: 0,id,query_id,product_id,label
0,0,0,25434,Exact
1,1,0,12088,Irrelevant
2,2,0,42931,Exact
3,3,0,2636,Exact
4,4,0,42923,Exact


In [12]:
#group the labels for each query to use when identifying exact matches
grouped_label_df = label_df.groupby('query_id')

In [13]:
# Preprocess of data - finding null values in product names and descriptions
new_df = preprocess_dataframe(product_df)

Number of null values in product_class is: 2852
Number of null values in category hierarchy is: 1556
Number of null values in product_description is: 6008


In [13]:
new_df.head()

Unnamed: 0,product_id,product_name,product_class,category hierarchy,product_description,product_features,rating_count,average_rating,review_count
0,0,solid wood platform bed,bed,furniture / bedroom furniture / bed & headboar...,"good , deep sleep quite difficult busy age . f...",overallwidth-sidetoside:64.7|dsprimaryproducts...,15.0,4.5,15.0
1,1,all-clad 7 qt . slow cooker,slow cooker,kitchen & tabletop / small kitchen appliance /...,"create delicious slow-cooked meal , tender mea...",capacityquarts:7|producttype : slow cooker|pro...,100.0,2.0,98.0
2,2,all-clad electric 6.5 qt . slow cooker,slow cooker,kitchen & tabletop / small kitchen appliance /...,prepare home-cooked meal schedule essential sl...,feature : keep warm setting|capacityquarts:6.5...,208.0,3.0,181.0
3,3,all-clad professional tool pizza cutter,"slicers, peeler grater",browse brand / all-clad,original stainless tool designed complement ma...,overallwidth-sidetoside:3.5|warrantylength : l...,69.0,4.5,42.0
4,4,baldwin prestige alcott passage knob round ros...,door knob,home improvement / door & door hardware / door...,hardware rich heritage delivering modern luxur...,compatibledoorthickness:1.375 '' |countryofori...,70.0,5.0,42.0


In [17]:
# Calculate TF-IDF
# vectorizer, tfidf_matrix = calculate_tfidf(product_df)
vectorizer, tfidf_matrix = calculate_tfidf(new_df) # Using new enriched dataframe (having more columns to compare with the query)

In [24]:
# Usinmg WordNet to enrich the query 
def expand_query(query):
    synonyms = set()
    for syn in wordnet.synsets(query):
        for lemma in syn.lemmas():
            if lemma.name() not in synonyms: synonyms.add(lemma.name())
    return query + ' ' + ' '.join(synonyms)

# Sanity check code block to see if the search results are relevant
# Implementing a function to retrieve top K product IDs for a query
def get_top_product_ids_for_query(query):
    top_product_indices = get_top_products(vectorizer, tfidf_matrix, query, top_n=10)
    top_product_ids = new_df.iloc[top_product_indices]['product_id'].tolist()
    return top_product_ids

# Defining the test query
query = "armchair"
query = expand_query(query) # Enriching the query

# Obtain top product IDs
top_product_ids = get_top_product_ids_for_query(query)

print(f"Top products for '{query}':")
for product_id in top_product_ids:
    product = new_df.loc[product_df['product_id'] == product_id]
    print(product_id, product['product_name'].values[0])

Top products for 'armchair armchair':
12756 24.41 '' wide tufted polyester armchair 24.41 '' wide tufted polyester armchair accent chair furniture / living room furniture / chair & seating / accent chair / arm accent chair nothing make contemporary design statement like armchair . armchair design tone allowing dramatic upholstery silhouette retain u shape leisure chair living room . modern style armchair generous padding simple stitch . backheight-seattotopofback:14|levelofassembly : partial assembly|upholsterycolor : gray|armmaterial : fabric|overalldepth-fronttoback:27.56|framematerial : manufactured wood|armtype : flared arms|backfillmaterial : foam|legcolor : black|overallproductweight:59.66|seatwidth-sidetoside:19.69|seatconstruction : web suspension|waterrepellant : resiliency|productcare : clean professional cleaner only|removablecushionlocation : seat|cushionconstruction : foam|minimumdoorwidth-sidetoside:26|dssecondaryproductstyle : mid-century modern|dsprimaryproductstyle : m

In [25]:
# Implementing a function to retrieve exact match product IDs for a query_id
def get_exact_matches_for_query(query_id):
    query_group = grouped_label_df.get_group(query_id)
    exact_matches = query_group.loc[query_group['label'] == 'Exact']['product_id'].values
    return exact_matches

# Applying the function to obtain top product IDs and adding top K product IDs to the dataframe 
# query_df['top_product_ids'] = query_df['query'].apply(get_top_product_ids_for_query)
query_df['expand_query'] = query_df['query'].apply(expand_query) # Enriching the query
query_df['top_product_ids'] = query_df['expand_query'].apply(get_top_product_ids_for_query) # Passing the enriched query to the function

# Adding the list of exact match product_IDs from labels_df
query_df['relevant_ids'] = query_df['query_id'].apply(get_exact_matches_for_query)

# Now assigning the map@k score
query_df['map@k'] = query_df.apply(lambda x: map_at_k(x['relevant_ids'], x['top_product_ids'], k=10), axis=1)

In [26]:
# Calculating the MAP across the entire query set
query_df.loc[:, 'map@k'].mean()

np.float64(0.3271772670487948)

The average of MAP (Mean of Average Precision) at K ranked list for the k (10) retrieved products is 32%. This means there are 32% similarity between what we have searched (armchair) and descriptions of retrieved products.

The question is if it is different if we use another matching method such as Word2Vec?