## Experimenting with different preprocessing

In [182]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import math
import os
import time
import ast
import spacy


In [3]:
df = pd.read_csv("../data/racquets_with_embeddings.csv", index_col = 0)

Rename combined_text and embedded_text to indicate that they were the v1 iterations.

In [10]:
df.rename(columns = {"combined_text":"original_combined_text", "embedded_text":"original_embedded_text"}, inplace = True)

Because I saved the df as a .csv file, the column of vectors got saved a string. I'll need to convert them back to real lists. Code generated with ChatGPT.

In [179]:
# Safely parse stringified lists into real lists
df["original_embedded_text"] = df["original_embedded_text"].apply(ast.literal_eval)

In [11]:
df.columns

Index(['racquet_img', 'racquet_brand', 'racquet_name', 'racquet_rating',
       'racquet_price', 'racquet_desc', 'racquet_swingweight',
       'racquet_composition', 'racquet_power', 'racquet_stroke_style',
       'racquet_swing_speed', 'racquet_colors', 'racquet_grip_type',
       'racquet_head_size_sq_in', 'racquet_length_in', 'racquet_balance_in',
       'racquet_strung_weight_oz', 'racquet_balance_HH_HL',
       'racquet_stiffness', 'racquet_avg_beam_width', 'racquet_mains',
       'racquet_crosses', 'racquet_tension_lower', 'racquet_tension_upper',
       'original_combined_text', 'original_embedded_text'],
      dtype='object')

In [27]:
df.head()

Unnamed: 0,racquet_img,racquet_brand,racquet_name,racquet_rating,racquet_price,racquet_desc,racquet_swingweight,racquet_composition,racquet_power,racquet_stroke_style,...,racquet_strung_weight_oz,racquet_balance_HH_HL,racquet_stiffness,racquet_avg_beam_width,racquet_mains,racquet_crosses,racquet_tension_lower,racquet_tension_upper,original_combined_text,original_embedded_text
0,https://img.tennis-warehouse.com/watermark/rs....,Babolat,Babolat Pure Drive 2025,4.8,289.0,The Pure Drive is popular for a reason. Boast...,317.0,Graphite,Low-Medium,Medium-Full,...,11.2,4.0,69.0,24.0,16.0,19.0,46.0,55.0,Racquet Brand: Babolat\nRacquet Name: Babolat ...,"[-0.03395332768559456, -0.024847345426678658, ..."
1,https://img.tennis-warehouse.com/watermark/rs....,Babolat,Babolat Pure Drive 98 2025,4.5,299.0,Originally launched in 2019 under the VS moni...,326.0,Graphite,Low-Medium,Medium-Full,...,11.4,3.0,69.0,21.666666,16.0,20.0,46.0,55.0,Racquet Brand: Babolat\nRacquet Name: Babolat ...,"[-0.058281801640987396, -0.01862466149032116, ..."
2,https://img.tennis-warehouse.com/watermark/rs....,Babolat,Babolat Pure Drive 98 2-Pack 2025,5.0,579.0,This product is for 2 Pure Drive 98 racquets....,323.0,Graphite,Low-Medium,Medium-Full,...,11.4,3.0,69.0,21.666666,16.0,20.0,46.0,55.0,Racquet Brand: Babolat\nRacquet Name: Babolat ...,"[-0.05299745500087738, -0.026440361514687538, ..."
3,https://img.tennis-warehouse.com/watermark/rs....,Babolat,Babolat Pure Drive Plus 2025,5.0,289.0,Babolat adds another chapter to one of the ga...,325.0,Graphite,Low-Medium,Medium-Full,...,11.2,6.0,69.0,24.0,16.0,19.0,46.0,55.0,Racquet Brand: Babolat\nRacquet Name: Babolat ...,"[-0.032470472157001495, -0.010386866517364979,..."
4,https://img.tennis-warehouse.com/watermark/rs....,Babolat,Babolat Pure Drive Team 2025,5.0,269.0,The Pure Drive Team 2025 is defined by its us...,308.0,Graphite,Low-Medium,Medium-Full,...,10.6,5.0,69.0,24.0,16.0,19.0,44.0,53.0,Racquet Brand: Babolat\nRacquet Name: Babolat ...,"[-0.04666471481323242, -0.019274618476629257, ..."


In [67]:
model = SentenceTransformer("all-MiniLM-L6-v2")
def embed_column(df:pd.DataFrame, column:str, model:SentenceTransformer = model) -> list[list[float]]:    
    
    # Make sure to convert to str to handle NA values
    texts = df[column].astype(str).tolist()
    
    embeddings = model.encode(texts, show_progress_bar = True)
    
    return embeddings.tolist()


In [76]:
def get_top_k_matches(df:pd.DataFrame, query:str, embedding_column:str, similarity_col_name:str, model:SentenceTransformer = model, k:int = 5) -> pd.DataFrame:
    print(f"Embedding query: {query}")
    query_embedding = model.encode([query], show_progress_bar = True)
    
    print(f"Calculating cosine similarity.")
    racquet_embeddings = np.array(df[embedding_column].tolist())
    similarities = cosine_similarity(query_embedding, racquet_embeddings)[0]
    
    print("Query shape:", query_embedding.shape)
    print("Doc shape:", racquet_embeddings.shape)
    print("Similarities shape:", similarities.shape)
    
    print(f"Sorting results and retrieving top {k} racquets.")
    top_k_indices = similarities.argsort()[::-1][:k]
    top_k_indices = np.array(top_k_indices).flatten()
    
    results = df.iloc[top_k_indices].copy()
    results[similarity_col_name] = similarities[top_k_indices]
    
    return results

## Experiment A: Create a less structured combined text column

In [64]:
def create_naturalized_combined_text(row: pd.Series) -> str:
    def safe(val):
        return "unkown" if pd.isna(val) else str(val.strip())
    combined_text = (
    f"The {safe(row['racquet_name'])} is a {safe(row['racquet_power']).lower()} powered racquet designed for players with "
    f"{safe(row['racquet_stroke_style']).lower()} strokes and {safe(row['racquet_swing_speed']).lower()} swings. "
    f"It features a stiffness rating of {row['racquet_stiffness']} and a {str(row['racquet_composition']).lower()} "
    f"composition. The racquet has a {row['racquet_swingweight']} ounce swing weight, a {row['racquet_head_size_sq_in']} "
    f"square inch head size, a {row['racquet_strung_weight_oz']} ounce strung weight, "
    f"and has a {row['racquet_mains']} by {row['racquet_crosses']} string pattern."
    )
            
    return " ".join(combined_text.split())

In [65]:
df["expA_combined_text"] = df.apply(create_naturalized_combined_text, axis = 1)

In [69]:
df["expA_embeddings"] = embed_column(df = df, column = "expA_combined_text", model = model)

Batches: 100%|██████████| 11/11 [00:05<00:00,  2.10it/s]


In [71]:
df[["expA_combined_text", "expA_embeddings"]]

Unnamed: 0,expA_combined_text,expA_embeddings
0,The Babolat Pure Drive 2025 is a low-medium po...,"[-0.01439664326608181, -0.03063753806054592, -..."
1,The Babolat Pure Drive 98 2025 is a low-medium...,"[-0.01053662784397602, -0.025784382596611977, ..."
2,The Babolat Pure Drive 98 2-Pack 2025 is a low...,"[-0.009660015814006329, -0.019450191408395767,..."
3,The Babolat Pure Drive Plus 2025 is a low-medi...,"[-0.0022637704387307167, -0.04049436002969742,..."
4,The Babolat Pure Drive Team 2025 is a low-medi...,"[-0.018208928406238556, -0.026571277529001236,..."
...,...,...
392,The Solinco Blackout 300 XTD is a low-medium p...,"[-0.07088436186313629, -0.001582358032464981, ..."
393,The Solinco Blackout 300 XTD+ is a low-medium ...,"[-0.07515248656272888, -0.00494010467082262, -..."
394,The Lacoste L23 is a low-medium powered racque...,"[0.04245226830244064, -0.05801337584853172, 0...."
395,The Lacoste L23L is a low-medium powered racqu...,"[0.013153678737580776, -0.06492088735103607, 0..."


In [None]:
query = "I am a high power player with compact strokes. I want an arm-friendly racquet that doesn't require a fast swing."

res = get_top_k_matches(df, query, embedding_column = "expA_embeddings", similarity_col_name = "expA_similarity", k = 5)

res #Still not very good

Embedding query: I am a high power player with compact strokes. I want an arm-friendly racquet that doesn't require a fast swing.


Batches: 100%|██████████| 1/1 [00:00<00:00,  1.15it/s]

Calculating cosine similarity.
Query shape: (1, 384)
Doc shape: (324, 384)
Similarities shape: (324,)
Sorting results and retrieving top 5 racquets.





Unnamed: 0,racquet_img,racquet_brand,racquet_name,racquet_rating,racquet_price,racquet_desc,racquet_swingweight,racquet_composition,racquet_power,racquet_stroke_style,...,racquet_avg_beam_width,racquet_mains,racquet_crosses,racquet_tension_lower,racquet_tension_upper,original_combined_text,original_embedded_text,expA_combined_text,expA_embeddings,expA_similarity
96,https://img.tennis-warehouse.com/watermark/rs....,Wilson,Wilson Ultra 100 v4,5.0,239.0,Introducing the Wilson Ultra 100 v4! Like pre...,317.0,Carbon Fiber Graphite,Low-Medium,Medium-Full,...,25.166666,16.0,19.0,50.0,60.0,Racquet Brand: Wilson\nRacquet Name: Wilson Ul...,"[-0.0355178564786911, -0.003755951067432761, -...",The Wilson Ultra 100 v4 is a low-medium powere...,"[0.014990380965173244, 0.025851238518953323, -...",0.697956
90,https://img.tennis-warehouse.com/watermark/rs....,Wilson,Wilson Pro Staff 97L v14,4.1,269.0,Wilson adds another chapter to Pro Staff 97L ...,313.0,Graphite,Low-Medium,Medium-Full,...,23.5,16.0,19.0,50.0,60.0,Racquet Brand: Wilson\nRacquet Name: Wilson Pr...,"[-0.05154640972614288, -0.016933811828494072, ...",The Wilson Pro Staff 97L v14 is a low-medium p...,"[-0.02314380183815956, -0.004212844185531139, ...",0.695873
97,https://img.tennis-warehouse.com/watermark/rs....,Wilson,Wilson Ultra 100L v4,5.0,219.0,The Ultra 100L v4 takes the spin and power of...,299.0,Carbon Fiber Graphite,Low-Medium,Medium-Full,...,24.9,16.0,19.0,50.0,60.0,Racquet Brand: Wilson\nRacquet Name: Wilson Ul...,"[-0.06917305290699005, -0.0008213474648073316,...",The Wilson Ultra 100L v4 is a low-medium power...,"[0.00945640541613102, 0.026841046288609505, -0...",0.694517
99,https://img.tennis-warehouse.com/watermark/rs....,Wilson,Wilson Ultra 108 v4,4.8,239.0,"Introducing the Ultra 108 v4, the most powerf...",311.0,Carbon Fiber Graphite,Medium-High,Compact-Medium,...,26.333334,16.0,18.0,50.0,60.0,Racquet Brand: Wilson\nRacquet Name: Wilson Ul...,"[-0.04023877903819084, -0.024468939751386642, ...",The Wilson Ultra 108 v4 is a medium-high power...,"[0.011325204744935036, -0.006268652155995369, ...",0.692605
88,https://img.tennis-warehouse.com/watermark/rs....,Wilson,Wilson Pro Staff 97 v14,4.6,289.0,Updated with Wilson's revolutionary layup tec...,325.0,Aramid/Graphite,Low,Full,...,21.5,16.0,19.0,50.0,60.0,Racquet Brand: Wilson\nRacquet Name: Wilson Pr...,"[-0.04986654594540596, -0.019648632034659386, ...",The Wilson Pro Staff 97 v14 is a low powered r...,"[-0.022547589614987373, -0.010201423428952694,...",0.691921


In [None]:
df["racquet_crosses"].unique()

array([19., 20., 17., 18., nan, 16.])

## Experiment B: Structure User Input Query

In [167]:
def get_options(df:pd.DataFrame, column:str) -> list[str]:
    return [item for item in df[column].unique().tolist() if pd.notna(item)]

def generate_prompt(column:str):
    return f"Choose a {column.replace('_', ' ')} from the list below."

def get_query_feature(input_prompt:str, options:list[str], no_preference_str:str):
    opt = [x.lower() for x in options]
    
    while True:
        print(f"{input_prompt}")
        
        for index, row in enumerate(options):
            print(f"{index + 1}. {row}")
            
        entry = input("Pick a choice from above or click enter for no preference.")
        
        if not entry:
            return no_preference_str
        
        elif entry.strip().replace(".", "").isdigit():
            return options[int(entry.strip().replace(".", ""))-1]
        
        elif entry.strip().lower() in opt:
            return options[opt.index(entry.strip().lower())]
        
        else:
            print("Invalid entry. Try again.")
 

In [164]:
# Function to generate a dictionary of user inputs mapped to a specific df column
def input_dict_generation(df, columns:list[str], no_preference_str:str):
    features_dict = {}
    for col in columns:
        options_list = get_options(df, col)
        prompt_str = generate_prompt(col)
        val = get_query_feature(input_prompt = prompt_str, options = options_list, no_preference_str = no_preference_str)
        features_dict[col] = val
        time.sleep(0.1)
        os.system("clear")
        
    return features_dict

In [170]:
# Test that it works
features = input_dict_generation(df, ["racquet_power", "racquet_stroke_style", "racquet_swing_speed"], no_preference_str = "any")

Choose a racquet power from the list below.
1. Low-Medium
2. Medium
3. Medium-High
4. Low
5. High
[H[2JChoose a racquet stroke style from the list below.
1. Medium-Full
2. Medium
3. Compact-Medium
4. Full
5. Compact
6. Long
[H[2JChoose a racquet swing speed from the list below.
1. Medium-Fast
2. Medium
3. Slow-Moderate
4. Fast
5. Slow
6. Moderate-Fast
[H[2J

In [171]:
features

{'racquet_power': 'any',
 'racquet_stroke_style': 'Full',
 'racquet_swing_speed': 'Slow-Moderate'}

In [180]:
query = f"Racquet should be {features['racquet_power'].lower()} power and designed for {features['racquet_stroke_style'].lower()} stroke styles and {features['racquet_swing_speed'].lower()} swing speeds."
res = get_top_k_matches(df = df.dropna(), query = query, embedding_column = "original_embedded_text", similarity_col_name = "expB_similarity")

Embedding query: Racquet should be any power and designed for full stroke styles and slow-moderate swing speeds.


Batches: 100%|██████████| 1/1 [00:00<00:00,  1.57it/s]

Calculating cosine similarity.
Query shape: (1, 384)
Doc shape: (293, 384)
Similarities shape: (293,)
Sorting results and retrieving top 5 racquets.





In [None]:
res #Best so far

Unnamed: 0,racquet_img,racquet_brand,racquet_name,racquet_rating,racquet_price,racquet_desc,racquet_swingweight,racquet_composition,racquet_power,racquet_stroke_style,...,racquet_avg_beam_width,racquet_mains,racquet_crosses,racquet_tension_lower,racquet_tension_upper,original_combined_text,original_embedded_text,expA_combined_text,expA_embeddings,expB_similarity
97,https://img.tennis-warehouse.com/watermark/rs....,Wilson,Wilson Ultra 100L v4,5.0,219.0,The Ultra 100L v4 takes the spin and power of...,299.0,Carbon Fiber Graphite,Low-Medium,Medium-Full,...,24.9,16.0,19.0,50.0,60.0,Racquet Brand: Wilson\nRacquet Name: Wilson Ul...,"[-0.06917305290699005, -0.0008213474648073316,...",The Wilson Ultra 100L v4 is a low-medium power...,"[0.00945640541613102, 0.026841046288609505, -0...",0.757694
123,https://img.tennis-warehouse.com/watermark/rs....,Head,Head Speed Team,5.0,274.0,Head makes big changes to the Speed Team for ...,306.0,Auxetic 2/Graphene 360+/Graphite,Medium,Medium,...,24.0,16.0,19.0,48.0,57.0,Racquet Brand: Head\nRacquet Name: Head Speed ...,"[-0.026125581935048103, 0.021720057353377342, ...",The Head Speed Team is a medium powered racque...,"[0.03225333243608475, 0.06821465492248535, -0....",0.756687
74,https://img.tennis-warehouse.com/watermark/rs....,Wilson,Wilson Ultra Pro 16x19 v4,4.6,239.0,Introducing the Ultra Pro 16x19 v4! This clas...,317.0,Graphite,Low,Full,...,20.6,16.0,19.0,50.0,60.0,Racquet Brand: Wilson\nRacquet Name: Wilson Ul...,"[-0.04590383172035217, -0.014738207682967186, ...",The Wilson Ultra Pro 16x19 v4 is a low powered...,"[0.019274748861789703, 0.02141467109322548, -0...",0.755378
94,https://img.tennis-warehouse.com/watermark/rs....,Wilson,Wilson Ultra Pro 16x19 v4,4.6,239.0,Introducing the Ultra Pro 16x19 v4! This clas...,317.0,Graphite,Low,Full,...,20.6,16.0,19.0,50.0,60.0,Racquet Brand: Wilson\nRacquet Name: Wilson Ul...,"[-0.04590383172035217, -0.014738207682967186, ...",The Wilson Ultra Pro 16x19 v4 is a low powered...,"[0.019274748861789703, 0.02141467109322548, -0...",0.755378
146,https://img.tennis-warehouse.com/watermark/rs....,Head,Head Radical Team 2025,5.0,274.0,Head adds another chapter to the Radical Team...,303.0,Graphene Inside/Graphite,Low-Medium,Medium-Full,...,23.333334,16.0,19.0,48.0,57.0,Racquet Brand: Head\nRacquet Name: Head Radica...,"[-0.04297531768679619, -0.025608988478779793, ...",The Head Radical Team 2025 is a low-medium pow...,"[-0.027225306257605553, 0.044008370488882065, ...",0.751204


## Hybrid Search

First, extract structured filters and apply them. Then run semantic search on just embeddings of the racquet description.