# Text Embedding Preprocessing

## Imports and data loading

In [1]:
from itertools import zip_longest
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer

#HF sentence-transformers resource: https://huggingface.co/sentence-transformers

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df = pd.read_csv("../data/interim/racquets_trimmed.csv", index_col = 0)

racquet_brand = [row.split(" ")[0] for row in df["racquet_name"]]

df.insert(1, "racquet_brand", racquet_brand)

df

Unnamed: 0,racquet_img,racquet_brand,racquet_name,racquet_rating,racquet_price,racquet_desc,racquet_swingweight,racquet_composition,racquet_power,racquet_stroke_style,...,racquet_length_in,racquet_balance_in,racquet_strung_weight_oz,racquet_balance_HH_HL,racquet_stiffness,racquet_avg_beam_width,racquet_mains,racquet_crosses,racquet_tension_lower,racquet_tension_upper
0,https://img.tennis-warehouse.com/watermark/rs....,Babolat,Babolat Pure Drive 2025,4.8,289.00,The Pure Drive is popular for a reason. Boast...,317.0,Graphite,Low-Medium,Medium-Full,...,27.0,12.99,11.2,4.0,69.0,24.000000,16.0,19.0,46.0,55.0
1,https://img.tennis-warehouse.com/watermark/rs....,Babolat,Babolat Pure Drive 98 2025,4.5,299.00,Originally launched in 2019 under the VS moni...,326.0,Graphite,Low-Medium,Medium-Full,...,27.0,13.18,11.4,3.0,69.0,21.666666,16.0,20.0,46.0,55.0
2,https://img.tennis-warehouse.com/watermark/rs....,Babolat,Babolat Pure Drive 98 2-Pack 2025,5.0,579.00,This product is for 2 Pure Drive 98 racquets....,323.0,Graphite,Low-Medium,Medium-Full,...,27.0,13.18,11.4,3.0,69.0,21.666666,16.0,20.0,46.0,55.0
3,https://img.tennis-warehouse.com/watermark/rs....,Babolat,Babolat Pure Drive Plus 2025,5.0,289.00,Babolat adds another chapter to one of the ga...,325.0,Graphite,Low-Medium,Medium-Full,...,27.5,13.00,11.2,6.0,69.0,24.000000,16.0,19.0,46.0,55.0
4,https://img.tennis-warehouse.com/watermark/rs....,Babolat,Babolat Pure Drive Team 2025,5.0,269.00,The Pure Drive Team 2025 is defined by its us...,308.0,Graphite,Low-Medium,Medium-Full,...,27.0,12.85,10.6,5.0,69.0,24.000000,16.0,19.0,44.0,53.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
392,https://img.tennis-warehouse.com/watermark/rs....,Solinco,Solinco Blackout 300 XTD,4.8,229.99,"With the Blackout 300 XTD, Solinco takes the ...",328.0,40T Carbon/Graphite,Low-Medium,Medium-Full,...,27.5,12.80,11.3,8.0,70.0,24.166666,16.0,19.0,50.0,60.0
393,https://img.tennis-warehouse.com/watermark/rs....,Solinco,Solinco Blackout 300 XTD+,5.0,229.99,"With the Blackout 300 XTD+, Solinco gives adv...",333.0,40T Carbon/Graphite,Low-Medium,Medium-Full,...,28.0,12.80,11.3,10.0,66.0,24.166666,16.0,19.0,50.0,60.0
394,https://img.tennis-warehouse.com/watermark/rs....,Lacoste,Lacoste L23,4.5,199.00,Introducing the Lascoste L23! Following on th...,318.0,Graphite,Low-Medium,Medium-Full,...,27.0,12.90,11.1,5.0,69.0,23.666666,16.0,19.0,51.0,55.0
395,https://img.tennis-warehouse.com/watermark/rs....,Lacoste,Lacoste L23L,5.0,199.00,Lacoste makes impressive updates to the L23L ...,310.0,Graphite,Low-Medium,Medium-Full,...,27.0,13.40,10.2,1.0,,23.666666,16.0,19.0,51.0,55.0


## Combining text columns for embedding

In [3]:
object_cols = df.select_dtypes(include = ["object"]).columns.to_list()
numeric_cols = df.select_dtypes(include = [np.number]).columns.to_list()

print(pd.DataFrame(
    list(zip_longest(object_cols, numeric_cols, fillvalue=None)), 
    columns = ["Object Columns", "Numeric Columns"]
    )
      )

          Object Columns           Numeric Columns
0            racquet_img            racquet_rating
1          racquet_brand             racquet_price
2           racquet_name       racquet_swingweight
3           racquet_desc   racquet_head_size_sq_in
4    racquet_composition         racquet_length_in
5          racquet_power        racquet_balance_in
6   racquet_stroke_style  racquet_strung_weight_oz
7    racquet_swing_speed     racquet_balance_HH_HL
8         racquet_colors         racquet_stiffness
9      racquet_grip_type    racquet_avg_beam_width
10                  None             racquet_mains
11                  None           racquet_crosses
12                  None     racquet_tension_lower
13                  None     racquet_tension_upper


In [4]:
df["combined_text"] = ""

### 

In [5]:
def combine_obj_cols(df:pd.DataFrame, object_cols:list[str]) -> pd.Series:
    _df = df.copy()
    _df["combined_col"] = ""
    
    _replacements = str.maketrans({
        "!":".",
        "_":" ",
        "&":"and",
        "²":"",
        "\xa0":"",
        "\n":"",
        "\r":"",
        '"':"",
        '“':"",
        "+":"plus",
        "%":"percent"
    })
    
    _title_dict = {
        "racquet_brand":"Racquet Brand",
        "racquet_name":"Racquet Name",
        "racquet_desc":"Racquet Description",
        "racquet_composition":"Racquet Composition",
        "racquet_power":"Racquet Power Level",
        "racquet_stroke_style":"Racquet Stroke Style",
        "racquet_swing_speed":"Racquet Swing Speed",
        "racquet_colors":"Racquet Colors",
        "racquet_grip_type":"Racquet Grip Type"
    }
    
    for col in object_cols:
        _content = _df[col].str.translate(_replacements)
        _content = _content.replace("in²", "inches squared").replace("  ", " ")
        _df["combined_col"] += _title_dict[col] + ": " +  _content + "\n"
        
    return _df["combined_col"]

In [6]:
df["combined_text"] = combine_obj_cols(df, object_cols[1:len(object_cols)])

In [7]:
df[["racquet_brand","racquet_name", "racquet_desc", "combined_text"]]

Unnamed: 0,racquet_brand,racquet_name,racquet_desc,combined_text
0,Babolat,Babolat Pure Drive 2025,The Pure Drive is popular for a reason. Boast...,Racquet Brand: Babolat\nRacquet Name: Babolat ...
1,Babolat,Babolat Pure Drive 98 2025,Originally launched in 2019 under the VS moni...,Racquet Brand: Babolat\nRacquet Name: Babolat ...
2,Babolat,Babolat Pure Drive 98 2-Pack 2025,This product is for 2 Pure Drive 98 racquets....,Racquet Brand: Babolat\nRacquet Name: Babolat ...
3,Babolat,Babolat Pure Drive Plus 2025,Babolat adds another chapter to one of the ga...,Racquet Brand: Babolat\nRacquet Name: Babolat ...
4,Babolat,Babolat Pure Drive Team 2025,The Pure Drive Team 2025 is defined by its us...,Racquet Brand: Babolat\nRacquet Name: Babolat ...
...,...,...,...,...
392,Solinco,Solinco Blackout 300 XTD,"With the Blackout 300 XTD, Solinco takes the ...",Racquet Brand: Solinco\nRacquet Name: Solinco ...
393,Solinco,Solinco Blackout 300 XTD+,"With the Blackout 300 XTD+, Solinco gives adv...",Racquet Brand: Solinco\nRacquet Name: Solinco ...
394,Lacoste,Lacoste L23,Introducing the Lascoste L23! Following on th...,Racquet Brand: Lacoste\nRacquet Name: Lacoste ...
395,Lacoste,Lacoste L23L,Lacoste makes impressive updates to the L23L ...,Racquet Brand: Lacoste\nRacquet Name: Lacoste ...


In [8]:
import collections
from itertools import chain


char_counts = collections.Counter(chain.from_iterable(df['combined_text'].dropna()))
print(char_counts)

Counter({' ': 64898, 'e': 42969, 't': 30146, 'a': 25954, 'i': 25243, 'o': 22983, 'n': 21132, 'r': 20634, 's': 18636, 'l': 15295, 'h': 13129, 'c': 12851, 'd': 11110, 'u': 10658, 'p': 9678, 'm': 7528, 'g': 7215, 'f': 6405, 'y': 5125, 'w': 4895, 'b': 4251, 'q': 3975, 'v': 3457, 'R': 3238, ',': 2894, '.': 2870, ':': 2773, '\n': 2763, 'k': 2611, 'S': 2200, 'T': 1740, '-': 1671, '0': 1516, 'x': 1445, 'P': 1365, 'C': 1235, 'F': 1200, '2': 1176, 'M': 1082, '1': 1030, 'G': 994, 'B': 979, 'L': 957, 'I': 745, 'A': 723, 'D': 704, 'V': 638, 'W': 571, 'N': 546, "'": 524, 'E': 511, '5': 485, '3': 467, 'O': 456, '9': 400, '/': 378, 'H': 374, 'U': 284, '(': 283, ')': 283, '8': 277, 'z': 252, 'Y': 246, 'X': 219, '6': 207, '4': 195, 'K': 191, '7': 156, 'Z': 95, 'j': 70, 'Q': 60, '’': 40, '?': 10, ';': 2})


## Test Embedding Process

In [9]:
model = SentenceTransformer("all-MiniLM-L6-v2")

In [10]:
texts = [df["combined_text"][0],
         df["combined_text"][1],
         df["combined_text"][2],
         df["combined_text"][374]]
texts

["Racquet Brand: Babolat\nRacquet Name: Babolat Pure Drive 2025\nRacquet Description:  ThePure Drive is popular for a reason. Boasting an appeal that cuts across ability levels, this modern player's racquet offers an unmistakably easy learning curve to virtually anyone looking to play aggressive tennis. Although it reserves its greatest charm for the baseliner who likes dictating action with heavy pace, it’s also dangerous in the front court, where the quick handling and powerful response practically beg you to finish volleys with a bang. Serving is another strength of the Pure Drive, where surgically placed slices and kickers are as easy to hit as flat bombs down the T. For 2025, Babolat, enhances comfort with an updated version of NF2 Tech, a dampening technology that uses flax fibers in the throat to smooth out harsh vibrations. This model also inherits FSI Power from the previous generation, consisting of a bundle of technologies that enable the racquet to transfer more energy to t

In [11]:
embeddings = model.encode(texts)
print(embeddings.shape)

(4, 384)


In [12]:
pd.DataFrame(embeddings)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,374,375,376,377,378,379,380,381,382,383
0,-0.033953,-0.024847,0.006405,-0.050867,-0.068951,-0.046891,0.003465,0.120841,0.051757,0.05075,...,-0.011013,0.083371,-0.062081,0.012446,0.017826,0.050175,0.075972,-0.084262,0.023721,0.020156
1,-0.058282,-0.018625,0.010552,-0.040193,-0.017465,-0.039943,-0.014186,0.117243,0.024589,0.000139,...,0.000189,0.068433,-0.058803,0.028495,-0.021703,0.056423,0.099534,-0.078435,0.050541,0.026781
2,-0.052997,-0.02644,0.000464,-0.040866,-0.027805,-0.054675,-0.000313,0.125724,0.033623,-0.013678,...,-0.002499,0.061659,-0.051614,0.02267,-0.004192,0.051927,0.087324,-0.071866,0.051886,0.013404
3,-0.093516,0.041833,-0.035343,-0.088772,-0.076597,0.003802,0.046118,0.119499,0.016019,0.116547,...,-0.050261,0.102456,-0.092934,-0.00366,-0.066958,0.017879,0.06238,-0.130975,-0.027347,0.090979


In [13]:
similarities = model.similarity(embeddings, embeddings)
print(similarities) # First three should be more similar to each other than the fourth

tensor([[1.0000, 0.9181, 0.9312, 0.6332],
        [0.9181, 1.0000, 0.9818, 0.6054],
        [0.9312, 0.9818, 1.0000, 0.5957],
        [0.6332, 0.6054, 0.5957, 1.0000]])


## Embed combined_text columns

In [None]:
# Define embedding model, create embedding function
model = SentenceTransformer("all-MiniLM-L6-v2")
def embed_column(df:pd.DataFrame, column:str, model:SentenceTransformer = model) -> list[list[float]]:    
    
    # Make sure to convert to str to handle NA values
    texts = df[column].astype(str).tolist()
    
    embeddings = model.encode(texts, show_progress_bar = True)
    
    return embeddings.tolist()

In [15]:
df["embedded_text"] = embed_column(df, "combined_text")

Batches: 100%|██████████| 11/11 [00:03<00:00,  3.17it/s]


In [16]:
df[["combined_text", "embedded_text"]]

Unnamed: 0,combined_text,embedded_text
0,Racquet Brand: Babolat\nRacquet Name: Babolat ...,"[-0.03395332768559456, -0.024847345426678658, ..."
1,Racquet Brand: Babolat\nRacquet Name: Babolat ...,"[-0.058281801640987396, -0.01862466149032116, ..."
2,Racquet Brand: Babolat\nRacquet Name: Babolat ...,"[-0.05299745500087738, -0.026440361514687538, ..."
3,Racquet Brand: Babolat\nRacquet Name: Babolat ...,"[-0.032470472157001495, -0.010386866517364979,..."
4,Racquet Brand: Babolat\nRacquet Name: Babolat ...,"[-0.04666471481323242, -0.019274618476629257, ..."
...,...,...
392,Racquet Brand: Solinco\nRacquet Name: Solinco ...,"[-0.07645091414451599, -0.022547654807567596, ..."
393,Racquet Brand: Solinco\nRacquet Name: Solinco ...,"[-0.07288163900375366, -0.0318160243332386, -0..."
394,Racquet Brand: Lacoste\nRacquet Name: Lacoste ...,"[-0.037468843162059784, -0.061095334589481354,..."
395,Racquet Brand: Lacoste\nRacquet Name: Lacoste ...,"[-0.046293120831251144, -0.04943633824586868, ..."


In [21]:
df.to_csv(path_or_buf = "../data/racquets_with_embeddings.csv", sep = ",")

In [17]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

def get_top_k_matches(df:pd.DataFrame, query:str, embedding_column:str, model:SentenceTransformer = model, k:int = 5) -> pd.DataFrame:
    print(f"Embedding query: {query}")
    query_embedding = model.encode([query], show_progress_bar = True)
    
    print(f"Calculating cosine similarity.")
    racquet_embeddings = np.array(df[embedding_column].tolist())
    similarities = cosine_similarity(query_embedding, racquet_embeddings)[0]
    
    print("Query shape:", query_embedding.shape)
    print("Doc shape:", racquet_embeddings.shape)
    print("Similarities shape:", similarities.shape)
    
    print(f"Sorting results and retrieving top {k} racquets.")
    top_k_indices = similarities.argsort()[::-1][:k]
    top_k_indices = np.array(top_k_indices).flatten()
    
    results = df.iloc[top_k_indices].copy()
    results["similarity"] = similarities[top_k_indices]
    
    return results


In [19]:
query = "I am a high power player with compact strokes. I want an arm-friendly racquet that doesn't require a fast swing."

res = get_top_k_matches(df, query, embedding_column = "embedded_text", k = 10)

res #Did not perform well

Embedding query: I am a high power player with compact strokes. I want an arm-friendly racquet that doesn't require a fast swing.


Batches: 100%|██████████| 1/1 [00:00<00:00,  2.08it/s]

Calculating cosine similarity.
Query shape: (1, 384)
Doc shape: (324, 384)
Similarities shape: (324,)
Sorting results and retrieving top 10 racquets.





Unnamed: 0,racquet_img,racquet_brand,racquet_name,racquet_rating,racquet_price,racquet_desc,racquet_swingweight,racquet_composition,racquet_power,racquet_stroke_style,...,racquet_balance_HH_HL,racquet_stiffness,racquet_avg_beam_width,racquet_mains,racquet_crosses,racquet_tension_lower,racquet_tension_upper,combined_text,embedded_text,similarity
108,https://img.tennis-warehouse.com/watermark/rs....,Wilson,Wilson Triad Five,5.0,209.0,"Wilson's Triad Five combines ""best of class"" ...",324.0,Graphite,Medium,Medium,...,-6.0,,26.0,16.0,20.0,50.0,60.0,Racquet Brand: Wilson\nRacquet Name: Wilson Tr...,"[-0.02798600122332573, -0.0002932979550678283,...",0.723488
94,https://img.tennis-warehouse.com/watermark/rs....,Wilson,Wilson Ultra Pro 16x19 v4,4.6,239.0,Introducing the Ultra Pro 16x19 v4! This clas...,317.0,Graphite,Low,Full,...,6.0,62.0,20.6,16.0,19.0,50.0,60.0,Racquet Brand: Wilson\nRacquet Name: Wilson Ul...,"[-0.04590383172035217, -0.014738207682967186, ...",0.717534
74,https://img.tennis-warehouse.com/watermark/rs....,Wilson,Wilson Ultra Pro 16x19 v4,4.6,239.0,Introducing the Ultra Pro 16x19 v4! This clas...,317.0,Graphite,Low,Full,...,6.0,62.0,20.6,16.0,19.0,50.0,60.0,Racquet Brand: Wilson\nRacquet Name: Wilson Ul...,"[-0.04590383172035217, -0.014738207682967186, ...",0.717534
123,https://img.tennis-warehouse.com/watermark/rs....,Head,Head Speed Team,5.0,274.0,Head makes big changes to the Speed Team for ...,306.0,Auxetic 2/Graphene 360+/Graphite,Medium,Medium,...,3.0,61.0,24.0,16.0,19.0,48.0,57.0,Racquet Brand: Head\nRacquet Name: Head Speed ...,"[-0.026125581935048103, 0.021720057353377342, ...",0.715029
95,https://img.tennis-warehouse.com/watermark/rs....,Wilson,Wilson Ultra Pro 18x20 v4,4.8,239.0,Introducing the Wilson Ultra Pro 18x20 v4! O...,319.0,Graphite,Low,Full,...,6.0,62.0,20.6,18.0,20.0,50.0,60.0,Racquet Brand: Wilson\nRacquet Name: Wilson Ul...,"[-0.026982324197888374, -0.022458204999566078,...",0.700565
75,https://img.tennis-warehouse.com/watermark/rs....,Wilson,Wilson Ultra Pro 18x20 v4,4.8,239.0,Introducing the Wilson Ultra Pro 18x20 v4! O...,319.0,Graphite,Low,Full,...,6.0,62.0,20.6,18.0,20.0,50.0,60.0,Racquet Brand: Wilson\nRacquet Name: Wilson Ul...,"[-0.026982324197888374, -0.022458204999566078,...",0.700565
146,https://img.tennis-warehouse.com/watermark/rs....,Head,Head Radical Team 2025,5.0,274.0,Head adds another chapter to the Radical Team...,303.0,Graphene Inside/Graphite,Low-Medium,Medium-Full,...,4.0,63.0,23.333334,16.0,19.0,48.0,57.0,Racquet Brand: Head\nRacquet Name: Head Radica...,"[-0.04297531768679619, -0.025608988478779793, ...",0.695383
35,https://img.tennis-warehouse.com/watermark/rs....,Babolat,Babolat EVO Strike,4.0,199.0,This racquet comes pre-strung with a comforta...,309.0,Graphite,Low-Medium,Medium-Full,...,4.0,66.0,23.5,16.0,19.0,50.0,55.0,Racquet Brand: Babolat\nRacquet Name: Babolat ...,"[-0.00611136993393302, 0.024812890216708183, -...",0.694883
153,https://img.tennis-warehouse.com/watermark/rs....,Head,Head Graphene 360 Radical MP,4.5,119.0,"With this version of the Radical MP, Head add...",324.0,Graphene 360/Graphite,Low-Medium,Medium-Full,...,6.0,68.0,21.333334,16.0,19.0,48.0,57.0,Racquet Brand: Head\nRacquet Name: Head Graphe...,"[0.005895413924008608, -0.023976199328899384, ...",0.691304
76,https://img.tennis-warehouse.com/watermark/rs....,Wilson,Wilson Pro Staff Six.One 95 v14,4.9,289.0,Introducing the Pro Staff Six One 95 v14! Lik...,325.0,Graphite,Low,Full,...,9.0,64.0,21.5,18.0,20.0,50.0,60.0,Racquet Brand: Wilson\nRacquet Name: Wilson Pr...,"[-0.036166105419397354, 0.014084859751164913, ...",0.689659
