In [1]:
import pandas as pd

# === Load fully preprocessed full dataset ===
df_full = pd.read_csv("data/processed_small.csv")

# === Sanity check: drop duplicates (query, product_id) ===
df_full = df_full.drop_duplicates(subset=["query", "product_id"]).reset_index(drop=True)

# === Sample 25k for training ===
df_train = df_full.sample(n=25000, random_state=42).reset_index(drop=True)

# === Remove training rows from full set to form the test pool ===
df_test_pool = df_full.merge(
    df_train[['query', 'product_id']],
    on=['query', 'product_id'],
    how='left',
    indicator=True
)
df_test_pool = df_test_pool[df_test_pool['_merge'] == 'left_only'].drop(columns=['_merge'])

# === Sample 4k for testing ===
df_test = df_test_pool.sample(n=4000, random_state=123).reset_index(drop=True)

# === Save both sets ===
df_train.to_csv("data/processed_small_25k.csv", index=False)
df_test.to_csv("data/processed_small_4k_test.csv", index=False)

print(f"‚úÖ Training set saved: {len(df_train)} rows ‚Üí data/processed_small_25k.csv")
print(f"‚úÖ Test set saved:     {len(df_test)} rows ‚Üí data/processed_small_4k_test.csv")

‚úÖ Training set saved: 25000 rows ‚Üí data/processed_small_25k.csv
‚úÖ Test set saved:     4000 rows ‚Üí data/processed_small_4k_test.csv


In [2]:
import pandas as pd
from sentence_transformers import SentenceTransformer, InputExample, losses
from torch.utils.data import DataLoader

# === Load training set ===
df_train = pd.read_csv("data/processed_small_25k.csv")
df_train = df_train.dropna(subset=['query', 'full_product_text']).reset_index(drop=True)

# === Create training samples ===
train_samples = [
    InputExample(texts=[row['query'], row['full_product_text']])
    for _, row in df_train.iterrows()
]

# === Load model and prepare training ===
model = SentenceTransformer('all-MiniLM-L6-v2')
train_dataloader = DataLoader(train_samples, shuffle=True, batch_size=32)
train_loss = losses.MultipleNegativesRankingLoss(model)

# === Train the model ===
model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    epochs=8,
    warmup_steps=100,
    show_progress_bar=True
)

# === Save trained model ===
model_save_path = "saved_models/two_tower_mnr_25k_v2"
model.save(model_save_path)
print(f"‚úÖ Model saved to: {model_save_path}")




Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss
500,0.5186
1000,0.4216
1500,0.3658
2000,0.3036
2500,0.2655
3000,0.2331
3500,0.2051
4000,0.1995
4500,0.1656
5000,0.1614


‚úÖ Model saved to: saved_models/two_tower_mnr_25k_v2


In [3]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# === Load test set and product pool ===
df_test = pd.read_csv("data/processed_small_4k_test.csv")
df_products = pd.read_csv("data/processed_small_25k.csv")

# === Sanitize rows ===
df_test = df_test.dropna(subset=['query', 'full_product_text', 'product_id']).reset_index(drop=True)
df_products = df_products.dropna(subset=['full_product_text', 'product_id']).reset_index(drop=True)

# === Extract values ===
queries = df_test['query'].tolist()
true_product_ids = df_test['product_id'].tolist()
products = df_products['full_product_text'].tolist()
product_ids = df_products['product_id'].tolist()

# === Load trained model ===
model = SentenceTransformer("saved_models/two_tower_mnr_25k_v2")

# === Encode queries and products ===
query_vecs = model.encode(queries, convert_to_numpy=True, show_progress_bar=True)
product_vecs = model.encode(products, convert_to_numpy=True, show_progress_bar=True)

# === Build ground truth map ===
ground_truth = {i: true_product_ids[i] for i in range(len(queries))}

Batches:   0%|          | 0/125 [00:00<?, ?it/s]

Batches:   0%|          | 0/782 [00:00<?, ?it/s]

In [4]:
def evaluate_recall_mrr(query_vecs, product_vecs, product_ids, ground_truth, K=10):
    correct_at_k = 0
    reciprocal_ranks = []

    sim_matrix = cosine_similarity(query_vecs, product_vecs)

    for i, sims in enumerate(sim_matrix):
        top_k_idx = np.argsort(sims)[::-1][:K]
        top_k_ids = [product_ids[j] for j in top_k_idx]

        true_pid = ground_truth[i]

        if true_pid in top_k_ids:
            correct_at_k += 1
            rank = top_k_ids.index(true_pid) + 1
            reciprocal_ranks.append(1 / rank)
        else:
            reciprocal_ranks.append(0)

    recall_at_k = correct_at_k / len(query_vecs)
    mrr_at_k = sum(reciprocal_ranks) / len(query_vecs)

    return recall_at_k, mrr_at_k

# === Run evaluation
recall, mrr = evaluate_recall_mrr(query_vecs, product_vecs, product_ids, ground_truth, K=10)
print(f"‚úÖ Recall@10: {recall:.4f}")
print(f"‚úÖ MRR@10:    {mrr:.4f}")

‚úÖ Recall@10: 0.0118
‚úÖ MRR@10:    0.0052


In [5]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import pandas as pd

# === Choose a query index from the test set
query_index = 0  # You can change this to test other queries

# === Get the query text
query_text = queries[query_index]
print(f"\nüîç Query: {query_text}")

# === Get embedding for the query
query_vec = model.encode([query_text], convert_to_numpy=True)

# === Compute cosine similarity with all product vectors
scores = cosine_similarity(query_vec, product_vecs)[0]

# === Get top 10 product indices
top_k = 10
top_k_indices = np.argsort(scores)[::-1][:top_k]

# === Display the top 10 matches
print(f"\nüìà Top {top_k} Products Retrieved:")

for rank, idx in enumerate(top_k_indices, 1):
    print(f"\nüîπ Rank {rank} | Product ID: {product_ids[idx]}")
    print(df_products.iloc[idx]['product_title'])
    print("‚Äî" * 50)


üîç Query: new balance mens orange

üìà Top 10 Products Retrieved:

üîπ Rank 1 | Product ID: B0751V26P5
New Balance Running 990V4 Green
‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî

üîπ Rank 2 | Product ID: B01N43LBWS
New Balance Men's Made 990 V4 Sneaker, Black/Grey, 12.5 D US
‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî

üîπ Rank 3 | Product ID: B072LDRR3X
New Balance Men's Tenacity Short Sleeve, Athletic Grey , Small
‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî

üîπ Rank 4 | Product ID: B07HG4439Z
New Balance Women's 574 V2 Sneaker, White/Veg Tan, 8.5
‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî

In [6]:
df_check = pd.read_csv("data/processed_small.csv")

In [7]:
df_check.head()

Unnamed: 0,example_id,query,query_id,product_id,product_locale,esci_label,small_version,large_version,split,product_title,product_description,product_bullet_point,product_brand,product_color,full_product_text,relevance
0,16,!awnmower tires without rims,1,B075SCHMPY,us,I,1,1,train,"RamPro 10"" All Purpose Utility Air Tires/Wheel...","<b>About The Ram-Pro All Purpose Utility 10"" A...",‚úì The Ram-Pro Ten Inch ready to install Air Ti...,RamPro,10 Inch,"RamPro 10"" All Purpose Utility Air Tires/Wheel...",0
1,17,!awnmower tires without rims,1,B08L3B9B9P,us,E,1,1,train,MaxAuto 2-Pack 13x5.00-6 2PLY Turf Mower Tract...,MaxAuto 2-Pack 13x5.00-6 2PLY Turf Mower Tract...,Please check your existing tire Sidewall for t...,MaxAuto,,MaxAuto 2-Pack 13x5.00-6 2PLY Turf Mower Tract...,3
2,18,!awnmower tires without rims,1,B082K7V2GZ,us,I,1,1,train,NEIKO 20601A 14.5 inch Steel Tire Spoon Lever ...,,[QUALITY]: Hardened Steel-Iron construction wi...,Neiko,,NEIKO 20601A 14.5 inch Steel Tire Spoon Lever ...,0
3,19,!awnmower tires without rims,1,B07P4CF3DP,us,S,1,1,train,2PK 13x5.00-6 13x5.00x6 13x5x6 13x5-6 2PLY Tur...,"Tire Size: 13 x 5.00 - 6 Axle: 3/4"" inside dia...",,Russo,,2PK 13x5.00-6 13x5.00x6 13x5x6 13x5-6 2PLY Tur...,2
4,20,!awnmower tires without rims,1,B07C1WZG12,us,E,1,1,train,(Set of 2) 15x6.00-6 Husqvarna/Poulan Tire Whe...,No fuss. Just take off your old assembly and r...,Tire size:15x6.00-6 Ply: 4 Tubeless\n6x4.5 Whe...,Antego Tire & Wheel,Husqvarna Silver,(Set of 2) 15x6.00-6 Husqvarna/Poulan Tire Whe...,3


In [8]:
df_check[df_check['query']=='new balance mens orange']

Unnamed: 0,example_id,query,query_id,product_id,product_locale,esci_label,small_version,large_version,split,product_title,product_description,product_bullet_point,product_brand,product_color,full_product_text,relevance
389564,1438348,new balance mens orange,73095,B07F1QT54H,us,S,1,1,test,"New Balance Men's 410 V6 Trail Running Shoe, T...",,"ACTEVA midsole cushioning delivers versatile, ...",New Balance,Team Royal/Alpha Orange,"New Balance Men's 410 V6 Trail Running Shoe, T...",2
389565,1438349,new balance mens orange,73095,B07HMVMT8N,us,E,1,1,test,"New Balance Men's 4040 V5 Metal Baseball Shoe,...",,Traction You Can Trust: These New Balance base...,New Balance,Black | Orange,"New Balance Men's 4040 V5 Metal Baseball Shoe,...",3
389566,1438350,new balance mens orange,73095,B07HMJTJS9,us,E,1,1,test,"New Balance Men's 4040 V5 Turf Baseball Shoe, ...",,A Home Run for Versatility: Designed for the e...,New Balance,Black/Orange,"New Balance Men's 4040 V5 Turf Baseball Shoe, ...",3
389567,1438351,new balance mens orange,73095,B01NA8VF0F,us,E,1,1,test,New Balance Men's 4040 V4 Metal Mid-Cut Baseba...,,Full-length REVlite RC&reg; midsole\nMetal cle...,New Balance,Black/Orange,New Balance Men's 4040 V4 Metal Mid-Cut Baseba...,3
389568,1438352,new balance mens orange,73095,B07PJX2XH1,us,S,1,1,test,"New Balance Men's 517 V2 Cross Trainer, Pigmen...",,Advanced Shock Absorption: These comfortable t...,New Balance,Pigment/Varsity Orange,"New Balance Men's 517 V2 Cross Trainer, Pigmen...",2
389569,1438353,new balance mens orange,73095,B07BL2MWY6,us,S,1,1,test,"New Balance Men's 590 V4 Trail Running Shoe, S...",,All Terrain Outsole\nSynthetic/Mesh Upper\nNB ...,New Balance,Serpent Green/Alpha Orange,"New Balance Men's 590 V4 Trail Running Shoe, S...",2
389570,1438354,new balance mens orange,73095,B07HMVN1PZ,us,E,1,1,test,New Balance Men's 4040 V5 TPU Molded Baseball ...,,Kinetic stitch synthetic upper\nFull-length RE...,New Balance,Black/Orange,New Balance Men's 4040 V5 TPU Molded Baseball ...,3
389571,1438355,new balance mens orange,73095,B079FHGY54,us,S,1,1,test,New Balance Mens X90 Orange,,,New Balance,Orange,New Balance Mens X90 Orange New Balance,2
389572,1438356,new balance mens orange,73095,B01MXNYAEQ,us,S,1,1,test,"New Balance Men's 481 V3 Trail Running Shoe, T...",,"Cush+ midsole cushioning delivers ultra-soft, ...",New Balance,Team Away Grey/Magnet/Black,"New Balance Men's 481 V3 Trail Running Shoe, T...",2
389573,1438357,new balance mens orange,73095,B075R6VGZK,us,E,1,1,test,"New Balance Men's 3000 V4 Turf Baseball Shoe, ...",,Designed for Comfort: This turf version of the...,New Balance,Black/Orange,"New Balance Men's 3000 V4 Turf Baseball Shoe, ...",3


In [12]:
df_train[df_train['product_id']=='B075D95T44']

Unnamed: 0,example_id,query,query_id,product_id,product_locale,esci_label,small_version,large_version,split,product_title,product_description,product_bullet_point,product_brand,product_color,full_product_text,relevance


In [11]:
df_train[df_train['query']=='new balance mens orange']

Unnamed: 0,example_id,query,query_id,product_id,product_locale,esci_label,small_version,large_version,split,product_title,product_description,product_bullet_point,product_brand,product_color,full_product_text,relevance


In [14]:
df_train[df_train['product_title']=="New Balance Men's 990v4 Sneaker, Orange, 11.5 D US"]

Unnamed: 0,example_id,query,query_id,product_id,product_locale,esci_label,small_version,large_version,split,product_title,product_description,product_bullet_point,product_brand,product_color,full_product_text,relevance


Try feeding the full product pool to the model

In [15]:
df_full = pd.read_csv("data/processed_small.csv")
df_full = df_full.dropna(subset=['full_product_text', 'product_id']).reset_index(drop=True)

all_product_texts = df_full['full_product_text'].tolist()
all_product_ids = df_full['product_id'].tolist()

# Generate embeddings
all_product_vecs = model.encode(all_product_texts, convert_to_numpy=True, show_progress_bar=True)


Batches:   0%|          | 0/18793 [00:00<?, ?it/s]

In [18]:
pip install faiss-cpu

Collecting faiss-cpu
  Downloading faiss_cpu-1.11.0.post1-cp312-cp312-win_amd64.whl.metadata (5.1 kB)
Downloading faiss_cpu-1.11.0.post1-cp312-cp312-win_amd64.whl (14.9 MB)
   ---------------------------------------- 0.0/14.9 MB ? eta -:--:--
   -------------- ------------------------- 5.5/14.9 MB 37.2 MB/s eta 0:00:01
   ------------------------- -------------- 9.4/14.9 MB 26.7 MB/s eta 0:00:01
   ---------------------------------------- 14.9/14.9 MB 26.7 MB/s eta 0:00:00
Installing collected packages: faiss-cpu
Successfully installed faiss-cpu-1.11.0.post1
Note: you may need to restart the kernel to use updated packages.


In [None]:
import numpy as np

# Save embeddings and product IDs
np.save("embeddings/all_product_vecs.npy", all_product_vecs)
np.save("embeddings/all_product_ids.npy", np.array(all_product_ids))

print("‚úÖ Saved product embeddings and IDs to /embeddings/")

In [21]:
import numpy as np

np.savez_compressed(
    'data/saved_embeddings/product_embeddings.npz',
    product_ids=np.array(all_product_ids),
    embeddings=all_product_vecs
)

In [25]:
import faiss
import numpy as np

# Normalize embeddings for cosine similarity
faiss.normalize_L2(all_product_vecs)

# Create FAISS index
index = faiss.IndexFlatIP(all_product_vecs.shape[1])
index.add(all_product_vecs)

# Encode a query
query_vec = model.encode(["new balance mens orange"], convert_to_numpy=True)
faiss.normalize_L2(query_vec)

# Search top 10
_, top_k_idx = index.search(query_vec, k=10)
top_k_ids = [all_product_ids[i] for i in top_k_idx[0]]

In [26]:
top_k_ids

['B079FHGY54',
 'B075D95T44',
 'B07W7CWG9F',
 'B0751RDTVZ',
 'B07J5672LP',
 'B07PJX2XH1',
 'B07RJWBW5F',
 'B07BL3D626',
 'B06Y2XLLR6',
 'B06Y2ZW6LP']

In [27]:
print(f"\nüîç Query: {query_text}")
print(f"\nüìà Top 10 Retrieved Products:")

for rank, i in enumerate(top_k_idx[0], 1):
    product_id = all_product_ids[i]
    product_title = df_full.iloc[i]['product_title']
    print(f"\nüîπ Rank {rank}")
    print(f"Product ID: {product_id}")
    print(f"Title     : {product_title}")
    print("‚Äî" * 50)


üîç Query: new balance mens orange

üìà Top 10 Retrieved Products:

üîπ Rank 1
Product ID: B079FHGY54
Title     : New Balance Mens X90 Orange
‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî

üîπ Rank 2
Product ID: B075D95T44
Title     : New Balance Men's 990v4 Sneaker, Orange, 11.5 D US
‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî

üîπ Rank 3
Product ID: B07W7CWG9F
Title     : New Balance Iconic Mens 500 V1 (8.5, Navy/Grey/Gum)
‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî

üîπ Rank 4
Product ID: B0751RDTVZ
Title     : New Balance Men's 574 V2 Essential Sneaker, Varsity Orange, 17 XW US
‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚

Retrain with relevant E and S data only 

In [28]:
import pandas as pd
from sentence_transformers import SentenceTransformer, InputExample, losses
from torch.utils.data import DataLoader

# === Load the full training set ===
df_train = pd.read_csv("data/processed_small_25k.csv")
df_train = df_train.dropna(subset=['query', 'full_product_text', 'relevance']).reset_index(drop=True)

# === Filter to only high-relevance (E or S) pairs ===
df_train = df_train[df_train['relevance'] >= 2]  # Keep only E (3) and S (2)
print(f"‚úÖ Filtered training set: {len(df_train)} pairs with relevance ‚â• 2")

# === Create InputExamples (positive pairs only) ===
train_samples = [
    InputExample(texts=[row['query'], row['full_product_text']])
    for _, row in df_train.iterrows()
]

# === Initialize sentence-transformers model ===
model = SentenceTransformer('all-MiniLM-L6-v2')

# === Prepare DataLoader and loss ===
train_dataloader = DataLoader(train_samples, shuffle=True, batch_size=32)
train_loss = losses.MultipleNegativesRankingLoss(model)

# === Train the model ===
model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    epochs=8,
    warmup_steps=100,
    show_progress_bar=True
)

# === Save trained model ===
model_save_path = "saved_models/two_tower_mnr_25k_E_and_S"
model.save(model_save_path)
print(f"‚úÖ Model saved to: {model_save_path}")


‚úÖ Filtered training set: 19872 pairs with relevance ‚â• 2


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss
500,0.3667
1000,0.2668
1500,0.2125
2000,0.1762
2500,0.1472
3000,0.1234
3500,0.114
4000,0.104
4500,0.0957


‚úÖ Model saved to: saved_models/two_tower_mnr_25k_E_and_S


Create new embeddings 

In [36]:
# Load your fine-tuned model
model = SentenceTransformer("saved_models/two_tower_mnr_25k_E_and_S")

# Recreate product embeddings
df_full = pd.read_csv("data/processed_small.csv")
df_full = df_full.dropna(subset=['full_product_text', 'product_id']).reset_index(drop=True)

all_product_texts = df_full['full_product_text'].tolist()
all_product_ids = df_full['product_id'].tolist()

all_product_vecs = model.encode(all_product_texts, convert_to_numpy=True, show_progress_bar=True)

# Optional: save to disk
import numpy as np
np.savez_compressed(
    'data/saved_embeddings/product_embeddings_mnr_25k_ES.npz',
    product_ids=np.array(all_product_ids),
    embeddings=all_product_vecs
)

Batches:   0%|          | 0/18793 [00:00<?, ?it/s]

new test set

Load model and test set

In [38]:
# Load fine-tuned model
model = SentenceTransformer("saved_models/two_tower_mnr_25k_E_and_S")

# Load test set
df_test = pd.read_csv("data/processed_small_4k_test.csv")
df_test = df_test.dropna(subset=['query', 'product_id']).reset_index(drop=True)

queries = df_test['query'].tolist()
true_product_ids = df_test['product_id'].tolist()
query_vecs = model.encode(queries, convert_to_numpy=True, show_progress_bar=True)

Batches:   0%|          | 0/125 [00:00<?, ?it/s]

check recall at 10

In [39]:
from sklearn.metrics.pairwise import cosine_similarity

def recall_at_k(query_vecs, product_vecs, product_ids, true_product_ids, K=10, sample_size=None, seed=42):
    if sample_size is not None and sample_size < len(query_vecs):
        np.random.seed(seed)
        sample_idx = np.random.choice(len(query_vecs), size=sample_size, replace=False)
        query_vecs = query_vecs[sample_idx]
        true_product_ids = [true_product_ids[i] for i in sample_idx]

    sim_matrix = cosine_similarity(query_vecs, product_vecs)
    hits = 0

    for i, sims in enumerate(sim_matrix):
        top_k_idx = np.argsort(sims)[::-1][:K]
        top_k_ids = [product_ids[j] for j in top_k_idx]
        if true_product_ids[i] in top_k_ids:
            hits += 1

    return hits / len(true_product_ids)

In [40]:
recall_10 = recall_at_k(
    query_vecs, all_product_vecs, all_product_ids,
    true_product_ids, K=10, sample_size=200
)

print(f"‚úÖ Recall@10 (fine-tuned model, 200 queries): {recall_10:.4f}")

‚úÖ Recall@10 (fine-tuned model, 200 queries): 0.1900


In [41]:
import faiss

# Normalize for cosine similarity
faiss.normalize_L2(all_product_vecs)
faiss.normalize_L2(query_vecs)

# Index all products
index = faiss.IndexFlatIP(all_product_vecs.shape[1])
index.add(all_product_vecs)

# Search top 10 for one query
_, top_k_idx = index.search(query_vecs[0:1], k=10)
top_k_ids = [all_product_ids[i] for i in top_k_idx[0]]

# Load product titles
df_products = pd.read_csv("data/processed_small.csv")
product_lookup = dict(zip(df_products['product_id'], df_products['product_title']))

print(f"\nüîç Query: {queries[0]}")
print(f"‚úÖ Ground truth: {product_lookup.get(true_product_ids[0], '[not found]')}\n")
print("üìå Top 10 Matches:")
for rank, pid in enumerate(top_k_ids, 1):
    print(f"{rank:2d}. {product_lookup.get(pid, '[title not found]')}")


üîç Query: new balance mens orange
‚úÖ Ground truth: New Balance Men's 4040 V5 Metal Baseball Shoe, Black | Orange, 10 M US

üìå Top 10 Matches:
 1. New Balance Mens X90 Orange
 2. New Balance Iconic Mens 500 V1 (8.5, Navy/Grey/Gum)
 3. New Balance Men's 990v4 Sneaker, Orange, 11.5 D US
 4. New Balance Men's 574 V2 Essential Sneaker, Varsity Orange, 17 XW US
 5. New Balance Men's 990v5, Burgundy/Navy, 9.5 D US
 6. New Balance Men's 574 V2 Sneaker, Black/Neo Classic Blue, 12 M US
 7. New Balance Men's 574 V2 Pebbled Sport Sneaker, Team Royal/Dark, 7 W US
 8. New Balance Men's 574 V2 Evergreen Sneaker, Grey/Grey, 11.5 Wide
 9. New Balance Men's 574 V2 Evergreen Sneaker, Grey/Grey, 13
10. New Balance Men's 517 V2 Cross Trainer, Pigment/Varsity Orange, 12


In [30]:
import pandas as pd

df_test = pd.read_csv("data/processed_small_4k_test.csv")
df_test = df_test.dropna(subset=['query', 'product_id']).reset_index(drop=True)

queries = df_test['query'].tolist()
true_product_ids = df_test['product_id'].tolist()


In [31]:
query_vecs = model.encode(queries, convert_to_numpy=True, show_progress_bar=True)


Batches:   0%|          | 0/125 [00:00<?, ?it/s]

In [32]:
from sklearn.metrics.pairwise import cosine_similarity

def recall_at_k(query_vecs, product_vecs, product_ids, true_product_ids, K=10, sample_size=None, seed=42):
    if sample_size is not None and sample_size < len(query_vecs):
        np.random.seed(seed)
        sample_idx = np.random.choice(len(query_vecs), size=sample_size, replace=False)
        query_vecs = query_vecs[sample_idx]
        true_product_ids = [true_product_ids[i] for i in sample_idx]

    sim_matrix = cosine_similarity(query_vecs, product_vecs)
    hits = 0

    for i, sims in enumerate(sim_matrix):
        top_k_idx = np.argsort(sims)[::-1][:K]
        top_k_ids = [product_ids[j] for j in top_k_idx]
        if true_product_ids[i] in top_k_ids:
            hits += 1

    return hits / len(true_product_ids)


In [34]:
recall_10 = recall_at_k(query_vecs, all_product_vecs, all_product_ids, true_product_ids, K=10, sample_size=1)
print(f"‚úÖ Recall@10 (on 200 test queries, new model): {recall_10:.4f}")

‚úÖ Recall@10 (on 200 test queries, new model): 0.0000


In [35]:
recall_10

0.0

In [None]:
print(f"\nüîç Query: {query_text}")
print(f"\nüìà Top 10 Retrieved Products:")

for rank, i in enumerate(top_k_idx[0], 1):
    product_id = all_product_ids[i]
    product_title = df_full.iloc[i]['product_title']
    print(f"\nüîπ Rank {rank}")
    print(f"Product ID: {product_id}")
    print(f"Title     : {product_title}")
    print("‚Äî" * 50)