In [None]:
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer, InputExample, losses
from torch.utils.data import DataLoader
import faiss
import numpy as np
import re
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm
from sklearn.utils import shuffle


In [None]:

# === Load fully preprocessed full dataset ===
df_full = pd.read_csv("data/processed_small.csv")

# === Sanity check: drop duplicates (query, product_id) ===
df_full = df_full.drop_duplicates(subset=["query", "product_id"]).reset_index(drop=True)

# === Sample 25k for training ===
df_train = df_full.sample(n=25000, random_state=42).reset_index(drop=True)

# === Remove training rows from full set to form the test pool ===
df_test_pool = df_full.merge(
    df_train[['query', 'product_id']],
    on=['query', 'product_id'],
    how='left',
    indicator=True
)
df_test_pool = df_test_pool[df_test_pool['_merge'] == 'left_only'].drop(columns=['_merge'])

# === Sample 4k for testing ===
df_test = df_test_pool.sample(n=4000, random_state=123).reset_index(drop=True)

# === Save both sets ===
df_train.to_csv("data/processed_small_25k.csv", index=False)
df_test.to_csv("data/processed_small_4k_test.csv", index=False)

print(f"✅ Training set saved: {len(df_train)} rows → data/processed_small_25k.csv")
print(f"✅ Test set saved:     {len(df_test)} rows → data/processed_small_4k_test.csv")

✅ Training set saved: 25000 rows → data/processed_small_25k.csv
✅ Test set saved:     4000 rows → data/processed_small_4k_test.csv


In [None]:

# === Load training set ===
df_train = pd.read_csv("data/processed_small_25k.csv")
df_train = df_train.dropna(subset=['query', 'full_product_text']).reset_index(drop=True)

# === Create training samples ===
train_samples = [
    InputExample(texts=[row['query'], row['full_product_text']])
    for _, row in df_train.iterrows()
]

# === Load model and prepare training ===
model = SentenceTransformer('all-MiniLM-L6-v2')
train_dataloader = DataLoader(train_samples, shuffle=True, batch_size=32)
train_loss = losses.MultipleNegativesRankingLoss(model)

# === Train the model ===
model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    epochs=8,
    warmup_steps=100,
    show_progress_bar=True
)

# === Save trained model ===
model_save_path = "saved_models/two_tower_mnr_25k_v2"
model.save(model_save_path)
print(f"✅ Model saved to: {model_save_path}")




Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss
500,0.5186
1000,0.4216
1500,0.3658
2000,0.3036
2500,0.2655
3000,0.2331
3500,0.2051
4000,0.1995
4500,0.1656
5000,0.1614


✅ Model saved to: saved_models/two_tower_mnr_25k_v2


In [None]:


# === Load test set and product pool ===
df_test = pd.read_csv("data/processed_small_4k_test.csv")
df_products = pd.read_csv("data/processed_small_25k.csv")

# === Sanitize rows ===
df_test = df_test.dropna(subset=['query', 'full_product_text', 'product_id']).reset_index(drop=True)
df_products = df_products.dropna(subset=['full_product_text', 'product_id']).reset_index(drop=True)

# === Extract values ===
queries = df_test['query'].tolist()
true_product_ids = df_test['product_id'].tolist()
products = df_products['full_product_text'].tolist()
product_ids = df_products['product_id'].tolist()

# === Load trained model ===
model = SentenceTransformer("saved_models/two_tower_mnr_25k_v2")

# === Encode queries and products ===
query_vecs = model.encode(queries, convert_to_numpy=True, show_progress_bar=True)
product_vecs = model.encode(products, convert_to_numpy=True, show_progress_bar=True)

# === Build ground truth map ===
ground_truth = {i: true_product_ids[i] for i in range(len(queries))}

Batches:   0%|          | 0/125 [00:00<?, ?it/s]

Batches:   0%|          | 0/782 [00:00<?, ?it/s]

In [4]:
def evaluate_recall_mrr(query_vecs, product_vecs, product_ids, ground_truth, K=10):
    correct_at_k = 0
    reciprocal_ranks = []

    sim_matrix = cosine_similarity(query_vecs, product_vecs)

    for i, sims in enumerate(sim_matrix):
        top_k_idx = np.argsort(sims)[::-1][:K]
        top_k_ids = [product_ids[j] for j in top_k_idx]

        true_pid = ground_truth[i]

        if true_pid in top_k_ids:
            correct_at_k += 1
            rank = top_k_ids.index(true_pid) + 1
            reciprocal_ranks.append(1 / rank)
        else:
            reciprocal_ranks.append(0)

    recall_at_k = correct_at_k / len(query_vecs)
    mrr_at_k = sum(reciprocal_ranks) / len(query_vecs)

    return recall_at_k, mrr_at_k

# === Run evaluation
recall, mrr = evaluate_recall_mrr(query_vecs, product_vecs, product_ids, ground_truth, K=10)
print(f"✅ Recall@10: {recall:.4f}")
print(f"✅ MRR@10:    {mrr:.4f}")

✅ Recall@10: 0.0118
✅ MRR@10:    0.0052


In [None]:


# === Choose a query index from the test set
query_index = 0  # You can change this to test other queries

# === Get the query text
query_text = queries[query_index]
print(f"\n🔍 Query: {query_text}")

# === Get embedding for the query
query_vec = model.encode([query_text], convert_to_numpy=True)

# === Compute cosine similarity with all product vectors
scores = cosine_similarity(query_vec, product_vecs)[0]

# === Get top 10 product indices
top_k = 10
top_k_indices = np.argsort(scores)[::-1][:top_k]

# === Display the top 10 matches
print(f"\n📈 Top {top_k} Products Retrieved:")

for rank, idx in enumerate(top_k_indices, 1):
    print(f"\n🔹 Rank {rank} | Product ID: {product_ids[idx]}")
    print(df_products.iloc[idx]['product_title'])
    print("—" * 50)


🔍 Query: new balance mens orange

📈 Top 10 Products Retrieved:

🔹 Rank 1 | Product ID: B0751V26P5
New Balance Running 990V4 Green
——————————————————————————————————————————————————

🔹 Rank 2 | Product ID: B01N43LBWS
New Balance Men's Made 990 V4 Sneaker, Black/Grey, 12.5 D US
——————————————————————————————————————————————————

🔹 Rank 3 | Product ID: B072LDRR3X
New Balance Men's Tenacity Short Sleeve, Athletic Grey , Small
——————————————————————————————————————————————————

🔹 Rank 4 | Product ID: B07HG4439Z
New Balance Women's 574 V2 Sneaker, White/Veg Tan, 8.5
——————————————————————————————————————————————————

🔹 Rank 5 | Product ID: B07DL7SYP8
New Balance Men's Fortitech 7 Inch 2 in 1 Short, Black, Large
——————————————————————————————————————————————————

🔹 Rank 6 | Product ID: B086KW7HQX
New Balance Men's Standard Boxer Brief, Black/Vision Blue/Marnet/Team Royal, Large
——————————————————————————————————————————————————

🔹 Rank 7 | Product ID: B07G25639W
New Balance Men's 1540 V3 Run

In [6]:
df_check = pd.read_csv("data/processed_small.csv")

In [7]:
df_check.head()

Unnamed: 0,example_id,query,query_id,product_id,product_locale,esci_label,small_version,large_version,split,product_title,product_description,product_bullet_point,product_brand,product_color,full_product_text,relevance
0,16,!awnmower tires without rims,1,B075SCHMPY,us,I,1,1,train,"RamPro 10"" All Purpose Utility Air Tires/Wheel...","<b>About The Ram-Pro All Purpose Utility 10"" A...",✓ The Ram-Pro Ten Inch ready to install Air Ti...,RamPro,10 Inch,"RamPro 10"" All Purpose Utility Air Tires/Wheel...",0
1,17,!awnmower tires without rims,1,B08L3B9B9P,us,E,1,1,train,MaxAuto 2-Pack 13x5.00-6 2PLY Turf Mower Tract...,MaxAuto 2-Pack 13x5.00-6 2PLY Turf Mower Tract...,Please check your existing tire Sidewall for t...,MaxAuto,,MaxAuto 2-Pack 13x5.00-6 2PLY Turf Mower Tract...,3
2,18,!awnmower tires without rims,1,B082K7V2GZ,us,I,1,1,train,NEIKO 20601A 14.5 inch Steel Tire Spoon Lever ...,,[QUALITY]: Hardened Steel-Iron construction wi...,Neiko,,NEIKO 20601A 14.5 inch Steel Tire Spoon Lever ...,0
3,19,!awnmower tires without rims,1,B07P4CF3DP,us,S,1,1,train,2PK 13x5.00-6 13x5.00x6 13x5x6 13x5-6 2PLY Tur...,"Tire Size: 13 x 5.00 - 6 Axle: 3/4"" inside dia...",,Russo,,2PK 13x5.00-6 13x5.00x6 13x5x6 13x5-6 2PLY Tur...,2
4,20,!awnmower tires without rims,1,B07C1WZG12,us,E,1,1,train,(Set of 2) 15x6.00-6 Husqvarna/Poulan Tire Whe...,No fuss. Just take off your old assembly and r...,Tire size:15x6.00-6 Ply: 4 Tubeless\n6x4.5 Whe...,Antego Tire & Wheel,Husqvarna Silver,(Set of 2) 15x6.00-6 Husqvarna/Poulan Tire Whe...,3


In [8]:
df_check[df_check['query']=='new balance mens orange']

Unnamed: 0,example_id,query,query_id,product_id,product_locale,esci_label,small_version,large_version,split,product_title,product_description,product_bullet_point,product_brand,product_color,full_product_text,relevance
389564,1438348,new balance mens orange,73095,B07F1QT54H,us,S,1,1,test,"New Balance Men's 410 V6 Trail Running Shoe, T...",,"ACTEVA midsole cushioning delivers versatile, ...",New Balance,Team Royal/Alpha Orange,"New Balance Men's 410 V6 Trail Running Shoe, T...",2
389565,1438349,new balance mens orange,73095,B07HMVMT8N,us,E,1,1,test,"New Balance Men's 4040 V5 Metal Baseball Shoe,...",,Traction You Can Trust: These New Balance base...,New Balance,Black | Orange,"New Balance Men's 4040 V5 Metal Baseball Shoe,...",3
389566,1438350,new balance mens orange,73095,B07HMJTJS9,us,E,1,1,test,"New Balance Men's 4040 V5 Turf Baseball Shoe, ...",,A Home Run for Versatility: Designed for the e...,New Balance,Black/Orange,"New Balance Men's 4040 V5 Turf Baseball Shoe, ...",3
389567,1438351,new balance mens orange,73095,B01NA8VF0F,us,E,1,1,test,New Balance Men's 4040 V4 Metal Mid-Cut Baseba...,,Full-length REVlite RC&reg; midsole\nMetal cle...,New Balance,Black/Orange,New Balance Men's 4040 V4 Metal Mid-Cut Baseba...,3
389568,1438352,new balance mens orange,73095,B07PJX2XH1,us,S,1,1,test,"New Balance Men's 517 V2 Cross Trainer, Pigmen...",,Advanced Shock Absorption: These comfortable t...,New Balance,Pigment/Varsity Orange,"New Balance Men's 517 V2 Cross Trainer, Pigmen...",2
389569,1438353,new balance mens orange,73095,B07BL2MWY6,us,S,1,1,test,"New Balance Men's 590 V4 Trail Running Shoe, S...",,All Terrain Outsole\nSynthetic/Mesh Upper\nNB ...,New Balance,Serpent Green/Alpha Orange,"New Balance Men's 590 V4 Trail Running Shoe, S...",2
389570,1438354,new balance mens orange,73095,B07HMVN1PZ,us,E,1,1,test,New Balance Men's 4040 V5 TPU Molded Baseball ...,,Kinetic stitch synthetic upper\nFull-length RE...,New Balance,Black/Orange,New Balance Men's 4040 V5 TPU Molded Baseball ...,3
389571,1438355,new balance mens orange,73095,B079FHGY54,us,S,1,1,test,New Balance Mens X90 Orange,,,New Balance,Orange,New Balance Mens X90 Orange New Balance,2
389572,1438356,new balance mens orange,73095,B01MXNYAEQ,us,S,1,1,test,"New Balance Men's 481 V3 Trail Running Shoe, T...",,"Cush+ midsole cushioning delivers ultra-soft, ...",New Balance,Team Away Grey/Magnet/Black,"New Balance Men's 481 V3 Trail Running Shoe, T...",2
389573,1438357,new balance mens orange,73095,B075R6VGZK,us,E,1,1,test,"New Balance Men's 3000 V4 Turf Baseball Shoe, ...",,Designed for Comfort: This turf version of the...,New Balance,Black/Orange,"New Balance Men's 3000 V4 Turf Baseball Shoe, ...",3


In [12]:
df_train[df_train['product_id']=='B075D95T44']

Unnamed: 0,example_id,query,query_id,product_id,product_locale,esci_label,small_version,large_version,split,product_title,product_description,product_bullet_point,product_brand,product_color,full_product_text,relevance


In [11]:
df_train[df_train['query']=='new balance mens orange']

Unnamed: 0,example_id,query,query_id,product_id,product_locale,esci_label,small_version,large_version,split,product_title,product_description,product_bullet_point,product_brand,product_color,full_product_text,relevance


In [14]:
df_train[df_train['product_title']=="New Balance Men's 990v4 Sneaker, Orange, 11.5 D US"]

Unnamed: 0,example_id,query,query_id,product_id,product_locale,esci_label,small_version,large_version,split,product_title,product_description,product_bullet_point,product_brand,product_color,full_product_text,relevance


Try feeding the full product pool to the model

In [15]:
df_full = pd.read_csv("data/processed_small.csv")
df_full = df_full.dropna(subset=['full_product_text', 'product_id']).reset_index(drop=True)

all_product_texts = df_full['full_product_text'].tolist()
all_product_ids = df_full['product_id'].tolist()

# Generate embeddings
all_product_vecs = model.encode(all_product_texts, convert_to_numpy=True, show_progress_bar=True)


Batches:   0%|          | 0/18793 [00:00<?, ?it/s]

In [None]:

# Save embeddings and product IDs
np.save("embeddings/all_product_vecs.npy", all_product_vecs)
np.save("embeddings/all_product_ids.npy", np.array(all_product_ids))

print("✅ Saved product embeddings and IDs to /embeddings/")

In [None]:


np.savez_compressed(
    'data/saved_embeddings/product_embeddings.npz',
    product_ids=np.array(all_product_ids),
    embeddings=all_product_vecs
)

In [None]:
import faiss

# Normalize embeddings for cosine similarity
faiss.normalize_L2(all_product_vecs)

# Create FAISS index
index = faiss.IndexFlatIP(all_product_vecs.shape[1])
index.add(all_product_vecs)

# Encode a query
query_vec = model.encode(["new balance mens orange"], convert_to_numpy=True)
faiss.normalize_L2(query_vec)

# Search top 10
_, top_k_idx = index.search(query_vec, k=10)
top_k_ids = [all_product_ids[i] for i in top_k_idx[0]]

In [26]:
top_k_ids

['B079FHGY54',
 'B075D95T44',
 'B07W7CWG9F',
 'B0751RDTVZ',
 'B07J5672LP',
 'B07PJX2XH1',
 'B07RJWBW5F',
 'B07BL3D626',
 'B06Y2XLLR6',
 'B06Y2ZW6LP']

In [27]:
print(f"\n🔍 Query: {query_text}")
print(f"\n📈 Top 10 Retrieved Products:")

for rank, i in enumerate(top_k_idx[0], 1):
    product_id = all_product_ids[i]
    product_title = df_full.iloc[i]['product_title']
    print(f"\n🔹 Rank {rank}")
    print(f"Product ID: {product_id}")
    print(f"Title     : {product_title}")
    print("—" * 50)


🔍 Query: new balance mens orange

📈 Top 10 Retrieved Products:

🔹 Rank 1
Product ID: B079FHGY54
Title     : New Balance Mens X90 Orange
——————————————————————————————————————————————————

🔹 Rank 2
Product ID: B075D95T44
Title     : New Balance Men's 990v4 Sneaker, Orange, 11.5 D US
——————————————————————————————————————————————————

🔹 Rank 3
Product ID: B07W7CWG9F
Title     : New Balance Iconic Mens 500 V1 (8.5, Navy/Grey/Gum)
——————————————————————————————————————————————————

🔹 Rank 4
Product ID: B0751RDTVZ
Title     : New Balance Men's 574 V2 Essential Sneaker, Varsity Orange, 17 XW US
——————————————————————————————————————————————————

🔹 Rank 5
Product ID: B07J5672LP
Title     : New Balance Men's 990v5, Burgundy/Navy, 9.5 D US
——————————————————————————————————————————————————

🔹 Rank 6
Product ID: B07PJX2XH1
Title     : New Balance Men's 517 V2 Cross Trainer, Pigment/Varsity Orange, 12
——————————————————————————————————————————————————

🔹 Rank 7
Product ID: B07RJWBW5F
Title     :

Retrain with relevant E and S data only 

In [None]:

# === Load the full training set ===
df_train = pd.read_csv("data/processed_small_25k.csv")
df_train = df_train.dropna(subset=['query', 'full_product_text', 'relevance']).reset_index(drop=True)

# === Filter to only high-relevance (E or S) pairs ===
df_train = df_train[df_train['relevance'] >= 2]  # Keep only E (3) and S (2)
print(f"✅ Filtered training set: {len(df_train)} pairs with relevance ≥ 2")

# === Create InputExamples (positive pairs only) ===
train_samples = [
    InputExample(texts=[row['query'], row['full_product_text']])
    for _, row in df_train.iterrows()
]

# === Initialize sentence-transformers model ===
model = SentenceTransformer('all-MiniLM-L6-v2')

# === Prepare DataLoader and loss ===
train_dataloader = DataLoader(train_samples, shuffle=True, batch_size=32)
train_loss = losses.MultipleNegativesRankingLoss(model)

# === Train the model ===
model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    epochs=8,
    warmup_steps=100,
    show_progress_bar=True
)

# === Save trained model ===
model_save_path = "saved_models/two_tower_mnr_25k_E_and_S"
model.save(model_save_path)
print(f"✅ Model saved to: {model_save_path}")


✅ Filtered training set: 19872 pairs with relevance ≥ 2


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss
500,0.3667
1000,0.2668
1500,0.2125
2000,0.1762
2500,0.1472
3000,0.1234
3500,0.114
4000,0.104
4500,0.0957


✅ Model saved to: saved_models/two_tower_mnr_25k_E_and_S


Create new embeddings 

In [None]:
# Load your fine-tuned model
model = SentenceTransformer("saved_models/two_tower_mnr_25k_E_and_S")

# Recreate product embeddings
df_full = pd.read_csv("data/processed_small.csv")
df_full = df_full.dropna(subset=['full_product_text', 'product_id']).reset_index(drop=True)

all_product_texts = df_full['full_product_text'].tolist()
all_product_ids = df_full['product_id'].tolist()

all_product_vecs = model.encode(all_product_texts, convert_to_numpy=True, show_progress_bar=True)

# Optional: save to disk

np.savez_compressed(
    'data/saved_embeddings/product_embeddings_mnr_25k_ES.npz',
    product_ids=np.array(all_product_ids),
    embeddings=all_product_vecs
)

Batches:   0%|          | 0/18793 [00:00<?, ?it/s]

new test set

Load model and test set

In [38]:
# Load fine-tuned model
model = SentenceTransformer("saved_models/two_tower_mnr_25k_E_and_S")

# Load test set
df_test = pd.read_csv("data/processed_small_4k_test.csv")
df_test = df_test.dropna(subset=['query', 'product_id']).reset_index(drop=True)

queries = df_test['query'].tolist()
true_product_ids = df_test['product_id'].tolist()
query_vecs = model.encode(queries, convert_to_numpy=True, show_progress_bar=True)

Batches:   0%|          | 0/125 [00:00<?, ?it/s]

check recall at 10

In [None]:


def recall_at_k(query_vecs, product_vecs, product_ids, true_product_ids, K=10, sample_size=None, seed=42):
    if sample_size is not None and sample_size < len(query_vecs):
        np.random.seed(seed)
        sample_idx = np.random.choice(len(query_vecs), size=sample_size, replace=False)
        query_vecs = query_vecs[sample_idx]
        true_product_ids = [true_product_ids[i] for i in sample_idx]

    sim_matrix = cosine_similarity(query_vecs, product_vecs)
    hits = 0

    for i, sims in enumerate(sim_matrix):
        top_k_idx = np.argsort(sims)[::-1][:K]
        top_k_ids = [product_ids[j] for j in top_k_idx]
        if true_product_ids[i] in top_k_ids:
            hits += 1

    return hits / len(true_product_ids)

In [40]:
recall_10 = recall_at_k(
    query_vecs, all_product_vecs, all_product_ids,
    true_product_ids, K=10, sample_size=200
)

print(f"✅ Recall@10 (fine-tuned model, 200 queries): {recall_10:.4f}")

✅ Recall@10 (fine-tuned model, 200 queries): 0.1900


In [41]:
import faiss

# Normalize for cosine similarity
faiss.normalize_L2(all_product_vecs)
faiss.normalize_L2(query_vecs)

# Index all products
index = faiss.IndexFlatIP(all_product_vecs.shape[1])
index.add(all_product_vecs)

# Search top 10 for one query
_, top_k_idx = index.search(query_vecs[0:1], k=10)
top_k_ids = [all_product_ids[i] for i in top_k_idx[0]]

# Load product titles
df_products = pd.read_csv("data/processed_small.csv")
product_lookup = dict(zip(df_products['product_id'], df_products['product_title']))

print(f"\n🔍 Query: {queries[0]}")
print(f"✅ Ground truth: {product_lookup.get(true_product_ids[0], '[not found]')}\n")
print("📌 Top 10 Matches:")
for rank, pid in enumerate(top_k_ids, 1):
    print(f"{rank:2d}. {product_lookup.get(pid, '[title not found]')}")


🔍 Query: new balance mens orange
✅ Ground truth: New Balance Men's 4040 V5 Metal Baseball Shoe, Black | Orange, 10 M US

📌 Top 10 Matches:
 1. New Balance Mens X90 Orange
 2. New Balance Iconic Mens 500 V1 (8.5, Navy/Grey/Gum)
 3. New Balance Men's 990v4 Sneaker, Orange, 11.5 D US
 4. New Balance Men's 574 V2 Essential Sneaker, Varsity Orange, 17 XW US
 5. New Balance Men's 990v5, Burgundy/Navy, 9.5 D US
 6. New Balance Men's 574 V2 Sneaker, Black/Neo Classic Blue, 12 M US
 7. New Balance Men's 574 V2 Pebbled Sport Sneaker, Team Royal/Dark, 7 W US
 8. New Balance Men's 574 V2 Evergreen Sneaker, Grey/Grey, 11.5 Wide
 9. New Balance Men's 574 V2 Evergreen Sneaker, Grey/Grey, 13
10. New Balance Men's 517 V2 Cross Trainer, Pigment/Varsity Orange, 12


In [None]:

df_test = pd.read_csv("data/processed_small_4k_test.csv")
df_test = df_test.dropna(subset=['query', 'product_id']).reset_index(drop=True)

queries = df_test['query'].tolist()
true_product_ids = df_test['product_id'].tolist()


In [31]:
query_vecs = model.encode(queries, convert_to_numpy=True, show_progress_bar=True)


Batches:   0%|          | 0/125 [00:00<?, ?it/s]

In [None]:

def recall_at_k(query_vecs, product_vecs, product_ids, true_product_ids, K=10, sample_size=None, seed=42):
    if sample_size is not None and sample_size < len(query_vecs):
        np.random.seed(seed)
        sample_idx = np.random.choice(len(query_vecs), size=sample_size, replace=False)
        query_vecs = query_vecs[sample_idx]
        true_product_ids = [true_product_ids[i] for i in sample_idx]

    sim_matrix = cosine_similarity(query_vecs, product_vecs)
    hits = 0

    for i, sims in enumerate(sim_matrix):
        top_k_idx = np.argsort(sims)[::-1][:K]
        top_k_ids = [product_ids[j] for j in top_k_idx]
        if true_product_ids[i] in top_k_ids:
            hits += 1

    return hits / len(true_product_ids)


In [34]:
recall_10 = recall_at_k(query_vecs, all_product_vecs, all_product_ids, true_product_ids, K=10, sample_size=1)
print(f"✅ Recall@10 (on 200 test queries, new model): {recall_10:.4f}")

✅ Recall@10 (on 200 test queries, new model): 0.0000


In [35]:
recall_10

0.0

In [42]:
print(f"\n🔍 Query: {query_text}")
print(f"\n📈 Top 10 Retrieved Products:")

for rank, i in enumerate(top_k_idx[0], 1):
    product_id = all_product_ids[i]
    product_title = df_full.iloc[i]['product_title']
    print(f"\n🔹 Rank {rank}")
    print(f"Product ID: {product_id}")
    print(f"Title     : {product_title}")
    print("—" * 50)


🔍 Query: new balance mens orange

📈 Top 10 Retrieved Products:

🔹 Rank 1
Product ID: B079FHGY54
Title     : New Balance Mens X90 Orange
——————————————————————————————————————————————————

🔹 Rank 2
Product ID: B07W7CWG9F
Title     : New Balance Iconic Mens 500 V1 (8.5, Navy/Grey/Gum)
——————————————————————————————————————————————————

🔹 Rank 3
Product ID: B075D95T44
Title     : New Balance Men's 990v4 Sneaker, Orange, 11.5 D US
——————————————————————————————————————————————————

🔹 Rank 4
Product ID: B0751RDTVZ
Title     : New Balance Men's 574 V2 Essential Sneaker, Varsity Orange, 17 XW US
——————————————————————————————————————————————————

🔹 Rank 5
Product ID: B07J5672LP
Title     : New Balance Men's 990v5, Burgundy/Navy, 9.5 D US
——————————————————————————————————————————————————

🔹 Rank 6
Product ID: B07RJWBW5F
Title     : New Balance Men's 574 V2 Sneaker, Black/Neo Classic Blue, 12 M US
——————————————————————————————————————————————————

🔹 Rank 7
Product ID: B07BL3D626
Title     : 

See Impact of cleaning Dataset from HTML Tags

In [None]:


def desc_cleaner(text):
    if pd.isnull(text): return ""
    cleaner = re.compile('<.*?>')
    return re.sub(cleaner, '', str(text)).strip()

In [44]:
df = pd.read_csv("data/processed_small.csv")

columns_to_clean = [
    'product_title',
    'product_description',
    'product_bullet_point',
    'product_brand'
]

for col in columns_to_clean:
    if col in df.columns:
        df[col] = df[col].apply(desc_cleaner)

# Optional: regenerate full_product_text
df['full_product_text'] = (
    df['product_title'] + ' ' +
    df['product_description'] + ' ' +
    df['product_brand'] + ' ' +
    df['product_bullet_point']
)

# Save cleaned version
df.to_csv("data/processed_small_cleaned.csv", index=False)

In [45]:
df = pd.read_csv("data/processed_small_25k.csv")

columns_to_clean = [
    'product_title',
    'product_description',
    'product_bullet_point',
    'product_brand'
]

for col in columns_to_clean:
    if col in df.columns:
        df[col] = df[col].apply(desc_cleaner)

# Optional: regenerate full_product_text
df['full_product_text'] = (
    df['product_title'] + ' ' +
    df['product_description'] + ' ' +
    df['product_brand'] + ' ' +
    df['product_bullet_point']
)

# Save cleaned version
df.to_csv("data/processed_small_25k_cleaned.csv", index=False)

In [46]:
df = pd.read_csv("data/processed_small_4k_test.csv")

columns_to_clean = [
    'product_title',
    'product_description',
    'product_bullet_point',
    'product_brand'
]

for col in columns_to_clean:
    if col in df.columns:
        df[col] = df[col].apply(desc_cleaner)

# Optional: regenerate full_product_text
df['full_product_text'] = (
    df['product_title'] + ' ' +
    df['product_description'] + ' ' +
    df['product_brand'] + ' ' +
    df['product_bullet_point']
)

# Save cleaned version
df.to_csv("data/processed_small_4k_test_cleaned.csv", index=False)

What loss function to use ? 
MSE to : https://amazonkddcup.github.io/papers/8610.pdf


Train using cleaned data (NO HTML tags) , and also use softmax as the loss function.

In [None]:


# === Load Cleaned Dataset ===
df_train = pd.read_csv("data/processed_small_25k_cleaned.csv")
df_train = df_train.dropna(subset=['query', 'full_product_text', 'relevance']).reset_index(drop=True)

# === Optional: Cast relevance to int (in case it's float) ===
df_train['relevance'] = df_train['relevance'].astype(int)

# === Create label encoder (SoftmaxLoss requires integer labels starting from 0) ===
label_encoder = LabelEncoder()
df_train['relevance_label'] = label_encoder.fit_transform(df_train['relevance'])

# === Prepare InputExamples ===
train_samples = [
    InputExample(texts=[row['query'], row['full_product_text']], label=row['relevance_label'])
    for _, row in df_train.iterrows()
]

# === Load sentence-transformer model ===
model = SentenceTransformer('all-MiniLM-L6-v2')

# === Create DataLoader and SoftmaxLoss ===
train_dataloader = DataLoader(train_samples, shuffle=True, batch_size=32)
train_loss = losses.SoftmaxLoss(
    model=model,
    sentence_embedding_dimension=model.get_sentence_embedding_dimension(),
    num_labels=len(df_train['relevance_label'].unique())
)

# === Train the model ===
model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    epochs=8,  # adjust based on available GPU time
    warmup_steps=100,
    show_progress_bar=True
)

# === Save the model ===
model_save_path = "saved_models/two_tower_softmaxloss_25k_cleaned"
model.save(model_save_path)
print(f"✅ Model saved to: {model_save_path}")


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss
500,1.2325
1000,1.1927
1500,1.1833
2000,1.1723
2500,1.1666
3000,1.1529
3500,1.1523
4000,1.1399
4500,1.1357
5000,1.1333


✅ Model saved to: saved_models/two_tower_softmaxloss_25k_cleaned


In [57]:
df_train['relevance'].value_counts()

relevance
3    11006
2     8866
0     4077
1     1051
Name: count, dtype: int64

In [58]:
df_train

Unnamed: 0,example_id,query,query_id,product_id,product_locale,esci_label,small_version,large_version,split,product_title,product_description,product_bullet_point,product_brand,product_color,full_product_text,relevance,relevance_label
0,2191513,woman penny loafers shoes,112396,B07FT598ZL,us,E,1,1,train,"Sperry Women's Seaport Penny Tri Tone Loafer, ...",,"MATERIALS Full-grain, tri-tone leather upper w...",Sperry,Tan/White,"Sperry Women's Seaport Penny Tri Tone Loafer, ...",3,3
1,1711261,retaine eye drops for dry eyes preservative free,87261,B0170RDJQW,us,S,1,1,train,Systane Long Lasting Lubricant Eye Drops Vials...,,#1 Dr Recommended Brand of Artificial Tears*\n...,Systane,N/a,Systane Long Lasting Lubricant Eye Drops Vials...,2,2
2,1096137,jamie oliver veg,55413,1452179611,us,S,1,1,test,From Crook to Cook: Platinum Recipes from Tha ...,,,Chronicle Books,,From Crook to Cook: Platinum Recipes from Tha ...,2,2
3,125101,75 inch grill cover,5188,B000NWAO74,us,I,1,1,train,Classic Accessories Veranda Water-Resistant 72...,,THE CLASSIC ACCESSORIES DIFFERENCE: Veranda co...,Classic Accessories,Pebble,Classic Accessories Veranda Water-Resistant 72...,0,0
4,1961897,tai chi,100405,B07YNXWHTR,us,E,1,1,train,Tai Chi: Discover The Ancient Art,,,,,Tai Chi: Discover The Ancient Art,3,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24995,382881,books best sellers,18405,B083QQ7V97,us,S,1,1,train,The Pale-Faced Lie: A True Story,"Growing up on the Navajo Indian Reservation, D...",,,,The Pale-Faced Lie: A True Story Growing up on...,2,2
24996,181505,alka seltzer without aspirin,8043,B01IAI022O,us,E,1,1,test,Alka-Seltzer Effervescent Tablets Lemon Lime 3...,UPC 16500567301,,Alka-Seltzer,,Alka-Seltzer Effervescent Tablets Lemon Lime 3...,3,3
24997,1644920,proud not loud,83828,B07SQH6RZ3,us,I,1,1,train,The Loud House Lincoln Loud The Man With The P...,,Officially Licensed The Loud House Apparel\n19...,The Loud House,White,The Loud House Lincoln Loud The Man With The P...,0,0
24998,2037272,torxh,104326,B07CWKNM5W,us,E,1,1,test,Authenzo BS-400 Kitchen Torch Lighter (Butane ...,,,Authenzo,Black,Authenzo BS-400 Kitchen Torch Lighter (Butane ...,3,3


In [None]:


# === Load cleaned full product pool ===
df_products = pd.read_csv("data/processed_small_cleaned.csv")

# === Drop rows with missing values just in case ===
df_products = df_products.dropna(subset=['product_id', 'full_product_text']).reset_index(drop=True)

# === Extract fields ===
product_ids = df_products['product_id'].tolist()
product_texts = df_products['full_product_text'].tolist()

# === Load trained model ===
model = SentenceTransformer("saved_models/two_tower_softmaxloss_25k_cleaned")

# === Encode products ===
print("🔄 Generating embeddings for full product pool...")
product_vecs = model.encode(product_texts, convert_to_numpy=True, show_progress_bar=True)

# === Save to compressed .npz format ===
np.savez_compressed(
    "data/saved_embeddings/product_embeddings_softmax_fullpool.npz",
    product_ids=np.array(product_ids),
    embeddings=product_vecs
)

print("✅ Saved full product pool embeddings to: product_embeddings_softmax_fullpool.npz")


🔄 Generating embeddings for full product pool...


Batches:   0%|          | 0/18793 [00:00<?, ?it/s]

✅ Saved full product pool embeddings to: product_embeddings_softmax_fullpool.npz


In [None]:

# === Load fine-tuned model ===
model = SentenceTransformer("saved_models/two_tower_softmaxloss_25k_cleaned")

# === Load full product pool embeddings ===
npz = np.load("data/saved_embeddings/product_embeddings_softmax_fullpool.npz", allow_pickle=True)
product_ids = npz["product_ids"]
product_vecs = npz["embeddings"]

# === Load cleaned test set ===
df_test = pd.read_csv("data/processed_small_4k_test_cleaned.csv")
df_test = df_test.dropna(subset=["query", "product_id", "full_product_text"]).reset_index(drop=True)

# === Prepare test queries and ground truth ===
queries = df_test["query"].tolist()
true_product_ids = df_test["product_id"].tolist()

# === Generate query embeddings ===
query_vecs = model.encode(queries, convert_to_numpy=True, show_progress_bar=True)

# === Evaluation function ===
def evaluate_recall_mrr(query_vecs, product_vecs, product_ids, true_product_ids, K=10):
    sim_matrix = cosine_similarity(query_vecs, product_vecs)
    recall_hits = 0
    reciprocal_ranks = []

    for i, sims in enumerate(sim_matrix):
        ranked_indices = np.argsort(sims)[::-1]
        ranked_ids = [product_ids[j] for j in ranked_indices[:K]]

        # Recall@K
        if true_product_ids[i] in ranked_ids:
            recall_hits += 1

        # MRR
        try:
            rank = ranked_ids.index(true_product_ids[i]) + 1
            reciprocal_ranks.append(1 / rank)
        except ValueError:
            reciprocal_ranks.append(0)

    recall_at_k = recall_hits / len(true_product_ids)
    mrr = np.mean(reciprocal_ranks)
    return recall_at_k, mrr

# === Run evaluation (on all or a subset) ===
subset_size = 200  # Optional: change this to evaluate fewer rows for speed
recall, mrr = evaluate_recall_mrr(
    query_vecs[:subset_size],
    product_vecs,
    product_ids,
    true_product_ids[:subset_size],
    K=10
)

print(f"✅ Recall@10: {recall:.4f}")
print(f"✅ MRR@10   : {mrr:.4f}")


Batches:   0%|          | 0/125 [00:00<?, ?it/s]

✅ Recall@10: 0.0050
✅ MRR@10   : 0.0017


In [None]:
def compute_ndcg(query_vecs, product_vecs, product_ids, df_test, K=10):
    # Create a mapping of product_id to relevance
    relevance_dict = dict(zip(df_test["product_id"], df_test["relevance"]))

    # Similarity matrix
    sim_matrix = cosine_similarity(query_vecs, product_vecs)
    ndcg_scores = []

    for i, sims in enumerate(sim_matrix):
        ranked_indices = np.argsort(sims)[::-1][:K]
        ranked_ids = [product_ids[j] for j in ranked_indices]

        # Get relevance scores for the top-K predictions
        rel_scores = [relevance_dict.get(pid, 0) for pid in ranked_ids]

        # Compute DCG
        dcg = sum([
            (rel / np.log2(idx + 2))  # log2(idx + 2) because idx starts at 0
            for idx, rel in enumerate(rel_scores)
        ])

        # Compute IDCG (ideal DCG)
        true_rel = sorted([relevance_dict.get(df_test.iloc[i]["product_id"], 0)] + [0]*(K-1), reverse=True)
        idcg = sum([
            (rel / np.log2(idx + 2))
            for idx, rel in enumerate(true_rel)
        ])

        ndcg = dcg / idcg if idcg > 0 else 0
        ndcg_scores.append(ndcg)

    return np.mean(ndcg_scores)


In [52]:
subset_size = 200  # adjust as needed

# Make sure 'relevance' column is available
df_test = df_test.copy()
df_test['relevance'] = df_test['relevance'].astype(int)

# Evaluate nDCG@10
ndcg_10 = compute_ndcg(
    query_vecs[:subset_size],
    product_vecs,
    product_ids,
    df_test.iloc[:subset_size],
    K=10
)

print(f"📊 nDCG@10: {ndcg_10:.4f}")


📊 nDCG@10: 0.0025


In [None]:


# === Load fine-tuned model ===
model = SentenceTransformer("saved_models/two_tower_softmaxloss_25k_cleaned")

# === Load full product pool data and embeddings ===
df_products = pd.read_csv("data/processed_small_cleaned.csv")  # full cleaned product pool
npz = np.load("data/saved_embeddings/product_embeddings_softmax_fullpool.npz", allow_pickle=True)
all_product_ids = npz["product_ids"]
all_product_vecs = npz["embeddings"]

# === Normalize for FAISS inner product search ===
faiss.normalize_L2(all_product_vecs)

# === Build FAISS index ===
index = faiss.IndexFlatIP(all_product_vecs.shape[1])
index.add(all_product_vecs)

# === Encode and normalize the query ===
query = "new balance mens orange"
query_vec = model.encode([query], convert_to_numpy=True)
faiss.normalize_L2(query_vec)

# === Search top 10 ===
_, top_k_idx = index.search(query_vec, k=50)
top_k_ids = [all_product_ids[i] for i in top_k_idx[0]]

# === Show top 10 products ===
print(f"\n🔍 Query: {query}\n📈 Top 10 Retrieved Products:\n")

for rank, pid in enumerate(top_k_ids, 1):
    match = df_products[df_products["product_id"] == pid]
    if not match.empty:
        title = match.iloc[0]["product_title"]
        print(f"🔹 Rank {rank}")
        print(f"Product ID: {pid}")
        print(f"Title     : {title}")
        print("—" * 60)



🔍 Query: new balance mens orange
📈 Top 10 Retrieved Products:

🔹 Rank 1
Product ID: B004APOM7S
Title     : Barbie in the Nutcracker
————————————————————————————————————————————————————————————
🔹 Rank 2
Product ID: B00C4QSH4E
Title     : Celtic Pride
————————————————————————————————————————————————————————————
🔹 Rank 3
Product ID: B007GMIHQE
Title     : Amazing Grace And Chuck
————————————————————————————————————————————————————————————
🔹 Rank 4
Product ID: 0988988526
Title     : Dark Church
————————————————————————————————————————————————————————————
🔹 Rank 5
Product ID: B07G9Q9YH6
Title     : The Real Bachelorettes
————————————————————————————————————————————————————————————
🔹 Rank 6
Product ID: 0738772143
Title     : Tarot Original 1909 Circular Deck
————————————————————————————————————————————————————————————
🔹 Rank 7
Product ID: B01ATQBGUC
Title     : Effie Gray
————————————————————————————————————————————————————————————
🔹 Rank 8
Product ID: 3037666005
Title     : Tuscany Nudes
—

In [54]:
# === Encode and normalize the query ===
query = "new balance mens orange"
query_vec = model.encode([query], convert_to_numpy=True)
faiss.normalize_L2(query_vec)

# === Search top 10 ===
_, top_k_idx = index.search(query_vec, k=50)
top_k_ids = [all_product_ids[i] for i in top_k_idx[0]]

# === Show top 10 products ===
print(f"\n🔍 Query: {query}\n📈 Top 10 Retrieved Products:\n")

for rank, pid in enumerate(top_k_ids, 1):
    match = df_products[df_products["product_id"] == pid]
    if not match.empty:
        title = match.iloc[0]["product_title"]
        print(f"🔹 Rank {rank}")
        print(f"Product ID: {pid}")
        print(f"Title     : {title}")
        print("—" * 60)



🔍 Query: new balance mens orange
📈 Top 10 Retrieved Products:

🔹 Rank 1
Product ID: B004APOM7S
Title     : Barbie in the Nutcracker
————————————————————————————————————————————————————————————
🔹 Rank 2
Product ID: B00C4QSH4E
Title     : Celtic Pride
————————————————————————————————————————————————————————————
🔹 Rank 3
Product ID: B007GMIHQE
Title     : Amazing Grace And Chuck
————————————————————————————————————————————————————————————
🔹 Rank 4
Product ID: 0988988526
Title     : Dark Church
————————————————————————————————————————————————————————————
🔹 Rank 5
Product ID: B07G9Q9YH6
Title     : The Real Bachelorettes
————————————————————————————————————————————————————————————
🔹 Rank 6
Product ID: 0738772143
Title     : Tarot Original 1909 Circular Deck
————————————————————————————————————————————————————————————
🔹 Rank 7
Product ID: B01ATQBGUC
Title     : Effie Gray
————————————————————————————————————————————————————————————
🔹 Rank 8
Product ID: 3037666005
Title     : Tuscany Nudes
—

In [55]:
df_products

Unnamed: 0,example_id,query,query_id,product_id,product_locale,esci_label,small_version,large_version,split,product_title,product_description,product_bullet_point,product_brand,product_color,full_product_text,relevance
0,16,!awnmower tires without rims,1,B075SCHMPY,us,I,1,1,train,"RamPro 10"" All Purpose Utility Air Tires/Wheel...","About The Ram-Pro All Purpose Utility 10"" Air ...",✓ The Ram-Pro Ten Inch ready to install Air Ti...,RamPro,10 Inch,"RamPro 10"" All Purpose Utility Air Tires/Wheel...",0
1,17,!awnmower tires without rims,1,B08L3B9B9P,us,E,1,1,train,MaxAuto 2-Pack 13x5.00-6 2PLY Turf Mower Tract...,MaxAuto 2-Pack 13x5.00-6 2PLY Turf Mower Tract...,Please check your existing tire Sidewall for t...,MaxAuto,,MaxAuto 2-Pack 13x5.00-6 2PLY Turf Mower Tract...,3
2,18,!awnmower tires without rims,1,B082K7V2GZ,us,I,1,1,train,NEIKO 20601A 14.5 inch Steel Tire Spoon Lever ...,,[QUALITY]: Hardened Steel-Iron construction wi...,Neiko,,NEIKO 20601A 14.5 inch Steel Tire Spoon Lever ...,0
3,19,!awnmower tires without rims,1,B07P4CF3DP,us,S,1,1,train,2PK 13x5.00-6 13x5.00x6 13x5x6 13x5-6 2PLY Tur...,"Tire Size: 13 x 5.00 - 6 Axle: 3/4"" inside dia...",,Russo,,2PK 13x5.00-6 13x5.00x6 13x5x6 13x5-6 2PLY Tur...,2
4,20,!awnmower tires without rims,1,B07C1WZG12,us,E,1,1,train,(Set of 2) 15x6.00-6 Husqvarna/Poulan Tire Whe...,No fuss. Just take off your old assembly and r...,Tire size:15x6.00-6 Ply: 4 Tubeless\n6x4.5 Whe...,Antego Tire & Wheel,Husqvarna Silver,(Set of 2) 15x6.00-6 Husqvarna/Poulan Tire Whe...,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
601349,2618565,자전거트레일러,130539,B06XTZYJL3,us,I,1,1,train,Nilight - ZH003 20Inch 126W Spot Flood Combo L...,,Super bright beam: High intensity 3W LED chips...,Nilight,,Nilight - ZH003 20Inch 126W Spot Flood Combo L...,0
601350,2618566,자전거트레일러,130539,B010LLGWL8,us,E,1,1,train,"Burley Honey Bee, 2 Seat Kids Bike Trailer & S...",,Sport Type: Cycling,Burley Design,Red,"Burley Honey Bee, 2 Seat Kids Bike Trailer & S...",3
601351,2618567,자전거트레일러,130539,B010LLGWKE,us,E,1,1,train,"BURLEY Design Bee, 2 Seat, Lightweight, Kids B...",,sport type: Cycling,Burley Design,Yellow,"BURLEY Design Bee, 2 Seat, Lightweight, Kids B...",3
601352,2618568,자전거트레일러,130539,B003CUBPUY,us,I,1,1,train,"Bell 20-Inch Universal Inner Tube, Width Fit R...",,Mold cured rubber for consistent side wall–pre...,BELL,"20""x1.75-2.25"" Schrader","Bell 20-Inch Universal Inner Tube, Width Fit R...",0


In [56]:
df_products[df_products['query']=='new balance mens orange']

Unnamed: 0,example_id,query,query_id,product_id,product_locale,esci_label,small_version,large_version,split,product_title,product_description,product_bullet_point,product_brand,product_color,full_product_text,relevance
389564,1438348,new balance mens orange,73095,B07F1QT54H,us,S,1,1,test,"New Balance Men's 410 V6 Trail Running Shoe, T...",,"ACTEVA midsole cushioning delivers versatile, ...",New Balance,Team Royal/Alpha Orange,"New Balance Men's 410 V6 Trail Running Shoe, T...",2
389565,1438349,new balance mens orange,73095,B07HMVMT8N,us,E,1,1,test,"New Balance Men's 4040 V5 Metal Baseball Shoe,...",,Traction You Can Trust: These New Balance base...,New Balance,Black | Orange,"New Balance Men's 4040 V5 Metal Baseball Shoe,...",3
389566,1438350,new balance mens orange,73095,B07HMJTJS9,us,E,1,1,test,"New Balance Men's 4040 V5 Turf Baseball Shoe, ...",,A Home Run for Versatility: Designed for the e...,New Balance,Black/Orange,"New Balance Men's 4040 V5 Turf Baseball Shoe, ...",3
389567,1438351,new balance mens orange,73095,B01NA8VF0F,us,E,1,1,test,New Balance Men's 4040 V4 Metal Mid-Cut Baseba...,,Full-length REVlite RC&reg; midsole\nMetal cle...,New Balance,Black/Orange,New Balance Men's 4040 V4 Metal Mid-Cut Baseba...,3
389568,1438352,new balance mens orange,73095,B07PJX2XH1,us,S,1,1,test,"New Balance Men's 517 V2 Cross Trainer, Pigmen...",,Advanced Shock Absorption: These comfortable t...,New Balance,Pigment/Varsity Orange,"New Balance Men's 517 V2 Cross Trainer, Pigmen...",2
389569,1438353,new balance mens orange,73095,B07BL2MWY6,us,S,1,1,test,"New Balance Men's 590 V4 Trail Running Shoe, S...",,All Terrain Outsole\nSynthetic/Mesh Upper\nNB ...,New Balance,Serpent Green/Alpha Orange,"New Balance Men's 590 V4 Trail Running Shoe, S...",2
389570,1438354,new balance mens orange,73095,B07HMVN1PZ,us,E,1,1,test,New Balance Men's 4040 V5 TPU Molded Baseball ...,,Kinetic stitch synthetic upper\nFull-length RE...,New Balance,Black/Orange,New Balance Men's 4040 V5 TPU Molded Baseball ...,3
389571,1438355,new balance mens orange,73095,B079FHGY54,us,S,1,1,test,New Balance Mens X90 Orange,,,New Balance,Orange,New Balance Mens X90 Orange New Balance,2
389572,1438356,new balance mens orange,73095,B01MXNYAEQ,us,S,1,1,test,"New Balance Men's 481 V3 Trail Running Shoe, T...",,"Cush+ midsole cushioning delivers ultra-soft, ...",New Balance,Team Away Grey/Magnet/Black,"New Balance Men's 481 V3 Trail Running Shoe, T...",2
389573,1438357,new balance mens orange,73095,B075R6VGZK,us,E,1,1,test,"New Balance Men's 3000 V4 Turf Baseball Shoe, ...",,Designed for Comfort: This turf version of the...,New Balance,Black/Orange,"New Balance Men's 3000 V4 Turf Baseball Shoe, ...",3


Redo MNR with cleaned HTML

In [None]:


# === Load Cleaned Dataset ===
df_train = pd.read_csv("data/processed_small_25k_cleaned.csv")
df_train = df_train.dropna(subset=['query', 'full_product_text', 'relevance']).reset_index(drop=True)

# === Filter to only E and S relevance classes ===
df_train = df_train[df_train['relevance'] >= 2]  # Keep only E (3) and S (2)
print(f"✅ Filtered training set: {len(df_train)} pairs with relevance ≥ 2")

# === Create InputExample list (no labels needed for MNR) ===
train_samples = [
    InputExample(texts=[row['query'], row['full_product_text']])
    for _, row in df_train.iterrows()
]

# === Load pretrained SentenceTransformer model ===
model = SentenceTransformer('all-MiniLM-L6-v2')

# === Setup DataLoader and MNR Loss ===
train_dataloader = DataLoader(train_samples, shuffle=True, batch_size=32)
train_loss = losses.MultipleNegativesRankingLoss(model)

# === Train the model ===
model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    epochs=8,
    warmup_steps=100,
    show_progress_bar=True
)

# === Save the fine-tuned model ===
model_save_path = "saved_models/two_tower_mnr_25k_cleaned_ES"
model.save(model_save_path)
print(f"✅ Model saved to: {model_save_path}")


✅ Filtered training set: 19872 pairs with relevance ≥ 2


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss
500,0.3655
1000,0.2663
1500,0.2121
2000,0.1754
2500,0.1471
3000,0.1236
3500,0.1142
4000,0.1037
4500,0.0951


✅ Model saved to: saved_models/two_tower_mnr_25k_cleaned_ES


Generate Embeddings

In [None]:
# === Load fine-tuned model ===
model_path = "saved_models/two_tower_mnr_25k_cleaned_ES"
model = SentenceTransformer(model_path)

# === Load full cleaned product pool ===
df_full = pd.read_csv("data/processed_small_cleaned.csv")
df_full = df_full.dropna(subset=['full_product_text', 'product_id']).reset_index(drop=True)

all_product_texts = df_full['full_product_text'].tolist() 
all_product_ids = df_full['product_id'].tolist()

# === Generate product embeddings ===
all_product_vecs = model.encode(all_product_texts, convert_to_numpy=True, show_progress_bar=True)

# === Save embeddings ===
np.savez_compressed(
    'data/saved_embeddings/product_embeddings_mnr_25k_cleaned_ES.npz',
    product_ids=np.array(all_product_ids),
    embeddings=all_product_vecs
)
print("✅ Product embeddings saved.")


Batches:   0%|          | 0/18793 [00:00<?, ?it/s]

✅ Product embeddings saved.


In [None]:
from sklearn.metrics import ndcg_score

# === Load test queries ===
df_test = pd.read_csv("data/processed_small_4k_test_cleaned.csv")
df_test = df_test.dropna(subset=['query', 'product_id', 'relevance']).reset_index(drop=True)

# Prepare query list and true product ids
queries = df_test['query'].tolist()
true_product_ids = df_test['product_id'].tolist()

# === Encode queries ===
query_vecs = model.encode(queries, convert_to_numpy=True, show_progress_bar=True)

# === Load product pool ===
product_data = np.load('data/saved_embeddings/product_embeddings_mnr_25k_cleaned_ES.npz', allow_pickle=True)
all_product_vecs = product_data['embeddings']
all_product_ids = product_data['product_ids'].tolist()

# === Evaluation functions ===

def recall_at_k(query_vecs, product_vecs, product_ids, true_ids, k=10):
    sim_matrix = cosine_similarity(query_vecs, product_vecs)
    hits = 0
    for i, sims in enumerate(sim_matrix):
        top_k = np.argsort(sims)[::-1][:k]
        top_k_ids = [product_ids[j] for j in top_k]
        if true_ids[i] in top_k_ids:
            hits += 1
    return hits / len(true_ids)

def mrr_at_k(query_vecs, product_vecs, product_ids, true_ids, k=10):
    sim_matrix = cosine_similarity(query_vecs, product_vecs)
    reciprocal_ranks = []
    for i, sims in enumerate(sim_matrix):
        top_k = np.argsort(sims)[::-1][:k]
        top_k_ids = [product_ids[j] for j in top_k]
        if true_ids[i] in top_k_ids:
            rank = top_k_ids.index(true_ids[i]) + 1
            reciprocal_ranks.append(1 / rank)
        else:
            reciprocal_ranks.append(0.0)
    return np.mean(reciprocal_ranks)

def ndcg_at_k(query_vecs, product_vecs, product_ids, true_ids, k=10):
    sim_matrix = cosine_similarity(query_vecs, product_vecs)
    scores = []
    for i, sims in enumerate(sim_matrix):
        top_k = np.argsort(sims)[::-1][:k]
        relevance = [1 if product_ids[j] == true_ids[i] else 0 for j in top_k]
        scores.append(relevance)
    return np.mean([ndcg_score([rel], [list(range(len(rel), 0, -1))]) for rel in scores])

# === Sample 200 test cases for fast evaluation ===
sample_n = 200
query_vecs_sample = query_vecs[:sample_n]
true_ids_sample = true_product_ids[:sample_n]

# === Evaluate ===
recall = recall_at_k(query_vecs_sample, all_product_vecs, all_product_ids, true_ids_sample, k=10)
mrr = mrr_at_k(query_vecs_sample, all_product_vecs, all_product_ids, true_ids_sample, k=10)
ndcg = ndcg_at_k(query_vecs_sample, all_product_vecs, all_product_ids, true_ids_sample, k=10)

print(f"📊 Recall@10: {recall:.4f}")
print(f"📊 MRR@10   : {mrr:.4f}")
print(f"📊 nDCG@10  : {ndcg:.4f}")


Batches:   0%|          | 0/125 [00:00<?, ?it/s]

📊 Recall@10: 0.1650
📊 MRR@10   : 0.0579
📊 nDCG@10  : 0.0836


Check Distribution of full Dataset , and generate balanced dataset

In [62]:
import pandas as pd

df_full = pd.read_csv("data/processed_small_cleaned.csv")
label_dist = df_full['relevance'].value_counts(normalize=True).sort_index()
print("🔍 Full dataset distribution:\n", label_dist)


🔍 Full dataset distribution:
 relevance
0    0.168698
1    0.045213
2    0.351192
3    0.434897
Name: proportion, dtype: float64


Generate Balanced Dataset

In [63]:
from sklearn.utils import shuffle

# Group all rows by query
grouped = df_full.groupby("query")

sampled_rows = []

# Define the target number of samples (e.g., 25k)
target_size = 25000

# Calculate how many per class
target_counts = (label_dist * target_size).round().astype(int)
print("🎯 Target sample counts:\n", target_counts)

# Sample from each class proportionally
for relevance_class, count in target_counts.items():
    # Filter all rows with this relevance class
    df_class = df_full[df_full['relevance'] == relevance_class]
    df_sample = df_class.sample(n=count, random_state=42)
    sampled_rows.append(df_sample)

# Combine and shuffle
df_balanced_sample = pd.concat(sampled_rows).reset_index(drop=True)
df_balanced_sample = shuffle(df_balanced_sample, random_state=42)

# Save for training
df_balanced_sample.to_csv("data/train_balanced_25k.csv", index=False)
print("✅ Saved balanced training set.")


🎯 Target sample counts:
 relevance
0     4217
1     1130
2     8780
3    10872
Name: proportion, dtype: int32
✅ Saved balanced training set.


Train New Model using balanced Dataset

In [None]:

# === Load cleaned and balanced training set ===
df_train = pd.read_csv("data/train_balanced_25k.csv")
df_train = df_train.dropna(subset=['query', 'full_product_text', 'relevance']).reset_index(drop=True)

# === Filter for E and S only (positive pairs) ===
df_train = df_train[df_train['relevance'] >= 2]
print(f"✅ Training on {len(df_train)} E/S samples.")

# === Convert to InputExample list ===
train_samples = [
    InputExample(texts=[row['query'], row['full_product_text']])
    for _, row in df_train.iterrows()
]

# === Load base model ===
model = SentenceTransformer('all-MiniLM-L6-v2')

# === Prepare DataLoader and Loss ===
train_dataloader = DataLoader(train_samples, shuffle=True, batch_size=32)
train_loss = losses.MultipleNegativesRankingLoss(model)

# === Train ===
model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    epochs=8,
    warmup_steps=100,
    show_progress_bar=True
)

# === Save Trained Model ===
model_save_path = "saved_models/two_tower_mnr_balanced_25k"
model.save(model_save_path)
print(f"✅ Model saved to: {model_save_path}")


✅ Training on 19652 E/S samples.


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss
500,0.3592
1000,0.2654
1500,0.2041
2000,0.1702
2500,0.142
3000,0.1171
3500,0.1102
4000,0.0988
4500,0.0949


✅ Model saved to: saved_models/two_tower_mnr_balanced_25k


Generate Test Set

In [65]:
# === Load full cleaned dataset and training set ===
df_full = pd.read_csv("data/processed_small_cleaned.csv")
df_train = pd.read_csv("data/train_balanced_25k.csv")

# === Drop duplicates and NAs just to be safe ===
df_full = df_full.dropna(subset=['query', 'product_id', 'full_product_text']).drop_duplicates()
df_train = df_train.dropna(subset=['query', 'product_id']).drop_duplicates()

# === Filter out training queries from test candidates ===
train_queries = set(df_train['query'].unique())
df_test_candidates = df_full[~df_full['query'].isin(train_queries)]

# === Optional: Keep only rows with labeled relevance (E/S/C/I) ===
df_test_candidates = df_test_candidates[df_test_candidates['relevance'].notnull()]

# === Sample 4,000 for test ===
df_test = df_test_candidates.sample(n=4000, random_state=42).reset_index(drop=True)

# === Save test set ===
df_test.to_csv("data/test_4k_nonoverlap.csv", index=False)
print("✅ Test set saved to: data/test_4k_nonoverlap.csv")


✅ Test set saved to: data/test_4k_nonoverlap.csv


In [None]:

# Load cleaned product pool
df_products = pd.read_csv("data/processed_small_cleaned.csv")
df_products = df_products.dropna(subset=["product_id", "full_product_text"]).drop_duplicates()

product_texts = df_products["full_product_text"].astype(str).tolist()
product_ids = df_products["product_id"].astype(str).tolist()


In [None]:


# Load your trained two-tower model
model = SentenceTransformer("saved_models/two_tower_mnr_balanced_25k")

In [None]:

# Generate product embeddings
product_vecs = model.encode(
    product_texts,
    convert_to_numpy=True,
    batch_size=64,
    show_progress_bar=True
)

# Save embeddings for future inference
np.savez_compressed(
    "data/saved_embeddings/product_embeddings_mnr_balanced_25k.npz",
    product_ids=np.array(product_ids),
    embeddings=product_vecs
)

print("✅ Product embeddings saved.")


Batches:   0%|          | 0/9397 [00:00<?, ?it/s]

✅ Product embeddings saved.


Evaluation

In [75]:

# Load trained model
model = SentenceTransformer("saved_models/two_tower_mnr_balanced_25k")

# Load saved product embeddings
data = np.load("data/saved_embeddings/product_embeddings_mnr_balanced_25k.npz", allow_pickle=True)
product_vecs = data["embeddings"]
product_ids = data["product_ids"]

# Load product metadata
df_products = pd.read_csv("data/processed_small_cleaned.csv")
product_id_to_title = dict(zip(df_products["product_id"].astype(str), df_products["product_title"].astype(str)))



df_query_test = pd.read_csv("data/test_4k_query_only.csv")
# Get all queries and relevant product ID lists
queries = df_query_test["query"].tolist()
relevant_lists = df_query_test["relevant_product_ids"].apply(lambda x: set(str(pid) for pid in eval(x))).tolist()

# Encode all queries at once
query_vecs = model.encode(queries, convert_to_numpy=True, show_progress_bar=True)



Batches:   0%|          | 0/16 [00:00<?, ?it/s]

In [77]:
df_full

Unnamed: 0,example_id,query,query_id,product_id,product_locale,esci_label,small_version,large_version,split,product_title,product_description,product_bullet_point,product_brand,product_color,full_product_text,relevance
0,16,!awnmower tires without rims,1,B075SCHMPY,us,I,1,1,train,"RamPro 10"" All Purpose Utility Air Tires/Wheel...","About The Ram-Pro All Purpose Utility 10"" Air ...",✓ The Ram-Pro Ten Inch ready to install Air Ti...,RamPro,10 Inch,"RamPro 10"" All Purpose Utility Air Tires/Wheel...",0
1,17,!awnmower tires without rims,1,B08L3B9B9P,us,E,1,1,train,MaxAuto 2-Pack 13x5.00-6 2PLY Turf Mower Tract...,MaxAuto 2-Pack 13x5.00-6 2PLY Turf Mower Tract...,Please check your existing tire Sidewall for t...,MaxAuto,,MaxAuto 2-Pack 13x5.00-6 2PLY Turf Mower Tract...,3
2,18,!awnmower tires without rims,1,B082K7V2GZ,us,I,1,1,train,NEIKO 20601A 14.5 inch Steel Tire Spoon Lever ...,,[QUALITY]: Hardened Steel-Iron construction wi...,Neiko,,NEIKO 20601A 14.5 inch Steel Tire Spoon Lever ...,0
3,19,!awnmower tires without rims,1,B07P4CF3DP,us,S,1,1,train,2PK 13x5.00-6 13x5.00x6 13x5x6 13x5-6 2PLY Tur...,"Tire Size: 13 x 5.00 - 6 Axle: 3/4"" inside dia...",,Russo,,2PK 13x5.00-6 13x5.00x6 13x5x6 13x5-6 2PLY Tur...,2
4,20,!awnmower tires without rims,1,B07C1WZG12,us,E,1,1,train,(Set of 2) 15x6.00-6 Husqvarna/Poulan Tire Whe...,No fuss. Just take off your old assembly and r...,Tire size:15x6.00-6 Ply: 4 Tubeless\n6x4.5 Whe...,Antego Tire & Wheel,Husqvarna Silver,(Set of 2) 15x6.00-6 Husqvarna/Poulan Tire Whe...,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
601349,2618565,자전거트레일러,130539,B06XTZYJL3,us,I,1,1,train,Nilight - ZH003 20Inch 126W Spot Flood Combo L...,,Super bright beam: High intensity 3W LED chips...,Nilight,,Nilight - ZH003 20Inch 126W Spot Flood Combo L...,0
601350,2618566,자전거트레일러,130539,B010LLGWL8,us,E,1,1,train,"Burley Honey Bee, 2 Seat Kids Bike Trailer & S...",,Sport Type: Cycling,Burley Design,Red,"Burley Honey Bee, 2 Seat Kids Bike Trailer & S...",3
601351,2618567,자전거트레일러,130539,B010LLGWKE,us,E,1,1,train,"BURLEY Design Bee, 2 Seat, Lightweight, Kids B...",,sport type: Cycling,Burley Design,Yellow,"BURLEY Design Bee, 2 Seat, Lightweight, Kids B...",3
601352,2618568,자전거트레일러,130539,B003CUBPUY,us,I,1,1,train,"Bell 20-Inch Universal Inner Tube, Width Fit R...",,Mold cured rubber for consistent side wall–pre...,BELL,"20""x1.75-2.25"" Schrader","Bell 20-Inch Universal Inner Tube, Width Fit R...",0


In [81]:
# === ESCI point mapping ===
esci_points = {3: 3, 2: 2, 1: 1, 0: 0} 

# === Load ESCI labels for all product IDs ===
product_id_to_label = dict(zip(df_full["product_id"].astype(str), df_full["relevance"].astype(int)))
product_id_to_title = dict(zip(df_full["product_id"].astype(str), df_full["product_title"]))

# === Setup
total_possible = 0
total_achieved = 0

# === Evaluate
for i, query_vec in enumerate(query_vecs):
    query = queries[i]
    relevant_ids = set(str(pid) for pid in eval(df_query_test.loc[i, "relevant_product_ids"]))

    scores = cosine_similarity([query_vec], product_vecs).flatten()
    top_k_idx = scores.argsort()[::-1][:3]
    top_k_pred_ids = [str(product_ids[idx]) for idx in top_k_idx]

    # print(f"\n🔍 Query: {query}")
    found = False
    for rank, pid in enumerate(top_k_pred_ids, 1):
        title = product_id_to_title.get(pid, "Unknown Title")
        # print(f"{rank}. {title} (ID: {pid})")
        if not found and pid in relevant_ids:
            label = product_id_to_label.get(pid, 0)  # default to 'I' if not found
            points = esci_points.get(label, 0)
            total_achieved += points
            found = True
            # print(f"✅ Matched with label {label} → +{points} points")

    # Add max possible score for this query
    # We assume the best label among relevant_ids
    max_label = max((product_id_to_label.get(pid, 0) for pid in relevant_ids), default=0)
    total_possible += esci_points.get(max_label, 0)

# === Final score
print(f"\n🎯 Total Possible Points: {total_possible}")
print(f"🌟 Total Achieved Points: {total_achieved}")
print(f"📊 ESCI Retrieval Score: {total_achieved / total_possible:.4f}")



🎯 Total Possible Points: 1498
🌟 Total Achieved Points: 795
📊 ESCI Retrieval Score: 0.5307


In [None]:

hits = 0
K = 10  # Recall@10

for i, query_vec in enumerate(query_vecs):
    query = queries[i]
    relevant_ids = set(str(pid) for pid in eval(df_query_test.loc[i, "relevant_product_ids"]))

    scores = cosine_similarity([query_vec], product_vecs).flatten()
    top_k_idx = scores.argsort()[::-1][:K]
    top_k_pred_ids = [str(product_ids[idx]) for idx in top_k_idx]

    if relevant_ids.intersection(top_k_pred_ids):
        hits += 1

recall_at_10 = hits / len(queries)
print(f"✅ Recall@10: {recall_at_10:.4f}")


✅ Recall@10: 0.7240


In [None]:


# Define the query
query = "new balance mens orange"

# Encode the query
query_vec = model.encode([query], convert_to_numpy=True)

# Compute cosine similarity
scores = cosine_similarity(query_vec, product_vecs).flatten()

# Get top 10 indices
top_k = 10
top_k_idx = scores.argsort()[::-1][:top_k]

# Display top 10 results
print(f"\n🔍 Top {top_k} matches for query: '{query}'\n")
for rank, idx in enumerate(top_k_idx, 1):
    pid = product_ids[idx]
    title = product_id_to_title.get(pid, "Unknown Title")
    print(f"🔹 Rank {rank}")
    print(f"Product ID: {pid}")
    print(f"Title     : {title}")
    print(f"Score     : {scores[idx]:.4f}")
    print("—" * 60)



🔍 Top 10 matches for query: 'new balance mens orange'

🔹 Rank 1
Product ID: B079FHGY54
Title     : New Balance Mens X90 Orange
Score     : 0.8939
————————————————————————————————————————————————————————————
🔹 Rank 2
Product ID: B07W7CWG9F
Title     : New Balance Iconic Mens 500 V1 (8.5, Navy/Grey/Gum)
Score     : 0.7436
————————————————————————————————————————————————————————————
🔹 Rank 3
Product ID: B075D95T44
Title     : New Balance Men's 990v4 Sneaker, Orange, 11.5 D US
Score     : 0.7431
————————————————————————————————————————————————————————————
🔹 Rank 4
Product ID: B0751RDTVZ
Title     : New Balance Men's 574 V2 Essential Sneaker, Varsity Orange, 17 XW US
Score     : 0.7367
————————————————————————————————————————————————————————————
🔹 Rank 5
Product ID: B07J5672LP
Title     : New Balance Men's 990v5, Burgundy/Navy, 9.5 D US
Score     : 0.6949
————————————————————————————————————————————————————————————
🔹 Rank 6
Product ID: B07PJX2XH1
Title     : New Balance Men's 517 V2 Cross 

In [84]:
# === Recall@10 Evaluation ===
K = 10
recall_scores = []

for i, query_vec in enumerate(query_vecs):
    relevant_ids = set(map(str, relevant_lists[i]))
    if not relevant_ids:
        continue  # Skip queries with no ground truth

    scores = cosine_similarity([query_vec], product_vecs).flatten()
    top_k_idx = scores.argsort()[::-1][:K]
    top_k_pred_ids = set(map(str, [product_ids[idx] for idx in top_k_idx]))

    num_relevant_retrieved = len(relevant_ids.intersection(top_k_pred_ids))
    recall = num_relevant_retrieved / len(relevant_ids)
    recall_scores.append(recall)

# === Final Result ===
average_recall_at_10 = np.mean(recall_scores)
print(f"✅ Average Recall@10 (Two-Tower): {average_recall_at_10:.4f}")

✅ Average Recall@10 (Two-Tower): 0.1879
