# Part 3: Ranking & Filtering


In [21]:
import os
import sys
import pandas as pd

# Add project root to path
sys.path.insert(0, os.path.abspath('../../'))

from myapp.search.load_corpus import load_preprocessed_corpus
from myapp.search.search_engine import SearchEngine

In [22]:
# Load preprocessed corpus (same cache as in Part 2)
corpus = load_preprocessed_corpus('../../data/processed/preprocessed_corpus.pkl')

print(f"Loaded {len(corpus)} documents")

# Show example structure
example_pid = list(corpus.keys())[0]
example_doc = corpus[example_pid]
print("\nExample document structure:")
print(f"  PID: {example_doc['pid']}")
print(f"  Tokens (first 10): {example_doc['searchable_text'][:10]}")
print(f"  Metadata: {example_doc['metadata']}")

Loaded 28080 documents

Example document structure:
  PID: TKPFCZ9EA7H5FYZH
  Tokens (first 10): ['solid', 'women', 'multicolor', 'track', 'pant', 'yorker', 'trackpant', 'made', '100', 'rich']
  Metadata: {'category': 'clothing and accessories', 'sub_category': 'bottomwear', 'brand': 'york', 'seller': 'shyam enterprises'}


In [23]:
TEST_QUERIES = [
    "women cotton dress summer",
    "men leather shoes formal",
    "kids blue jeans comfortable",
    "sports running shoes lightweight",
    "winter jacket warm waterproof"
]

In [24]:
search_engine = SearchEngine()

# TF-IDF

In [25]:
test_query = "cotton sweatshirt"
print(f"Testing TF-IDF for query: '{test_query}'\n")

results = search_engine.search_tfidf(test_query, corpus, top_k=10)

print("\n" + "="*60)
print("Top 10 Results (TF-IDF)")
print("="*60)
for rank, (pid, score) in enumerate(results, 1):
    doc = corpus[pid]
    title = doc['original']['title']
    print(f"{rank:2d}. [{score:6.3f}] {pid} - {title[:60]}")

Testing TF-IDF for query: 'cotton sweatshirt'

Processed query terms: ['cotton', 'sweatshirt']
Indexed 20906 unique terms from 28080 documents in 1.11s
Found 1025 documents matching all terms

Top 10 Results (TF-IDF)
 1. [ 0.763] SWSFMTNHCG3SRHJR - Full Sleeve Solid Men Sweatshirt
 2. [ 0.762] SWSFMTNHEM5SNHVZ - Full Sleeve Solid Women Sweatshirt
 3. [ 0.733] SWSFMTNHZR59ZUXR - Full Sleeve Solid Men Sweatshirt
 4. [ 0.654] SWSFXMFPD9RDM2EK - Full Sleeve Printed Men Sweatshirt
 5. [ 0.649] SWSFXMFPZQGQ6ENA - Full Sleeve Printed Men Sweatshirt
 6. [ 0.644] SWSFVZRFCWPPZVGT - Full Sleeve Solid Men Sweatshirt
 7. [ 0.642] SWSFVZRFS7GHGKSF - Full Sleeve Solid Women Sweatshirt
 8. [ 0.640] SWSFVZRFDEQAKUH2 - Full Sleeve Solid Men Sweatshirt
 9. [ 0.639] SWSFVZRFTM4P9XGF - Full Sleeve Solid Men Sweatshirt
10. [ 0.638] SWSFVZRFVZ2SZ3HZ - Full Sleeve Solid Men Sweatshirt


In [26]:
# Run all test queries with TF-IDF
tfidf_results = {}

for query in TEST_QUERIES:
    print(f"\n{'='*80}")
    print(f"Query: '{query}'")
    print(f"{'='*80}")

    results = search_engine.search_tfidf(query, corpus, top_k=20)
    tfidf_results[query] = results

    print("\nTop 20 Results (TF-IDF):")
    if results:
        for rank, (pid, score) in enumerate(results, 1):
            doc = corpus[pid]
            title = doc['original']['title']
            brand = doc['original'].get('brand', 'N/A')
            price = doc['original'].get('selling_price', 'N/A')
            rating = doc['original'].get('average_rating', 'N/A')

            print(f"{rank:2d}. [Score: {score:6.3f}] {pid}")
            print(f"    Title: {title[:60]}")
            print(f"    Brand: {brand} | Price: {price} | Rating: {rating}")
    else:
        print("  No results found")


Query: 'women cotton dress summer'
Processed query terms: ['women', 'cotton', 'dress', 'summer']
Indexed 20906 unique terms from 28080 documents in 0.76s
Found 273 documents matching all terms

Top 20 Results (TF-IDF):
 1. [Score:  0.161] TSHFVUXNG5VCCCHU
    Title: Solid Women Polo Neck Red T-Shirt
    Brand: Shoef | Price: 246 | Rating: 3.6
 2. [Score:  0.160] TSHFVUXGRXTQPUGH
    Title: Solid Women Polo Neck Maroon T-Shirt
    Brand: Shoef | Price: 246 | Rating: 3
 3. [Score:  0.158] TSHFGBYKFQUMFKKU
    Title: Solid Women Polo Neck Multicolor T-Shirt  (Pack of 3)
    Brand: Keo | Price: 664 | Rating: 3.5
 4. [Score:  0.158] TSHFGBYKFZHSCGGW
    Title: Solid Women Polo Neck Multicolor T-Shirt  (Pack of 3)
    Brand: Keo | Price: 664 | Rating: 3.5
 5. [Score:  0.158] TSHFGDEYGRTC7ZED
    Title: Solid Women Polo Neck Green, Blue, Black T-Shirt  (Pack of 3
    Brand: Keo | Price: 664 | Rating: 3.9
 6. [Score:  0.158] TSHFGBYKUQKCREZJ
    Title: Solid Women Polo Neck Multicolor T-Shirt

# BM25

In [27]:
test_query = "cotton sweatshirt"
print(f"Testing BM25 for query: '{test_query}'\n")

results_bm25 = search_engine.search_bm25(test_query, corpus, top_k=10)

print("\n" + "="*60)
print("Top 10 Results (BM25)")
print("="*60)
for rank, (pid, score) in enumerate(results_bm25, 1):
    doc = corpus[pid]
    title = doc['original']['title']
    print(f"{rank:2d}. [{score:6.3f}] {pid} - {title[:60]}")

Testing BM25 for query: 'cotton sweatshirt'

Processed query terms (BM25): ['cotton', 'sweatshirt']
Indexed 20906 unique terms from 28080 documents in 1.05s
Found 1025 documents matching all terms (BM25)

Top 10 Results (BM25)
 1. [ 6.582] SWSFMTNHEM5SNHVZ - Full Sleeve Solid Women Sweatshirt
 2. [ 6.582] SWSFMTNHZR59ZUXR - Full Sleeve Solid Men Sweatshirt
 3. [ 6.582] SWSFMTNHCG3SRHJR - Full Sleeve Solid Men Sweatshirt
 4. [ 6.419] SWSFXZWWYXPZXPTY - Full Sleeve Printed Men Sweatshirt
 5. [ 6.419] SWSFXZWGMGVDUCXA - Full Sleeve Printed Women Sweatshirt
 6. [ 6.419] SWSFXZWGFKHNG88S - Full Sleeve Solid Men Sweatshirt
 7. [ 6.368] SWSFYFFYJCSGNGD8 - Full Sleeve Printed Men Sweatshirt
 8. [ 6.368] SWSFYFFFMWQMAPJJ - Full Sleeve Printed Men Sweatshirt
 9. [ 6.342] SWSFYFFFFYZ896TJ - Full Sleeve Printed Women Sweatshirt
10. [ 6.323] SWSFXMFPDVRHYYPH - Full Sleeve Striped Women Sweatshirt


In [28]:
bm25_results = {}

for query in TEST_QUERIES:
    print(f"\n{'='*80}")
    print(f"Query: '{query}'")
    print(f"{'='*80}")

    results = search_engine.search_bm25(query, corpus, top_k=20)
    bm25_results[query] = results

    print("\nTop 20 Results (BM25):")
    if results:
        for rank, (pid, score) in enumerate(results, 1):
            doc = corpus[pid]
            title = doc['original']['title']
            brand = doc['original'].get('brand', 'N/A')
            price = doc['original'].get('selling_price', 'N/A')
            rating = doc['original'].get('average_rating', 'N/A')

            print(f"{rank:2d}. [Score: {score:6.3f}] {pid}")
            print(f"    Title: {title[:60]}")
            print(f"    Brand: {brand} | Price: {price} | Rating: {rating}")
    else:
        print("  No results found")


Query: 'women cotton dress summer'
Processed query terms (BM25): ['women', 'cotton', 'dress', 'summer']
Indexed 20906 unique terms from 28080 documents in 0.83s
Found 273 documents matching all terms (BM25)

Top 20 Results (BM25):
 1. [Score:  5.817] TSHFVUXNG5VCCCHU
    Title: Solid Women Polo Neck Red T-Shirt
    Brand: Shoef | Price: 246 | Rating: 3.6
 2. [Score:  5.794] TSHFVUXGRXTQPUGH
    Title: Solid Women Polo Neck Maroon T-Shirt
    Brand: Shoef | Price: 246 | Rating: 3
 3. [Score:  5.656] TSHFWCRCEQSJUBPR
    Title: Solid Women Polo Neck Multicolor T-Shirt  (Pack of 2)
    Brand: Shoef | Price: 458 | Rating: 3.6
 4. [Score:  5.656] TSHFWCR44UPHFCHU
    Title: Solid Women Polo Neck Multicolor T-Shirt  (Pack of 2)
    Brand: Shoef | Price: 458 | Rating: 3.6
 5. [Score:  5.656] TSHFWCSUQR6ECHFG
    Title: Solid Women Polo Neck Multicolor T-Shirt  (Pack of 2)
    Brand: Shoef | Price: 458 | Rating: 3.6
 6. [Score:  5.656] TSHFWCRC55NY8FHU
    Title: Solid Women Polo Neck Multico

## COMPARISON TF-IDF vs BM25

In [29]:
import pandas as pd

# As we see in the previous cells, the first query has more results, so we choose this one for the comparison
query = TEST_QUERIES[0]
print(f"Comparing TF-IDF vs BM25 for query:\n  '{query}'\n")

# We execute the search for the query with both models
tfidf_results = search_engine.search_tfidf(query, corpus, top_k=20)
bm25_results = search_engine.search_bm25(query, corpus, top_k=20)

# We convert the results to DataFrames with explicit rank
tfidf_df = pd.DataFrame([
    {
        "Rank_TFIDF": rank,
        "PID_TFIDF": pid,
        "Score_TFIDF": score,
        "Title_TFIDF": corpus[pid]["original"]["title"][:60],
    }
    for rank, (pid, score) in enumerate(tfidf_results, start=1)
])

bm25_df = pd.DataFrame([
    {
        "Rank_BM25": rank,
        "PID_BM25": pid,
        "Score_BM25": score,
        "Title_BM25": corpus[pid]["original"]["title"][:60],
    }
    for rank, (pid, score) in enumerate(bm25_results, start=1)
])

# We merge the two DataFrames by ranking
max_rows = max(len(tfidf_df), len(bm25_df))
tfidf_df = tfidf_df.reindex(range(max_rows))
bm25_df = bm25_df.reindex(range(max_rows))

comparison_df = pd.concat([tfidf_df, bm25_df], axis=1)

print("\n=== TOP-20 COMPARISON: TF-IDF vs BM25 (side by side) ===")
display(comparison_df)

Comparing TF-IDF vs BM25 for query:
  'women cotton dress summer'

Processed query terms: ['women', 'cotton', 'dress', 'summer']
Indexed 20906 unique terms from 28080 documents in 0.86s
Found 273 documents matching all terms
Processed query terms (BM25): ['women', 'cotton', 'dress', 'summer']
Indexed 20906 unique terms from 28080 documents in 1.06s
Found 273 documents matching all terms (BM25)

=== TOP-20 COMPARISON: TF-IDF vs BM25 (side by side) ===


Unnamed: 0,Rank_TFIDF,PID_TFIDF,Score_TFIDF,Title_TFIDF,Rank_BM25,PID_BM25,Score_BM25,Title_BM25
0,1,TSHFVUXNG5VCCCHU,0.16137,Solid Women Polo Neck Red T-Shirt,1,TSHFVUXNG5VCCCHU,5.817391,Solid Women Polo Neck Red T-Shirt
1,2,TSHFVUXGRXTQPUGH,0.15961,Solid Women Polo Neck Maroon T-Shirt,2,TSHFVUXGRXTQPUGH,5.793817,Solid Women Polo Neck Maroon T-Shirt
2,3,TSHFGBYKFQUMFKKU,0.158298,Solid Women Polo Neck Multicolor T-Shirt (Pac...,3,TSHFWCRCEQSJUBPR,5.656383,Solid Women Polo Neck Multicolor T-Shirt (Pac...
3,4,TSHFGBYKFZHSCGGW,0.158078,Solid Women Polo Neck Multicolor T-Shirt (Pac...,4,TSHFWCR44UPHFCHU,5.656383,Solid Women Polo Neck Multicolor T-Shirt (Pac...
4,5,TSHFGDEYGRTC7ZED,0.158078,"Solid Women Polo Neck Green, Blue, Black T-Shi...",5,TSHFWCSUQR6ECHFG,5.656383,Solid Women Polo Neck Multicolor T-Shirt (Pac...
5,6,TSHFGBYKUQKCREZJ,0.157777,Solid Women Polo Neck Multicolor T-Shirt (Pac...,6,TSHFWCRC55NY8FHU,5.656383,Solid Women Polo Neck Multicolor T-Shirt (Pac...
6,7,TSHFGBYKCDAEBHT7,0.157634,Solid Women Polo Neck Multicolor T-Shirt (Pac...,7,TSHFWCSK9YCSUBNZ,5.634124,Solid Women Polo Neck Multicolor T-Shirt (Pac...
7,8,TSHFGBYKEJC36UJF,0.157594,Solid Women Polo Neck Multicolor T-Shirt (Pac...,8,TSHFWCQXENGNZRVY,5.634124,Solid Women Polo Neck Multicolor T-Shirt (Pac...
8,9,TSHFGDEYZ9X7JXFN,0.15744,"Solid Women Polo Neck Green, Black, Grey T-Shi...",9,TSHFTT34Z2HPAZUB,5.634124,Solid Women Polo Neck Multicolor T-Shirt (Pac...
9,10,TSHFGBYK3R9GN9CK,0.157418,Solid Women Polo Neck Multicolor T-Shirt (Pac...,10,TSHFTT32HHGAHPCK,5.634124,Solid Women Polo Neck Multicolor T-Shirt (Pac...


## Your Score (Custom ranking: BM25 + Rating + Discount + Stock)

In [30]:
# Test CustomScoreRanker with a simple query
test_query = "cotton sweatshirt"
print(f"Testing Custom Score for query: '{test_query}'\n")

custom_results = search_engine.search_custom(test_query, corpus, top_k=10)

print("\n" + "="*60)
print("Top 10 Results (Custom Score)")
print("="*60)
for rank, (pid, score) in enumerate(custom_results, 1):
    doc = corpus[pid]
    original = doc["original"]
    title = original["title"]
    rating = original.get("average_rating")
    discount = original.get("discount")
    out_of_stock = original.get("out_of_stock")
    print(f"{rank:2d}. [{score:6.3f}] {pid} - {title[:60]}")
    print(f"    Rating: {rating} | Discount: {discount} | Out of stock: {out_of_stock}")

Testing Custom Score for query: 'cotton sweatshirt'

Processed query terms (Custom): ['cotton', 'sweatshirt']
Indexed 20906 unique terms from 28080 documents in 0.78s
Found 1025 documents matching all terms (Custom)

Top 10 Results (Custom Score)
 1. [ 4.835] SWSFMTNHZR59ZUXR - Full Sleeve Solid Men Sweatshirt
    Rating: 4.2 | Discount: 60% off | Out of stock: False
 2. [ 4.835] SWSFMTNHCG3SRHJR - Full Sleeve Solid Men Sweatshirt
    Rating: 4.2 | Discount: 60% off | Out of stock: False
 3. [ 4.830] SWSFMTNHEM5SNHVZ - Full Sleeve Solid Women Sweatshirt
    Rating: 4.2 | Discount: 55% off | Out of stock: False
 4. [ 4.685] SWSFXZWWYXPZXPTY - Full Sleeve Printed Men Sweatshirt
    Rating: 3.2 | Discount: 63% off | Out of stock: False
 5. [ 4.685] SWSFXZWGMGVDUCXA - Full Sleeve Printed Women Sweatshirt
    Rating: 3.2 | Discount: 63% off | Out of stock: False
 6. [ 4.685] SWSFXZWGFKHNG88S - Full Sleeve Solid Men Sweatshirt
    Rating: 3.2 | Discount: 63% off | Out of stock: False
 7. [ 4

In [31]:
# Run all test queries with Custom Score
custom_results_all = {}

for query in TEST_QUERIES:
    print(f"\n{'='*80}")
    print(f"Query: '{query}'")
    print(f"{'='*80}")

    results = search_engine.search_custom(query, corpus, top_k=20)
    custom_results_all[query] = results

    print("\nTop 20 Results (Custom Score):")
    if results:
        for rank, (pid, score) in enumerate(results, 1):
            doc = corpus[pid]
            original = doc["original"]
            title = original["title"]
            brand = original.get("brand", "N/A")
            price = original.get("selling_price", "N/A")
            rating = original.get("average_rating", "N/A")
            discount = original.get("discount", "N/A")
            out_of_stock = original.get("out_of_stock", False)

            print(f"{rank:2d}. [Score: {score:6.3f}] {pid}")
            print(f"    Title: {title[:60]}")
            print(f"    Brand: {brand} | Price: {price} | Rating: {rating} | Discount: {discount} | Out of stock: {out_of_stock}")
    else:
        print("  No results found")


Query: 'women cotton dress summer'
Processed query terms (Custom): ['women', 'cotton', 'dress', 'summer']
Indexed 20906 unique terms from 28080 documents in 1.02s
Found 273 documents matching all terms (Custom)

Top 20 Results (Custom Score):
 1. [Score:  4.266] TSHFVUXNG5VCCCHU
    Title: Solid Women Polo Neck Red T-Shirt
    Brand: Shoef | Price: 246 | Rating: 3.6 | Discount: 50% off | Out of stock: False
 2. [Score:  4.226] TSHFVUXGRXTQPUGH
    Title: Solid Women Polo Neck Maroon T-Shirt
    Brand: Shoef | Price: 246 | Rating: 3 | Discount: 50% off | Out of stock: False
 3. [Score:  4.203] TSHFTT2BHNGZAK2M
    Title: Solid Women Polo Neck Multicolor T-Shirt  (Pack of 4)
    Brand: Shoef | Price: 806 | Rating: 5 | Discount: 59% off | Out of stock: False
 4. [Score:  4.159] TSHFTT2WMXWR6ER2
    Title: Solid Women Polo Neck Multicolor T-Shirt  (Pack of 4)
    Brand: Shoef | Price: 806 | Rating: 4.3 | Discount: 59% off | Out of stock: False
 5. [Score:  4.157] TSHFWCRCEQSJUBPR
    Titl

## Word2Vec + Cosine

In [32]:
from gensim.models import Word2Vec

# Prepare the training phrases
sentences = [doc["searchable_text"] for doc in corpus.values()]

# Train Word2Vec
w2v_model = Word2Vec(
    sentences=sentences,
    vector_size=100,
    window=10,
    min_count=5,   # or 1 if you want to cover more terms
    sg=1           # skip-gram, default is 0 for CBOW
)

# (optional) save model
w2v_model.save("../../data/word2vec_model.bin")

In [33]:
# from gensim.models import Word2Vec
# w2v_model = Word2Vec.load("../../data/word2vec_model.bin")

In [34]:
# Dictionary token -> vector (list[float]) compatible with Word2VecRanker
embeddings = {word: w2v_model.wv[word].tolist() for word in w2v_model.wv.key_to_index}
len(embeddings), list(embeddings.keys())[:10]

(4418,
 ['shirt',
  'neck',
  'cotton',
  'wear',
  'round',
  'women',
  '1',
  'regular',
  'men',
  'wash'])

In [35]:
test_query = "cotton sweatshirt"
print(f"Testing Word2Vec ranking for query: '{test_query}'\n")

results_w2v = search_engine.search_word2vec(test_query, corpus, embeddings, top_k=10)

print("\n" + "="*60)
print("Top 10 Results (Word2Vec)")
print("="*60)
for rank, (pid, score) in enumerate(results_w2v, 1):
    doc = corpus[pid]
    title = doc["original"]["title"]
    print(f"{rank:2d}. [{score:6.3f}] {pid} - {title[:60]}")

Testing Word2Vec ranking for query: 'cotton sweatshirt'

Processed query terms (w2v): ['cotton', 'sweatshirt']
Indexed 20906 unique terms from 28080 documents in 0.79s
Found 1025 documents matching all terms (w2v)

Top 10 Results (Word2Vec)
 1. [ 0.839] SWSFXMFP2RUM86QD - Full Sleeve Printed Women Sweatshirt
 2. [ 0.837] SWSFY5ZHUEZPZZYV - Full Sleeve Printed Women Sweatshirt
 3. [ 0.836] SWSFY5ZHEJ2HYWDG - Full Sleeve Printed Men Sweatshirt
 4. [ 0.835] SWSFXMFPRTY4GKGF - Full Sleeve Printed Men Sweatshirt
 5. [ 0.833] SWSFXMFPNZQGZZNE - Full Sleeve Printed Women Sweatshirt
 6. [ 0.832] SWSFXMFPYYTZUZ3Z - Full Sleeve Printed Men Sweatshirt
 7. [ 0.829] SWSFUNF6XHCGGVCU - Full Sleeve Printed Men Sweatshirt
 8. [ 0.824] SWSFUNF6ZCFUZKHR - Full Sleeve Printed Men Sweatshirt
 9. [ 0.823] SWSFUNF6V4ASAUTP - Full Sleeve Printed Women Sweatshirt
10. [ 0.822] SWSFUNF6VFJBYZZC - Full Sleeve Printed Men Sweatshirt


In [36]:
w2v_results = {}

for query in TEST_QUERIES:
    print(f"\n{'='*80}")
    print(f"Query: '{query}'")
    print(f"{'='*80}")

    results = search_engine.search_word2vec(query, corpus, embeddings, top_k=20)
    w2v_results[query] = results

    print("\nTop 20 Results (Word2Vec):")
    if results:
        for rank, (pid, score) in enumerate(results, 1):
            doc = corpus[pid]
            original = doc["original"]
            title = original["title"]
            brand = original.get("brand", "N/A")
            price = original.get("selling_price", "N/A")
            rating = original.get("average_rating", "N/A")

            print(f"{rank:2d}. [Score: {score:6.3f}] {pid}")
            print(f"    Title: {title[:60]}")
            print(f"    Brand: {brand} | Price: {price} | Rating: {rating}")
    else:
        print("  No results found")


Query: 'women cotton dress summer'
Processed query terms (w2v): ['women', 'cotton', 'dress', 'summer']
Indexed 20906 unique terms from 28080 documents in 0.76s
Found 273 documents matching all terms (w2v)

Top 20 Results (Word2Vec):
 1. [Score:  0.708] TSHFSFVPJ4HMCMUX
    Title: Solid Women Polo Neck Multicolor T-Shirt  (Pack of 3)
    Brand: Shoef | Price: 616 | Rating: 3.6
 2. [Score:  0.708] TSHFSFVMYXB3RECU
    Title: Solid Women Polo Neck Multicolor T-Shirt  (Pack of 3)
    Brand: Shoef | Price: 616 | Rating: 4.2
 3. [Score:  0.708] TSHFSFVP4XRRRZQF
    Title: Solid Women Polo Neck Multicolor T-Shirt  (Pack of 3)
    Brand: Shoef | Price: 616 | Rating: 3.6
 4. [Score:  0.707] TSHFSFVQGA4SYWZ5
    Title: Solid Women Polo Neck Multicolor T-Shirt  (Pack of 3)
    Brand: Shoef | Price: 616 | Rating: 3.6
 5. [Score:  0.707] TSHFSFVR6ZXHJMSG
    Title: Solid Women Polo Neck Multicolor T-Shirt  (Pack of 3)
    Brand: Shoef | Price: 616 | Rating: 3.6
 6. [Score:  0.707] TSHFSFVZAQ4G2NT5

# COMPARISON ALL MODELS

In [37]:
# We choose one of the TEST_QUERIES to compare (for example the first one)
query = TEST_QUERIES[0]
print(f"Comparing all 4 models for query:\n  '{query}'\n")

# We rerun the calculation
tfidf = search_engine.search_tfidf(query, corpus, top_k=20)
bm25 = search_engine.search_bm25(query, corpus, top_k=20)
custom = search_engine.search_custom(query, corpus, top_k=20)
w2v = search_engine.search_word2vec(query, corpus, embeddings, top_k=20)

# Convert each ranking into a DataFrame with explicit rank
tfidf_df = pd.DataFrame([
    {
        "Rank_TFIDF": rank,
        "PID_TFIDF": pid,
        "Score_TFIDF": score,
        "Title_TFIDF": corpus[pid]["original"]["title"][:60],
    }
    for rank, (pid, score) in enumerate(tfidf, start=1)
])

bm25_df = pd.DataFrame([
    {
        "Rank_BM25": rank,
        "PID_BM25": pid,
        "Score_BM25": score,
        "Title_BM25": corpus[pid]["original"]["title"][:60],
    }
    for rank, (pid, score) in enumerate(bm25, start=1)
])

custom_df = pd.DataFrame([
    {
        "Rank_Custom": rank,
        "PID_Custom": pid,
        "Score_Custom": score,
        "Title_Custom": corpus[pid]["original"]["title"][:60],
    }
    for rank, (pid, score) in enumerate(custom, start=1)
])

w2v_df = pd.DataFrame([
    {
        "Rank_W2V": rank,
        "PID_W2V": pid,
        "Score_W2V": score,
        "Title_W2V": corpus[pid]["original"]["title"][:60],
    }
    for rank, (pid, score) in enumerate(w2v, start=1)
])

# Reindex all to the same number of rows (maximum of the 4)
max_rows = max(len(tfidf_df), len(bm25_df), len(custom_df), len(w2v_df))
tfidf_df = tfidf_df.reindex(range(max_rows))
bm25_df = bm25_df.reindex(range(max_rows))
custom_df = custom_df.reindex(range(max_rows))
w2v_df = w2v_df.reindex(range(max_rows))

# Concatenate columns -> ranking side by side
comparison_all = pd.concat([tfidf_df, bm25_df, custom_df, w2v_df], axis=1)

print("\n=== TOP-20 COMPARISON: TF-IDF vs BM25 vs Custom vs Word2Vec ===")
display(comparison_all)

Comparing all 4 models for query:
  'women cotton dress summer'

Processed query terms: ['women', 'cotton', 'dress', 'summer']
Indexed 20906 unique terms from 28080 documents in 0.96s
Found 273 documents matching all terms
Processed query terms (BM25): ['women', 'cotton', 'dress', 'summer']
Indexed 20906 unique terms from 28080 documents in 0.78s
Found 273 documents matching all terms (BM25)
Processed query terms (Custom): ['women', 'cotton', 'dress', 'summer']
Indexed 20906 unique terms from 28080 documents in 0.97s
Found 273 documents matching all terms (Custom)
Processed query terms (w2v): ['women', 'cotton', 'dress', 'summer']
Indexed 20906 unique terms from 28080 documents in 0.74s
Found 273 documents matching all terms (w2v)

=== TOP-20 COMPARISON: TF-IDF vs BM25 vs Custom vs Word2Vec ===


Unnamed: 0,Rank_TFIDF,PID_TFIDF,Score_TFIDF,Title_TFIDF,Rank_BM25,PID_BM25,Score_BM25,Title_BM25,Rank_Custom,PID_Custom,Score_Custom,Title_Custom,Rank_W2V,PID_W2V,Score_W2V,Title_W2V
0,1,TSHFVUXNG5VCCCHU,0.16137,Solid Women Polo Neck Red T-Shirt,1,TSHFVUXNG5VCCCHU,5.817391,Solid Women Polo Neck Red T-Shirt,1,TSHFVUXNG5VCCCHU,4.266174,Solid Women Polo Neck Red T-Shirt,1,TSHFSFVPJ4HMCMUX,0.707745,Solid Women Polo Neck Multicolor T-Shirt (Pac...
1,2,TSHFVUXGRXTQPUGH,0.15961,Solid Women Polo Neck Maroon T-Shirt,2,TSHFVUXGRXTQPUGH,5.793817,Solid Women Polo Neck Maroon T-Shirt,2,TSHFVUXGRXTQPUGH,4.225672,Solid Women Polo Neck Maroon T-Shirt,2,TSHFSFVMYXB3RECU,0.707572,Solid Women Polo Neck Multicolor T-Shirt (Pac...
2,3,TSHFGBYKFQUMFKKU,0.158298,Solid Women Polo Neck Multicolor T-Shirt (Pac...,3,TSHFWCRCEQSJUBPR,5.656383,Solid Women Polo Neck Multicolor T-Shirt (Pac...,3,TSHFTT2BHNGZAK2M,4.202887,Solid Women Polo Neck Multicolor T-Shirt (Pac...,3,TSHFSFVP4XRRRZQF,0.707514,Solid Women Polo Neck Multicolor T-Shirt (Pac...
3,4,TSHFGBYKFZHSCGGW,0.158078,Solid Women Polo Neck Multicolor T-Shirt (Pac...,4,TSHFWCR44UPHFCHU,5.656383,Solid Women Polo Neck Multicolor T-Shirt (Pac...,4,TSHFTT2WMXWR6ER2,4.15943,Solid Women Polo Neck Multicolor T-Shirt (Pac...,4,TSHFSFVQGA4SYWZ5,0.707487,Solid Women Polo Neck Multicolor T-Shirt (Pac...
4,5,TSHFGDEYGRTC7ZED,0.158078,"Solid Women Polo Neck Green, Blue, Black T-Shi...",5,TSHFWCSUQR6ECHFG,5.656383,Solid Women Polo Neck Multicolor T-Shirt (Pac...,5,TSHFWCRCEQSJUBPR,4.157468,Solid Women Polo Neck Multicolor T-Shirt (Pac...,5,TSHFSFVR6ZXHJMSG,0.707484,Solid Women Polo Neck Multicolor T-Shirt (Pac...
5,6,TSHFGBYKUQKCREZJ,0.157777,Solid Women Polo Neck Multicolor T-Shirt (Pac...,6,TSHFWCRC55NY8FHU,5.656383,Solid Women Polo Neck Multicolor T-Shirt (Pac...,6,TSHFWCR44UPHFCHU,4.157468,Solid Women Polo Neck Multicolor T-Shirt (Pac...,6,TSHFSFVZAQ4G2NT5,0.707456,Solid Women Polo Neck Multicolor T-Shirt (Pac...
6,7,TSHFGBYKCDAEBHT7,0.157634,Solid Women Polo Neck Multicolor T-Shirt (Pac...,7,TSHFWCSK9YCSUBNZ,5.634124,Solid Women Polo Neck Multicolor T-Shirt (Pac...,7,TSHFWCSUQR6ECHFG,4.157468,Solid Women Polo Neck Multicolor T-Shirt (Pac...,7,TSHFSFVQYZWB3DPW,0.707411,Solid Women Polo Neck Multicolor T-Shirt (Pac...
7,8,TSHFGBYKEJC36UJF,0.157594,Solid Women Polo Neck Multicolor T-Shirt (Pac...,8,TSHFWCQXENGNZRVY,5.634124,Solid Women Polo Neck Multicolor T-Shirt (Pac...,8,TSHFWCRC55NY8FHU,4.157468,Solid Women Polo Neck Multicolor T-Shirt (Pac...,8,TSHFSFVQQQB9AZGY,0.707391,Solid Women Polo Neck Multicolor T-Shirt (Pac...
8,9,TSHFGDEYZ9X7JXFN,0.15744,"Solid Women Polo Neck Green, Black, Grey T-Shi...",9,TSHFTT34Z2HPAZUB,5.634124,Solid Women Polo Neck Multicolor T-Shirt (Pac...,9,TSHFFNRDKXHP6ETU,4.153963,"Solid Women Polo Neck Black, Orange T-Shirt (...",9,TSHFSFVHWG4EESGN,0.707373,Solid Women Polo Neck Multicolor T-Shirt (Pac...
9,10,TSHFGBYK3R9GN9CK,0.157418,Solid Women Polo Neck Multicolor T-Shirt (Pac...,10,TSHFTT32HHGAHPCK,5.634124,Solid Women Polo Neck Multicolor T-Shirt (Pac...,10,TSHFTTFWDKHUP9EY,4.15143,Solid Women Polo Neck Multicolor T-Shirt (Pac...,10,TSHFSFVN9RTR475R,0.707336,Solid Women Polo Neck Multicolor T-Shirt (Pac...


In [38]:
from myapp.search.evaluation import SearchEvaluator

# Load labels
df = pd.read_csv("../../data/test_queries_labels.csv")

gt_by_query = {}
for qid, group in df.groupby("query_id"):
    if group["labels"].sum() > 0:
        gt_by_query[qid] = dict(zip(group["pid"], group["labels"]))

def eval_model(qid, model_name, ranked_pids):
    evaluator = SearchEvaluator(gt_by_query[qid])
    m = evaluator.evaluate_all(ranked_pids, k_values=[5, 10, 20])
    return {
        "QueryID": qid,
        "Model": model_name,
        "P@10": m["P@10"],
        "R@10": m["R@10"],
        "AP@20": m["AP@20"],
        "MRR": m["MRR"],
        "NDCG@10": m["NDCG@10"],
    }

rows = []
for qid, query in enumerate(TEST_QUERIES, start=1):
    if qid not in gt_by_query:
        continue

    tfidf_rank = [pid for pid, _ in search_engine.search_tfidf(query, corpus, top_k=20)]
    bm25_rank  = [pid for pid, _ in search_engine.search_bm25(query, corpus, top_k=20)]
    custom_rank = [pid for pid, _ in search_engine.search_custom(query, corpus, top_k=20)]
    w2v_rank   = [pid for pid, _ in search_engine.search_word2vec(query, corpus, embeddings, top_k=20)]

    rows.append(eval_model(qid, "TFIDF", tfidf_rank))
    rows.append(eval_model(qid, "BM25", bm25_rank))
    rows.append(eval_model(qid, "Custom", custom_rank))
    rows.append(eval_model(qid, "Word2Vec", w2v_rank))

metrics_df = pd.DataFrame(rows)
metrics_df

# EXTRA ALTERNATIVE FOR WORD2VEC

In [39]:
from gensim.models.fasttext import FastText

# Prepare the training phrases
sentences_ft = [doc["searchable_text"] for doc in corpus.values()]

# Train FastText (parameters inspired by the class notebook)
ft_model = FastText(
    sentences=sentences_ft,
    vector_size=100,    # dimension of the embedding
    window=10,          # large context
    min_count=5,        # ignores rare terms
    negative=15,        # more negative sampling
    sg=1,               # skip-gram
    min_n=2,            # character n-grams [2,5]
    max_n=5
)

In [40]:
# Dictionary token -> vector (list[float]) compatible with Word2VecRanker
embeddings_ft = {word: ft_model.wv[word].tolist() for word in ft_model.wv.key_to_index}
len(embeddings_ft), list(embeddings_ft.keys())[:10]

(4418,
 ['shirt',
  'neck',
  'cotton',
  'wear',
  'round',
  'women',
  '1',
  'regular',
  'men',
  'wash'])

In [41]:
test_query = "cotton sweatshirt"
print(f"Testing FastText ranking for query: '{test_query}'\n")

results_ft = search_engine.search_word2vec(test_query, corpus, embeddings_ft, top_k=10)

print("\n" + "="*60)
print("Top 10 Results (FastText)")
print("="*60)
for rank, (pid, score) in enumerate(results_ft, 1):
    doc = corpus[pid]
    title = doc["original"]["title"]
    print(f"{rank:2d}. [{score:6.3f}] {pid} - {title[:60]}")

Testing FastText ranking for query: 'cotton sweatshirt'

Processed query terms (w2v): ['cotton', 'sweatshirt']
Indexed 20906 unique terms from 28080 documents in 0.92s
Found 1025 documents matching all terms (w2v)

Top 10 Results (FastText)
 1. [ 0.916] SWSFXMFPRTY4GKGF - Full Sleeve Printed Men Sweatshirt
 2. [ 0.912] SWSFXMFP2RUM86QD - Full Sleeve Printed Women Sweatshirt
 3. [ 0.911] SWSFXMFPYYTZUZ3Z - Full Sleeve Printed Men Sweatshirt
 4. [ 0.909] SWSFXMFPZQGQ6ENA - Full Sleeve Printed Men Sweatshirt
 5. [ 0.908] SWSFY5ZHEJ2HYWDG - Full Sleeve Printed Men Sweatshirt
 6. [ 0.907] SWSFXMFPD9RDM2EK - Full Sleeve Printed Men Sweatshirt
 7. [ 0.907] SWSFXMFPNZQGZZNE - Full Sleeve Printed Women Sweatshirt
 8. [ 0.907] SWSFUNF6XHCGGVCU - Full Sleeve Printed Men Sweatshirt
 9. [ 0.906] SWSFMTNHEM5SNHVZ - Full Sleeve Solid Women Sweatshirt
10. [ 0.906] SWSFUNF6VFJBYZZC - Full Sleeve Printed Men Sweatshirt


In [42]:
ft_results = {}

for query in TEST_QUERIES:
    print(f"\n{'='*80}")
    print(f"Query: '{query}'")
    print(f"{'='*80}")

    results = search_engine.search_word2vec(query, corpus, embeddings_ft, top_k=20)
    ft_results[query] = results

    print("\nTop 20 Results (FastText):")
    if results:
        for rank, (pid, score) in enumerate(results, 1):
            doc = corpus[pid]
            original = doc["original"]
            title = original["title"]
            brand = original.get("brand", "N/A")
            price = original.get("selling_price", "N/A")
            rating = original.get("average_rating", "N/A")

            print(f"{rank:2d}. [Score: {score:6.3f}] {pid}")
            print(f"    Title: {title[:60]}")
            print(f"    Brand: {brand} | Price: {price} | Rating: {rating}")
    else:
        print("  No results found")


Query: 'women cotton dress summer'
Processed query terms (w2v): ['women', 'cotton', 'dress', 'summer']
Indexed 20906 unique terms from 28080 documents in 0.96s
Found 273 documents matching all terms (w2v)

Top 20 Results (FastText):
 1. [Score:  0.871] TSHFHBNT7PMKUZWT
    Title: Solid Women Polo Neck Light Blue, Pink, Light Green T-Shirt 
    Brand: Keo | Price: 664 | Rating: 3.9
 2. [Score:  0.871] TSHFFWRGH3AGQYGJ
    Title: Solid Women Polo Neck Light Green, Yellow T-Shirt  (Pack of 
    Brand: Keo | Price: 521 | Rating: 1
 3. [Score:  0.871] TSHFWCRC55NY8FHU
    Title: Solid Women Polo Neck Multicolor T-Shirt  (Pack of 2)
    Brand: Shoef | Price: 458 | Rating: 3.6
 4. [Score:  0.871] TSHFWCR6EAAKWTEY
    Title: Solid Women Polo Neck Multicolor T-Shirt  (Pack of 2)
    Brand: Shoef | Price: 458 | Rating: 3.6
 5. [Score:  0.871] TSHFGHV5UAHVZDGF
    Title: Solid Women Polo Neck Light Blue, Maroon, Light Green T-Shir
    Brand: Keo | Price: 597 | Rating: 4.3
 6. [Score:  0.871] TSH

In [43]:
from myapp.search.evaluation import SearchEvaluator

# 1. Load ground truth of your test queries
df = pd.read_csv("../../data/test_queries_labels.csv")

gt_by_query = {}
for qid, group in df.groupby("query_id"):
    if group["labels"].sum() > 0:    # only queries with at least one relevant
        gt_by_query[qid] = dict(zip(group["pid"], group["labels"]))


def eval_model(qid, model_name, ranked_pids):
    evaluator = SearchEvaluator(gt_by_query[qid])
    m = evaluator.evaluate_all(ranked_pids, k_values=[5, 10, 20])
    return {
        "QueryID": qid,
        "Model": model_name,
        "P@10": m["P@10"],
        "R@10": m["R@10"],
        "AP@20": m["AP@20"],
        "MRR": m["MRR"],
        "NDCG@10": m["NDCG@10"],
    }


rows = []

for qid, query in enumerate(TEST_QUERIES, start=1):
    if qid not in gt_by_query:
        continue

    # Ranking Word2Vec
    w2v_rank = [
        pid for pid, _ in search_engine.search_word2vec(
            query, corpus, embeddings, top_k=20
        )
    ]

    # Ranking FastText (reusing search_word2vec with embeddings_ft)
    ft_rank = [
        pid for pid, _ in search_engine.search_word2vec(
            query, corpus, embeddings_ft, top_k=20
        )
    ]

    rows.append(eval_model(qid, "Word2Vec", w2v_rank))
    rows.append(eval_model(qid, "FastText", ft_rank))

metrics_w2v_ft = pd.DataFrame(rows)
metrics_w2v_ft