# Part 3: Ranking & Filtering


In [1]:
import os
import sys
import pandas as pd

# Add project root to path
sys.path.insert(0, os.path.abspath('../../'))

from myapp.search.load_corpus import load_preprocessed_corpus
from myapp.search.search_engine import SearchEngine

In [2]:
# Load preprocessed corpus (same cache as in Part 2)
corpus = load_preprocessed_corpus('../../data/processed/preprocessed_corpus.pkl')

print(f"Loaded {len(corpus)} documents")

# Show example structure
example_pid = list(corpus.keys())[0]
example_doc = corpus[example_pid]
print("\nExample document structure:")
print(f"  PID: {example_doc['pid']}")
print(f"  Tokens (first 10): {example_doc['searchable_text'][:10]}")
print(f"  Metadata: {example_doc['metadata']}")

Loaded 28080 documents

Example document structure:
  PID: TKPFCZ9EA7H5FYZH
  Tokens (first 10): ['solid', 'women', 'multicolor', 'track', 'pant', 'yorker', 'trackpant', 'made', '100', 'rich']
  Metadata: {'category': 'clothing and accessories', 'sub_category': 'bottomwear', 'brand': 'york', 'seller': 'shyam enterprises'}


In [3]:
TEST_QUERIES = [
    "women cotton dress summer",
    "men leather shoes formal",
    "kids blue jeans comfortable",
    "sports running shoes lightweight",
    "winter jacket warm waterproof"
]

In [4]:
search_engine = SearchEngine()

# TF-IDF

In [5]:
test_query = "cotton sweatshirt"
print(f"Testing TF-IDF for query: '{test_query}'\n")

results = search_engine.search_tfidf(test_query, corpus, top_k=10)

print("\n" + "="*60)
print("Top 10 Results (TF-IDF)")
print("="*60)
for rank, (pid, score) in enumerate(results, 1):
    doc = corpus[pid]
    title = doc['original']['title']
    print(f"{rank:2d}. [{score:6.3f}] {pid} - {title[:60]}")

Testing TF-IDF for query: 'cotton sweatshirt'

Processed query terms: ['cotton', 'sweatshirt']
Indexed 20906 unique terms from 28080 documents in 1.05s
Found 1025 documents matching all terms

Top 10 Results (TF-IDF)
 1. [ 0.763] SWSFMTNHCG3SRHJR - Full Sleeve Solid Men Sweatshirt
 2. [ 0.762] SWSFMTNHEM5SNHVZ - Full Sleeve Solid Women Sweatshirt
 3. [ 0.733] SWSFMTNHZR59ZUXR - Full Sleeve Solid Men Sweatshirt
 4. [ 0.654] SWSFXMFPD9RDM2EK - Full Sleeve Printed Men Sweatshirt
 5. [ 0.649] SWSFXMFPZQGQ6ENA - Full Sleeve Printed Men Sweatshirt
 6. [ 0.644] SWSFVZRFCWPPZVGT - Full Sleeve Solid Men Sweatshirt
 7. [ 0.642] SWSFVZRFS7GHGKSF - Full Sleeve Solid Women Sweatshirt
 8. [ 0.640] SWSFVZRFDEQAKUH2 - Full Sleeve Solid Men Sweatshirt
 9. [ 0.639] SWSFVZRFTM4P9XGF - Full Sleeve Solid Men Sweatshirt
10. [ 0.638] SWSFVZRFVZ2SZ3HZ - Full Sleeve Solid Men Sweatshirt


In [6]:
# Run all test queries with TF-IDF
tfidf_results = {}

for query in TEST_QUERIES:
    print(f"\n{'='*80}")
    print(f"Query: '{query}'")
    print(f"{'='*80}")

    results = search_engine.search_tfidf(query, corpus, top_k=20)
    tfidf_results[query] = results

    print("\nTop 20 Results (TF-IDF):")
    if results:
        for rank, (pid, score) in enumerate(results, 1):
            doc = corpus[pid]
            title = doc['original']['title']
            brand = doc['original'].get('brand', 'N/A')
            price = doc['original'].get('selling_price', 'N/A')
            rating = doc['original'].get('average_rating', 'N/A')

            print(f"{rank:2d}. [Score: {score:6.3f}] {pid}")
            print(f"    Title: {title[:60]}")
            print(f"    Brand: {brand} | Price: {price} | Rating: {rating}")
    else:
        print("  No results found")


Query: 'women cotton dress summer'
Processed query terms: ['women', 'cotton', 'dress', 'summer']
Indexed 20906 unique terms from 28080 documents in 1.04s
Found 273 documents matching all terms

Top 20 Results (TF-IDF):
 1. [Score:  0.161] TSHFVUXNG5VCCCHU
    Title: Solid Women Polo Neck Red T-Shirt
    Brand: Shoef | Price: 246 | Rating: 3.6
 2. [Score:  0.160] TSHFVUXGRXTQPUGH
    Title: Solid Women Polo Neck Maroon T-Shirt
    Brand: Shoef | Price: 246 | Rating: 3
 3. [Score:  0.158] TSHFGBYKFQUMFKKU
    Title: Solid Women Polo Neck Multicolor T-Shirt  (Pack of 3)
    Brand: Keo | Price: 664 | Rating: 3.5
 4. [Score:  0.158] TSHFGBYKFZHSCGGW
    Title: Solid Women Polo Neck Multicolor T-Shirt  (Pack of 3)
    Brand: Keo | Price: 664 | Rating: 3.5
 5. [Score:  0.158] TSHFGDEYGRTC7ZED
    Title: Solid Women Polo Neck Green, Blue, Black T-Shirt  (Pack of 3
    Brand: Keo | Price: 664 | Rating: 3.9
 6. [Score:  0.158] TSHFGBYKUQKCREZJ
    Title: Solid Women Polo Neck Multicolor T-Shirt

# BM25

In [7]:
test_query = "cotton sweatshirt"
print(f"Testing BM25 for query: '{test_query}'\n")

results_bm25 = search_engine.search_bm25(test_query, corpus, top_k=10)

print("\n" + "="*60)
print("Top 10 Results (BM25)")
print("="*60)
for rank, (pid, score) in enumerate(results_bm25, 1):
    doc = corpus[pid]
    title = doc['original']['title']
    print(f"{rank:2d}. [{score:6.3f}] {pid} - {title[:60]}")

Testing BM25 for query: 'cotton sweatshirt'

Processed query terms (BM25): ['cotton', 'sweatshirt']
Indexed 20906 unique terms from 28080 documents in 0.98s
Found 1025 documents matching all terms (BM25)

Top 10 Results (BM25)
 1. [ 6.582] SWSFMTNHEM5SNHVZ - Full Sleeve Solid Women Sweatshirt
 2. [ 6.582] SWSFMTNHZR59ZUXR - Full Sleeve Solid Men Sweatshirt
 3. [ 6.582] SWSFMTNHCG3SRHJR - Full Sleeve Solid Men Sweatshirt
 4. [ 6.419] SWSFXZWWYXPZXPTY - Full Sleeve Printed Men Sweatshirt
 5. [ 6.419] SWSFXZWGMGVDUCXA - Full Sleeve Printed Women Sweatshirt
 6. [ 6.419] SWSFXZWGFKHNG88S - Full Sleeve Solid Men Sweatshirt
 7. [ 6.368] SWSFYFFYJCSGNGD8 - Full Sleeve Printed Men Sweatshirt
 8. [ 6.368] SWSFYFFFMWQMAPJJ - Full Sleeve Printed Men Sweatshirt
 9. [ 6.342] SWSFYFFFFYZ896TJ - Full Sleeve Printed Women Sweatshirt
10. [ 6.323] SWSFXMFPDVRHYYPH - Full Sleeve Striped Women Sweatshirt


In [8]:
bm25_results = {}

for query in TEST_QUERIES:
    print(f"\n{'='*80}")
    print(f"Query: '{query}'")
    print(f"{'='*80}")

    results = search_engine.search_bm25(query, corpus, top_k=20)
    bm25_results[query] = results

    print("\nTop 20 Results (BM25):")
    if results:
        for rank, (pid, score) in enumerate(results, 1):
            doc = corpus[pid]
            title = doc['original']['title']
            brand = doc['original'].get('brand', 'N/A')
            price = doc['original'].get('selling_price', 'N/A')
            rating = doc['original'].get('average_rating', 'N/A')

            print(f"{rank:2d}. [Score: {score:6.3f}] {pid}")
            print(f"    Title: {title[:60]}")
            print(f"    Brand: {brand} | Price: {price} | Rating: {rating}")
    else:
        print("  No results found")


Query: 'women cotton dress summer'
Processed query terms (BM25): ['women', 'cotton', 'dress', 'summer']
Indexed 20906 unique terms from 28080 documents in 0.83s
Found 273 documents matching all terms (BM25)

Top 20 Results (BM25):
 1. [Score:  5.817] TSHFVUXNG5VCCCHU
    Title: Solid Women Polo Neck Red T-Shirt
    Brand: Shoef | Price: 246 | Rating: 3.6
 2. [Score:  5.794] TSHFVUXGRXTQPUGH
    Title: Solid Women Polo Neck Maroon T-Shirt
    Brand: Shoef | Price: 246 | Rating: 3
 3. [Score:  5.656] TSHFWCRCEQSJUBPR
    Title: Solid Women Polo Neck Multicolor T-Shirt  (Pack of 2)
    Brand: Shoef | Price: 458 | Rating: 3.6
 4. [Score:  5.656] TSHFWCR44UPHFCHU
    Title: Solid Women Polo Neck Multicolor T-Shirt  (Pack of 2)
    Brand: Shoef | Price: 458 | Rating: 3.6
 5. [Score:  5.656] TSHFWCSUQR6ECHFG
    Title: Solid Women Polo Neck Multicolor T-Shirt  (Pack of 2)
    Brand: Shoef | Price: 458 | Rating: 3.6
 6. [Score:  5.656] TSHFWCRC55NY8FHU
    Title: Solid Women Polo Neck Multico

## COMPARISON TF-IDF vs BM25

In [9]:
import pandas as pd

# As we see in the previous cells, the first query has more results, so we choose this one for the comparison
query = TEST_QUERIES[0]
print(f"Comparing TF-IDF vs BM25 for query:\n  '{query}'\n")

# We execute the search for the query with both models
tfidf_results = search_engine.search_tfidf(query, corpus, top_k=20)
bm25_results = search_engine.search_bm25(query, corpus, top_k=20)

# We convert the results to DataFrames with explicit rank
tfidf_df = pd.DataFrame([
    {
        "Rank_TFIDF": rank,
        "PID_TFIDF": pid,
        "Score_TFIDF": score,
        "Title_TFIDF": corpus[pid]["original"]["title"][:60],
    }
    for rank, (pid, score) in enumerate(tfidf_results, start=1)
])

bm25_df = pd.DataFrame([
    {
        "Rank_BM25": rank,
        "PID_BM25": pid,
        "Score_BM25": score,
        "Title_BM25": corpus[pid]["original"]["title"][:60],
    }
    for rank, (pid, score) in enumerate(bm25_results, start=1)
])

# We merge the two DataFrames by ranking
max_rows = max(len(tfidf_df), len(bm25_df))
tfidf_df = tfidf_df.reindex(range(max_rows))
bm25_df = bm25_df.reindex(range(max_rows))

comparison_df = pd.concat([tfidf_df, bm25_df], axis=1)

print("\n=== TOP-20 COMPARISON: TF-IDF vs BM25 (side by side) ===")
display(comparison_df)

Comparing TF-IDF vs BM25 for query:
  'women cotton dress summer'

Processed query terms: ['women', 'cotton', 'dress', 'summer']
Indexed 20906 unique terms from 28080 documents in 1.02s
Found 273 documents matching all terms
Processed query terms (BM25): ['women', 'cotton', 'dress', 'summer']
Indexed 20906 unique terms from 28080 documents in 1.04s
Found 273 documents matching all terms (BM25)

=== TOP-20 COMPARISON: TF-IDF vs BM25 (side by side) ===


Unnamed: 0,Rank_TFIDF,PID_TFIDF,Score_TFIDF,Title_TFIDF,Rank_BM25,PID_BM25,Score_BM25,Title_BM25
0,1,TSHFVUXNG5VCCCHU,0.16137,Solid Women Polo Neck Red T-Shirt,1,TSHFVUXNG5VCCCHU,5.817391,Solid Women Polo Neck Red T-Shirt
1,2,TSHFVUXGRXTQPUGH,0.15961,Solid Women Polo Neck Maroon T-Shirt,2,TSHFVUXGRXTQPUGH,5.793817,Solid Women Polo Neck Maroon T-Shirt
2,3,TSHFGBYKFQUMFKKU,0.158298,Solid Women Polo Neck Multicolor T-Shirt (Pac...,3,TSHFWCRCEQSJUBPR,5.656383,Solid Women Polo Neck Multicolor T-Shirt (Pac...
3,4,TSHFGBYKFZHSCGGW,0.158078,Solid Women Polo Neck Multicolor T-Shirt (Pac...,4,TSHFWCR44UPHFCHU,5.656383,Solid Women Polo Neck Multicolor T-Shirt (Pac...
4,5,TSHFGDEYGRTC7ZED,0.158078,"Solid Women Polo Neck Green, Blue, Black T-Shi...",5,TSHFWCSUQR6ECHFG,5.656383,Solid Women Polo Neck Multicolor T-Shirt (Pac...
5,6,TSHFGBYKUQKCREZJ,0.157777,Solid Women Polo Neck Multicolor T-Shirt (Pac...,6,TSHFWCRC55NY8FHU,5.656383,Solid Women Polo Neck Multicolor T-Shirt (Pac...
6,7,TSHFGBYKCDAEBHT7,0.157634,Solid Women Polo Neck Multicolor T-Shirt (Pac...,7,TSHFWCSK9YCSUBNZ,5.634124,Solid Women Polo Neck Multicolor T-Shirt (Pac...
7,8,TSHFGBYKEJC36UJF,0.157594,Solid Women Polo Neck Multicolor T-Shirt (Pac...,8,TSHFWCQXENGNZRVY,5.634124,Solid Women Polo Neck Multicolor T-Shirt (Pac...
8,9,TSHFGDEYZ9X7JXFN,0.15744,"Solid Women Polo Neck Green, Black, Grey T-Shi...",9,TSHFTT34Z2HPAZUB,5.634124,Solid Women Polo Neck Multicolor T-Shirt (Pac...
9,10,TSHFGBYK3R9GN9CK,0.157418,Solid Women Polo Neck Multicolor T-Shirt (Pac...,10,TSHFTT32HHGAHPCK,5.634124,Solid Women Polo Neck Multicolor T-Shirt (Pac...


## Your Score (Custom ranking: BM25 + Rating + Discount + Stock)

In [10]:
# Test CustomScoreRanker with a simple query
test_query = "cotton sweatshirt"
print(f"Testing Custom Score for query: '{test_query}'\n")

custom_results = search_engine.search_custom(test_query, corpus, top_k=10)

print("\n" + "="*60)
print("Top 10 Results (Custom Score)")
print("="*60)
for rank, (pid, score) in enumerate(custom_results, 1):
    doc = corpus[pid]
    original = doc["original"]
    title = original["title"]
    rating = original.get("average_rating")
    discount = original.get("discount")
    out_of_stock = original.get("out_of_stock")
    print(f"{rank:2d}. [{score:6.3f}] {pid} - {title[:60]}")
    print(f"    Rating: {rating} | Discount: {discount} | Out of stock: {out_of_stock}")

Testing Custom Score for query: 'cotton sweatshirt'

Processed query terms (Custom): ['cotton', 'sweatshirt']
Indexed 20906 unique terms from 28080 documents in 0.79s
Found 1025 documents matching all terms (Custom)

Top 10 Results (Custom Score)
 1. [ 4.835] SWSFMTNHZR59ZUXR - Full Sleeve Solid Men Sweatshirt
    Rating: 4.2 | Discount: 60% off | Out of stock: False
 2. [ 4.835] SWSFMTNHCG3SRHJR - Full Sleeve Solid Men Sweatshirt
    Rating: 4.2 | Discount: 60% off | Out of stock: False
 3. [ 4.830] SWSFMTNHEM5SNHVZ - Full Sleeve Solid Women Sweatshirt
    Rating: 4.2 | Discount: 55% off | Out of stock: False
 4. [ 4.685] SWSFXZWWYXPZXPTY - Full Sleeve Printed Men Sweatshirt
    Rating: 3.2 | Discount: 63% off | Out of stock: False
 5. [ 4.685] SWSFXZWGMGVDUCXA - Full Sleeve Printed Women Sweatshirt
    Rating: 3.2 | Discount: 63% off | Out of stock: False
 6. [ 4.685] SWSFXZWGFKHNG88S - Full Sleeve Solid Men Sweatshirt
    Rating: 3.2 | Discount: 63% off | Out of stock: False
 7. [ 4

In [11]:
# Run all test queries with Custom Score
custom_results_all = {}

for query in TEST_QUERIES:
    print(f"\n{'='*80}")
    print(f"Query: '{query}'")
    print(f"{'='*80}")

    results = search_engine.search_custom(query, corpus, top_k=20)
    custom_results_all[query] = results

    print("\nTop 20 Results (Custom Score):")
    if results:
        for rank, (pid, score) in enumerate(results, 1):
            doc = corpus[pid]
            original = doc["original"]
            title = original["title"]
            brand = original.get("brand", "N/A")
            price = original.get("selling_price", "N/A")
            rating = original.get("average_rating", "N/A")
            discount = original.get("discount", "N/A")
            out_of_stock = original.get("out_of_stock", False)

            print(f"{rank:2d}. [Score: {score:6.3f}] {pid}")
            print(f"    Title: {title[:60]}")
            print(f"    Brand: {brand} | Price: {price} | Rating: {rating} | Discount: {discount} | Out of stock: {out_of_stock}")
    else:
        print("  No results found")


Query: 'women cotton dress summer'
Processed query terms (Custom): ['women', 'cotton', 'dress', 'summer']
Indexed 20906 unique terms from 28080 documents in 1.00s
Found 273 documents matching all terms (Custom)

Top 20 Results (Custom Score):
 1. [Score:  4.266] TSHFVUXNG5VCCCHU
    Title: Solid Women Polo Neck Red T-Shirt
    Brand: Shoef | Price: 246 | Rating: 3.6 | Discount: 50% off | Out of stock: False
 2. [Score:  4.226] TSHFVUXGRXTQPUGH
    Title: Solid Women Polo Neck Maroon T-Shirt
    Brand: Shoef | Price: 246 | Rating: 3 | Discount: 50% off | Out of stock: False
 3. [Score:  4.203] TSHFTT2BHNGZAK2M
    Title: Solid Women Polo Neck Multicolor T-Shirt  (Pack of 4)
    Brand: Shoef | Price: 806 | Rating: 5 | Discount: 59% off | Out of stock: False
 4. [Score:  4.159] TSHFTT2WMXWR6ER2
    Title: Solid Women Polo Neck Multicolor T-Shirt  (Pack of 4)
    Brand: Shoef | Price: 806 | Rating: 4.3 | Discount: 59% off | Out of stock: False
 5. [Score:  4.157] TSHFWCRCEQSJUBPR
    Titl

## Word2Vec + Cosine

In [None]:
# Test CustomScoreRanker with a simple query
test_query = "cotton sweatshirt"
print(f"Testing Custom Score for query: '{test_query}'\n")

custom_results = search_engine.search_word2vec(test_query, corpus, embeddings, top_k=10)

print("\n" + "="*60)
print("Top 10 Results (Custom Score)")
print("="*60)
for rank, (pid, score) in enumerate(custom_results, 1):
    doc = corpus[pid]
    original = doc["original"]
    title = original["title"]
    rating = original.get("average_rating")
    discount = original.get("discount")
    out_of_stock = original.get("out_of_stock")
    print(f"{rank:2d}. [{score:6.3f}] {pid} - {title[:60]}")
    print(f"    Rating: {rating} | Discount: {discount} | Out of stock: {out_of_stock}")