# Part 3: Ranking & Filtering


In [1]:
import os
import sys
import pandas as pd

# Add project root to path
sys.path.insert(0, os.path.abspath('../../'))

from myapp.search.load_corpus import load_preprocessed_corpus
from myapp.search.search_engine import SearchEngine

In [2]:
# Load preprocessed corpus (same cache as in Part 2)
corpus = load_preprocessed_corpus('../../data/processed/preprocessed_corpus.pkl')

print(f"Loaded {len(corpus)} documents")

# Show example structure
example_pid = list(corpus.keys())[0]
example_doc = corpus[example_pid]
print("\nExample document structure:")
print(f"  PID: {example_doc['pid']}")
print(f"  Tokens (first 10): {example_doc['searchable_text'][:10]}")
print(f"  Metadata: {example_doc['metadata']}")

Loaded 28080 documents

Example document structure:
  PID: TKPFCZ9EA7H5FYZH
  Tokens (first 10): ['solid', 'women', 'multicolor', 'track', 'pant', 'yorker', 'trackpant', 'made', '100', 'rich']
  Metadata: {'category': 'clothing and accessories', 'sub_category': 'bottomwear', 'brand': 'york', 'seller': 'shyam enterprises'}


In [3]:
TEST_QUERIES = [
    "women cotton dress summer",
    "men leather shoes formal",
    "kids blue jeans comfortable",
    "sports running shoes lightweight",
    "winter jacket warm waterproof"
]

In [4]:
search_engine = SearchEngine()

In [5]:
test_query = "cotton sweatshirt"
print(f"Testing TF-IDF for query: '{test_query}'\n")

results = search_engine.search_tfidf(test_query, corpus, top_k=10)

print("\n" + "="*60)
print("Top 10 Results (TF-IDF)")
print("="*60)
for rank, (pid, score) in enumerate(results, 1):
    doc = corpus[pid]
    title = doc['original']['title']
    print(f"{rank:2d}. [{score:6.3f}] {pid} - {title[:60]}")

Testing TF-IDF for query: 'cotton sweatshirt'

Processed query terms: ['cotton', 'sweatshirt']
Indexed 20906 unique terms from 28080 documents in 0.68s
Found 1025 documents matching all terms

Top 10 Results (TF-IDF)
 1. [ 0.763] SWSFMTNHCG3SRHJR - Full Sleeve Solid Men Sweatshirt
 2. [ 0.762] SWSFMTNHEM5SNHVZ - Full Sleeve Solid Women Sweatshirt
 3. [ 0.733] SWSFMTNHZR59ZUXR - Full Sleeve Solid Men Sweatshirt
 4. [ 0.654] SWSFXMFPD9RDM2EK - Full Sleeve Printed Men Sweatshirt
 5. [ 0.649] SWSFXMFPZQGQ6ENA - Full Sleeve Printed Men Sweatshirt
 6. [ 0.644] SWSFVZRFCWPPZVGT - Full Sleeve Solid Men Sweatshirt
 7. [ 0.642] SWSFVZRFS7GHGKSF - Full Sleeve Solid Women Sweatshirt
 8. [ 0.640] SWSFVZRFDEQAKUH2 - Full Sleeve Solid Men Sweatshirt
 9. [ 0.639] SWSFVZRFTM4P9XGF - Full Sleeve Solid Men Sweatshirt
10. [ 0.638] SWSFVZRFVZ2SZ3HZ - Full Sleeve Solid Men Sweatshirt


In [6]:
# Run all test queries with TF-IDF
tfidf_results = {}

for query in TEST_QUERIES:
    print(f"\n{'='*80}")
    print(f"Query: '{query}'")
    print(f"{'='*80}")

    results = search_engine.search_tfidf(query, corpus, top_k=20)
    tfidf_results[query] = results

    print("\nTop 20 Results (TF-IDF):")
    if results:
        for rank, (pid, score) in enumerate(results, 1):
            doc = corpus[pid]
            title = doc['original']['title']
            brand = doc['original'].get('brand', 'N/A')
            price = doc['original'].get('selling_price', 'N/A')
            rating = doc['original'].get('average_rating', 'N/A')

            print(f"{rank:2d}. [Score: {score:6.3f}] {pid}")
            print(f"    Title: {title[:60]}")
            print(f"    Brand: {brand} | Price: {price} | Rating: {rating}")
    else:
        print("  No results found")


Query: 'women cotton dress summer'
Processed query terms: ['women', 'cotton', 'dress', 'summer']
Indexed 20906 unique terms from 28080 documents in 0.54s
Found 273 documents matching all terms

Top 20 Results (TF-IDF):
 1. [Score:  0.161] TSHFVUXNG5VCCCHU
    Title: Solid Women Polo Neck Red T-Shirt
    Brand: Shoef | Price: 246 | Rating: 3.6
 2. [Score:  0.160] TSHFVUXGRXTQPUGH
    Title: Solid Women Polo Neck Maroon T-Shirt
    Brand: Shoef | Price: 246 | Rating: 3
 3. [Score:  0.158] TSHFGBYKFQUMFKKU
    Title: Solid Women Polo Neck Multicolor T-Shirt  (Pack of 3)
    Brand: Keo | Price: 664 | Rating: 3.5
 4. [Score:  0.158] TSHFGBYKFZHSCGGW
    Title: Solid Women Polo Neck Multicolor T-Shirt  (Pack of 3)
    Brand: Keo | Price: 664 | Rating: 3.5
 5. [Score:  0.158] TSHFGDEYGRTC7ZED
    Title: Solid Women Polo Neck Green, Blue, Black T-Shirt  (Pack of 3
    Brand: Keo | Price: 664 | Rating: 3.9
 6. [Score:  0.158] TSHFGBYKUQKCREZJ
    Title: Solid Women Polo Neck Multicolor T-Shirt