# Part 2: Indexing and Evaluation
## Part 1: Indexing

This notebook implements:
1. **Inverted Index** - Build index structure from preprocessed corpus
2. **Conjunctive Query (AND)** - Search for documents containing ALL query terms
3. **TF-IDF Ranking** - Rank results by relevance
4. **5 Test Queries** - Evaluate search engine with custom queries

In [1]:
import os
import sys
import pandas as pd

# Add project root to path
sys.path.insert(0, os.path.abspath('../../'))

In [2]:
from myapp.search.load_corpus import load_preprocessed_corpus

corpus = load_preprocessed_corpus('../../data/processed/preprocessed_corpus.pkl')

print(f"Loaded {len(corpus)} documents")
print(f"\nExample document structure:")
example_pid = list(corpus.keys())[0]
example_doc = corpus[example_pid]
print(f"  PID: {example_doc['pid']}")
print(f"  Tokens (first 10): {example_doc['searchable_text'][:10]}")
print(f"  Metadata: {example_doc['metadata']}")

Loaded 28080 documents

Example document structure:
  PID: TKPFCZ9EA7H5FYZH
  Tokens (first 10): ['solid', 'women', 'multicolor', 'track', 'pant', 'yorker', 'trackpant', 'made', '100', 'rich']
  Metadata: {'category': 'clothing and accessories', 'sub_category': 'bottomwear', 'brand': 'york', 'seller': 'shyam enterprises'}


In [3]:
TEST_QUERIES = [
    "women cotton dress summer",
    "men leather shoes formal",
    "kids blue jeans comfortable",
    "sports running shoes lightweight",
    "winter jacket warm waterproof"
]

In [8]:
from myapp.search.search_engine import SearchEngine
searchEngine = SearchEngine()
# Test with a simple query
test_query = "cotton sweatshirt"
print(f"Testing query: '{test_query}'\n")

results = searchEngine.search_tfidf(test_query, corpus=corpus, top_k=10)

print(f"\n{'='*60}")
print(f"Top 10 Results:")
print(f"{'='*60}")
for rank, (pid, score) in enumerate(results, 1):
    doc = corpus[pid]
    title = doc['original']['title']
    print(f"{rank:2d}. [{score:6.3f}] {pid} - {title[:50]}")

Testing query: 'cotton sweatshirt'

Processed query terms: ['cotton', 'sweatshirt']
Indexed 20906 unique terms from 28080 documents in 1.08s
Found 1025 documents matching all terms

Top 10 Results:
 1. [ 0.763] SWSFMTNHCG3SRHJR - Full Sleeve Solid Men Sweatshirt
 2. [ 0.762] SWSFMTNHEM5SNHVZ - Full Sleeve Solid Women Sweatshirt
 3. [ 0.733] SWSFMTNHZR59ZUXR - Full Sleeve Solid Men Sweatshirt
 4. [ 0.654] SWSFXMFPD9RDM2EK - Full Sleeve Printed Men Sweatshirt
 5. [ 0.649] SWSFXMFPZQGQ6ENA - Full Sleeve Printed Men Sweatshirt
 6. [ 0.644] SWSFVZRFCWPPZVGT - Full Sleeve Solid Men Sweatshirt
 7. [ 0.642] SWSFVZRFS7GHGKSF - Full Sleeve Solid Women Sweatshirt
 8. [ 0.640] SWSFVZRFDEQAKUH2 - Full Sleeve Solid Men Sweatshirt
 9. [ 0.639] SWSFVZRFTM4P9XGF - Full Sleeve Solid Men Sweatshirt
10. [ 0.638] SWSFVZRFVZ2SZ3HZ - Full Sleeve Solid Men Sweatshirt


In [9]:
# Run all test queries
all_results = {}

for query in TEST_QUERIES:
    print(f"\n{'='*80}")
    print(f"Query: '{query}'")
    print(f"{'='*80}")

    results = searchEngine.search_tfidf(query, corpus=corpus, top_k=20)
    all_results[query] = results

    print(f"\nTop 20 Results:")
    if results:
        for rank, (pid, score) in enumerate(results, 1):
            doc = corpus[pid]
            title = doc['original']['title']
            brand = doc['original'].get('brand', 'N/A')
            price = doc['original'].get('selling_price', 'N/A')
            rating = doc['original'].get('average_rating', 'N/A')

            print(f"{rank:2d}. [Score: {score:6.3f}] {pid}")
            print(f"    Title: {title[:60]}")
            print(f"    Brand: {brand} | Price: {price} | Rating: {rating}")
    else:
        print("  No results found")



Query: 'women cotton dress summer'
Processed query terms: ['women', 'cotton', 'dress', 'summer']
Indexed 20906 unique terms from 28080 documents in 1.06s
Found 273 documents matching all terms

Top 20 Results:
 1. [Score:  0.161] TSHFVUXNG5VCCCHU
    Title: Solid Women Polo Neck Red T-Shirt
    Brand: Shoef | Price: 246 | Rating: 3.6
 2. [Score:  0.160] TSHFVUXGRXTQPUGH
    Title: Solid Women Polo Neck Maroon T-Shirt
    Brand: Shoef | Price: 246 | Rating: 3
 3. [Score:  0.158] TSHFGBYKFQUMFKKU
    Title: Solid Women Polo Neck Multicolor T-Shirt  (Pack of 3)
    Brand: Keo | Price: 664 | Rating: 3.5
 4. [Score:  0.158] TSHFGBYKFZHSCGGW
    Title: Solid Women Polo Neck Multicolor T-Shirt  (Pack of 3)
    Brand: Keo | Price: 664 | Rating: 3.5
 5. [Score:  0.158] TSHFGDEYGRTC7ZED
    Title: Solid Women Polo Neck Green, Blue, Black T-Shirt  (Pack of 3
    Brand: Keo | Price: 664 | Rating: 3.9
 6. [Score:  0.158] TSHFGBYKUQKCREZJ
    Title: Solid Women Polo Neck Multicolor T-Shirt  (Pack o

In [10]:
# Create summary table
summary_data = []
for query, results in all_results.items():
    num_results = len(results)
    avg_score = sum(score for _, score in results) / num_results if num_results > 0 else 0
    max_score = max((score for _, score in results), default=0)
    min_score = min((score for _, score in results), default=0)

    summary_data.append({
        'Query': query,
        'Num Results': num_results,
        'Avg Score': f"{avg_score:.3f}",
        'Max Score': f"{max_score:.3f}",
        'Min Score': f"{min_score:.3f}"
    })

summary_df = pd.DataFrame(summary_data)
print("\n" + "="*80)
print("SUMMARY - Test Queries Performance")
print("="*80)
print(summary_df.to_string(index=False))



SUMMARY - Test Queries Performance
                           Query  Num Results Avg Score Max Score Min Score
       women cotton dress summer           20     0.157     0.161     0.156
        men leather shoes formal           15     0.327     0.506     0.206
     kids blue jeans comfortable            1     0.165     0.165     0.165
sports running shoes lightweight            3     0.433     0.457     0.392
   winter jacket warm waterproof            0     0.000     0.000     0.000


In [12]:
# 1. Get top-20 results for each test query
test_query_results = {}
for i, query in enumerate(TEST_QUERIES, start=1):
    results = searchEngine.search_tfidf(query, corpus=corpus, top_k=20)
    test_query_results[i] = {
        'query': query,
        'results': results
    }

# 2. Display results for manual labeling
ground_truth_data = []
for query_id, data in test_query_results.items():
    query = data['query']
    results = data['results']

    print(f"\n{'='*80}")
    print(f"Query {query_id}: {query}")
    print(f"{'='*80}")

    for rank, (pid, score) in enumerate(results, 1):
        doc = corpus[pid]
        title = doc['original']['title']
        print(f"{rank:2d}. {pid} - {title}")

        ground_truth_data.append({
            'query_id': query_id,
            'pid': pid,
            'title': title,
            'labels': None  # You'll fill this manually
        })

# 3. Create DataFrame
ground_truth_df = pd.DataFrame(ground_truth_data)

# 4. Save to CSV (you'll need to fill labels column first)
ground_truth_df.to_csv('../../data/test_queries_labels.csv', index=False)
print("\nGround truth saved to ../../data/test_queries_labels.csv")

# WE NEED TO MANUALLY LABEL THE DOCUMENTS, ALL THE QUERY 1-3 ARE 0 AND THE QUERY 4 = 1

Processed query terms: ['women', 'cotton', 'dress', 'summer']
Indexed 20906 unique terms from 28080 documents in 1.05s
Found 273 documents matching all terms
Processed query terms: ['men', 'leather', 'shoe', 'formal']
Indexed 20906 unique terms from 28080 documents in 0.79s
Found 15 documents matching all terms
Processed query terms: ['kid', 'blue', 'jean', 'comfort']
Indexed 20906 unique terms from 28080 documents in 1.01s
Found 1 documents matching all terms
Processed query terms: ['sport', 'run', 'shoe', 'lightweight']
Indexed 20906 unique terms from 28080 documents in 0.81s
Found 3 documents matching all terms
Processed query terms: ['winter', 'jacket', 'warm', 'waterproof']
Indexed 20906 unique terms from 28080 documents in 1.00s
No documents found matching all query terms

Query 1: women cotton dress summer
 1. TSHFVUXNG5VCCCHU - Solid Women Polo Neck Red T-Shirt
 2. TSHFVUXGRXTQPUGH - Solid Women Polo Neck Maroon T-Shirt
 3. TSHFGBYKFQUMFKKU - Solid Women Polo Neck Multicolor T-

In [14]:
from myapp.search.evaluation import (
    SearchEvaluator,
    load_validation_labels
)

# Load ground truth for test queries
test_labels = load_validation_labels('../../data/test_queries_labels.csv')

# Evaluate each test query
for query_id in range(5):
    query = TEST_QUERIES[query_id - 1]
    results = searchEngine.search_tfidf(query, corpus=corpus, top_k=20)
    ranked_pids = [pid for pid, score in results]

    if query_id in test_labels:
        evaluator = SearchEvaluator(test_labels[query_id])
        eval_results = evaluator.evaluate_all(ranked_pids, k_values=[5, 10, 20])

        print(f"\nQuery {query_id}: {query}")
        print(f"P@10={eval_results['P@10']:.3f}, R@10={eval_results['R@10']:.3f}")

Processed query terms: ['winter', 'jacket', 'warm', 'waterproof']
Indexed 20906 unique terms from 28080 documents in 0.79s
No documents found matching all query terms
Processed query terms: ['women', 'cotton', 'dress', 'summer']
Indexed 20906 unique terms from 28080 documents in 1.00s
Found 273 documents matching all terms

Query 1: women cotton dress summer
P@10=0.000, R@10=0.000
Processed query terms: ['men', 'leather', 'shoe', 'formal']
Indexed 20906 unique terms from 28080 documents in 0.79s
Found 15 documents matching all terms

Query 2: men leather shoes formal
P@10=0.000, R@10=0.000
Processed query terms: ['kid', 'blue', 'jean', 'comfort']
Indexed 20906 unique terms from 28080 documents in 0.99s
Found 1 documents matching all terms

Query 3: kids blue jeans comfortable
P@10=0.000, R@10=0.000
Processed query terms: ['sport', 'run', 'shoe', 'lightweight']
Indexed 20906 unique terms from 28080 documents in 0.75s
Found 3 documents matching all terms

Query 4: sports running shoes li

## Test Validation Queries

Test with the two predefined queries from `validation_labels.csv`:
- Query 1: "women full sleeve sweatshirt cotton"
- Query 2: "men slim jeans blue"


In [16]:
VALIDATION_QUERIES = [
    "women full sleeve sweatshirt cotton",
    "men slim jeans blue"
]

for i, query in enumerate(VALIDATION_QUERIES, 1):
    print(f"\n{'='*80}")
    print(f"Validation Query {i}: '{query}'")
    print(f"{'='*80}")

    results = searchEngine.search_tfidf(query, corpus=corpus, top_k=20)

    print(f"\nTop 20 Results (for evaluation in Part 2):")
    for rank, (pid, score) in enumerate(results, 1):
        doc = corpus[pid]
        title = doc['original']['title']
        print(f"{rank:2d}. [Score: {score:6.3f}] {pid} - {title[:50]}")



Validation Query 1: 'women full sleeve sweatshirt cotton'
Processed query terms: ['women', 'full', 'sleev', 'sweatshirt', 'cotton']
Indexed 20906 unique terms from 28080 documents in 0.97s
Found 500 documents matching all terms

Top 20 Results (for evaluation in Part 2):
 1. [Score:  0.723] SWSFMTNHEM5SNHVZ - Full Sleeve Solid Women Sweatshirt
 2. [Score:  0.689] SWSFVZRFS7GHGKSF - Full Sleeve Solid Women Sweatshirt
 3. [Score:  0.681] SWSFYTYMNTBNARUN - Full Sleeve Solid Women Sweatshirt
 4. [Score:  0.675] SWSFVZRFCWPPZVGT - Full Sleeve Solid Men Sweatshirt
 5. [Score:  0.670] SWSFVZRFDEQAKUH2 - Full Sleeve Solid Men Sweatshirt
 6. [Score:  0.670] SWSFVZRFTM4P9XGF - Full Sleeve Solid Men Sweatshirt
 7. [Score:  0.668] SWSFVZRFVZ2SZ3HZ - Full Sleeve Solid Men Sweatshirt
 8. [Score:  0.642] SWSFVZRFNFMYYRCK - Full Sleeve Solid Men Sweatshirt
 9. [Score:  0.635] SWSFZVTTQCB4SJ7F - Full Sleeve Solid Women Sweatshirt
10. [Score:  0.633] SWSFZ2H4KMYXZXX7 - Full Sleeve Solid Women Sweatshi

In [17]:
import pandas as pd

from myapp.search.evaluation import (
    SearchEvaluator,
    calculate_map,
    calculate_mrr,
)

# Load validation labels with ranking
validation_df = pd.read_csv('../../data/validation_labels.csv')

# Build ranking (ordered pids) and ground truth (pid -> label) per query
ranked_pids_by_query = {}
labels_by_query = {}

for query_id, group in validation_df.groupby('query_id'):
    ranked_pids_by_query[query_id] = group['pid'].tolist()
    labels_by_query[query_id] = dict(zip(group['pid'], group['labels']))

print("=" * 80)
print("PART 2: EVALUATION METRICS - PROVIDED QUERIES")
print("=" * 80)
print("\nValidation labels loaded (from validation_labels.csv):")
for query_id in sorted(ranked_pids_by_query.keys()):
    labels = labels_by_query[query_id]
    relevant_count = sum(1 for label in labels.values() if label == 1)
    print(f"  Query {query_id}: {len(labels)} documents ({relevant_count} relevant)")

# Mapping from query_id to query text (for pretty printing)
VALIDATION_QUERIES = {
    1: "women full sleeve sweatshirt cotton",
    2: "men slim jeans blue",
}

# Evaluate each query using only the provided ranking from the CSV
evaluators = []
all_ranked = []

for query_id in sorted(ranked_pids_by_query.keys()):
    ranked_pids = ranked_pids_by_query[query_id]
    evaluator = SearchEvaluator(labels_by_query[query_id])
    eval_results = evaluator.evaluate_all(ranked_pids, k_values=[5, 10, 20])

    print("\n" + "=" * 80)
    print(f"QUERY {query_id}: {VALIDATION_QUERIES[query_id]}")
    print("=" * 80)
    print(f"MRR:     {eval_results['MRR']:.3f}")
    print(
        f"\nK=5:   P@5={eval_results['P@5']:.3f}  R@5={eval_results['R@5']:.3f}  "
        f"AP@5={eval_results['AP@5']:.3f}  F1@5={eval_results['F1@5']:.3f}  "
        f"NDCG@5={eval_results['NDCG@5']:.3f}"
    )
    print(
        f"K=10:  P@10={eval_results['P@10']:.3f}  R@10={eval_results['R@10']:.3f}  "
        f"AP@10={eval_results['AP@10']:.3f}  F1@10={eval_results['F1@10']:.3f}  "
        f"NDCG@10={eval_results['NDCG@10']:.3f}"
    )
    print(
        f"K=20:  P@20={eval_results['P@20']:.3f}  R@20={eval_results['R@20']:.3f}  "
        f"AP@20={eval_results['AP@20']:.3f}  F1@20={eval_results['F1@20']:.3f}  "
        f"NDCG@20={eval_results['NDCG@20']:.3f}"
    )

    evaluators.append(evaluator)
    all_ranked.append(ranked_pids)

# Aggregate metrics across queries
map_score = calculate_map(evaluators, all_ranked, k=20)
mrr_score = calculate_mrr(evaluators, all_ranked)

print("\n" + "=" * 80)
print("AGGREGATE METRICS (Both Queries)")
print("=" * 80)
print(f"MAP@20: {map_score:.3f}")
print(f"MRR:    {mrr_score:.3f}")

# Summary table
summary_data = []
for query_id, query_name in VALIDATION_QUERIES.items():
    evaluator = SearchEvaluator(labels_by_query[query_id])
    ranked_pids = ranked_pids_by_query[query_id]
    eval_results = evaluator.evaluate_all(ranked_pids, k_values=[10, 20])

    summary_data.append({
        'Query': query_name,
        'P@10': f"{eval_results['P@10']:.3f}",
        'R@10': f"{eval_results['R@10']:.3f}",
        'F1@10': f"{eval_results['F1@10']:.3f}",
        'AP@20': f"{eval_results['AP@20']:.3f}",
        'MRR': f"{eval_results['MRR']:.3f}",
        'NDCG@10': f"{eval_results['NDCG@10']:.3f}",
    })

summary_df = pd.DataFrame(summary_data)
print("\n" + "=" * 80)
print("SUMMARY - Validation Queries Evaluation (Provided System)")
print("=" * 80)
print(summary_df.to_string(index=False))

PART 2: EVALUATION METRICS - PROVIDED QUERIES

Validation labels loaded (from validation_labels.csv):
  Query 1: 20 documents (13 relevant)
  Query 2: 20 documents (10 relevant)

QUERY 1: women full sleeve sweatshirt cotton
MRR:     1.000

K=5:   P@5=0.600  R@5=0.231  AP@5=0.483  F1@5=0.333  NDCG@5=0.655
K=10:  P@10=0.700  R@10=0.538  AP@10=0.508  F1@10=0.609  NDCG@10=0.703
K=20:  P@20=0.650  R@20=1.000  AP@20=0.694  F1@20=0.788  NDCG@20=0.873

QUERY 2: men slim jeans blue
MRR:     1.000

K=5:   P@5=0.600  R@5=0.300  AP@5=0.420  F1@5=0.400  NDCG@5=0.616
K=10:  P@10=0.600  R@10=0.600  AP@10=0.408  F1@10=0.600  NDCG@10=0.615
K=20:  P@20=0.500  R@20=1.000  AP@20=0.627  F1@20=0.667  NDCG@20=0.833

AGGREGATE METRICS (Both Queries)
MAP@20: 0.660
MRR:    1.000

SUMMARY - Validation Queries Evaluation (Provided System)
                              Query  P@10  R@10 F1@10 AP@20   MRR NDCG@10
women full sleeve sweatshirt cotton 0.700 0.538 0.609 0.694 1.000   0.703
                men slim jean