# Part 1: Indexing

In [27]:
import os
import sys
# Add project root to path
sys.path.insert(0, os.path.abspath('../../'))

# Import preprocessing functions
from myapp.preprocessing.text_processing import build_query_terms


In [None]:
import pickle

cache_path = '../../data/processed/preprocessed_corpus.pkl'

with open(cache_path, 'rb') as f:
    preprocessed_data = pickle.load(f)

sample = preprocessed_data[0]

all_tokens = [token for doc in preprocessed_data for token in doc["searchable_text"]]
vocabulary = set(all_tokens)

print(f"  Total documents: {len(preprocessed_data)}")
print(f"  Sample structure keys: {list(sample.keys())}")
print(f"  Vocabulary size: {len(vocabulary)}")
print(f"  PID accessible at root: {'pid' in sample}")

  Total documents: 28080
  Sample structure keys: ['pid', 'searchable_text', 'metadata', 'original']
  Vocabulary size: 20906
  PID accessible at root: True
TKPFCZ9EA7H5FYZH


In [6]:
from collections import defaultdict
import time

start_time = time.time()
term_index = {term: i for i, term in enumerate(vocabulary)}
index = defaultdict(set)

for i, doc in enumerate(preprocessed_data):

    for term in doc['searchable_text']:
        term_id = term_index[term]
        index[term_id].add(i)

print(f"  Total indexed terms: {len(index)}")
print(f"  Indexing time: {time.time() - start_time:.2f} seconds")

  Total indexed terms: 20906
  Indexing time: 0.35 seconds


In [40]:
def search(query, index):
    query = build_query_terms(query)
    print(f"Processed query terms: {query}")
    docs = None
    for term in query:
        term_id = term_index.get(term)
        if term_id in index:
            if docs is None:
                docs = index[term_id].copy()
            else:
                docs.intersection_update(index[term_id])
        else:
            # If any term is not in the index, return an empty result
            return []
    return list(docs) if docs else []

In [41]:
print("Insert your query:")
query = input()
results = search(query, index)
top = 10

print(f"\n====================\n Top {top} results for query: '{query}'")
for doc_id in results[:top]:
    doc = preprocessed_data[doc_id]
    print(f"  PID: {doc['pid']}, Title: {doc['original']['title']}")

Insert your query:
Processed query terms: ['cotton', 'sweatshirt']

 Top 10 results for query: 'cotton sweatshirt'
  PID: SWSFPGX2M7SXUMP7, Title: Full Sleeve Solid Men Sweatshirt
  PID: SWSFUNF6VFJBYZZC, Title: Full Sleeve Printed Men Sweatshirt
  PID: SWSFVZPXWGQBHJZT, Title: Full Sleeve Printed Women Sweatshirt
  PID: SWSFPHFF4H2EDHRH, Title: Full Sleeve Solid Women Sweatshirt
  PID: SWSFPGEZETE9UCZT, Title: Full Sleeve Printed Men Sweatshirt
  PID: SWSFTEBYH3YXT3JZ, Title: Full Sleeve Solid Men Sweatshirt
  PID: SWSFSUDD247YZBKJ, Title: Full Sleeve Printed Women Sweatshirt
  PID: SWSFWE3GFTYKGHJZ, Title: Full Sleeve Graphic Print Women Sweatshirt
  PID: SWSFV4E6G62HGJVG, Title: Full Sleeve Printed Women Sweatshirt
  PID: SWSFTEBYPNVAVZW7, Title: Full Sleeve Solid Men Sweatshirt
