***The pipeline demonstrates how champion lists reduce search cost while preserving ranking quality.***


Step 1: build_synthetic_corpus
Generates a controlled synthetic corpus for IR experiments.

Step 2:doc_frequency(docs)
Computes document frequency to inspect term distribution.

Step 3:build_sparse_tfidf(docs)
Converts documents into sparse TF-IDF vectors.

Step 4:build_inverted_index_from_tfidf(tfidf)
Builds an inverted index mapping terms to documents.

Step 5: build_champion_list(inv_idx, r)
Selects top-r high-weight documents per term to reduce candidates.

Step 6 : tfidf_to_dense(tfidf, term2idx, N)
Converts sparse TF-IDF vectors into a dense matrix.

Step 7: cosine_dense(a, b)
Computes cosine similarity between two vectors.

Step 8: vectorize_query(query_terms, term2idx, idf)
Represents a query in the same TF-IDF space as documents.

Step 9:full_search_dense(...)
Performs exhaustive search over all documents as a baseline.

Step 10: champion_search_dense(...)
Performs approximate search using champion lists.

In [41]:
import random
from collections import Counter
import numpy as np
import math
from collections import defaultdict

In [42]:
#Step 1 : Build build_synthetic_corpus

We use a synthetic corpus to control DF/TF patterns so that the impact of champion lists on candidate reduction and top-K preservation is clearly observable.

In [43]:
#1-1 vocab 생성 함수

In [44]:
def make_vocab_sets(

    topics_n=5,
    topic_vocab_size=120,
    common_vocab_size=120,
    head_terms_n=25,
    super_common_n=15
):
    """
    Create controlled token sets for a synthetic corpus.
    Returns:
      topics: dict[str, list[str]]   (topic -> topic-specific terms)
      common: list[str]             (moderately common terms)
      head_terms: list[str]         (high-DF terms to create long posting lists)
      super_common: list[str]       (stopword-like ultra-high-DF terms)
    """

    # Iterate over the topic vocabulary size and generate unique terms
    # by combining the topic index (i) and the word index (j).
    # Topic-specific vocabularies: t{i}_w{j}
    topics = {
        f"topic{i}": [f"t{i}_w{j}" for j in range(topic_vocab_size)]
        #This loop iterates over the number of topics.
        for i in range(topics_n)
    }

    # Common terms shared across topics: c_w{j}
    common = [f"c_w{j}" for j in range(common_vocab_size)]

    # Head terms: controllable high-frequency tokens
    # Head terms simulate ultra-high-frequency tokens to create long posting lists.
    head_terms = [f"head_w{j}" for j in range(head_terms_n)]

    # Super common terms: stopword-like ultra-high DF tokens
    super_common = [f"sw_w{j}" for j in range(super_common_n)]

    return topics, common, head_terms, super_common


In [45]:
# 1-2 build_synthetic_corpus_v2()
# → Responsible only for the document generation flow

In [46]:
def build_synthetic_corpus_v2(
    n_docs=100,
    seed=42,
    topics_n=5,
    topic_vocab_size=120,
    common_vocab_size=120,
    doc_len_range=(20, 35),
    topic_ratio=0.7,
    burst_prob=0.45,
    burst_repeat_range=(2, 6),
    super_common_n=15,
    super_common_ratio=0.12,
    super_common_df_target=0.95
):
    """
    Extended synthetic corpus generator for champion list experiments.
    """
    random.seed(seed)

    # Structure the vocabulary into disjoint sets by role:
    # topic-specific, common, head, and super-common.
    topics, common, head_terms, super_common = make_vocab_sets(
        topics_n=topics_n,
        topic_vocab_size=topic_vocab_size,
        common_vocab_size=common_vocab_size,
        head_terms_n=25,
        super_common_n=super_common_n
    )

    # Store document texts and their topic labels (labels help for kNN/classification).
    docs = []
    labels = []

    for doc_id in range(n_docs):
        # Randomly assign a topic ID to this document
        topic_id = random.randrange(topics_n)
        topic_name = f"topic{topic_id}"
        labels.append(topic_id)

        # Choose document length
        L = random.randint(*doc_len_range)

        # Allocate super-common tokens (stopword-like) in most documents,
        # so that these terms have very high DF (df ≈ N).
        n_super = 0
        if random.random() < super_common_df_target:
            n_super = max(1, int(L * super_common_ratio))

        # Distribute remaining tokens between topic-specific and regular common terms
        L_rem = max(1, L - n_super)
        n_topic = int(L_rem * topic_ratio)
        n_common = L_rem - n_topic

        words = []
        words += random.choices(topics[topic_name], k=n_topic)
        words += random.choices(common, k=n_common)

        if n_super > 0:
            words += random.choices(super_common, k=n_super)

        # Add head terms to create long posting lists
        if random.random() < 0.55:
            words += random.sample(head_terms, k=random.randint(1, 3))

        # Burst to create high TF for some topic terms
        if random.random() < burst_prob:
            burst_word = random.choice(topics[topic_name])
            words += [burst_word] * random.randint(*burst_repeat_range)

        random.shuffle(words)
        docs.append(" ".join(words))

    return docs, labels, topics, common, head_terms, super_common


In [47]:
docs, labels, topics, common, head_terms, super_common = build_synthetic_corpus_v2(
    n_docs=100,
    seed=42
)

In [48]:
#def doc_frequency(docs): Quick inspection of term-level document frequency (DF)

In [49]:
def doc_frequency(docs):
    df = Counter()
    for text in docs:
        # Each document is split into whitespace-separated tokens,
        # and each term contributes at most 1 to document frequency (DF).
        df.update(set(text.split()))
    return df

df = doc_frequency(docs)

# Head terms are expected to have relatively large DF values
print("Sample head terms df:")
for t in head_terms[:10]:
    print(t, df[t])

# Some terms should be very frequent, while others should be rare,
# so that the champion list becomes meaningful
print("\nRare-ish terms df (df == 1 ~ 3) examples:")
rare = [t for t, c in df.items() if 1 <= c <= 3]


Sample head terms df:
head_w0 7
head_w1 1
head_w2 5
head_w3 6
head_w4 8
head_w5 2
head_w6 10
head_w7 2
head_w8 2
head_w9 7

Rare-ish terms df (df == 1 ~ 3) examples:


In [50]:
#def build_sparse_tfidf(docs):  
# Build sparse TF-IDF representations and auxiliary indices
# for IR and champion list experiments.

In [51]:
import math
from collections import Counter

def build_sparse_tfidf(docs):

    # Step 1. tokenize (already tokenized; split by space)
    # Simple whitespace tokenization is sufficient here
    # because the synthetic corpus already consists of pre-tokenized terms.

    tokenized = [doc.split() for doc in docs]
    N = len(tokenized)

    # Step 2. DF
    # Use set(toks) so that each term contributes at most once per document,
    # which matches the definition of document frequency (DF).

    df = Counter()
    for toks in tokenized:
        df.update(set(toks))

    # Step 3. Add-one smoothed IDF to avoid idf=0
    # Add-one (Laplace) smoothing:
    # - avoids division by zero
    # - keeps IDF well-defined even for very frequent terms
    idf = {
        term: math.log((N + 1) / (df_t + 1)) + 1.0
        for term, df_t in df.items()
    }

    # Step 4. Compute sparse TF-IDF
    # Store TF-IDF vectors in sparse form (only non-zero entries).
    tfidf = {}  # Output dictionary: doc_id -> {term: tf-idf value}

    for doc_id, toks in enumerate(tokenized):
        # 1) Compute term frequency (TF)
        tf = Counter(toks)

        # 2) Compute TF-IDF weights
        tfidf[doc_id] = {
            term: tf_count * idf[term]
            for term, tf_count in tf.items()
        }

    # Step 5. vocabulary index  (IMPORTANT: outside the loop)
    # Build a global vocabulary index for later conversion
    # to dense vectors and cosine similarity computation.
    vocab = sorted(df.keys())
    term2idx = {term: idx for idx, term in enumerate(vocab)}

    # Return after all documents are processed
    return tfidf, term2idx, idf


In [52]:
tfidf, term2idx, idf = build_sparse_tfidf(docs)

In [60]:
from collections import defaultdict

def build_inverted_index_from_tfidf(tfidf):
    """
    Build an inverted index from sparse TF-IDF representations.

    Args:
        tfidf (dict[int, dict[str, float]]):
            Mapping from document ID to a sparse TF-IDF vector.
            Example:
              {
                0: {"t0_w1": 0.32, "c_w5": 0.11},
                1: {"t1_w3": 0.45, "head_w2": 0.08},
                ...
              }

    Returns:
        inv_idx (dict[str, list[tuple[int, float]]]):
            Inverted index mapping each term to a posting list of
            (doc_id, tf-idf weight) pairs.
            Example:
              {
                "t0_w1": [(0, 0.32), (5, 0.27), ...],
                "head_w2": [(1, 0.08), (7, 0.06), ...],
                ...
              }
    """

    # Initialize an empty inverted index.
    # defaultdict(list) automatically creates an empty list
    # when a new term is encountered.
    inv_idx = defaultdict(list)

    # Iterate over documents
    for doc_id, term_dict in tfidf.items():
        # term_dict is the sparse TF-IDF vector for this document:
        # {term: tf-idf weight}

        # Iterate over terms appearing in the document
        for term, weight in term_dict.items():
            # Append (doc_id, weight) to the posting list of this term
            inv_idx[term].append((doc_id, weight))

    return inv_idx

In [54]:
inv_idx = build_inverted_index_from_tfidf(tfidf)
print("num terms in inv_idx:", len(inv_idx))

num terms in inv_idx: 720


In [61]:
def build_champion_list(inv_idx, r=30):
    """
    Build a champion list for each term.

    Args:
        inv_idx (dict[str, list[tuple[int, float]]]):
            Inverted index mapping each term to a posting list of
            (doc_id, tf-idf weight) pairs.

        r (int):
            Number of top documents to keep per term (champion size).

    Returns:
        champion (dict[str, list[tuple[int, float]]]):
            Champion list mapping each term to its top-r postings,
            sorted by descending TF-IDF weight.
    """

    champion = {}

    # Iterate over each term and its posting list
    for term, postings in inv_idx.items():
        # Sort postings by TF-IDF weight in descending order
        # and keep only the top-r documents
        champion[term] = sorted(
            postings,
            key=lambda x: x[1],  # x[1] = TF-IDF weight
            reverse=True
        )[:r]

    return champion

In [62]:
# Build champion list
r = 30
champion = build_champion_list(inv_idx, r=r)

print("Total number of terms:", len(inv_idx))
print("Champion list size per term (r):", r)

# Pick a sample term to inspect
sample_term = next(iter(champion.keys()))

print("\nSample term:", sample_term)
print("Posting list length (full):", len(inv_idx[sample_term]))
print("Champion list length:", len(champion[sample_term]))

print("\nTop-5 champion postings (doc_id, tf-idf):")
for doc_id, weight in champion[sample_term][:5]:
    print(f"doc {doc_id}, weight={weight:.4f}")

Total number of terms: 720
Champion list size per term (r): 30

Sample term: c_w50
Posting list length (full): 8
Champion list length: 8

Top-5 champion postings (doc_id, tf-idf):
doc 0, weight=3.4179
doc 4, weight=3.4179
doc 24, weight=3.4179
doc 28, weight=3.4179
doc 41, weight=3.4179


In [63]:
for t in head_terms[:3]:
    print("\nTerm:", t)
    print("Full postings:", len(inv_idx[t]))
    print("Champion postings:", len(champion[t]))



Term: head_w0
Full postings: 7
Champion postings: 7

Term: head_w1
Full postings: 1
Champion postings: 1

Term: head_w2
Full postings: 5
Champion postings: 5


In [64]:
term_longest = max(inv_idx.keys(), key=lambda t: len(inv_idx[t]))
print("Longest term:", term_longest, "df=", len(inv_idx[term_longest]))
print("Champion len:", len(champion[term_longest]))
print("Top 5:", champion[term_longest][:5])

Longest term: sw_w3 df= 23
Champion len: 23
Top 5: [(1, 4.874133372986627), (34, 4.874133372986627), (63, 4.874133372986627), (81, 4.874133372986627), (4, 2.4370666864933135)]


In [56]:
def tfidf_to_dense(tfidf, term2idx, N):
    V = len(term2idx)
    X = np.zeros((N, V))
    for doc_id, term_dict in tfidf.items():
        for term, val in term_dict.items():
            X[doc_id, term2idx[term]] = val
    return X


N = len(docs)
X_tfidf = tfidf_to_dense(tfidf, term2idx, N)
print("X_tfidf shape:", X_tfidf.shape)


X_tfidf shape: (100, 720)


In [57]:
import numpy as np

def cosine_dense(a, b):
    """
    Compute cosine similarity between two dense vectors.

    Args:
        a (np.ndarray): dense vector (e.g., query vector)
        b (np.ndarray): dense vector (e.g., document vector)

    Returns:
        float: cosine similarity value in [0, 1]
    """
    # Compute L2 norms
    norm_a = np.linalg.norm(a)
    norm_b = np.linalg.norm(b)

    # Handle zero vectors
    if norm_a == 0.0 or norm_b == 0.0:
        return 0.0

    # Cosine similarity = (a · b) / (||a|| * ||b||)
    return float(np.dot(a, b) / (norm_a * norm_b))

In [65]:
import numpy as np
from collections import Counter

def vectorize_query(query_terms, term2idx, idf):
    """
    Convert a query (list of terms) into a dense TF-IDF vector.

    Args:
        query_terms (List[str]): tokenized query terms
        term2idx (dict[str, int]): mapping from term to column index
        idf (dict[str, float]): inverse document frequency values

    Returns:
        np.ndarray: dense query vector of shape (V,)
    """
    # Initialize dense query vector
    q = np.zeros((len(term2idx),), dtype=float)

    # Compute term frequency in the query
    q_tf = Counter(query_terms)

    # Fill the vector with TF-IDF weights
    for term, tf_count in q_tf.items():
        #  # Ignore query terms not seen in the vocabulary
        if term in term2idx:
            q[term2idx[term]] = tf_count * idf.get(term, 0.0)

    return q

In [68]:
# Baseline search that computes cosine similarity against all documents.

In [67]:
def full_search_dense(query_terms, X_tfidf, term2idx, idf, topk=5):

    # Vectorize the query into a dense TF-IDF vector
    q = vectorize_query(query_terms, term2idx, idf)

    scores = []

    # Compute cosine similarity against ALL documents
    for doc_id in range(X_tfidf.shape[0]):
        s = cosine_dense(q, X_tfidf[doc_id])
        if s > 0:
            scores.append((doc_id, s))

    # Sort documents by similarity score (descending)
    scores.sort(key=lambda x: x[1], reverse=True)

    # Return top-k results and the total number of evaluated documents
    return scores[:topk], X_tfidf.shape[0]


In [69]:
# --- Debug / sanity check for full_search_dense ---

query_terms = ["t0_w81", "t0_w84", "head_w6", "c_w50"]  # adjust as you like
topk = 5

full_res, full_cands = full_search_dense(query_terms, X_tfidf, term2idx, idf, topk=topk)

print("Query terms:", query_terms)
print("FULL candidates evaluated:", full_cands)
print(f"FULL top{topk} results (doc_id, score):")
for doc_id, score in full_res:
    print(f"  doc {doc_id:3d}  score={score:.6f}")

# Optional: inspect one returned document snippet
if full_res:
    best_doc_id = full_res[0][0]
    print("\nTop-1 document text (first 120 chars):")
    print(docs[best_doc_id][:120])


Query terms: ['t0_w81', 't0_w84', 'head_w6', 'c_w50']
FULL candidates evaluated: 100
FULL top5 results (doc_id, score):
  doc  37  score=0.231855
  doc  30  score=0.193575
  doc  50  score=0.097691
  doc  28  score=0.092566
  doc   4  score=0.091946

Top-1 document text (first 120 chars):
sw_w8 t0_w108 t0_w9 c_w83 c_w68 t0_w84 c_w110 t0_w45 c_w33 t0_w117 t0_w57 sw_w9 t0_w110 c_w32 t0_w77 t0_w107 head_w6 t0_


#The champion list reduced the candidate set by 80% while preserving the top-5 ranking.

In [70]:
def champion_search_dense(query_terms, X_tfidf, term2idx, idf, champion, topk=5):
    """
    Perform champion-list-based search:
    Instead of scoring all documents, score only candidate documents
    collected from the union of champion postings for the query terms.

    Returns:
        top_results: top-k (doc_id, score)
        num_candidates: number of candidate docs actually scored
    """
    # Vectorize query into a dense TF-IDF vector
    q = vectorize_query(query_terms, term2idx, idf)

    # Collect candidate documents:
    # Take the union of champion postings for each unique query term.
    cand = set()
    for term in set(query_terms):
        for doc_id, _ in champion.get(term, []):
            cand.add(doc_id)

    # Score only candidate documents
    scores = []
    for doc_id in cand:
        s = cosine_dense(q, X_tfidf[doc_id])
        if s > 0:
            scores.append((doc_id, s))

    # Sort by similarity score (descending) and return top-k
    scores.sort(key=lambda x: x[1], reverse=True)
    return scores[:topk], len(cand)

In [None]:
# --- Debug / comparison: FULL vs CHAMPION ---

In [72]:
query_terms = ["t0_w81", "t0_w84", "head_w6", "c_w50"]
topk = 5

full_res, full_cands = full_search_dense(query_terms, X_tfidf, term2idx, idf, topk=topk)
ch_res, ch_cands = champion_search_dense(query_terms, X_tfidf, term2idx, idf, champion, topk=topk)

print("Query terms:", query_terms)
print("FULL   candidates:", full_cands, "topk:", full_res)
print("CHAMP  candidates:", ch_cands,  "topk:", ch_res)

# Overlap check for top-k
full_top = [d for d, _ in full_res]
ch_top = [d for d, _ in ch_res]
overlap = len(set(full_top) & set(ch_top))

print(f"top{topk} overlap:", overlap, "/", topk)

# Optional: inspect texts of the returned docs
if full_res:
    print("\nFULL top-1 doc preview:", docs[full_res[0][0]][:120])
if ch_res:
    print("CHAMP top-1 doc preview:", docs[ch_res[0][0]][:120])


Query terms: ['t0_w81', 't0_w84', 'head_w6', 'c_w50']
FULL   candidates: 100 topk: [(37, 0.23185539959571902), (30, 0.1935747915713828), (50, 0.0976913145431689), (28, 0.09256649153379709), (4, 0.09194558587983924)]
CHAMP  candidates: 20 topk: [(37, 0.23185539959571902), (30, 0.1935747915713828), (50, 0.0976913145431689), (28, 0.09256649153379709), (4, 0.09194558587983924)]
top5 overlap: 5 / 5

FULL top-1 doc preview: sw_w8 t0_w108 t0_w9 c_w83 c_w68 t0_w84 c_w110 t0_w45 c_w33 t0_w117 t0_w57 sw_w9 t0_w110 c_w32 t0_w77 t0_w107 head_w6 t0_
CHAMP top-1 doc preview: sw_w8 t0_w108 t0_w9 c_w83 c_w68 t0_w84 c_w110 t0_w45 c_w33 t0_w117 t0_w57 sw_w9 t0_w110 c_w32 t0_w77 t0_w107 head_w6 t0_
