This notebook implements a TF-IDF–based tiered inverted index for efficient top-K retrieval, demonstrated on toy data.

# ==================================================
# Pipeline Overview (Tiered Index with TF-IDF)
# ==================================================
# 0) Toy data
#    docs = {doc_id: sentence}
#
# 1) Tokenizer (leveled-up)
#    - lowercase
#    - regex: extract alphanumeric tokens
#    - (optional) remove stopwords
#    - (optional) remove tokens of length <= 1
#    -> doc_tokens = {doc_id: [term, term, ...]}
#
# 2) Build basic inverted index
#    inv_index[term][doc_id] = tf
#    -> term -> {doc_id: tf}
#
# 3) Compute DF & IDF
#    df(term) = number of documents containing term
#    idf(term) = log((N+1)/(df+1)) + 1   (toy smoothing)
#    -> idf[term]
#
# 4) TF-IDF postings (sorted)
#    score = tf * idf(term)
#    postings_tfidf[term] = [(doc_id, score), ...] sorted desc
#
# 5) Split into tiers (super simple rule)
#    Toy:    tier1 top-1 / tier2 next-1 / tier3 rest
#    Real-ish: tier1 top-50 / tier2 next-200 / rest
# ==================================================



In [94]:
import re
import math
from typing import List
from collections import defaultdict, Counter

In [95]:
# Step 0. Toy data
docs = {
    0: "nlp search search model",
    1: "search engine ranking",
    2: "nlp language model",
    3: "cat dog dog"
}

In [96]:
STOP = {"the", "a", "an", "is", "and", "or", "to", "of", "in"}

In [97]:
# Step 1. Advanced tokenizer
def tokenize(text: str) -> List[str]:

    # 1) lowercase
    text = text.lower()

    #2)Regex :extract alphanumeric tokens
    tokens = re.findall(r"[a-z0-9]+", text)

    # 3) remove stopwords and short tokens
    tokens = [t for t in tokens if t not in STOP and len(t) > 1]

    return tokens

In [98]:
# Step 2. Apply tokenizer to each document (doc_id -> token list)
# - Input:  docs = {doc_id: raw_text}
# - Output: doc_tokens = {doc_id: [term1, term2, ...]}
doc_tokens = {doc_id: tokenize(sentence) for doc_id, sentence in docs.items()}

In [99]:
print("=== doc_tokens ===")
for k, v in doc_tokens.items():
    print(k, ":", v)

=== doc_tokens ===
0 : ['nlp', 'search', 'search', 'model']
1 : ['search', 'engine', 'ranking']
2 : ['nlp', 'language', 'model']
3 : ['cat', 'dog', 'dog']


In [100]:
#Step 3.Build basic inverted index
# - Input:  doc_tokens = {doc_id: [term, term, ...]}
# - Output: inv_index = {term: {doc_id: tf}}

In [101]:
# 1) empty space to store inv_index term → {doc_id: tf}
inv_index = defaultdict(dict)

# iterate doc_tokens which generated from step2
for doc_id, tokens in doc_tokens.items():
    tf = Counter(tokens)           # term frequency in this document
    for term, freq in tf.items():  # iterate tf and extract term and freq
        inv_index[term][doc_id] = freq  # store into inv_index

In [102]:
print("\n=== Inverted Index (pretty view) ===")
for term in sorted(inv_index.keys()):
    print(f"Term: '{term}'")
    for doc_id, tf in inv_index[term].items():
        print(f"  doc {doc_id}: tf = {tf}")


=== Inverted Index (pretty view) ===
Term: 'cat'
  doc 3: tf = 1
Term: 'dog'
  doc 3: tf = 2
Term: 'engine'
  doc 1: tf = 1
Term: 'language'
  doc 2: tf = 1
Term: 'model'
  doc 0: tf = 1
  doc 2: tf = 1
Term: 'nlp'
  doc 0: tf = 1
  doc 2: tf = 1
Term: 'ranking'
  doc 1: tf = 1
Term: 'search'
  doc 0: tf = 2
  doc 1: tf = 1


In [103]:
#Compute DF & IDF

In [104]:
def doc_freq_from_inv(inv_index):
    """
    Compute document frequency (DF) from inverted index.
    DF(term) = number of documents containing the term.
    """
    return {term: len(postings) for term, postings in inv_index.items()}

In [105]:
# Compute DF from inverted index
df = doc_freq_from_inv(inv_index)

print("=== Document Frequency (DF) ===")
for term in sorted(df.keys()):
    print(f"{term:10s} -> df = {df[term]}")

=== Document Frequency (DF) ===
cat        -> df = 1
dog        -> df = 1
engine     -> df = 1
language   -> df = 1
model      -> df = 2
nlp        -> df = 2
ranking    -> df = 1
search     -> df = 2


In [106]:
df = doc_freq_from_inv(inv_index)

In [107]:
#Compute IDF with add-one smoothing

In [108]:
# Step 4. Compute IDF with add-one smoothing
# idf(term) = log((N + 1) / (df + 1)) + 1
def compute_idf(df, N):
    """
    Compute IDF values from document frequency.
    N: total number of documents
    """
    return {
        term: math.log((N + 1) / (df_t + 1)) + 1.0
        for term, df_t in df.items()
    }

In [109]:
# total number of documents
N = len(doc_tokens)

# compute DF from inverted index
df = doc_freq_from_inv(inv_index)

# compute IDF
idf = compute_idf(df, N)

print("=== IDF values ===")
for term in sorted(idf.keys()):
    print(f"{term:10s} -> idf = {idf[term]:.4f}")


=== IDF values ===
cat        -> idf = 1.9163
dog        -> idf = 1.9163
engine     -> idf = 1.9163
language   -> idf = 1.9163
model      -> idf = 1.5108
nlp        -> idf = 1.5108
ranking    -> idf = 1.9163
search     -> idf = 1.5108


In [110]:
# Step 5. Build TF-IDF postings (term -> [(doc_id, score)])
# - Input:  inv_index (term -> {doc_id: tf}), idf (term -> idf)
# - Output: tfidf_postings (term -> [(doc_id, tf-idf), ...]) sorted desc

In [111]:
tfidf_postings ={}
for term, postings in inv_index.items():
    list=[]
    for doc_id,tf in postings.items():
        score = tf * idf[term]
        list.append((doc_id, score))

    #sort by descending order
    list.sort(key=lambda x: (-x[1], x[0]))
    tfidf_postings[term] = list

In [112]:
print("=== TF-IDF Postings (term -> [(doc_id, score)]) ===")
for term in sorted(tfidf_postings.keys()):
    scores = [(d, round(s, 4)) for d, s in tfidf_postings[term]]
    print(f"{term:10s} -> {scores}")

=== TF-IDF Postings (term -> [(doc_id, score)]) ===
cat        -> [(3, 1.9163)]
dog        -> [(3, 3.8326)]
engine     -> [(1, 1.9163)]
language   -> [(2, 1.9163)]
model      -> [(0, 1.5108), (2, 1.5108)]
nlp        -> [(0, 1.5108), (2, 1.5108)]
ranking    -> [(1, 1.9163)]
search     -> [(0, 3.0217), (1, 1.5108)]


In [113]:
# Step 6. Split postings into tiers (toy rule: 1 / 1 / rest)
def split_tiers(sorted_postings, t1=1, t2=1):
    tier1 = sorted_postings[:t1]
    tier2 = sorted_postings[t1:t1+t2]
    tier3 = sorted_postings[t1+t2:]
    return {"tier1": tier1, "tier2": tier2, "tier3": tier3}

tiered_index = {
    term: split_tiers(postings, t1=1, t2=1)
    for term, postings in tfidf_postings.items()
}

# Print tiered index
print("=== Tiered Index (term -> tier1/tier2/tier3) ===")
for term in sorted(tiered_index.keys()):
    print(f"{term:10s} -> {tiered_index[term]}")


=== Tiered Index (term -> tier1/tier2/tier3) ===
cat        -> {'tier1': [(3, 1.916290731874155)], 'tier2': [], 'tier3': []}
dog        -> {'tier1': [(3, 3.83258146374831)], 'tier2': [], 'tier3': []}
engine     -> {'tier1': [(1, 1.916290731874155)], 'tier2': [], 'tier3': []}
language   -> {'tier1': [(2, 1.916290731874155)], 'tier2': [], 'tier3': []}
model      -> {'tier1': [(0, 1.5108256237659907)], 'tier2': [(2, 1.5108256237659907)], 'tier3': []}
nlp        -> {'tier1': [(0, 1.5108256237659907)], 'tier2': [(2, 1.5108256237659907)], 'tier3': []}
ranking    -> {'tier1': [(1, 1.916290731874155)], 'tier2': [], 'tier3': []}
search     -> {'tier1': [(0, 3.0216512475319814)], 'tier2': [(1, 1.5108256237659907)], 'tier3': []}


In [114]:
tiered_docs_only = {
    term: {
        "tier1": [d for d, s in tiers["tier1"]],
        "tier2": [d for d, s in tiers["tier2"]],
        "tier3": [d for d, s in tiers["tier3"]],
    }
    for term, tiers in tiered_index.items()
}

print("=== Tiered Index (docs only) ===")
for term in sorted(tiered_docs_only.keys()):
    print(f"{term:10s} -> {tiered_docs_only[term]}")


=== Tiered Index (docs only) ===
cat        -> {'tier1': [3], 'tier2': [], 'tier3': []}
dog        -> {'tier1': [3], 'tier2': [], 'tier3': []}
engine     -> {'tier1': [1], 'tier2': [], 'tier3': []}
language   -> {'tier1': [2], 'tier2': [], 'tier3': []}
model      -> {'tier1': [0], 'tier2': [2], 'tier3': []}
nlp        -> {'tier1': [0], 'tier2': [2], 'tier3': []}
ranking    -> {'tier1': [1], 'tier2': [], 'tier3': []}
search     -> {'tier1': [0], 'tier2': [1], 'tier3': []}
