# 16. Indexing Strategies: BSBI & SPIMI\n
\n
When dealing with millions of documents, we cannot hold the entire index in memory while building it. We need **Disk-Based Index Construction**.\n
\n
We will demonstrate two standard algorithms:\n
1. **BSBI (Block Sort-Based Indexing)**: Sorts (TermID, DocID) pairs.\n
2. **SPIMI (Single-Pass In-Memory Indexing)**: Builds separate dictionaries for blocks and merges them.

In [1]:
import os
import glob
import shutil
from collections import defaultdict

# Simulating a disk environment\n
DISK_DIR = "../data/disk_simulation"
if os.path.exists(DISK_DIR):
    shutil.rmtree(DISK_DIR)
os.makedirs(DISK_DIR)

DATA_DIR = "../data"
files = sorted(glob.glob(os.path.join(DATA_DIR, "doc*.txt")))
print(f"Processing {len(files)} documents.")

Processing 60 documents.


## 1. BSBI (Block Sort-Based Indexing)\n
**Idea**: Map terms to IDs, collect (TermID, DocID) pairs, sort them, and write to disk blocks.\n
\n
*(Since we are in python, we simulate blocks)*

In [2]:
class BSBIIndexer:
    def __init__(self, block_size=10): # block_size in number of docs for demo\n
        self.term2id = {}
        self.id2term = {}
        self.current_term_id = 0
        self.block_size = block_size
        self.block_count = 0
        
    def get_term_id(self, term):
        if term not in self.term2id:
            self.term2id[term] = self.current_term_id
            self.id2term[self.current_term_id] = term
            self.current_term_id += 1
        return self.term2id[term]
    
    def invert_block(self, doc_batch):
        # 1. Parse & Convert to IDs\n
        pairs = []
        for doc_path in doc_batch:
            doc_id = os.path.basename(doc_path)
            with open(doc_path, 'r', encoding='utf-8') as f:
                tokens = f.read().split()
                for token in tokens:
                    tid = self.get_term_id(token)
                    pairs.append((tid, doc_id))
        
        # 2. Sort pairs\n
        pairs.sort()
        
        # 3. Create Posting Lists\n
        block_index = defaultdict(list)
        for tid, did in pairs:
            if not block_index[tid] or block_index[tid][-1] != did:
                block_index[tid].append(did)
                
        return block_index
    
    def write_block(self, block_index):
        # Write sorted block to disk\n
        filename = os.path.join(DISK_DIR, f"bsbi_block_{self.block_count}.txt")
        with open(filename, 'w', encoding='utf-8') as f:
            for tid in sorted(block_index.keys()):
                postings = ",".join(block_index[tid])
                term = self.id2term[tid]
                f.write(f"{term}:{postings}\n")
        self.block_count += 1
        print(f"  -> Wrote block {filename}")

bsbi = BSBIIndexer(block_size=20)

# Process in chunks\n
for i in range(0, len(files), 20):
    chunk = files[i:i+20]
    index_block = bsbi.invert_block(chunk)
    bsbi.write_block(index_block)

  -> Wrote block ../data/disk_simulation\bsbi_block_0.txt
  -> Wrote block ../data/disk_simulation\bsbi_block_1.txt
  -> Wrote block ../data/disk_simulation\bsbi_block_2.txt


## 2. SPIMI (Single-Pass In-Memory Indexing)\n
**Idea**: No TermIDs needed. Just build a dictionary, sort terms *only when block is full*, and write.\n
\n
Advantages:\n
- No need to maintain global TermID mapping (saved memory)\n
- Faster (no sorting of pairs)

In [3]:
def spimi_invert(doc_batch, block_id):
    dictionary = defaultdict(list)
    
    for doc_path in doc_batch:
        doc_id = os.path.basename(doc_path)
        with open(doc_path, 'r', encoding='utf-8') as f:
            tokens = f.read().split()
            for token in tokens:
                if not dictionary[token] or dictionary[token][-1] != doc_id:
                    dictionary[token].append(doc_id)
    
    # Sort terms only at write time\n
    sorted_terms = sorted(dictionary.keys())
    
    filename = os.path.join(DISK_DIR, f"spimi_block_{block_id}.txt")
    with open(filename, 'w', encoding='utf-8') as f:
        for term in sorted_terms:
            postings = ",".join(dictionary[term])
            f.write(f"{term}:{postings}\n")
    print(f"  -> Wrote SPIMI block {filename}")

# Process in chunks\n
block_id = 0
for i in range(0, len(files), 20):
    chunk = files[i:i+20]
    spimi_invert(chunk, block_id)
    block_id += 1

  -> Wrote SPIMI block ../data/disk_simulation\spimi_block_0.txt
  -> Wrote SPIMI block ../data/disk_simulation\spimi_block_1.txt
  -> Wrote SPIMI block ../data/disk_simulation\spimi_block_2.txt


## 3. Merging Blocks (The Final Step)\n
Both algorithms end with merging sorted blocks.

In [4]:
def merge_blocks(pattern):
    # Simplified merge: Load all blocks and write final index\n
    # In reality, this would be a k-way merge stream\n
    
    final_index = defaultdict(list)
    block_files = glob.glob(os.path.join(DISK_DIR, pattern))
    
    for bf in block_files:
        with open(bf, 'r', encoding='utf-8') as f:
            for line in f:
                term, postings = line.strip().split(':')
                doc_list = postings.split(',')
                final_index[term].extend(doc_list)
                
    # Sort postings list one last time (deduplication if needed)\n
    for term in final_index:
        final_index[term] = sorted(list(set(final_index[term])))
        
    return final_index

merged_index = merge_blocks("spimi_block_*.txt")
print(f"\nMerged Index Size: {len(merged_index)} terms.")


Merged Index Size: 572 terms.
