In [1]:
import collections
import string

In [2]:
class Document:
    def __init__(self, text, doc_id):
        self.text = text
        self.doc_id = doc_id

In [3]:
class Tokenizer:
    @staticmethod
    def tokenize(text):
        text = ''.join(ch for ch in text if ch not in string.punctuation)
        tokens = text.lower().split()
        
        return tokens

In [4]:
class InvertedIndex:
    def __init__(self):
        self.index = collections.defaultdict(set)

    def construct(self, docs):
        for doc in docs:
            tokens = Tokenizer.tokenize(doc.text)
            for token in tokens:
                self.index[token].add(doc.doc_id)

    def compress(self):
        for term, postings in self.index.items():
            self.index[term] = self.delta_encode(postings)

    def delta_encode(self, postings):
        last = 0
        compressed = []
        for doc_id in sorted(postings):
            compressed.append(doc_id - last)
            last = doc_id

        return compressed

    def delta_decode(self, encoded_postings):
        decoded = []
        last = 0
        for delta in encoded_postings:
            doc_id = last + delta
            decoded.append(doc_id)
            last = doc_id

        return decoded

In [5]:
def search(query, index):
    tokens = Tokenizer.tokenize(query)

    result = set()
    for token in tokens:
        postings = index.index.get(token, [])
        docs = index.delta_decode(postings)
        if not result:
            result = set(docs)
        else:
            result &= set(docs)

    return result