# BIR retrieval with inverted files

## Helper functions for the BIRModel

### Feedback class

In [2]:
class Feedback:
    """
        Collects feedback for documents and provides
        functions to check if document is assessed,
        relevant or not relevant.
    """
    def __init__(self):
        self.assessed = set()
        self.relevant = set()
    
    def is_initial_step(self) -> bool:
        return len(self.assessed) == 0

    def add(self, doc: int, relevant: bool) -> None:
        self.assessed.add(doc)
        if relevant:
            self.relevant.add(doc)
        
    def is_relevant(self, doc: int) -> bool:
        return doc in self.relevant
    
    def is_assessed(self, doc: int) -> bool:
        return doc in self.assessed
    
    def is_not_relevant(self, doc: int) -> bool:
        return (doc in self.assessed) and (doc not in self.relevant)

### TopKList class

In [3]:
from heapq import heappop, heappush, nsmallest
from typing import Callable

class TopKList:
    """
        Maintains a list of top-k documents. Initializer accepts
        a list of tuples (term, weight) to provide information about
        weights used by retrieval model. Implements the iter() interface.
        Takes an optional predicate(doc: int) function to filter documents
        before returning them. For selective predicate, you may have to
        adjust the pruning thresholds.
    """
    # set this property to define the pruning threshold, None/False to turn pruning off
    # pruning ensures that the number of elements in the heap ranges between
    # first and second value of the tuple
    PRUNING_THRESHOLDS = (1000, 2000)

    def __init__(self, k: int, term_weights: list[tuple[str,float]] = None, predicate: Callable[[int], bool] = None):
        self.docs_heap = []
        self.k = k
        self.predicate = predicate
        if term_weights:
            self.term_weights = term_weights
            self.terms = [term for term, _ in self.term_weights]
            self.weights = dict(self.term_weights)
    
    def add(self, doc: int, score: float):
        heappush(self.docs_heap, (-score, doc, {'id': doc, 'score': score}))
        # optional pruning if heap grows too large; be careful to not trigger each time for performance
        if len(self.docs_heap) > TopKList.PRUNING_THRESHOLDS[1]:
            self.docs_heap = nsmallest(TopKList.PRUNING_THRESHOLDS[0], self.docs_heap)

    def __iter__(self):
        n = 0
        while n < self.k and len(self.docs_heap) > 0:
            doc = heappop(self.docs_heap)[2]
            if self.predicate == None or self.predicate(doc['id']):
                yield doc

In [4]:
# we use this global variables o drive the examples
DEBUG = False
nDocs = 100
index = {}
documents = []
vocabulary = {}

def print_feedback(feedback, text = 'feedback'):
    print(text + ":", ", ".join([('+' if feedback.is_relevant(doc) else '-') + str(doc) for doc in sorted(feedback.assessed, key=lambda doc: (not feedback.is_relevant(doc), doc))]))

# helper function to display result and get feedback
def print_topk_and_get_feedback(topk: TopKList, feedback: Feedback, is_relevant: Callable[[int], bool], nFeedback = 5, extra_feedback: bool = False):
    print("\n  f  r  id score  document\n-------------------------------------")
    for rank, doc in enumerate(topk):
        # let's assume user provides feedback for n not yet assessed documents
        if not feedback.is_assessed(doc['id']) and nFeedback > 0:
            nFeedback -= 1
            feedback.add(doc['id'], is_relevant(doc['id']))
        if rank < 10:
            relevancy = '+' if feedback.is_relevant(doc['id']) else '-' if feedback.is_assessed(doc['id']) else ' '
            print("  {relevancy}{rank: >3d} {id: >3d} ({score:.2f})".format(rank = rank + 1, relevancy = relevancy, **doc), documents[doc['id']])
    if extra_feedback:
        for doc in filter(lambda doc: not feedback.is_assessed(doc), range(1, nDocs + 1)):
            if nFeedback <= 0:
                break
            nFeedback -= 1
            feedback.add(doc, is_relevant(doc))
    print()
    for term in sorted(topk.weights.keys(), key = lambda term: -topk.weights[term]):
        print(term.rjust(16), topk.weights[term])
    print_feedback(feedback, "\nnext feedback")


### Functions for pretty printing

## BIR Model Implementation

### Base class

- pruning of terms
- setting weigths based on feedback

In [5]:
import math

class BIRModel:
    """
        Generic class for the evaluation of the BIR model, inherited by the document-at-a-time (DAAT) and 
        term-at-a-time (TAAT) models. This superclass defines the cj-weights including filtering the most
        important terms.
    """
    # set this property to True to remove terms with negative weights
    PRUNE_NEGATIVE_WEIGHTS = False

    # set this property to remove terms with absolute weights smaller than this value
    PRUNE_WEIGHT_THRESHOLD  = False

    # set this property to select top-k weights based on absolute values
    PRUNE_TOPK = False

    # set this property to true to prune non-relevant documents from result list
    PRUNE_NON_RELEVANT = False

    @staticmethod
    def cj_weight(term: str, feedback: Feedback):
        docFreq = len(index[term])
        if feedback.is_initial_step():
            rj = 0.5
            nj = (docFreq + 0.5) / (len(documents) + 1)
            if DEBUG:
                print(term, "rj=", rj, "nj=", nj, "cj=", math.log(rj / (1 - rj) * (1 - nj) / nj))
        else:
            # get postings as set to siplify calculations in Python
            docs = set(index[term])
            # number of assessed relevant documents which have the term
            lj, L = len(feedback.relevant & docs), len(feedback.relevant)
            # number of assessed documents which have the term
            kj, K = len(feedback.assessed & docs), len(feedback.assessed)
            # calculate rj and nj
            rj = (lj + 0.5) / (L + 1)
            nj = (kj - lj + 0.5) / (K - L + 1)
            if DEBUG:
                print(term, "l=", lj, "/", L, "k=", kj, "/", K, "rj=", rj, "nj=", nj, "cj=", math.log(rj / (1 - rj) * (1 - nj) / nj))
        return math.log(rj / (1 - rj) * (1 - nj) / nj)

    @staticmethod
    def filter_terms(terms: list[str], feedback: Feedback) -> list[tuple[str,float]]:
        # remove terms not in vocabulary
        terms = list(filter(lambda t: t in vocabulary, terms))
        # calculate weigths and produce tuples (term, weight)
        term_weights = list(map(lambda t: (t, BIRModel.cj_weight(t, feedback)), terms))
        # filter terms with negative weights
        if BIRModel.PRUNE_NEGATIVE_WEIGHTS:
            print('pruning negative weights')
            term_weights = list(filter(lambda t: t[1] >= 0, term_weights))
        # filter terms with small absolute weights
        if BIRModel.PRUNE_WEIGHT_THRESHOLD:
            print('pruning small weights')
            term_weights = list(filter(lambda t: abs(t[1]) > BIRModel.PRUNE_WEIGHT_THRESHOLD, term_weights))
        # select top-k terms based on absolute values
        if BIRModel.PRUNE_TOPK:
            print('pruning top-k weights')
            term_weights = sorted(term_weights, key = lambda t: (-abs(t[1]),len(index[t[0]]),t[0]))[:BIRModel.PRUNE_TOPK]
        return term_weights

### Document-at-a-time for BIR Model

In [6]:
class BIRModel_DAAT(BIRModel):
    """
        Implements the DAAT model for the BIR model using inverted index method.
    """
    @staticmethod
    def query(terms: list[str], k: int, feedback: Feedback, predicate: Callable[[int], bool] = None, selected_docs: set[int] = None):
        # filter terms and obtain weights for terms in order of their importance 
        term_weights = BIRModel.filter_terms(terms, feedback)
        
        # get iterators for each term and fetch first posting
        iters = [iter(index[term]) for (term, _) in term_weights]
        nexts = [next(iter, None) for iter in iters]

        # keep track of all retrieved documents and their score; stored as tuples (doc_id, score)
        topk = TopKList(k, term_weights, predicate)
        while not all(e is None for e in nexts):
            # get smallest value from nexts, ignoring None values
            smallest = min(nexts, key = lambda x: x if x is not None else float('inf'))
            # if we have feedback, make sure document is either relevant or not assessed so far; if we have selected_docs, make sure document is in it
            if not(BIRModel.PRUNE_NON_RELEVANT and feedback.is_not_relevant(smallest)) and (selected_docs == None or smallest in selected_docs):
                # if so, add it to topk
                score = sum([term_weights[i][1] for i in range(len(nexts)) if nexts[i] == smallest])
                topk.add(smallest, score)
            # for each entry in nexts, fetch next item if entry equals smallest
            for i, e in enumerate(nexts):
                if e is smallest:
                    nexts[i] = next(iters[i], None)
        
        # finsihed, return topk for result iteration
        return topk

### Term-at-a-time for BIR Model

In [7]:
class BIRModel_TAAT(BIRModel):
    """
        Implements the TAAT model for the BIR model using inverted index method.
    """
    @staticmethod
    def query(terms: list[str], k: int, feedback: Feedback, predicate: Callable[[int], bool] = None):
        # filter terms and obtain weights for terms in order of their importance 
        term_weights = BIRModel.filter_terms(terms, feedback)
        doc_scores = {}

        # iterate over terms and fetch postings
        for (term, weight) in term_weights:
            for posting in index[term]:
                # if document is not already in doc_scores, add it
                if posting not in doc_scores:
                    doc_scores[posting] = 0
                # add weight to document score
                doc_scores[posting] += weight

        # we do not need a full sort of doc_scores, but can use the heap in TopKList
        topk = TopKList(k, term_weights, predicate)
        for doc, score in doc_scores.items():
            topk.add(doc, score)
        
        # finsihed, return topk for result iteration
        return topk

## Random data example
### Create inverted index
The next section generates random inverted index postings for a set of terms. It simulates the indexing process for Boolean retrieval by associating random document IDs with each term. The `vocabulary` dictionary defines terms and their desired document frequencies (as a %-figure). The generated postings are stored in the `index` dictionary, with each term mapped to a set of corresponding document IDs.

* `nDocs = 100`: Defines the total number of documents (document IDs) as 100.
* `index = {}`: Initializes an empty dictionary to store the postings for each term.
* `DEBUG = False`: A debug flag (we use it later to illustrate code execution).
* `vocabulary`: Defines a dictionary where each term is associated with its desired document frequency (expressed as a percentage).
* `documents`: List of all documents with each entry holding the set of terms in the document

`create_postings(term: str, docFreq: int = None)` takes a term (string) and an optional document frequency (docFreq, integer) as arguments. It generates random postings for the term by creating a set of document IDs. If docFreq is not provided, it generates a random document frequency between 1 and nDocs. The for-loop iterates through each term in the vocabulary dictionary and calls the create_postings function. For each term, it fetches the desired document frequency from the vocabulary (values are percentages) and passes it to create_postings.

`is_relevant(doc: int)` returns True if document is relevant ()

In [8]:
import random

DEBUG = False
nDocs = 100
index = {}
documents = []
vocabulary = {}

# helper function to rate if newly encountered document is relevant
def is_relevant(doc):
    return random.random() < 0.8

# helper function to create random postings with given document frequency
def create_postings(term: str, docFreq: int = None):
    # create sets with random ids
    index[term] = sorted(random.sample(range(1, nDocs + 1), docFreq))
    vocabulary[term] = docFreq
    # extend feature vectors for documents
    for doc in index[term]:
        documents[doc].add(term)

# set all feature vectors of documents to empty. We use sets since BIR uses set-of-word model
for doc in range(nDocs + 1):
    documents.append(set())

# we use some animal terms to create random documents
terms = ['dog', 'cat', 'horse', 'rabit', 'ostrich', 'bear', 'tiger', 'lion', 'bird']

# call create_postings for each entry in vocabulary to create the inverted index
for term in terms:
    create_postings(term, random.randint(nDocs // 10, nDocs // 2))

Let's have a look at the postings for each term:

In [9]:
# print postings with term and list of documents
for term, posting in index.items():
    print(term.ljust(10), str(len(posting)).ljust(4), sorted(posting[:25]))

dog        28   [2, 6, 8, 10, 13, 20, 23, 26, 27, 30, 32, 39, 44, 45, 51, 53, 55, 58, 59, 62, 65, 67, 73, 82, 84]
cat        26   [2, 6, 11, 12, 15, 16, 22, 24, 26, 28, 32, 36, 38, 45, 47, 51, 57, 58, 68, 69, 70, 76, 79, 85, 88]
horse      46   [5, 8, 9, 10, 12, 13, 15, 16, 17, 20, 24, 30, 34, 36, 37, 41, 42, 43, 44, 45, 46, 47, 48, 53, 54]
rabit      40   [2, 4, 6, 7, 13, 15, 17, 21, 25, 28, 29, 30, 31, 38, 41, 43, 45, 46, 47, 48, 49, 51, 60, 61, 63]
ostrich    34   [4, 11, 12, 13, 16, 18, 21, 23, 24, 25, 27, 32, 33, 35, 42, 44, 51, 52, 54, 55, 56, 59, 69, 70, 73]
bear       50   [4, 5, 6, 7, 8, 12, 14, 15, 18, 20, 21, 22, 24, 30, 34, 35, 37, 38, 41, 42, 43, 45, 47, 48, 50]
tiger      14   [9, 11, 18, 21, 32, 54, 60, 61, 63, 74, 75, 77, 88, 95]
lion       49   [2, 4, 5, 6, 11, 12, 13, 19, 20, 25, 28, 30, 31, 32, 34, 36, 37, 40, 46, 47, 48, 50, 53, 54, 55]
bird       50   [3, 5, 6, 7, 9, 10, 12, 16, 17, 21, 22, 27, 30, 33, 34, 35, 36, 37, 38, 42, 44, 45, 46, 50, 51]


In [10]:
# print a few documents
for doc in range(20):
    print(doc + 1, documents[doc + 1])

1 set()
2 {'dog', 'cat', 'lion', 'rabit'}
3 {'bird'}
4 {'bear', 'ostrich', 'rabit', 'lion'}
5 {'horse', 'lion', 'bird', 'bear'}
6 {'dog', 'rabit', 'bird', 'bear', 'lion', 'cat'}
7 {'bear', 'rabit', 'bird'}
8 {'horse', 'dog', 'bear'}
9 {'horse', 'bird', 'tiger'}
10 {'horse', 'dog', 'bird'}
11 {'ostrich', 'cat', 'lion', 'tiger'}
12 {'bird', 'bear', 'lion', 'horse', 'ostrich', 'cat'}
13 {'dog', 'rabit', 'lion', 'horse', 'ostrich'}
14 {'bear'}
15 {'horse', 'cat', 'rabit', 'bear'}
16 {'horse', 'ostrich', 'cat', 'bird'}
17 {'horse', 'rabit', 'bird'}
18 {'bear', 'ostrich', 'tiger'}
19 {'lion'}
20 {'horse', 'dog', 'lion', 'bear'}


### Evaluation - Initial step without feedback

In [13]:
DEBUG = False

# initial step for "bird horse"
query = ['bird', 'horse']
k = 20
feedback = Feedback()
print(' '.join(query))

# set pruning behavior
BIRModel.PRUNE_NEGATIVE_WEIGHTS     = False
BIRModel.PRUNE_WEIGHT_THRESHOLD     = False
BIRModel.PRUNE_TOPK                 = False
BIRModel.PRUNE_NON_RELEVANT         = True

# (optional) enable a predicate for the filtering step
predicate = None
# predicate = lambda doc: doc % 2 == 0
# predicate = lambda doc: doc % 2 == 1
selected_docs = None
# selected_docs = list(range(10))

# run query, display result, and get feedback
topk = BIRModel_DAAT.query(query, k, feedback, predicate, selected_docs=selected_docs)
print_topk_and_get_feedback(topk, feedback, is_relevant, nFeedback = 5)

bird horse

  f  r  id score  document
-------------------------------------
  +  1   5 (0.20) {'horse', 'lion', 'bird', 'bear'}
  +  2   9 (0.20) {'horse', 'bird', 'tiger'}
  +  3  10 (0.20) {'horse', 'dog', 'bird'}
  +  4  12 (0.20) {'bird', 'bear', 'lion', 'horse', 'ostrich', 'cat'}
  -  5  16 (0.20) {'horse', 'ostrich', 'cat', 'bird'}
     6  17 (0.20) {'horse', 'rabit', 'bird'}
     7  30 (0.20) {'dog', 'rabit', 'bird', 'bear', 'lion', 'horse'}
     8  34 (0.20) {'horse', 'lion', 'bird', 'bear'}
     9  36 (0.20) {'horse', 'cat', 'lion', 'bird'}
    10  37 (0.20) {'horse', 'lion', 'bird', 'bear'}

           horse 0.17693070815907844
            bird 0.019608471388376337

next feedback: +5, +9, +10, +12, -16


### Evaluation - Feedback step

Adjust weights with feedback. Repeat runs (`Ctrl+Enter` to stay on cell)

In [26]:
print(' '.join(query))
print_feedback(feedback)

# run query, display result, and get feedback
topk = BIRModel_DAAT.query(query, k, feedback, predicate, selected_docs=selected_docs)
print_topk_and_get_feedback(topk, feedback, is_relevant, nFeedback = 5)

bird horse
feedback: +3, +5, +7, +8, +9, +10, +12, +13, +15, +17, +20, +21, +24, +27, +30, +33, +34, +35, +36, +37, +38, +41, +42, +44, +45, +46, +47, +51, +53, +54, +55, +57, +58, +60, +63, +67, +69, +71, +72, +73, +74, +76, +77, +83, +84, +86, +88, +89, +92, +95, +97, -6, -16, -22, -43, -48, -50, -56, -62, -66, -70, -75, -78, -80, -94

  f  r  id score  document
-------------------------------------
  +  1   5 (1.12) {'horse', 'lion', 'bird', 'bear'}
  +  2   9 (1.12) {'horse', 'bird', 'tiger'}
  +  3  10 (1.12) {'horse', 'dog', 'bird'}
  +  4  12 (1.12) {'bird', 'bear', 'lion', 'horse', 'ostrich', 'cat'}
  +  5  17 (1.12) {'horse', 'rabit', 'bird'}
  +  6  30 (1.12) {'dog', 'rabit', 'bird', 'bear', 'lion', 'horse'}
  +  7  34 (1.12) {'horse', 'lion', 'bird', 'bear'}
  +  8  36 (1.12) {'horse', 'cat', 'lion', 'bird'}
  +  9  37 (1.12) {'horse', 'lion', 'bird', 'bear'}
  + 10  42 (1.12) {'horse', 'ostrich', 'bird', 'bear'}

            bird 0.7124112323752194
           horse 0.410494

## Small document example for DAAT and TAAT
### Create inverted index

In [76]:
# example from the exercise
nDocs = 0
index = {}
documents = [set()]
vocabulary = {}
stopwords = set([
    'a', 'an', 'and', 'are', 'as', 'at', 'be', 'by', 'for', 'from', 'has', 'he',
    'i', 'in', 'is', 'it', 'its', 'of', 'on', 'that', 'the', 'to', 'was',
    'were', 'will', 'with'
])

# helper function to rate if newly encountered document is relevant
def is_relevant(doc):
    return doc < 6

def add_document(text: str):
    global nDocs
    nDocs += 1
    terms = set()
    for term in set(text.lower().split(' ')):
        if term in stopwords:
            continue
        terms.add(term)
        if term not in vocabulary:
            index[term] = [nDocs]
            vocabulary[term] = 1
        else:
            index[term].append(nDocs)
            vocabulary[term] += 1
    documents.append(terms)

add_document("Human machine interface for Lab ABC computer applications")
add_document("A survey of user opinion of computer system response time")
add_document("The EPS user interface management system")
add_document("System and human system engineering testing of EPS")
add_document("Relation of user perceived response time to error measurement")

add_document("The generation of random binary unordered trees")
add_document("The intersection graph of paths in trees")
add_document("Graph minors IV Widths of trees and well quasi ordering")
add_document("Graph minors a survey")

In [77]:
from itertools import islice 

# print postings with term and list of documents
for term, posting in islice(index.items(), 20):
    print(term.ljust(14), str(len(posting)).ljust(4), sorted(posting[:25]))

computer       2    [1, 2]
human          2    [1, 4]
machine        1    [1]
applications   1    [1]
interface      2    [1, 3]
lab            1    [1]
abc            1    [1]
survey         2    [2, 9]
response       2    [2, 5]
opinion        1    [2]
user           3    [2, 3, 5]
time           2    [2, 5]
system         3    [2, 3, 4]
eps            2    [3, 4]
management     1    [3]
engineering    1    [4]
testing        1    [4]
error          1    [5]
perceived      1    [5]
relation       1    [5]


In [78]:
# print all documents
print()
for doc in range(len(documents) - 1):
    print(doc + 1, documents[doc + 1])


1 {'computer', 'human', 'applications', 'machine', 'interface', 'lab', 'abc'}
2 {'survey', 'computer', 'response', 'opinion', 'user', 'time', 'system'}
3 {'interface', 'user', 'eps', 'management', 'system'}
4 {'engineering', 'human', 'eps', 'system', 'testing'}
5 {'error', 'response', 'perceived', 'relation', 'user', 'time', 'measurement'}
6 {'binary', 'trees', 'generation', 'random', 'unordered'}
7 {'trees', 'paths', 'graph', 'intersection'}
8 {'minors', 'iv', 'ordering', 'widths', 'graph', 'quasi', 'trees', 'well'}
9 {'minors', 'graph', 'survey'}


### Initial step

In [79]:
DEBUG = False

# initial step for "cat dog"
query = ['human', 'computer', 'interaction']
k = 9
feedback = Feedback()
print(' '.join(query))

# set behavior
BIRModel.PRUNE_NEGATIVE_WEIGHTS     = True
BIRModel.PRUNE_WEIGHT_THRESHOLD     = False
BIRModel.PRUNE_TOPK                 = 5
BIRModel.PRUNE_NON_RELEVANT         = False

# run query, display result, and get feedback
topk = BIRModel_DAAT.query(query, k, feedback)
print_topk_and_get_feedback(topk, feedback, is_relevant, nFeedback = 5)

human computer interaction
pruning negative weights
pruning top-k weights

  f  r  id score  document
-------------------------------------
  +  1   1 (2.45) {'computer', 'human', 'applications', 'machine', 'interface', 'lab', 'abc'}
  +  2   2 (1.22) {'survey', 'computer', 'response', 'opinion', 'user', 'time', 'system'}
  +  3   4 (1.22) {'engineering', 'human', 'eps', 'system', 'testing'}

        computer 1.2237754316221157
           human 1.2237754316221157

next feedback: +1, +2, +4


### Feedback step

Adjust weights with feedback. Repeat runs (`Ctrl+Enter` to stay on cell) + query expansion with terms from relevant documents

In [85]:
from functools import reduce
print(' '.join(query))
print_feedback(feedback)

# run query, display result, and get feedback
topk = BIRModel_DAAT.query(query, k, feedback)
print_topk_and_get_feedback(topk, feedback, is_relevant, nFeedback = 5, extra_feedback = True)

query = sorted(set(query) | reduce(lambda terms, doc: terms | documents[doc], feedback.relevant, set()))

abc applications computer engineering eps error human interaction interface lab machine management measurement opinion perceived relation response survey system testing time user
feedback: +1, +2, +3, +4, +5, -6, -7, -8, -9
pruning negative weights
pruning top-k weights

  f  r  id score  document
-------------------------------------
  +  1   2 (6.93) {'survey', 'computer', 'response', 'opinion', 'user', 'time', 'system'}
  +  2   3 (6.93) {'interface', 'user', 'eps', 'management', 'system'}
  +  3   4 (6.26) {'engineering', 'human', 'eps', 'system', 'testing'}
  +  4   1 (3.72) {'computer', 'human', 'applications', 'machine', 'interface', 'lab', 'abc'}
  +  5   5 (2.53) {'error', 'response', 'perceived', 'relation', 'user', 'time', 'measurement'}

          system 2.5336968139574325
            user 2.5336968139574325
        computer 1.8607523407150066
             eps 1.8607523407150066
           human 1.8607523407150066

next feedback: +1, +2, +3, +4, +5, -6, -7, -8, -9
