# BIR retrieval with inverted files

## Helper functions for the BIRModel

### Feedback class

In [32]:
class Feedback:
    """
        Collects feedback for documents and provides
        functions to check if document is assessed,
        relevant or not relevant.
    """
    def __init__(self):
        self.assessed = set()
        self.relevant = set()
    
    def is_initial_step(self) -> bool:
        return len(self.assessed) == 0

    def add(self, doc_id: int, relevant: bool) -> None:
        self.assessed.add(doc_id)
        if relevant:
            self.relevant.add(doc_id)
        
    def is_relevant(self, doc_id: int) -> bool:
        return doc_id in self.relevant
    
    def is_assessed(self, doc_id: int) -> bool:
        return doc_id in self.assessed
    
    def is_not_relevant(self, doc_id: int) -> bool:
        return (doc_id in self.assessed) and (doc_id not in self.relevant)

### TopKList class

In [33]:
from heapq import heappop, heappush, nsmallest
from typing import Callable

class TopKList:
    """
        Maintains a list of top-k documents. Initializer accepts
        a list of tuples (term, weight) to provide information about
        weights used by retrieval model. Implements the iter() interface.
        Takes an optional predicate(doc_id: int) function to filter documents
        before returning them. 
    """
    def __init__(self, k: int, term_weights: list[tuple[str,float]] = None, predicate: Callable[[int], bool] = None):
        self.docs_heap = []
        self.k = k
        self.predicate = predicate
        if term_weights:
            self.term_weights = term_weights
            self.terms = [term for term, _ in self.term_weights]
            self.weights = dict(self.term_weights)
    
    def add(self, doc_id: int, score: float):
        heappush(self.docs_heap, (-score, doc_id, {'id': doc_id, 'score': score}))
        # optional (infrequent) pruning if heap grows too large

    def __iter__(self):
        rank = 0
        while rank < self.k and len(self.docs_heap) > 0:
            entry = heappop(self.docs_heap)[2]
            if self.predicate == None or self.predicate(entry['id']):
                rank += 1
                entry['rank'] = rank
                yield entry

In [51]:
# we use this global variables o drive the examples
DEBUG = False
nDocs = 100
index = {}
documents = {}
vocabulary = {}

def print_feedback(feedback, text = 'feedback'):
    print(text + ":", ", ".join([('+' if feedback.is_relevant(doc_id) else '-') + str(doc_id) for doc_id in sorted(feedback.assessed, key=lambda doc_id: (not feedback.is_relevant(doc_id), doc_id))]))

# helper function to display result and get feedback
def print_topk_and_get_feedback(topk: TopKList, feedback: Feedback, is_relevant: Callable[[int], bool], nFeedback = 5, extra_feedback: bool = False):
    print("\n  f  r  id score  document\n-------------------------------------")
    for entry in topk:
        # let's assume user provides feedback for n not yet assessed documents
        if not feedback.is_assessed(entry['id']) and nFeedback > 0:
            nFeedback -= 1
            feedback.add(entry['id'], is_relevant(entry['id']))
        relevancy = '+' if feedback.is_relevant(entry['id']) else '-' if feedback.is_assessed(entry['id']) else ' '
        print("  {relevancy}{rank:>3d} {id:>3d} ({score:.2f})".format(relevancy = relevancy, **entry), documents[entry['id']])
    if extra_feedback:
        for doc_id in filter(lambda doc_id: not feedback.is_assessed(doc_id), documents.keys()):
            if nFeedback <= 0:
                break
            nFeedback -= 1
            feedback.add(doc_id, is_relevant(doc_id))
    print()
    for term in sorted(topk.weights.keys(), key = lambda term: -topk.weights[term]):
        print(term.rjust(16), topk.weights[term])
    print_feedback(feedback, "\nnext feedback")


### Functions for pretty printing

## BIR Model Implementation

### Base class

- pruning of terms
- setting weigths based on feedback

In [52]:
import math

class BIRModel:
    """
        Generic class for the evaluation of the BIR model, inherited by the document-at-a-time (DAAT) and 
        term-at-a-time (TAAT) models. This superclass defines the cj-weights including filtering the most
        important terms.
    """
    # set this property to True to remove terms with negative weights
    PRUNE_NEGATIVE_WEIGHTS = False

    # set this property to remove terms with absolute weights smaller than this value
    PRUNE_WEIGHT_THRESHOLD  = False

    # set this property to select top-k weights based on absolute values
    PRUNE_TOPK = False

    # set this property to true to prune non-relevant documents from result list
    PRUNE_NON_RELEVANT = False

    @staticmethod
    def cj_weight(term: str, feedback: Feedback):
        docFreq = len(index[term])
        if feedback.is_initial_step():
            rj = 0.5
            nj = (docFreq + 0.5) / (len(documents) + 1)
            if DEBUG:
                print(term, "rj=", rj, "nj=", nj, "cj=", math.log(rj / (1 - rj) * (1 - nj) / nj))
        else:
            # get postings as set to siplify calculations in Python
            docs = set(index[term])
            # number of assessed relevant documents which have the term
            lj, L = len(feedback.relevant & docs), len(feedback.relevant)
            # number of assessed documents which have the term
            kj, K = len(feedback.assessed & docs), len(feedback.assessed)
            # calculate rj and nj
            rj = (lj + 0.5) / (L + 1)
            nj = (kj - lj + 0.5) / (K - L + 1)
            if DEBUG:
                print(term, "l=", lj, "/", L, "k=", kj, "/", K, "rj=", rj, "nj=", nj, "cj=", math.log(rj / (1 - rj) * (1 - nj) / nj))
        return math.log(rj / (1 - rj) * (1 - nj) / nj)

    @staticmethod
    def filter_terms(terms: list[str], feedback: Feedback) -> list[tuple[str,float]]:
        # remove terms not in vocabulary
        terms = list(filter(lambda t: t in vocabulary, terms))
        # calculate weigths and produce tuples (term, weight)
        term_weights = list(map(lambda t: (t, BIRModel.cj_weight(t, feedback)), terms))
        # filter terms with negative weights
        if BIRModel.PRUNE_NEGATIVE_WEIGHTS:
            print('pruning negative weights')
            term_weights = list(filter(lambda t: t[1] >= 0, term_weights))
        # filter terms with small absolute weights
        if BIRModel.PRUNE_WEIGHT_THRESHOLD:
            print('pruning small weights')
            term_weights = list(filter(lambda t: abs(t[1]) > BIRModel.PRUNE_WEIGHT_THRESHOLD, term_weights))
        # select top-k terms based on absolute values
        if BIRModel.PRUNE_TOPK:
            print('pruning top-k weights')
            term_weights = sorted(term_weights, key = lambda t: (-abs(t[1]),len(index[t[0]]),t[0]))[:BIRModel.PRUNE_TOPK]
        return term_weights

### Document-at-a-time for BIR Model

In [53]:
class BIRModel_DAAT(BIRModel):
    """
        Implements the DAAT model for the BIR model using inverted index method.
    """
    @staticmethod
    def query(terms: list[str], k: int, feedback: Feedback, predicate: Callable[[int], bool] = None, selected_docs: set[int] = None):
        # filter terms and obtain weights for terms in order of their importance 
        term_weights = BIRModel.filter_terms(terms, feedback)
        
        # get iterators for each term and fetch first posting
        iters = [iter(index[term]) for (term, _) in term_weights]
        nexts = [next(iter, None) for iter in iters]

        # keep track of all retrieved documents and their score; stored as tuples (doc_id, score)
        topk = TopKList(k, term_weights, predicate)
        while not all(e is None for e in nexts):
            # get smallest value from nexts, ignoring None values
            smallest = min(nexts, key = lambda x: x if x is not None else float('inf'))
            # if we have feedback, make sure document is either relevant or not assessed so far; if we have selected_docs, make sure document is in it
            if not(BIRModel.PRUNE_NON_RELEVANT and feedback.is_not_relevant(smallest)) and (selected_docs == None or smallest in selected_docs):
                # if so, add it to topk
                score = sum([term_weights[i][1] for i in range(len(nexts)) if nexts[i] == smallest])
                topk.add(smallest, score)
            # for each entry in nexts, fetch next item if entry equals smallest
            for i, e in enumerate(nexts):
                if e is smallest:
                    nexts[i] = next(iters[i], None)
        
        # finsihed, return topk for result iteration
        return topk

### Term-at-a-time for BIR Model

In [54]:
class BIRModel_TAAT(BIRModel):
    """
        Implements the TAAT model for the BIR model using inverted index method.
    """
    @staticmethod
    def query(terms: list[str], k: int, feedback: Feedback, predicate: Callable[[int], bool] = None):
        # filter terms and obtain weights for terms in order of their importance 
        term_weights = BIRModel.filter_terms(terms, feedback)
        doc_scores = {}

        # iterate over terms and fetch postings
        for (term, weight) in term_weights:
            for posting in index[term]:
                # if document is not already in doc_scores, add it
                if posting not in doc_scores:
                    doc_scores[posting] = 0
                # add weight to document score
                doc_scores[posting] += weight

        # we do not need a full sort of doc_scores, but can use the heap in TopKList
        topk = TopKList(k, term_weights, predicate)
        for doc_id, score in doc_scores.items():
            topk.add(doc_id, score)
        
        # finsihed, return topk for result iteration
        return topk

## Random data example
### Create inverted index
The next section generates random inverted index postings for a set of terms. It simulates the indexing process for Boolean retrieval by associating random document IDs with each term. The `vocabulary` dictionary defines terms and their desired document frequencies (as a %-figure). The generated postings are stored in the `index` dictionary, with each term mapped to a set of corresponding document IDs.

* `nDocs = 100`: Defines the total number of documents (document IDs) as 100.
* `index = {}`: Initializes an empty dictionary to store the postings for each term.
* `DEBUG = False`: A debug flag (we use it later to illustrate code execution).
* `vocabulary`: Defines a dictionary where each term is associated with its desired document frequency (expressed as a percentage).
* `documents`: List of all documents with each entry holding the set of terms in the document

`create_postings(term: str, docFreq: int = None)` takes a term (string) and an optional document frequency (docFreq, integer) as arguments. It generates random postings for the term by creating a set of document IDs. If docFreq is not provided, it generates a random document frequency between 1 and nDocs. The for-loop iterates through each term in the vocabulary dictionary and calls the create_postings function. For each term, it fetches the desired document frequency from the vocabulary (values are percentages) and passes it to create_postings.

`is_relevant(doc_id: int)` returns True if document is relevant ()

In [55]:
import random

DEBUG = False
nDocs = 100
index = {}
documents = {}
vocabulary = {}

# helper function to rate if newly encountered document is relevant
def is_relevant(doc_id):
    return random.random() < 0.8

# helper function to create random postings with given document frequency
def create_postings(term: str, docFreq: int = None):
    # create sets with random ids
    index[term] = sorted(random.sample(range(1, nDocs + 1), docFreq))
    vocabulary[term] = docFreq
    # extend feature vectors for documents
    for doc_id in index[term]:
        documents[doc_id].add(term)

# set all feature vectors of documents to empty. We use sets since BIR uses set-of-word model
for doc_id in range(1, nDocs + 1):
    documents[doc_id] = set()

# we use some animal terms to create random documents
terms = ['dog', 'cat', 'horse', 'rabit', 'ostrich', 'bear', 'tiger', 'lion', 'bird']

# call create_postings for each entry in vocabulary to create the inverted index
for term in terms:
    create_postings(term, random.randint(nDocs // 10, nDocs // 2))

Let's have a look at the postings for each term:

In [56]:
# print postings with term and list of documents
for term, posting in index.items():
    print(term.ljust(10), str(len(posting)).ljust(4), sorted(posting[:25]))

dog        11   [17, 34, 37, 49, 59, 70, 77, 83, 88, 91, 94]
cat        17   [9, 12, 15, 27, 31, 34, 37, 41, 51, 62, 63, 66, 69, 73, 76, 85, 88]
horse      11   [1, 2, 11, 23, 24, 32, 56, 60, 66, 69, 98]
rabit      16   [5, 8, 11, 16, 27, 32, 37, 52, 58, 61, 65, 67, 73, 76, 87, 97]
ostrich    41   [3, 5, 6, 8, 11, 16, 17, 19, 21, 22, 24, 25, 27, 29, 31, 33, 35, 40, 41, 45, 49, 51, 52, 55, 57]
bear       30   [3, 12, 22, 23, 25, 28, 30, 33, 41, 42, 43, 46, 53, 55, 59, 61, 62, 63, 64, 68, 69, 73, 79, 81, 82]
tiger      50   [1, 8, 9, 11, 12, 17, 18, 19, 21, 22, 24, 25, 31, 35, 36, 38, 41, 42, 43, 46, 47, 48, 49, 50, 51]
lion       26   [3, 5, 10, 16, 19, 23, 29, 32, 35, 40, 46, 55, 57, 62, 65, 67, 72, 73, 74, 75, 76, 79, 89, 90, 99]
bird       39   [2, 4, 5, 6, 10, 11, 13, 16, 17, 19, 24, 25, 26, 29, 34, 35, 36, 37, 43, 46, 50, 53, 57, 58, 60]


In [57]:
# print a few documents
for doc_id in range(1, 21):
    print('{id:>3d} {terms}'.format(id=doc_id, terms=documents[doc_id]))

  1 {'tiger', 'horse'}
  2 {'bird', 'horse'}
  3 {'bear', 'ostrich', 'lion'}
  4 {'bird'}
  5 {'bird', 'ostrich', 'rabit', 'lion'}
  6 {'bird', 'ostrich'}
  7 set()
  8 {'tiger', 'ostrich', 'rabit'}
  9 {'cat', 'tiger'}
 10 {'bird', 'lion'}
 11 {'bird', 'horse', 'tiger', 'rabit', 'ostrich'}
 12 {'cat', 'bear', 'tiger'}
 13 {'bird'}
 14 set()
 15 {'cat'}
 16 {'bird', 'ostrich', 'rabit', 'lion'}
 17 {'bird', 'tiger', 'ostrich', 'dog'}
 18 {'tiger'}
 19 {'bird', 'ostrich', 'tiger', 'lion'}
 20 set()


### Evaluation - Initial step without feedback

In [58]:
DEBUG = False

# initial step for "bird horse"
query = ['bird', 'horse']
k = 10
feedback = Feedback()
print(' '.join(query))

# set pruning behavior
BIRModel.PRUNE_NEGATIVE_WEIGHTS     = False
BIRModel.PRUNE_WEIGHT_THRESHOLD     = False
BIRModel.PRUNE_TOPK                 = False
BIRModel.PRUNE_NON_RELEVANT         = True

# (optional) enable a predicate for the filtering step
predicate = None
# predicate = lambda doc_id: doc_id % 2 == 0
# predicate = lambda doc_id: doc_id % 2 == 1
selected_docs = None
# selected_docs = list(range(10))

# run query, display result, and get feedback
topk = BIRModel_DAAT.query(query, k, feedback, predicate, selected_docs=selected_docs)
print_topk_and_get_feedback(topk, feedback, is_relevant, nFeedback = 5)

bird horse

  f  r  id score  document
-------------------------------------
  -  1   2 (2.49) {'bird', 'horse'}
  +  2  11 (2.49) {'bird', 'horse', 'tiger', 'rabit', 'ostrich'}
  +  3  24 (2.49) {'bird', 'ostrich', 'tiger', 'horse'}
  -  4  60 (2.49) {'bird', 'horse'}
  +  5  66 (2.49) {'cat', 'bird', 'tiger', 'horse'}
     6  98 (2.49) {'bird', 'ostrich', 'tiger', 'horse'}
     7   1 (2.05) {'tiger', 'horse'}
     8  23 (2.05) {'bear', 'lion', 'horse'}
     9  32 (2.05) {'rabit', 'lion', 'horse'}
    10  56 (2.05) {'tiger', 'horse'}

           horse 2.0518915899116053
            bird 0.4427365029053959

next feedback: +11, +24, +66, -2, -60


### Evaluation - Feedback step

Adjust weights with feedback. Repeat runs (`Ctrl+Enter` to stay on cell)

In [67]:
print(' '.join(query))
print_feedback(feedback)

# run query, display result, and get feedback
topk = BIRModel_DAAT.query(query, k, feedback, predicate, selected_docs=selected_docs)
print_topk_and_get_feedback(topk, feedback, is_relevant, nFeedback = 5)

bird horse
feedback: +1, +4, +6, +11, +23, +24, +32, +56, +66, +69, +98, -2, -5, -60

  f  r  id score  document
-------------------------------------
  +  1   1 (0.82) {'tiger', 'horse'}
  +  2  23 (0.82) {'bear', 'lion', 'horse'}
  +  3  32 (0.82) {'rabit', 'lion', 'horse'}
  +  4  56 (0.82) {'tiger', 'horse'}
  +  5  69 (0.82) {'horse', 'bear', 'tiger', 'cat', 'ostrich'}
  +  6  11 (-0.95) {'bird', 'horse', 'tiger', 'rabit', 'ostrich'}
  +  7  24 (-0.95) {'bird', 'ostrich', 'tiger', 'horse'}
  +  8  66 (-0.95) {'cat', 'bird', 'tiger', 'horse'}
  +  9  98 (-0.95) {'bird', 'ostrich', 'tiger', 'horse'}
  + 10   4 (-1.78) {'bird'}

           horse 0.8241754429663491
            bird -1.7788560643921472

next feedback: +1, +4, +6, +11, +23, +24, +32, +56, +66, +69, +98, -2, -5, -60


## Small document example
### Create inverted index

In [68]:
# example from the exercise
nDocs = 0
index = {}
documents = {}
vocabulary = {}
stopwords = set([
    'a', 'an', 'and', 'are', 'as', 'at', 'be', 'by', 'for', 'from', 'has', 'he',
    'i', 'in', 'is', 'it', 'its', 'of', 'on', 'that', 'the', 'to', 'was',
    'were', 'will', 'with'
])

# helper function to rate if newly encountered document is relevant
def is_relevant(doc_id):
    return doc_id < 6

def add_document(text: str):
    global nDocs
    nDocs += 1
    terms = set()
    for term in set(text.lower().split(' ')):
        if term in stopwords:
            continue
        terms.add(term)
        if term not in vocabulary:
            index[term] = [nDocs]
            vocabulary[term] = 1
        else:
            index[term].append(nDocs)
            vocabulary[term] += 1
    documents[nDocs] = terms

add_document("Human machine interface for Lab ABC computer applications")
add_document("A survey of user opinion of computer system response time")
add_document("The EPS user interface management system")
add_document("System and human system engineering testing of EPS")
add_document("Relation of user perceived response time to error measurement")

add_document("The generation of random binary unordered trees")
add_document("The intersection graph of paths in trees")
add_document("Graph minors IV Widths of trees and well quasi ordering")
add_document("Graph minors a survey")

In [69]:
from itertools import islice 

# print postings with term and list of documents
for term, posting in islice(index.items(), 20):
    print(term.ljust(14), str(len(posting)).ljust(4), sorted(posting[:25]))

lab            1    [1]
applications   1    [1]
interface      2    [1, 3]
human          2    [1, 4]
machine        1    [1]
computer       2    [1, 2]
abc            1    [1]
time           2    [2, 5]
response       2    [2, 5]
user           3    [2, 3, 5]
system         3    [2, 3, 4]
survey         2    [2, 9]
opinion        1    [2]
eps            2    [3, 4]
management     1    [3]
engineering    1    [4]
testing        1    [4]
perceived      1    [5]
measurement    1    [5]
error          1    [5]


In [70]:
# print all documents
for doc_id, terms in documents.items():
    print('{id:>3d} {terms}'.format(id=doc_id, terms=terms))

  1 {'applications', 'lab', 'interface', 'human', 'machine', 'computer', 'abc'}
  2 {'time', 'response', 'user', 'system', 'computer', 'survey', 'opinion'}
  3 {'eps', 'interface', 'user', 'system', 'management'}
  4 {'eps', 'engineering', 'human', 'system', 'testing'}
  5 {'perceived', 'time', 'response', 'user', 'measurement', 'error', 'relation'}
  6 {'random', 'binary', 'unordered', 'generation', 'trees'}
  7 {'graph', 'paths', 'intersection', 'trees'}
  8 {'graph', 'iv', 'well', 'minors', 'widths', 'ordering', 'quasi', 'trees'}
  9 {'survey', 'graph', 'minors'}


### Initial step

In [71]:
DEBUG = False

# initial step for "cat dog"
query = ['human', 'computer', 'interaction']
k = 9
feedback = Feedback()
print(' '.join(query))

# set behavior
BIRModel.PRUNE_NEGATIVE_WEIGHTS     = True
BIRModel.PRUNE_WEIGHT_THRESHOLD     = False
BIRModel.PRUNE_TOPK                 = 5
BIRModel.PRUNE_NON_RELEVANT         = False

# run query, display result, and get feedback
topk = BIRModel_DAAT.query(query, k, feedback)
print_topk_and_get_feedback(topk, feedback, is_relevant, nFeedback = 5)

human computer interaction
pruning negative weights
pruning top-k weights

  f  r  id score  document
-------------------------------------
  +  1   1 (2.20) {'applications', 'lab', 'interface', 'human', 'machine', 'computer', 'abc'}
  +  2   2 (1.10) {'time', 'response', 'user', 'system', 'computer', 'survey', 'opinion'}
  +  3   4 (1.10) {'eps', 'engineering', 'human', 'system', 'testing'}

        computer 1.0986122886681098
           human 1.0986122886681098

next feedback: +1, +2, +4


### Feedback step

Adjust weights with feedback. Repeat runs (`Ctrl+Enter` to stay on cell) + query expansion with terms from relevant documents

In [79]:
from functools import reduce
print(' '.join(query))
print_feedback(feedback)

# run query, display result, and get feedback
topk = BIRModel_DAAT.query(query, k, feedback)
print_topk_and_get_feedback(topk, feedback, is_relevant, nFeedback = 5, extra_feedback = True)

query = sorted(set(query) | reduce(lambda terms, doc_id: terms | documents[doc_id], feedback.relevant, set()))

abc applications computer engineering eps error human interaction interface lab machine management measurement opinion perceived relation response survey system testing time user
feedback: +1, +2, +3, +4, +5, -6, -7, -8, -9
pruning negative weights
pruning top-k weights

  f  r  id score  document
-------------------------------------
  +  1   2 (6.93) {'time', 'response', 'user', 'system', 'computer', 'survey', 'opinion'}
  +  2   3 (6.93) {'eps', 'interface', 'user', 'system', 'management'}
  +  3   4 (6.26) {'eps', 'engineering', 'human', 'system', 'testing'}
  +  4   1 (3.72) {'applications', 'lab', 'interface', 'human', 'machine', 'computer', 'abc'}
  +  5   5 (2.53) {'perceived', 'time', 'response', 'user', 'measurement', 'error', 'relation'}

          system 2.5336968139574325
            user 2.5336968139574325
        computer 1.8607523407150066
             eps 1.8607523407150066
           human 1.8607523407150066

next feedback: +1, +2, +3, +4, +5, -6, -7, -8, -9


### What's next?