# BIR retrieval with inverted files

## Helper functions for the BIRModel

### Feedback class

In [1]:
class Feedback:
    def __init__(self):
        self.assessed = set()
        self.relevant = set()
    
    def is_initial_step(self) -> bool:
        return len(self.assessed) == 0

    def add(self, doc: int, relevant: bool) -> None:
        self.assessed.add(doc)
        if relevant:
            self.relevant.add(doc)
        
    def is_relevant(self, doc: int) -> bool:
        return doc in self.relevant
    
    def is_assessed(self, doc: int) -> bool:
        return doc in self.assessed
    
    def is_not_relevant(self, doc: int) -> bool:
        return (doc in self.assessed) and (doc not in self.relevant)

### TopKList class

In [2]:
from heapq import heappop, heappush, nsmallest

class TopKList:
    def __init__(self, k: int, term_weights: list[tuple[str,float]]):
        self.term_weights = term_weights
        self.docs_heap = []
        self.k = k
        self.terms = [term for term, _ in self.term_weights]
        self.weights = dict(self.term_weights)
    
    def add(self, doc: int, score: float):
        heappush(self.docs_heap, (-score, doc, {'id': doc, 'score': score}))
        # optional pruning if heap grows too large; be careful to not trigger each time for performance
        if len(self.docs_heap) > min(10 * self.k, 100):
            self.docs_heap = nsmallest(self.k, self.docs_heap)

    def __iter__(self):
        for i in range(min(self.k, len(self.docs_heap))):
            yield heappop(self.docs_heap)[2]

### Functions for pretty printing

In [3]:
def print_feedback(text = 'feedback'):
    print(text + ":", ", ".join([('+' if feedback.is_relevant(doc) else '-') + str(doc) for doc in sorted(feedback.assessed, key=lambda doc: (not feedback.is_relevant(doc), doc))]))

# helper function to display result and get feedback
def display_and_get_feedback(n = 8, all_docs = False):
    n = topk.size if n < 0 else n
    print("\n  f  r  id score  document\n-------------------------------------")
    for rank, doc in enumerate(topk):
        # let's assume user provides feedback for n not yet assessed documents
        if not feedback.is_assessed(doc['id']) and n > 0:
            n -= 1
            feedback.add(doc['id'], is_relevant(doc['id']))
        if rank < 10:
            relevancy = '+' if feedback.is_relevant(doc['id']) else '-' if feedback.is_assessed(doc['id']) else ' '
            print("  {relevancy}{rank: >3d} {id: >3d} ({score:.2f})".format(rank = rank + 1, relevancy = relevancy, **doc), documents[doc['id']])
    if all_docs:
        for doc in filter(lambda doc: not feedback.is_assessed(doc), range(1, nDocs + 1)):
            if n > 0:
                n -= 1
                feedback.add(doc, is_relevant(doc))
    print()
    for term in sorted(topk.weights.keys(), key = lambda term: -topk.weights[term]):
        print(term.rjust(16), topk.weights[term])
    print_feedback("\nnext feedback")


## BIR Model Implementation

### Base class

- pruning of terms
- setting weigths based on feedback

In [4]:
import math

class BIRModel:
    """
        Generic class for the evaluation of the BIR model, inherited by the document-at-a-time (DAAT) and 
        term-at-a-time (TAAT) models. This superclass defines the cj-weights including filtering the most
        important terms.
    """
    # set this property to True to remove terms with negative weights
    PRUNE_NEGATIVE_WEIGHTS = False

    # set this property to remove terms with absolute weights smaller than this value
    PRUNE_WEIGHT_THRESHOLD  = False

    # set this property to select top-k weights based on absolute values
    PRUNE_TOPK = False

    # set this property to true to prune non-relevant documents from result list
    PRUNE_NON_RELEVANT = False

    @staticmethod
    def cj_weight(term: str, feedback: Feedback):
        docFreq = len(index[term])
        if feedback.is_initial_step():
            rj = 0.5
            nj = (docFreq + 0.5) / (len(documents) + 1)
            if DEBUG:
                print(term, "rj=", rj, "nj=", nj, "cj=", math.log(rj / (1 - rj) * (1 - nj) / nj))
        else:
            # get postings as set to siplify calculations in Python
            docs = set(index[term])
            # number of assessed relevant documents which have the term
            lj, L = len(feedback.relevant & docs), len(feedback.relevant)
            # number of assessed documents which have the term
            kj, K = len(feedback.assessed & docs), len(feedback.assessed)
            # calculate rj and nj
            rj = (lj + 0.5) / (L + 1)
            nj = (kj - lj + 0.5) / (K - L + 1)
            if DEBUG:
                print(term, "l=", lj, "/", L, "k=", kj, "/", K, "rj=", rj, "nj=", nj, "cj=", math.log(rj / (1 - rj) * (1 - nj) / nj))
        return math.log(rj / (1 - rj) * (1 - nj) / nj)

    @staticmethod
    def filter_terms(terms: list[str], feedback: Feedback) -> list[tuple[str,float]]:
        # remove terms not in vocabulary
        terms = list(filter(lambda t: t in vocabulary, terms))
        # calculate weigths and produce tuples (term, weight)
        term_weights = list(map(lambda t: (t, BIRModel.cj_weight(t, feedback)), terms))
        # filter terms with negative weights
        if BIRModel.PRUNE_NEGATIVE_WEIGHTS:
            print('pruning negative weights')
            term_weights = list(filter(lambda t: t[1] >= 0, term_weights))
        # filter terms with small absolute weights
        if BIRModel.PRUNE_WEIGHT_THRESHOLD:
            print('pruning small weights')
            term_weights = list(filter(lambda t: abs(t[1]) > BIRModel.PRUNE_WEIGHT_THRESHOLD, term_weights))
        # select top-k terms based on absolute values
        if BIRModel.PRUNE_TOPK:
            print('pruning top-k weights')
            term_weights = sorted(term_weights, key = lambda t: (-abs(t[1]),len(index[t[0]]),t[0]))[:BIRModel.PRUNE_TOPK]
        return term_weights

### Document-at-a-time for BIR Model

In [5]:
class BIRModel_DAAT(BIRModel):
    """
        Implements the DAAT model for the BIR model using inverted index method.
    """
    @staticmethod
    def query(terms: list[str], k: int, feedback: Feedback):
        # filter terms and obtain weights for terms in order of their importance 
        term_weights = BIRModel.filter_terms(terms, feedback)
        
        # get iterators for each term and fetch first posting
        iters = [iter(index[term]) for (term, _) in term_weights]
        nexts = [next(iter, None) for iter in iters]

        # keep track of all retrieved documents and their score; stored as tuples (doc_id, score)
        topk = TopKList(k, term_weights)
        while not all(e is None for e in nexts):
            # get smallest value from nexts, ignoring None values
            smallest = min(nexts, key = lambda x: x if x is not None else float('inf'))
            # if we have feedback, make sure document is either relevant or not assessed so far
            if not(BIRModel.PRUNE_NON_RELEVANT and feedback.is_not_relevant(smallest)):
                # if so, add it to topk
                score = sum([term_weights[i][1] for i in range(len(nexts)) if nexts[i] == smallest])
                topk.add(smallest, score)
            # for each entry in nexts, fetch next item if entry equals smallest
            for i, e in enumerate(nexts):
                if e is smallest:
                    nexts[i] = next(iters[i], None)
        
        # finsihed, return topk for result iteration
        return topk

### Term-at-a-time for BIR Model

In [6]:
class BIRModel_TAAT(BIRModel):
    """
        Implements the TAAT model for the BIR model using inverted index method.
    """
    @staticmethod
    def query(terms: list[str], k: int, feedback: Feedback):
        # filter terms and obtain weights for terms in order of their importance 
        term_weights = BIRModel.filter_terms(terms, feedback)
        doc_scores = {}

        # iterate over terms and fetch postings
        for (term, weight) in term_weights:
            for posting in index[term]:
                # if document is not already in doc_scores, add it
                if posting not in doc_scores:
                    doc_scores[posting] = 0
                # add weight to document score
                doc_scores[posting] += weight

        # we do not need a full sort of doc_scores, but can use the heap in TopKList
        topk = TopKList(k, term_weights)
        for doc, score in doc_scores.items():
            topk.add(doc, score)
        
        # finsihed, return topk for result iteration
        return topk

## Random data example
### Create inverted index
The next section generates random inverted index postings for a set of terms. It simulates the indexing process for Boolean retrieval by associating random document IDs with each term. The `vocabulary` dictionary defines terms and their desired document frequencies (as a %-figure). The generated postings are stored in the `index` dictionary, with each term mapped to a set of corresponding document IDs.

* `nDocs = 100`: Defines the total number of documents (document IDs) as 100.
* `index = {}`: Initializes an empty dictionary to store the postings for each term.
* `DEBUG = False`: A debug flag (we use it later to illustrate code execution).
* `vocabulary`: Defines a dictionary where each term is associated with its desired document frequency (expressed as a percentage).
* `documents`: List of all documents with each entry holding the set of terms in the document

`create_postings(term: str, docFreq: int = None)` takes a term (string) and an optional document frequency (docFreq, integer) as arguments. It generates random postings for the term by creating a set of document IDs. If docFreq is not provided, it generates a random document frequency between 1 and nDocs. The for-loop iterates through each term in the vocabulary dictionary and calls the create_postings function. For each term, it fetches the desired document frequency from the vocabulary (values are percentages) and passes it to create_postings.

`is_relevant(doc: int)` returns True if document is relevant ()

In [7]:
import random

DEBUG = False
nDocs = 100
index = {}
documents = []

# helper function to rate if newly encountered document is relevant
def is_relevant(doc):
    # we check if bird or ostrich is present in the document
    # if 'horse' in documents[doc]:
    #     return random.random() < 0.5
    # if 'bird' in documents[doc]:
    #     return random.random() < 0.5
    # if 'ostrich' in documents[doc]:
    #     return random.random() < 1.0
    return random.random() < 0.8


def create_postings(term: str, docFreq: int = None):
    # create random postings for the term for ids between 1 and nDocs
    if docFreq is None:
        docFreq = random.randint(1, nDocs)
    # create sets with random ids
    index[term] = sorted(random.sample(range(1, nDocs + 1), docFreq))
    # extend feature vectors for documents
    for doc in index[term]:
        documents[doc].add(term)

# define vocabulary and create random postings with given document frequency (in percents)
vocabulary = {
    'dog':       33,
    'cat':       28,
    'horse':     18,
    'rabit':     20,
    'ostrich':   50,
    'bear':      13,
    'tiger':     12,
    'lion':      15,
    'bird':      30
}

# set all feature vectors of documents to empty. We use sets since BIR uses set-of-word model
for doc in range(nDocs + 1):
    documents.append(set())

# call create_postings for each entry in vocabulary to create the inverted index
for word in vocabulary:
    create_postings(word, vocabulary[word] * nDocs // 100)

Let's have a look at the postings for each term:

In [8]:
# print postings with term and list of documents
for term, posting in index.items():
    print(term.ljust(10), str(len(posting)).ljust(4), sorted(posting[:25]))

dog        33   [4, 9, 10, 12, 15, 16, 17, 18, 19, 20, 24, 26, 30, 31, 35, 38, 44, 55, 58, 64, 66, 70, 72, 74, 76]
cat        28   [2, 5, 7, 13, 17, 26, 31, 32, 41, 42, 43, 44, 46, 48, 50, 53, 56, 64, 72, 77, 83, 84, 89, 90, 94]
horse      18   [1, 4, 6, 20, 23, 24, 25, 31, 33, 36, 46, 47, 50, 69, 73, 91, 94, 100]
rabit      20   [1, 2, 14, 15, 22, 23, 25, 29, 31, 35, 43, 44, 47, 48, 54, 72, 81, 85, 90, 96]
ostrich    50   [1, 3, 5, 9, 10, 11, 18, 19, 20, 21, 22, 23, 24, 28, 29, 30, 32, 33, 35, 36, 39, 40, 45, 48, 49]
bear       13   [14, 20, 22, 30, 33, 37, 50, 66, 73, 80, 81, 84, 96]
tiger      12   [4, 13, 31, 49, 64, 69, 81, 82, 84, 95, 97, 99]
lion       15   [2, 9, 16, 17, 39, 43, 44, 48, 54, 56, 60, 61, 86, 95, 98]
bird       30   [1, 4, 9, 13, 20, 22, 24, 25, 26, 28, 29, 34, 35, 36, 38, 39, 48, 54, 56, 57, 60, 65, 67, 69, 71]


In [9]:
# print a few documents
for doc in range(20):
    print(doc + 1, documents[doc + 1])

1 {'ostrich', 'rabit', 'bird', 'horse'}
2 {'rabit', 'cat', 'lion'}
3 {'ostrich'}
4 {'dog', 'bird', 'horse', 'tiger'}
5 {'ostrich', 'cat'}
6 {'horse'}
7 {'cat'}
8 set()
9 {'dog', 'bird', 'lion', 'ostrich'}
10 {'dog', 'ostrich'}
11 {'ostrich'}
12 {'dog'}
13 {'bird', 'cat', 'tiger'}
14 {'bear', 'rabit'}
15 {'dog', 'rabit'}
16 {'dog', 'lion'}
17 {'dog', 'cat', 'lion'}
18 {'dog', 'ostrich'}
19 {'dog', 'ostrich'}
20 {'dog', 'bear', 'ostrich', 'horse', 'bird'}


### Evaluation - Initial step without feedback

In [10]:
DEBUG = False

# initial step for "cat dog"
query = ['bird', 'horse']
k = 20
feedback = Feedback()
print(' '.join(query))

# set pruning behavior
BIRModel.PRUNE_NEGATIVE_WEIGHTS     = False
BIRModel.PRUNE_WEIGHT_THRESHOLD     = False
BIRModel.PRUNE_TOPK                 = False
BIRModel.PRUNE_NON_RELEVANT         = True

# run query, display result, and get feedback
topk = BIRModel_DAAT.query(query, k, feedback)
display_and_get_feedback()

bird horse

  f  r  id score  document
-------------------------------------
  -  1   1 (2.36) {'ostrich', 'rabit', 'bird', 'horse'}
  +  2   4 (2.36) {'dog', 'bird', 'horse', 'tiger'}
  +  3  20 (2.36) {'dog', 'bear', 'ostrich', 'horse', 'bird'}
  +  4  24 (2.36) {'dog', 'bird', 'horse', 'ostrich'}
  +  5  25 (2.36) {'rabit', 'bird', 'horse'}
  +  6  36 (2.36) {'ostrich', 'bird', 'horse'}
  +  7  69 (2.36) {'ostrich', 'bird', 'horse', 'tiger'}
  +  8  94 (2.36) {'dog', 'horse', 'bird', 'cat'}
     9   6 (1.51) {'horse'}
    10  23 (1.51) {'ostrich', 'rabit', 'horse'}

           horse 1.5070758997725306
            bird 0.8519707660865959

next feedback: +4, +20, +24, +25, +36, +69, +94, -1


### Evaluation - Feedback step

Adjust weights with feedback. Repeat runs (`Ctrl+Enter` to stay on cell)

In [20]:
print(' '.join(query))
print_feedback()

# run query, display result, and get feedback
topk = BIRModel_DAAT.query(query, k, feedback)
display_and_get_feedback()

bird horse
feedback: +4, +6, +9, +20, +22, +23, +24, +25, +29, +31, +35, +36, +39, +46, +47, +50, +69, +73, +91, +94, -1, -13, -26, -28, -33, -34, -38, -100

  f  r  id score  document
-------------------------------------
  +  1   6 (1.49) {'horse'}
  +  2  23 (1.49) {'ostrich', 'rabit', 'horse'}
  +  3  31 (1.49) {'dog', 'tiger', 'rabit', 'horse', 'cat'}
  +  4  46 (1.49) {'horse', 'cat'}
  +  5  47 (1.49) {'rabit', 'horse'}
  +  6  50 (1.49) {'bear', 'horse', 'cat'}
  +  7  73 (1.49) {'ostrich', 'bear', 'horse'}
  +  8  91 (1.49) {'horse'}
  +  9   4 (0.92) {'dog', 'bird', 'horse', 'tiger'}
  + 10  20 (0.92) {'dog', 'bear', 'ostrich', 'horse', 'bird'}

           horse 1.4880770554298333
            bird -0.5698489642154517

next feedback: +4, +6, +9, +20, +22, +23, +24, +25, +29, +31, +35, +36, +39, +46, +47, +50, +69, +73, +91, +94, -1, -13, -26, -28, -33, -34, -38, -100


## Small document example for DAAT and TAAT
### Create inverted index

In [21]:
# example from the exercise
nDocs = 0
index = {}
documents = [set()]
vocabulary = {}
stopwords = set([
    'a', 'an', 'and', 'are', 'as', 'at', 'be', 'by', 'for', 'from', 'has', 'he',
    'i', 'in', 'is', 'it', 'its', 'of', 'on', 'that', 'the', 'to', 'was',
    'were', 'will', 'with'
])

# helper function to rate if newly encountered document is relevant
def is_relevant(doc):
    return doc < 6

def add_document(text: str):
    global nDocs
    nDocs += 1
    terms = set()
    for term in set(text.lower().split(' ')):
        if term in stopwords:
            continue
        terms.add(term)
        if term not in vocabulary:
            index[term] = [nDocs]
            vocabulary[term] = 1
        else:
            index[term].append(nDocs)
            vocabulary[term] += 1
    documents.append(terms)

add_document("Human machine interface for Lab ABC computer applications")
add_document("A survey of user opinion of computer system response time")
add_document("The EPS user interface management system")
add_document("System and human system engineering testing of EPS")
add_document("Relation of user perceived response time to error measurement")

add_document("The generation of random binary unordered trees")
add_document("The intersection graph of paths in trees")
add_document("Graph minors IV Widths of trees and well quasi ordering")
add_document("Graph minors a survey")

In [22]:
from itertools import islice 

# print postings with term and list of documents
for term, posting in islice(index.items(), 20):
    print(term.ljust(14), str(len(posting)).ljust(4), sorted(posting[:25]))

interface      2    [1, 3]
human          2    [1, 4]
lab            1    [1]
machine        1    [1]
computer       2    [1, 2]
applications   1    [1]
abc            1    [1]
opinion        1    [2]
user           3    [2, 3, 5]
system         3    [2, 3, 4]
time           2    [2, 5]
response       2    [2, 5]
survey         2    [2, 9]
management     1    [3]
eps            2    [3, 4]
engineering    1    [4]
testing        1    [4]
measurement    1    [5]
relation       1    [5]
error          1    [5]


In [24]:
# print all documents
print()
for doc in range(len(documents) - 1):
    print(doc + 1, documents[doc + 1])


1 {'interface', 'human', 'machine', 'lab', 'computer', 'applications', 'abc'}
2 {'opinion', 'computer', 'user', 'system', 'time', 'response', 'survey'}
3 {'interface', 'management', 'user', 'system', 'eps'}
4 {'human', 'engineering', 'testing', 'system', 'eps'}
5 {'measurement', 'relation', 'user', 'time', 'error', 'response', 'perceived'}
6 {'trees', 'unordered', 'random', 'binary', 'generation'}
7 {'intersection', 'graph', 'paths', 'trees'}
8 {'widths', 'well', 'trees', 'graph', 'minors', 'quasi', 'iv', 'ordering'}
9 {'graph', 'minors', 'survey'}


## Document-at-a-time evaluation

### Initial step

In [25]:
DEBUG = False

# initial step for "cat dog"
query = ['human', 'computer', 'interaction']
k = 9
feedback = Feedback()
print(' '.join(query))

# set behavior
BIRModel.PRUNE_NEGATIVE_WEIGHTS     = True
BIRModel.PRUNE_WEIGHT_THRESHOLD     = False
BIRModel.PRUNE_TOPK                 = 5
BIRModel.PRUNE_NON_RELEVANT         = False

# run query, display result, and get feedback
topk = BIRModel_DAAT.query(query, k, feedback)
display_and_get_feedback(k, all_docs = True)

human computer interaction
pruning negative weights
pruning top-k weights

  f  r  id score  document
-------------------------------------
  +  1   1 (2.45) {'interface', 'human', 'machine', 'lab', 'computer', 'applications', 'abc'}
  +  2   2 (1.22) {'opinion', 'computer', 'user', 'system', 'time', 'response', 'survey'}
  +  3   4 (1.22) {'human', 'engineering', 'testing', 'system', 'eps'}

        computer 1.2237754316221157
           human 1.2237754316221157

next feedback: +1, +2, +3, +4, +5, -6, -7, -8, -9


### Feedback step

Adjust weights with feedback. Repeat runs (`Ctrl+Enter` to stay on cell) + query expansion with terms from relevant documents

In [32]:
from functools import reduce
print(' '.join(query))
print_feedback()

# run query, display result, and get feedback
topk = BIRModel_DAAT.query(query, k, feedback)
display_and_get_feedback(k)

query = sorted(set(query) | reduce(lambda terms, doc: terms | documents[doc], feedback.relevant, set()))

abc applications computer engineering eps error human interaction interface lab machine management measurement opinion perceived relation response survey system testing time user
feedback: +1, +2, +3, +4, +5, -6, -7, -8, -9
pruning negative weights
pruning top-k weights

  f  r  id score  document
-------------------------------------
  +  1   2 (6.93) {'opinion', 'computer', 'user', 'system', 'time', 'response', 'survey'}
  +  2   3 (6.93) {'interface', 'management', 'user', 'system', 'eps'}
  +  3   4 (6.26) {'human', 'engineering', 'testing', 'system', 'eps'}
  +  4   1 (3.72) {'interface', 'human', 'machine', 'lab', 'computer', 'applications', 'abc'}
  +  5   5 (2.53) {'measurement', 'relation', 'user', 'time', 'error', 'response', 'perceived'}

          system 2.5336968139574325
            user 2.5336968139574325
        computer 1.8607523407150066
             eps 1.8607523407150066
           human 1.8607523407150066

next feedback: +1, +2, +3, +4, +5, -6, -7, -8, -9
