# Vector Space retrieval with inverted files

## Helper functions

### TopKList class

In [15]:
from heapq import heappop, heappush, nsmallest
from typing import Callable

class TopKList:
    """
        Maintains a list of top-k documents. Initializer accepts
        a list of tuples (term, weight) to provide information about
        weights used by retrieval model. Implements the iter() interface.
        Takes an optional predicate(doc: int) function to filter documents
        before returning them. For selective predicate, you may have to
        adjust the pruning thresholds.
    """
    # set this property to define the pruning threshold, None/False to turn pruning off
    # pruning ensures that the number of elements in the heap ranges between
    # first and second value of the tuple
    PRUNING_THRESHOLDS = (1000, 2000)

    def __init__(self, k: int, term_weights: list[tuple[str,float]] = None, predicate: Callable[[int], bool] = None):
        self.docs_heap = []
        self.k = k
        self.predicate = predicate
        if term_weights:
            self.term_weights = term_weights
            self.terms = [term for term, _ in self.term_weights]
            self.weights = dict(self.term_weights)
    
    def add(self, doc: int, score: float):
        heappush(self.docs_heap, (-score, doc, {'id': doc, 'score': score}))
        # optional pruning if heap grows too large; be careful to not trigger each time for performance
        if len(self.docs_heap) > TopKList.PRUNING_THRESHOLDS[1]:
            self.docs_heap = nsmallest(TopKList.PRUNING_THRESHOLDS[0], self.docs_heap)

    def __iter__(self):
        n = 0
        while n < self.k and len(self.docs_heap) > 0:
            doc = heappop(self.docs_heap)[2]
            if self.predicate == None or self.predicate(doc['id']):
                yield doc

### IDF implementations

In [16]:
import math
def idf(doc_freq: int, num_docs: int) -> float:
    return math.log((num_docs + 1) / (doc_freq + 1))

def idf_bm25(doc_freq: int, num_docs: int) -> float:
    return math.log((num_docs - doc_freq + 0.5) / (doc_freq + 0.5))

### Functions for pretty printing

In [2]:
# we use this global variables o drive the examples
DEBUG = False
nDocs = 100
index = {}
documents = []
topk = None

# helper function to display result and get feedback
def display_and_get_feedback(n = 8, all_docs = False):
    n = topk.size if n < 0 else n
    print("\n  f  r  id score  document\n-------------------------------------")
    for rank, doc in enumerate(topk):
        print("  {rank: >3d} {id: >3d} ({score:.2f})".format(rank = rank + 1, **doc), documents[doc['id']])
    print()
    for term in sorted(topk.weights.keys(), key = lambda term: -topk.weights[term]):
        print(term.rjust(16), topk.weights[term])

## Vector Space Model Implementation

### Base class

- scoring functions
- 

In [34]:
import math

class BIRModel:
    """
        Generic class for the evaluation of the BIR model, inherited by the document-at-a-time (DAAT) and 
        term-at-a-time (TAAT) models. This superclass defines the cj-weights including filtering the most
        important terms.
    """
    # set this property to True to remove terms with negative weights
    PRUNE_NEGATIVE_WEIGHTS = False

    # set this property to remove terms with absolute weights smaller than this value
    PRUNE_WEIGHT_THRESHOLD  = False

    # set this property to select top-k weights based on absolute values
    PRUNE_TOPK = False

    # set this property to true to prune non-relevant documents from result list
    PRUNE_NON_RELEVANT = False

    @staticmethod
    def cj_weight(term: str, feedback: Feedback):
        docFreq = len(index[term])
        if feedback.is_initial_step():
            rj = 0.5
            nj = (docFreq + 0.5) / (len(documents) + 1)
            if DEBUG:
                print(term, "rj=", rj, "nj=", nj, "cj=", math.log(rj / (1 - rj) * (1 - nj) / nj))
        else:
            # get postings as set to siplify calculations in Python
            docs = set(index[term])
            # number of assessed relevant documents which have the term
            lj, L = len(feedback.relevant & docs), len(feedback.relevant)
            # number of assessed documents which have the term
            kj, K = len(feedback.assessed & docs), len(feedback.assessed)
            # calculate rj and nj
            rj = (lj + 0.5) / (L + 1)
            nj = (kj - lj + 0.5) / (K - L + 1)
            if DEBUG:
                print(term, "l=", lj, "/", L, "k=", kj, "/", K, "rj=", rj, "nj=", nj, "cj=", math.log(rj / (1 - rj) * (1 - nj) / nj))
        return math.log(rj / (1 - rj) * (1 - nj) / nj)

    @staticmethod
    def filter_terms(terms: list[str], feedback: Feedback) -> list[tuple[str,float]]:
        # remove terms not in vocabulary
        terms = list(filter(lambda t: t in vocabulary, terms))
        # calculate weigths and produce tuples (term, weight)
        term_weights = list(map(lambda t: (t, BIRModel.cj_weight(t, feedback)), terms))
        # filter terms with negative weights
        if BIRModel.PRUNE_NEGATIVE_WEIGHTS:
            print('pruning negative weights')
            term_weights = list(filter(lambda t: t[1] >= 0, term_weights))
        # filter terms with small absolute weights
        if BIRModel.PRUNE_WEIGHT_THRESHOLD:
            print('pruning small weights')
            term_weights = list(filter(lambda t: abs(t[1]) > BIRModel.PRUNE_WEIGHT_THRESHOLD, term_weights))
        # select top-k terms based on absolute values
        if BIRModel.PRUNE_TOPK:
            print('pruning top-k weights')
            term_weights = sorted(term_weights, key = lambda t: (-abs(t[1]),len(index[t[0]]),t[0]))[:BIRModel.PRUNE_TOPK]
        return term_weights

### Document-at-a-time for BIR Model

In [35]:
class BIRModel_DAAT(BIRModel):
    """
        Implements the DAAT model for the BIR model using inverted index method.
    """
    @staticmethod
    def query(terms: list[str], k: int, feedback: Feedback, predicate: Callable[[int], bool] = None):
        # filter terms and obtain weights for terms in order of their importance 
        term_weights = BIRModel.filter_terms(terms, feedback)
        
        # get iterators for each term and fetch first posting
        iters = [iter(index[term]) for (term, _) in term_weights]
        nexts = [next(iter, None) for iter in iters]

        # keep track of all retrieved documents and their score; stored as tuples (doc_id, score)
        topk = TopKList(k, term_weights, predicate)
        while not all(e is None for e in nexts):
            # get smallest value from nexts, ignoring None values
            smallest = min(nexts, key = lambda x: x if x is not None else float('inf'))
            # if we have feedback, make sure document is either relevant or not assessed so far
            if not(BIRModel.PRUNE_NON_RELEVANT and feedback.is_not_relevant(smallest)):
                # if so, add it to topk
                score = sum([term_weights[i][1] for i in range(len(nexts)) if nexts[i] == smallest])
                topk.add(smallest, score)
            # for each entry in nexts, fetch next item if entry equals smallest
            for i, e in enumerate(nexts):
                if e is smallest:
                    nexts[i] = next(iters[i], None)
        
        # finsihed, return topk for result iteration
        return topk

### Term-at-a-time for BIR Model

In [36]:
class BIRModel_TAAT(BIRModel):
    """
        Implements the TAAT model for the BIR model using inverted index method.
    """
    @staticmethod
    def query(terms: list[str], k: int, feedback: Feedback, predicate: Callable[[int], bool] = None):
        # filter terms and obtain weights for terms in order of their importance 
        term_weights = BIRModel.filter_terms(terms, feedback)
        doc_scores = {}

        # iterate over terms and fetch postings
        for (term, weight) in term_weights:
            for posting in index[term]:
                # if document is not already in doc_scores, add it
                if posting not in doc_scores:
                    doc_scores[posting] = 0
                # add weight to document score
                doc_scores[posting] += weight

        # we do not need a full sort of doc_scores, but can use the heap in TopKList
        topk = TopKList(k, term_weights, predicate)
        for doc, score in doc_scores.items():
            topk.add(doc, score)
        
        # finsihed, return topk for result iteration
        return topk

## Random data example
### Create inverted index
The next section generates random inverted index postings for a set of terms. It simulates the indexing process for Boolean retrieval by associating random document IDs with each term. The `vocabulary` dictionary defines terms and their desired document frequencies (as a %-figure). The generated postings are stored in the `index` dictionary, with each term mapped to a set of corresponding document IDs.

* `nDocs = 100`: Defines the total number of documents (document IDs) as 100.
* `index = {}`: Initializes an empty dictionary to store the postings for each term.
* `DEBUG = False`: A debug flag (we use it later to illustrate code execution).
* `vocabulary`: Defines a dictionary where each term is associated with its desired document frequency (expressed as a percentage).
* `documents`: List of all documents with each entry holding a dictionary {vector: dict, len: float, norm: float}
  - vector holds the term freqeuncies as dictionary (key=term, value=term frequency)
  - len is the number of terms in the document (its length)
  - norm is the 

`create_postings(term: str, docFreq: int = None)` takes a term (string) and an optional document frequency (docFreq, integer) as arguments. It generates random postings for the term by creating a set of document IDs. If docFreq is not provided, it generates a random document frequency between 1 and nDocs. The for-loop iterates through each term in the vocabulary dictionary and calls the create_postings function. For each term, it fetches the desired document frequency from the vocabulary (values are percentages) and passes it to create_postings.

`is_relevant(doc: int)` returns True if document is relevant ()

In [53]:
import random
import math

DEBUG = False
nDocs = 40
index = {}
documents = []
vocabulary = {}

# helper function to create random postings with given document frequency
def create_postings(term: str, docFreq: int = None):
    # create sets with random ids
    index[term] = []
    vocabulary[term] = {'df': docFreq, 'idf': 0}
    # extend feature vectors for documents with a random term frequency
    for doc in sorted(random.sample(range(1, nDocs + 1), docFreq)):
        # select a random term frequency for the term
        tf = random.randint(1, 10)
        index[term].append((doc, tf))
        documents[doc]['terms'][term] = tf

# set all feature vectors of documents to empty. We use sets since BIR uses set-of-word model
for doc in range(nDocs + 1):
    documents.append({'terms': {}, 'len': 0, 'norm': 0})

# we use some animal terms to create random documents
terms = ['dog', 'cat', 'horse', 'rabit', 'ostrich', 'bear', 'tiger', 'lion', 'bird']

# call create_postings for each entry in vocabulary to create the inverted index
for term in terms:
    create_postings(term, random.randint(nDocs // 10, nDocs // 2))

# now calculate the idf for each term and the norm for each document
for item in vocabulary.values():
    item['idf'] = idf(item['df'], nDocs)
for doc in documents:
    doc['len'] = sum([tf for _, tf in doc['terms'].items()])
    doc['norm'] = sum([(tf * vocabulary[term]['idf']) ** 2 for term, tf in doc['terms'].items()]) ** 0.5


Let's have a look at the postings for each term:

In [54]:
# print vocabulary with df and idf
for term, item in vocabulary.items():
    print("{term:10} {df:<4d} {idf:<7.2f} {postings}".format(term=term.ljust(10), df=item['df'], idf=item['idf'], postings=index[term]))

dog        5    1.92    [(2, 4), (11, 6), (17, 7), (30, 2), (32, 9)]
cat        20   0.67    [(4, 7), (8, 6), (10, 8), (11, 1), (12, 9), (13, 9), (14, 4), (15, 4), (18, 2), (21, 1), (26, 4), (28, 9), (30, 5), (32, 6), (33, 6), (36, 10), (37, 4), (38, 10), (39, 8), (40, 5)]
horse      13   1.07    [(7, 4), (9, 6), (10, 8), (11, 5), (13, 8), (15, 8), (19, 2), (23, 2), (27, 3), (34, 6), (38, 8), (39, 2), (40, 1)]
rabit      17   0.82    [(1, 6), (4, 8), (8, 8), (9, 7), (15, 6), (16, 10), (17, 9), (19, 5), (23, 7), (25, 6), (27, 3), (28, 7), (29, 10), (31, 9), (32, 6), (33, 3), (34, 7)]
ostrich    17   0.82    [(1, 2), (5, 4), (7, 3), (9, 7), (10, 10), (13, 8), (14, 3), (15, 9), (17, 8), (19, 3), (22, 8), (26, 10), (29, 4), (30, 5), (31, 9), (37, 4), (40, 6)]
bear       17   0.82    [(2, 6), (5, 1), (7, 6), (9, 3), (10, 9), (11, 10), (12, 9), (13, 8), (14, 8), (18, 7), (27, 10), (31, 9), (32, 7), (35, 4), (38, 9), (39, 9), (40, 8)]
tiger      20   0.67    [(3, 9), (5, 8), (6, 3), (7, 2), (

In [59]:
# print a few documents
for doc in range(20):
    print("{id:>4} | {len:<5d} {norm:<7.2f} {terms}".format(id=doc + 1, len=documents[doc + 1]['len'], norm=documents[doc + 1]['norm'], terms=str(documents[doc + 1]['terms'])))


   1 | 8     5.21    {'rabit': 6, 'ostrich': 2}
   2 | 14    9.98    {'dog': 4, 'bear': 6, 'bird': 4}
   3 | 11    6.35    {'tiger': 9, 'bird': 2}
   4 | 19    9.03    {'cat': 7, 'rabit': 8, 'bird': 4}
   5 | 13    6.34    {'ostrich': 4, 'bear': 1, 'tiger': 8}
   6 | 9     7.64    {'tiger': 3, 'lion': 6}
   7 | 25    12.32   {'horse': 4, 'ostrich': 3, 'bear': 6, 'tiger': 2, 'bird': 10}
   8 | 23    11.89   {'cat': 6, 'rabit': 8, 'bird': 9}
   9 | 28    12.32   {'horse': 6, 'rabit': 7, 'ostrich': 7, 'bear': 3, 'lion': 5}
  10 | 44    18.64   {'cat': 8, 'horse': 8, 'ostrich': 10, 'bear': 9, 'lion': 9}
  11 | 29    16.34   {'dog': 6, 'cat': 1, 'horse': 5, 'bear': 10, 'tiger': 1, 'bird': 6}
  12 | 18    9.55    {'cat': 9, 'bear': 9}
  13 | 33    14.03   {'cat': 9, 'horse': 8, 'ostrich': 8, 'bear': 8}
  14 | 27    10.46   {'cat': 4, 'ostrich': 3, 'bear': 8, 'tiger': 8, 'lion': 4}
  15 | 37    16.17   {'cat': 4, 'horse': 8, 'rabit': 6, 'ostrich': 9, 'bird': 10}
  16 | 10    8.23    {'rabit':

### Evaluation - Initial step without feedback

In [42]:
DEBUG = False

# initial step for "cat dog"
query = ['bird', 'horse']
k = 20
feedback = Feedback()
print(' '.join(query))

# set pruning behavior
BIRModel.PRUNE_NEGATIVE_WEIGHTS     = False
BIRModel.PRUNE_WEIGHT_THRESHOLD     = False
BIRModel.PRUNE_TOPK                 = False
BIRModel.PRUNE_NON_RELEVANT         = True

# (optional) enable a predicate for the filtering step
predicate = None
# predicate = lambda doc: doc % 2 == 0
# predicate = lambda doc: doc % 2 == 1

# run query, display result, and get feedback
topk = BIRModel_DAAT.query(query, k, feedback, predicate)
display_and_get_feedback()

bird horse

  f  r  id score  document
-------------------------------------
  +  1  78 (2.36) {'horse', 'bird'}
  +  2  80 (2.36) {'horse', 'rabit', 'bird'}
  +  3  34 (1.51) {'lion', 'horse'}
  +  4  38 (1.51) {'horse'}
  +  5  44 (1.51) {'ostrich', 'horse', 'rabit', 'cat'}
  +  6  58 (1.51) {'ostrich', 'tiger', 'dog', 'horse', 'cat'}
  +  7  68 (1.51) {'ostrich', 'horse'}
  -  8  82 (1.51) {'ostrich', 'horse'}
     9  96 (1.51) {'horse', 'cat'}
    10  12 (0.85) {'dog', 'ostrich', 'bird'}

           horse 1.5070758997725306
            bird 0.8519707660865959

next feedback: +34, +38, +44, +58, +68, +78, +80, -82


### Evaluation - Feedback step

Adjust weights with feedback. Repeat runs (`Ctrl+Enter` to stay on cell)

In [46]:
print(' '.join(query))
print_feedback()

# run query, display result, and get feedback
topk = BIRModel_DAAT.query(query, k, feedback, predicate)
display_and_get_feedback()

bird horse
feedback: +12, +34, +36, +38, +44, +48, +52, +58, +68, +76, +78, +80, +98, -28, -60, -82, -96

  f  r  id score  document
-------------------------------------
  +  1  78 (0.58) {'horse', 'bird'}
  +  2  80 (0.58) {'horse', 'rabit', 'bird'}
  +  3  12 (0.44) {'dog', 'ostrich', 'bird'}
  +  4  36 (0.44) {'ostrich', 'bird', 'dog', 'lion', 'cat'}
  +  5  48 (0.44) {'dog', 'ostrich', 'bird', 'cat'}
  +  6  52 (0.44) {'bird'}
  +  7  76 (0.44) {'ostrich', 'bear', 'bird', 'lion', 'rabit'}
  +  8  98 (0.44) {'ostrich', 'bird'}
  +  9  34 (0.14) {'lion', 'horse'}
  + 10  38 (0.14) {'horse'}

            bird 0.4353180712578454
           horse 0.14310084364067324

next feedback: +12, +34, +36, +38, +44, +48, +52, +58, +68, +76, +78, +80, +98, -28, -60, -82, -96


## Small document example for DAAT and TAAT
### Create inverted index

In [21]:
# example from the exercise
nDocs = 0
index = {}
documents = [set()]
vocabulary = {}
stopwords = set([
    'a', 'an', 'and', 'are', 'as', 'at', 'be', 'by', 'for', 'from', 'has', 'he',
    'i', 'in', 'is', 'it', 'its', 'of', 'on', 'that', 'the', 'to', 'was',
    'were', 'will', 'with'
])

# helper function to rate if newly encountered document is relevant
def is_relevant(doc):
    return doc < 6

def add_document(text: str):
    global nDocs
    nDocs += 1
    terms = set()
    for term in set(text.lower().split(' ')):
        if term in stopwords:
            continue
        terms.add(term)
        if term not in vocabulary:
            index[term] = [nDocs]
            vocabulary[term] = 1
        else:
            index[term].append(nDocs)
            vocabulary[term] += 1
    documents.append(terms)

add_document("Human machine interface for Lab ABC computer applications")
add_document("A survey of user opinion of computer system response time")
add_document("The EPS user interface management system")
add_document("System and human system engineering testing of EPS")
add_document("Relation of user perceived response time to error measurement")

add_document("The generation of random binary unordered trees")
add_document("The intersection graph of paths in trees")
add_document("Graph minors IV Widths of trees and well quasi ordering")
add_document("Graph minors a survey")

In [22]:
from itertools import islice 

# print postings with term and list of documents
for term, posting in islice(index.items(), 20):
    print(term.ljust(14), str(len(posting)).ljust(4), sorted(posting[:25]))

interface      2    [1, 3]
human          2    [1, 4]
lab            1    [1]
machine        1    [1]
computer       2    [1, 2]
applications   1    [1]
abc            1    [1]
opinion        1    [2]
user           3    [2, 3, 5]
system         3    [2, 3, 4]
time           2    [2, 5]
response       2    [2, 5]
survey         2    [2, 9]
management     1    [3]
eps            2    [3, 4]
engineering    1    [4]
testing        1    [4]
measurement    1    [5]
relation       1    [5]
error          1    [5]


In [24]:
# print all documents
print()
for doc in range(len(documents) - 1):
    print(doc + 1, documents[doc + 1])


1 {'interface', 'human', 'machine', 'lab', 'computer', 'applications', 'abc'}
2 {'opinion', 'computer', 'user', 'system', 'time', 'response', 'survey'}
3 {'interface', 'management', 'user', 'system', 'eps'}
4 {'human', 'engineering', 'testing', 'system', 'eps'}
5 {'measurement', 'relation', 'user', 'time', 'error', 'response', 'perceived'}
6 {'trees', 'unordered', 'random', 'binary', 'generation'}
7 {'intersection', 'graph', 'paths', 'trees'}
8 {'widths', 'well', 'trees', 'graph', 'minors', 'quasi', 'iv', 'ordering'}
9 {'graph', 'minors', 'survey'}


### Initial step

In [25]:
DEBUG = False

# initial step for "cat dog"
query = ['human', 'computer', 'interaction']
k = 9
feedback = Feedback()
print(' '.join(query))

# set behavior
BIRModel.PRUNE_NEGATIVE_WEIGHTS     = True
BIRModel.PRUNE_WEIGHT_THRESHOLD     = False
BIRModel.PRUNE_TOPK                 = 5
BIRModel.PRUNE_NON_RELEVANT         = False

# run query, display result, and get feedback
topk = BIRModel_DAAT.query(query, k, feedback)
display_and_get_feedback(k, all_docs = True)

human computer interaction
pruning negative weights
pruning top-k weights

  f  r  id score  document
-------------------------------------
  +  1   1 (2.45) {'interface', 'human', 'machine', 'lab', 'computer', 'applications', 'abc'}
  +  2   2 (1.22) {'opinion', 'computer', 'user', 'system', 'time', 'response', 'survey'}
  +  3   4 (1.22) {'human', 'engineering', 'testing', 'system', 'eps'}

        computer 1.2237754316221157
           human 1.2237754316221157

next feedback: +1, +2, +3, +4, +5, -6, -7, -8, -9


### Feedback step

Adjust weights with feedback. Repeat runs (`Ctrl+Enter` to stay on cell) + query expansion with terms from relevant documents

In [4]:
from functools import reduce
print(' '.join(query))
print_feedback()

# run query, display result, and get feedback
topk = BIRModel_DAAT.query(query, k, feedback)
display_and_get_feedback(k)

query = sorted(set(query) | reduce(lambda terms, doc: terms | documents[doc], feedback.relevant, set()))

NameError: name 'query' is not defined

In [5]:
random.randint(10, 40)

23