# Vector Space retrieval with inverted files

## Helper functions

### TopKList class

In [1]:
from heapq import heappop, heappush, nsmallest
from typing import Callable

class TopKList:
    """
        Maintains a list of top-k documents. Initializer accepts
        a list of tuples (term, weight) to provide information about
        weights used by retrieval model. Implements the iter() interface.
        Takes an optional predicate(doc: int) function to filter documents
        before returning them. For selective predicate, you may have to
        adjust the pruning thresholds.
    """
    # set this property to define the pruning threshold, None/False to turn pruning off
    # pruning ensures that the number of elements in the heap ranges between
    # first and second value of the tuple
    PRUNING_THRESHOLDS = (1000, 2000)

    def __init__(self, k: int, term_weights: list[tuple[str,float]] = None, predicate: Callable[[int], bool] = None):
        self.docs_heap = []
        self.k = k
        self.predicate = predicate
        if term_weights:
            self.term_weights = term_weights
            self.terms = [term for term, _ in self.term_weights]
            self.weights = dict(self.term_weights)
    
    def add(self, doc: int, score: float):
        heappush(self.docs_heap, (-score, doc, {'id': doc, 'score': score}))
        # optional pruning if heap grows too large; be careful to not trigger each time for performance
        if len(self.docs_heap) > TopKList.PRUNING_THRESHOLDS[1]:
            self.docs_heap = nsmallest(TopKList.PRUNING_THRESHOLDS[0], self.docs_heap)

    def __iter__(self):
        n = 0
        while n < self.k and len(self.docs_heap) > 0:
            doc = heappop(self.docs_heap)[2]
            if self.predicate == None or self.predicate(doc['id']):
                yield doc

### IDF implementations

In [2]:
import math
def idf(doc_freq: int, num_docs: int) -> float:
    return math.log((num_docs + 1) / (doc_freq + 1))

def idf_bm25(doc_freq: int, num_docs: int) -> float:
    return math.log((num_docs - doc_freq + 0.5) / (doc_freq + 0.5))

### Functions for pretty printing

In [34]:
# we use this global variables o drive the examples
DEBUG = False
nDocs = 100
index = {}
documents = []
vocabulary = {}

# helper function to display result and get feedback
def print_topk(topk):
    print("\n    r  id score  document\n-------------------------------------")
    for rank, doc in enumerate(topk):
        print("  {rank: >3d} {id: >3d} ({score:.2f})".format(rank = rank + 1, **doc), documents[doc['id']])
    print()
    for term in sorted(topk.weights.keys(), key = lambda term: -topk.weights[term]):
        print(term.rjust(16), topk.weights[term])

## Vector Space Model Implementation

### Scoring functions


In [120]:
class VSMeasure: pass

# implements the cosine measure
class CosineMeasure(VSMeasure):
    def __init__(self, query_vector: dict[str, int]):
        self.query_vector = {}
        self.term_weights = []
        self.query_norm = 0
        for term in query_vector.keys():
            if term in vocabulary:
                idf_2 = vocabulary[term]['idf'] ** 2
                self.query_vector[term] = query_vector[term] * idf_2
                self.term_weights.append((term, vocabulary[term]['idf']))
                self.query_norm += idf_2 * query_vector[term] ** 2
            else:
                del self.query_vector[term]
        self.query_norm = self.query_norm ** 0.5
    
    def similarity(self, doc: int):
        doc_vector = documents[doc]['vector']
        dot_product = sum([doc_vector.get(term, 0) * q for (term, q) in self.query_vector.items()])
        return dot_product / self.query_norm / documents[doc]['norm']

    def update_sum(self, term: str, doc_tf: int, doc: int):
        return doc_tf * self.query_vector.get(term, 0) / self.query_norm / documents[doc]['norm']

# implements the dot product
class DotProduct(VSMeasure):
    def __init__(self, query_vector: dict[str, int]):
        self.query_vector = {}
        self.term_weights = []
        for term in query_vector.keys():
            if term in vocabulary:
                idf_2 = vocabulary[term]['idf'] ** 2
                self.query_vector[term] = query_vector[term] * idf_2
                self.term_weights.append((term, vocabulary[term]['idf']))
            else:
                del self.query_vector[term]
    
    def similarity(self, doc: int):
        doc_vector = documents[doc]['vector']
        dot_product = sum([doc_vector.get(term, 0) * q for (term, q) in self.query_vector.items()])
        return dot_product

    def update_sum(self, term: str, doc_tf: int, doc: int):
        return doc_tf * self.query_vector.get(term, 0)

### Base class

In [36]:
import math

class VSModel:
    """
        Generic class for the evaluation of the Vector Space model, inherited by the document-at-a-time (DAAT) and 
        term-at-a-time (TAAT) implementation. This superclass defines the idf-weights including filtering the most
        important terms.
    """
    @staticmethod
    def get_similarity_measure(measure: str, query_vector: dict[str, int]) -> VSMeasure:
        return {
            'cosine': CosineMeasure(query_vector)
        }[measure]

### Document-at-a-time for Vector Space Model

In [135]:
class VSModel_DAAT(VSModel):
    """
        Implements the DAAT model for the Vector Space model using inverted index method.
    """
    @staticmethod
    def query(query_vector: dict[str, int], k: int, measure: str = 'dot', predicate: Callable[[int], bool] = None, selected_docs: set[int] = None):
        # determine simialrity measure for this query
        sim = VSModel.get_similarity_measure(measure, query_vector)
        
        # get iterators for each term and fetch first posting
        iters = [iter(index[term]) for (term, _) in sim.term_weights]
        nexts = [next(iter, None) for iter in iters]

        # keep track of all retrieved documents and their score; stored as tuples (doc_id, score)
        topk = TopKList(k, sim.term_weights, predicate)
        while not all(e is None for e in nexts):
            # get smallest value from nexts, ignoring None values
            smallest = min(nexts, key = lambda x: x[0] if x is not None else float('inf'))[0]
            # calculate score with the query terms present in the document
            score = sum([sim.update_sum(sim.term_weights[i][0], nexts[i][1], smallest) for i in range(len(nexts)) if nexts[i] and nexts[i][0] == smallest])
            # assert score == sim.similarity(smallest)
            topk.add(smallest, score)
            # for each entry in nexts, fetch next item if entry equals smallest
            for i, e in enumerate(nexts):
                if e and e[0] is smallest:
                    nexts[i] = next(iters[i], None)
        
        # finsihed, return topk for result iteration
        return topk

### Term-at-a-time for BIR Model

In [36]:
class BIRModel_TAAT(BIRModel):
    """
        Implements the TAAT model for the BIR model using inverted index method.
    """
    @staticmethod
    def query(terms: list[str], k: int, feedback: Feedback, predicate: Callable[[int], bool] = None):
        # filter terms and obtain weights for terms in order of their importance 
        term_weights = BIRModel.filter_terms(terms, feedback)
        doc_scores = {}

        # iterate over terms and fetch postings
        for (term, weight) in term_weights:
            for posting in index[term]:
                # if document is not already in doc_scores, add it
                if posting not in doc_scores:
                    doc_scores[posting] = 0
                # add weight to document score
                doc_scores[posting] += weight

        # we do not need a full sort of doc_scores, but can use the heap in TopKList
        topk = TopKList(k, term_weights, predicate)
        for doc, score in doc_scores.items():
            topk.add(doc, score)
        
        # finsihed, return topk for result iteration
        return topk

## Random data example
### Create inverted index
The next section generates random inverted index postings for a set of terms. It simulates the indexing process for Boolean retrieval by associating random document IDs with each term. The `vocabulary` dictionary defines terms and their desired document frequencies (as a %-figure). The generated postings are stored in the `index` dictionary, with each term mapped to a set of corresponding document IDs.

* `nDocs = 100`: Defines the total number of documents (document IDs) as 100.
* `index = {}`: Initializes an empty dictionary to store the postings for each term.
* `DEBUG = False`: A debug flag (we use it later to illustrate code execution).
* `vocabulary`: Defines a dictionary where each term is associated with its desired document frequency (expressed as a percentage).
* `documents`: List of all documents with each entry holding a dictionary {vector: dict, len: float, norm: float}
  - vector holds the term freqeuncies as dictionary (key=term, value=term frequency)
  - len is the number of terms in the document (its length)
  - norm is the 

`create_postings(term: str, docFreq: int = None)` takes a term (string) and an optional document frequency (docFreq, integer) as arguments. It generates random postings for the term by creating a set of document IDs. If docFreq is not provided, it generates a random document frequency between 1 and nDocs. The for-loop iterates through each term in the vocabulary dictionary and calls the create_postings function. For each term, it fetches the desired document frequency from the vocabulary (values are percentages) and passes it to create_postings.

`is_relevant(doc: int)` returns True if document is relevant ()

In [137]:
import random

DEBUG = False
nDocs = 40
index = {}
documents = []
vocabulary = {}

# helper function to create random postings with given document frequency
def create_postings(term: str, docFreq: int = None):
    # create sets with random ids
    index[term] = []
    vocabulary[term] = {'df': docFreq, 'idf': 0}
    # extend feature vectors for documents with a random term frequency
    for doc in sorted(random.sample(range(1, nDocs + 1), docFreq)):
        # select a random term frequency for the term
        tf = random.randint(1, 10)
        index[term].append((doc, tf))
        documents[doc]['vector'][term] = tf

# set all feature vectors of documents to empty. We use sets since BIR uses set-of-word model
for doc in range(nDocs + 1):
    documents.append({'vector': {}, 'len': 0, 'norm': 0})

# we use some animal terms to create random documents
terms = ['dog', 'cat', 'horse', 'rabit', 'ostrich', 'bear', 'tiger', 'lion', 'bird']

# call create_postings for each entry in vocabulary to create the inverted index
for term in terms:
    create_postings(term, random.randint(nDocs // 10, nDocs // 2))

# now calculate the idf for each term and the norm for each document
for item in vocabulary.values():
    item['idf'] = idf(item['df'], nDocs)
    item['idf_bm25'] = idf_bm25(item['df'], nDocs)
for doc in documents:
    doc['len'] = sum([tf for _, tf in doc['vector'].items()])
    doc['norm'] = sum([(tf * vocabulary[term]['idf']) ** 2 for term, tf in doc['vector'].items()]) ** 0.5


Let's have a look at the postings for each term:

In [140]:
# print vocabulary with df and idf
for term, item in vocabulary.items():
    print("{term:10} {df:<4d} {idf:<7.2f} {idf_bm25:<7.2f} {postings}".format(term=term.ljust(10), df=item['df'], idf=item['idf'], idf_bm25=item['idf_bm25'], postings=index[term]))

dog        20   0.67    0.00    [(3, 4), (4, 9), (7, 9), (11, 1), (13, 6), (14, 5), (15, 9), (18, 3), (19, 3), (20, 10), (21, 10), (25, 4), (26, 8), (27, 10), (29, 5), (30, 1), (33, 2), (34, 7), (37, 10), (38, 1)]
cat        10   1.32    1.07    [(1, 3), (5, 9), (6, 3), (7, 8), (20, 7), (21, 2), (28, 3), (30, 3), (34, 7), (37, 8)]
horse      7    1.63    1.50    [(5, 6), (10, 7), (16, 3), (22, 10), (31, 7), (32, 10), (38, 4)]
rabit      8    1.52    1.34    [(14, 1), (21, 6), (22, 10), (28, 5), (30, 6), (32, 5), (33, 4), (38, 2)]
ostrich    11   1.23    0.94    [(5, 10), (7, 4), (9, 9), (11, 7), (12, 5), (13, 6), (17, 1), (18, 5), (24, 2), (26, 5), (34, 6)]
bear       9    1.41    1.20    [(9, 6), (10, 9), (13, 1), (15, 8), (18, 4), (28, 9), (30, 3), (31, 9), (37, 9)]
tiger      14   1.01    0.60    [(5, 3), (6, 2), (7, 1), (9, 8), (12, 9), (14, 3), (21, 5), (22, 10), (23, 9), (25, 4), (26, 5), (34, 9), (37, 4), (38, 4)]
lion       19   0.72    0.10    [(2, 4), (3, 8), (7, 5), (9, 7), 

In [139]:
# print a few documents
for doc in range(20):
    print("{id:>4} | {len:<5d} {norm:<7.2f} {terms}".format(id=doc + 1, len=documents[doc + 1]['len'], norm=documents[doc + 1]['norm'], terms=str(documents[doc + 1]['vector'])))


   1 | 10    8.96    {'cat': 3, 'bird': 7}
   2 | 4     2.87    {'lion': 4}
   3 | 12    6.34    {'dog': 4, 'lion': 8}
   4 | 9     6.02    {'dog': 9}
   5 | 30    20.04   {'cat': 9, 'horse': 6, 'ostrich': 10, 'tiger': 3, 'bird': 2}
   6 | 5     4.43    {'cat': 3, 'tiger': 2}
   7 | 27    13.60   {'dog': 9, 'cat': 8, 'ostrich': 4, 'tiger': 1, 'lion': 5}
   8 | 0     0.00    {}
   9 | 30    16.85   {'ostrich': 9, 'bear': 6, 'tiger': 8, 'lion': 7}
  10 | 22    18.04   {'horse': 7, 'bear': 9, 'lion': 1, 'bird': 5}
  11 | 20    13.64   {'dog': 1, 'ostrich': 7, 'lion': 3, 'bird': 9}
  12 | 14    10.94   {'ostrich': 5, 'tiger': 9}
  13 | 13    8.51    {'dog': 6, 'ostrich': 6, 'bear': 1}
  14 | 16    8.40    {'dog': 5, 'rabit': 1, 'tiger': 3, 'lion': 1, 'bird': 6}
  15 | 17    12.79   {'dog': 9, 'bear': 8}
  16 | 3     4.90    {'horse': 3}
  17 | 18    11.89   {'ostrich': 1, 'lion': 8, 'bird': 9}
  18 | 19    11.76   {'dog': 3, 'ostrich': 5, 'bear': 4, 'bird': 7}
  19 | 12    6.77    {'dog': 

### Evaluation

In [136]:
DEBUG = False

# initial step for "bird(2) horse"
query = {'bird': 2, 'horse':1}
k = 20
print(query)

# (optional) enable a predicate for the filtering step
predicate = None
# predicate = lambda doc: doc % 2 == 0
# predicate = lambda doc: doc % 2 == 1
selected_docs = None
# selected_docs = list(range(10))

# run query, display result, and get feedback
topk = VSModel_DAAT.query(query, k, 'cosine', predicate)
print_topk(topk)

{'bird': 2, 'horse': 1}

    r  id score  document
-------------------------------------
    1  23 (0.87) {'vector': {'dog': 1, 'cat': 7, 'lion': 1, 'bird': 10}, 'len': 19, 'norm': 14.025126073837832}
    2   4 (0.86) {'vector': {'rabit': 2, 'bird': 3}, 'len': 5, 'norm': 4.236190701513045}
    3   6 (0.77) {'vector': {'dog': 2, 'cat': 8, 'horse': 5, 'rabit': 3, 'lion': 7, 'bird': 8}, 'len': 33, 'norm': 15.239974756540674}
    4  14 (0.74) {'vector': {'dog': 5, 'horse': 6, 'rabit': 4, 'lion': 9, 'bird': 8}, 'len': 32, 'norm': 16.377357309671446}
    5  22 (0.73) {'vector': {'rabit': 6, 'bear': 2, 'lion': 7, 'bird': 9}, 'len': 24, 'norm': 15.075793772175903}
    6  36 (0.71) {'vector': {'tiger': 5, 'bird': 7}, 'len': 12, 'norm': 11.929040556051872}
    7  19 (0.55) {'vector': {'cat': 2, 'tiger': 7, 'bird': 6}, 'len': 15, 'norm': 13.295596288926044}
    8  24 (0.54) {'vector': {'dog': 1, 'rabit': 2, 'lion': 10, 'bird': 6}, 'len': 19, 'norm': 13.440831229986518}
    9   2 (0.45) {'vector':

## Small document example for DAAT and TAAT
### Create inverted index

In [21]:
# example from the exercise
nDocs = 0
index = {}
documents = [set()]
vocabulary = {}
stopwords = set([
    'a', 'an', 'and', 'are', 'as', 'at', 'be', 'by', 'for', 'from', 'has', 'he',
    'i', 'in', 'is', 'it', 'its', 'of', 'on', 'that', 'the', 'to', 'was',
    'were', 'will', 'with'
])

# helper function to rate if newly encountered document is relevant
def is_relevant(doc):
    return doc < 6

def add_document(text: str):
    global nDocs
    nDocs += 1
    terms = set()
    for term in set(text.lower().split(' ')):
        if term in stopwords:
            continue
        terms.add(term)
        if term not in vocabulary:
            index[term] = [nDocs]
            vocabulary[term] = 1
        else:
            index[term].append(nDocs)
            vocabulary[term] += 1
    documents.append(terms)

add_document("Human machine interface for Lab ABC computer applications")
add_document("A survey of user opinion of computer system response time")
add_document("The EPS user interface management system")
add_document("System and human system engineering testing of EPS")
add_document("Relation of user perceived response time to error measurement")

add_document("The generation of random binary unordered trees")
add_document("The intersection graph of paths in trees")
add_document("Graph minors IV Widths of trees and well quasi ordering")
add_document("Graph minors a survey")

In [22]:
from itertools import islice 

# print postings with term and list of documents
for term, posting in islice(index.items(), 20):
    print(term.ljust(14), str(len(posting)).ljust(4), sorted(posting[:25]))

interface      2    [1, 3]
human          2    [1, 4]
lab            1    [1]
machine        1    [1]
computer       2    [1, 2]
applications   1    [1]
abc            1    [1]
opinion        1    [2]
user           3    [2, 3, 5]
system         3    [2, 3, 4]
time           2    [2, 5]
response       2    [2, 5]
survey         2    [2, 9]
management     1    [3]
eps            2    [3, 4]
engineering    1    [4]
testing        1    [4]
measurement    1    [5]
relation       1    [5]
error          1    [5]


In [24]:
# print all documents
print()
for doc in range(len(documents) - 1):
    print(doc + 1, documents[doc + 1])


1 {'interface', 'human', 'machine', 'lab', 'computer', 'applications', 'abc'}
2 {'opinion', 'computer', 'user', 'system', 'time', 'response', 'survey'}
3 {'interface', 'management', 'user', 'system', 'eps'}
4 {'human', 'engineering', 'testing', 'system', 'eps'}
5 {'measurement', 'relation', 'user', 'time', 'error', 'response', 'perceived'}
6 {'trees', 'unordered', 'random', 'binary', 'generation'}
7 {'intersection', 'graph', 'paths', 'trees'}
8 {'widths', 'well', 'trees', 'graph', 'minors', 'quasi', 'iv', 'ordering'}
9 {'graph', 'minors', 'survey'}


### Initial step

In [25]:
DEBUG = False

# initial step for "cat dog"
query = ['human', 'computer', 'interaction']
k = 9
feedback = Feedback()
print(' '.join(query))

# set behavior
BIRModel.PRUNE_NEGATIVE_WEIGHTS     = True
BIRModel.PRUNE_WEIGHT_THRESHOLD     = False
BIRModel.PRUNE_TOPK                 = 5
BIRModel.PRUNE_NON_RELEVANT         = False

# run query, display result, and get feedback
topk = BIRModel_DAAT.query(query, k, feedback)
display_and_get_feedback(k, all_docs = True)

human computer interaction
pruning negative weights
pruning top-k weights

  f  r  id score  document
-------------------------------------
  +  1   1 (2.45) {'interface', 'human', 'machine', 'lab', 'computer', 'applications', 'abc'}
  +  2   2 (1.22) {'opinion', 'computer', 'user', 'system', 'time', 'response', 'survey'}
  +  3   4 (1.22) {'human', 'engineering', 'testing', 'system', 'eps'}

        computer 1.2237754316221157
           human 1.2237754316221157

next feedback: +1, +2, +3, +4, +5, -6, -7, -8, -9


### Feedback step

Adjust weights with feedback. Repeat runs (`Ctrl+Enter` to stay on cell) + query expansion with terms from relevant documents

In [4]:
from functools import reduce
print(' '.join(query))
print_feedback()

# run query, display result, and get feedback
topk = BIRModel_DAAT.query(query, k, feedback)
display_and_get_feedback(k)

query = sorted(set(query) | reduce(lambda terms, doc: terms | documents[doc], feedback.relevant, set()))

NameError: name 'query' is not defined

In [5]:
random.randint(10, 40)

23

In [112]:
sim = CosineMeasure(query)
doc = 23

print(query)
print(documents[doc])
print(sim.query_vector)
print(sim.term_weights)
print(sim.query_norm)
print(sim.similarity(doc))
# print([sim.update_sum(term, documents[doc]['vector'].get(term, 0)) for term in query.keys()])
s = sum([sim.update_sum(term, documents[doc]['vector'].get(term, 0)) for term in query.keys()])
print(s)

{'bird': 2, 'horse': 1}
{'vector': {'dog': 1, 'cat': 7, 'lion': 1, 'bird': 10}, 'len': 19, 'norm': 14.025126073837832}
{'bird': 3.4620108520452124, 'horse': 1.1545819202215484}
[('bird', 1.7310054260226062), ('horse', 1.1545819202215484)]
2.842288448471051
0.8684673603499498
0.8684673603499498


In [54]:
vocabulary

{'dog': {'df': 19, 'idf': 0.7178397931503168},
 'cat': {'df': 13, 'idf': 1.074514737089049},
 'horse': {'df': 20, 'idf': 0.6690496289808848},
 'rabit': {'df': 10, 'idf': 1.3156767939059373},
 'ostrich': {'df': 6, 'idf': 1.7676619176489945},
 'bear': {'df': 8, 'idf': 1.5163474893680884},
 'tiger': {'df': 19, 'idf': 0.7178397931503168},
 'lion': {'df': 16, 'idf': 0.8803587226480917},
 'bird': {'df': 4, 'idf': 2.1041341542702074}}