In [144]:
%load_ext autoreload
%autoreload 2

import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Vector Space retrieval with inverted files

## Helper functions

### TopKList class

In [145]:
from heapq import heappop, heappush, nsmallest
from typing import Callable

class TopKList:
    """
        Maintains a list of top-k documents. Initializer accepts
        a list of tuples (term, weight) to provide information about
        weights used by retrieval model. Implements the iter() interface.
        Takes an optional predicate(doc_id: int) function to filter documents
        before returning them. 
    """
    def __init__(self, k: int, term_weights: list[tuple[str,float]] = None, predicate: Callable[[int], bool] = None):
        self.docs_heap = []
        self.k = k
        self.predicate = predicate
        if term_weights:
            self.term_weights = term_weights
            self.terms = [term for term, _ in self.term_weights]
            self.weights = dict(self.term_weights)
    
    def add(self, doc_id: int, score: float):
        heappush(self.docs_heap, (-score, doc_id, {'id': doc_id, 'score': score}))
        # optional (infrequent) pruning if heap grows too large

    def __iter__(self):
        rank = 0
        while rank < self.k and len(self.docs_heap) > 0:
            entry = heappop(self.docs_heap)[2]
            if self.predicate == None or self.predicate(entry['id']):
                rank += 1
                entry['rank'] = rank
                yield entry

### IDF implementations

In [146]:
import math
def idf(doc_freq: int, num_docs: int) -> float:
    return math.log((num_docs + 1) / (doc_freq + 1))

def idf_bm25(doc_freq: int, num_docs: int) -> float:
    return math.log((num_docs - doc_freq + 0.5) / (doc_freq + 0.5))

### Functions for pretty printing

In [147]:
# we use this global variables o drive the examples
DEBUG = False
nDocs = 100
index = {}
documents = []
vocabulary = {}

# helper function to display result and get feedback
def print_topk(topk):
    print("\n    r  id score  document\n-------------------------------------")
    for entry in topk:
        print("  {rank: >3d} {id: >3d} ({score:.2f})".format(**entry), documents[entry['id']])
    print()
    for term in sorted(topk.weights.keys(), key = lambda term: -topk.weights[term]):
        print(term.rjust(16), topk.weights[term])

## Vector Space Model Implementation

### Scoring functions


In [148]:
class VSMeasure: pass

# implements the cosine measure
class CosineMeasure(VSMeasure):
    def __init__(self, query_vector: dict[str, int]):
        self.query_vector_normalized = {}
        self.term_weights = []
        self.query_norm = 0
        for term in query_vector.keys():
            if term in vocabulary:
                idf_2 = vocabulary[term]['idf'] ** 2
                self.query_vector_normalized[term] = query_vector[term] * idf_2
                self.term_weights.append((term, vocabulary[term]['idf']))
                self.query_norm += idf_2 * query_vector[term] ** 2
        self.query_norm = self.query_norm ** 0.5
        for term in self.query_vector_normalized.keys():
            self.query_vector_normalized[term] /= self.query_norm
    
    def similarity(self, doc_id: int, doc_vector: dict[str, int] = None):
        if not doc_vector: 
            doc_vector = documents[doc_id]['vector']
        dot_product = sum([doc_vector.get(term, 0) * q for (term, q) in self.query_vector_normalized.items()])
        return dot_product / documents[doc_id]['norm']

# implements the dot product
class DotProduct(VSMeasure):
    def __init__(self, query_vector: dict[str, int]):
        self.query_vector_idf2 = {}
        self.term_weights = []
        for term in query_vector.keys():
            if term in vocabulary:
                idf = vocabulary[term]['idf']
                self.query_vector_idf2[term] = query_vector[term] * idf ** 2
                self.term_weights.append((term, idf))
    
    def similarity(self, doc_id: int, doc_vector: dict[str, int] = None):
        if not doc_vector:
            doc_vector = documents[doc_id]['vector']
        dot_product = sum([doc_vector.get(term, 0) * q for (term, q) in self.query_vector_idf2.items()])
        return dot_product

### Base class

In [149]:
import math

class VSModel:
    """
        Generic class for the evaluation of the Vector Space model, inherited by the document-at-a-time (DAAT) and 
        term-at-a-time (TAAT) implementation. This superclass defines the idf-weights including filtering the most
        important terms.
    """
    @staticmethod
    def get_similarity_measure(measure: str, query_vector: dict[str, int]) -> VSMeasure:
        return {
            'cosine': CosineMeasure(query_vector)
        }[measure]

### Document-at-a-time for Vector Space Model

In [150]:
class VSModel_DAAT(VSModel):
    """
        Implements the DAAT model for the Vector Space model using inverted index method.
    """
    @staticmethod
    def query(query_vector: dict[str, int], k: int, measure: str = 'dot', predicate: Callable[[int], bool] = None, selected_docs: set[int] = None):
        # determine simialrity measure for this query
        sim = VSModel.get_similarity_measure(measure, query_vector)
        
        # get iterators for each term and fetch first posting; postings have form (term, tf)
        iters = [iter(index[term]) for (term, _) in sim.term_weights]
        nexts = [next(iter, None) for iter in iters]

        # keep track of all retrieved documents and their score; stored as tuples (doc_id, score)
        topk = TopKList(k, sim.term_weights, predicate)
        while not all(e is None for e in nexts):
            # get smallest value from nexts, ignoring None values
            smallest = min(nexts, key = lambda x: x[0] if x is not None else float('inf'))[0]
            # create a document vector with only the query terms for the document with id = smallest
            doc_query_terms = {sim.term_weights[i][0]: nexts[i][1] for i in range(len(nexts)) if nexts[i] and nexts[i][0] == smallest}
            score = sim.similarity(smallest, doc_query_terms)
            # assert score == sim.similarity(smallest)
            topk.add(smallest, score)
            # for each entry in nexts, fetch next item if entry equals smallest
            for i, e in enumerate(nexts):
                if e and e[0] is smallest:
                    nexts[i] = next(iters[i], None)
        
        # finsihed, return topk for result iteration
        return topk

### Term-at-a-time for Vector Space Model

## Random data example
### Create inverted index
The next section generates random inverted index postings for a set of terms. It simulates the indexing process for Boolean retrieval by associating random document IDs with each term. The `vocabulary` dictionary defines terms and their desired document frequencies (as a %-figure). The generated postings are stored in the `index` dictionary, with each term mapped to a set of corresponding document IDs.

* `nDocs = 100`: Defines the total number of documents (document IDs) as 100.
* `index = {}`: Initializes an empty dictionary to store the postings for each term.
* `DEBUG = False`: A debug flag (we use it later to illustrate code execution).
* `vocabulary`: Defines a dictionary where each term is associated with its desired document frequency (expressed as a percentage).
* `documents`: List of all documents with each entry holding a dictionary {vector: dict, len: float, norm: float}
  - vector holds the term freqeuncies as dictionary (key=term, value=term frequency)
  - len is the number of terms in the document (its length)
  - norm is the 

`create_postings(term: str, docFreq: int = None)` takes a term (string) and an optional document frequency (docFreq, integer) as arguments. It generates random postings for the term by creating a set of document IDs. If docFreq is not provided, it generates a random document frequency between 1 and nDocs. The for-loop iterates through each term in the vocabulary dictionary and calls the create_postings function. For each term, it fetches the desired document frequency from the vocabulary (values are percentages) and passes it to create_postings.

`is_relevant(doc_id: int)` returns True if document is relevant ()

In [28]:
import random

DEBUG = False
nDocs = 40
index = {}
documents = {}
vocabulary = {}

# helper function to create random postings with given document frequency
def create_postings(term: str, docFreq: int = None):
    # create sets with random ids
    index[term] = []
    vocabulary[term] = {'df': docFreq, 'idf': 0}
    # extend feature vectors for documents with a random term frequency
    for doc_id in sorted(random.sample(sorted(documents.keys()), docFreq)):
        # select a random term frequency for the term
        tf = random.randint(1, 10)
        index[term].append((doc_id, tf))
        documents[doc_id]['vector'][term] = tf

# set all feature vectors of documents to empty. We use sets since BIR uses set-of-word model
for doc_id in range(1, nDocs + 1):
    documents[doc_id] = {'id': doc_id, 'vector': {}, 'len': 0, 'norm': 0}

# we use some animal terms to create random documents
terms = ['dog', 'cat', 'horse', 'rabit', 'ostrich', 'bear', 'tiger', 'lion', 'bird']

# call create_postings for each entry in vocabulary to create the inverted index
for term in terms:
    create_postings(term, random.randint(nDocs // 10, nDocs // 2))

# now calculate the idf for each term and the norm for each document
for item in vocabulary.values():
    item['idf'] = idf(item['df'], nDocs)
    item['idf_bm25'] = idf_bm25(item['df'], nDocs)
for doc in documents.values():
    doc['len'] = sum([tf for _, tf in doc['vector'].items()])
    doc['norm'] = sum([(tf * vocabulary[term]['idf']) ** 2 for term, tf in doc['vector'].items()]) ** 0.5

Let's have a look at the postings for each term:

In [32]:
# print vocabulary with df and idf
for term, item in vocabulary.items():
    print("{term:10} {df:<4d} {idf:<7.2f} {idf_bm25:<7.2f} {postings}".format(term=term.ljust(10), df=item['df'], idf=item['idf'], idf_bm25=item['idf_bm25'], postings=index[term]))

dog        9    1.41    1.20    [(7, 8), (16, 2), (17, 7), (21, 8), (22, 2), (26, 4), (31, 9), (34, 9), (38, 9)]
cat        15   0.94    0.50    [(2, 5), (3, 7), (4, 8), (6, 2), (13, 1), (15, 9), (18, 3), (22, 3), (26, 6), (28, 10), (30, 6), (31, 10), (36, 4), (39, 8), (40, 8)]
horse      11   1.23    0.94    [(4, 8), (9, 2), (11, 8), (14, 5), (17, 9), (19, 1), (20, 10), (27, 9), (35, 10), (37, 6), (39, 10)]
rabit      5    1.92    1.86    [(4, 10), (20, 10), (28, 6), (30, 7), (31, 1)]
ostrich    14   1.01    0.60    [(2, 9), (9, 3), (10, 5), (12, 5), (15, 1), (17, 7), (20, 1), (21, 9), (23, 7), (26, 6), (27, 4), (29, 2), (33, 10), (37, 2)]
bear       15   0.94    0.50    [(2, 3), (5, 2), (7, 2), (9, 7), (14, 6), (16, 4), (19, 7), (20, 4), (23, 3), (24, 9), (25, 6), (28, 4), (31, 2), (38, 9), (39, 3)]
tiger      8    1.52    1.34    [(10, 5), (13, 6), (16, 5), (22, 3), (30, 7), (32, 6), (37, 4), (38, 9)]
lion       12   1.15    0.82    [(1, 5), (2, 6), (9, 2), (12, 5), (16, 10), (17, 6

In [33]:
# print a few documents
for doc_id in range(1, 21):
    print("{id:>4} | {len:<5d} {norm:<7.2f} {terms}".format(id=doc_id, len=documents[doc_id]['len'], norm=documents[doc_id]['norm'], terms=str(documents[doc_id]['vector'])))

   1 | 5     5.74    {'lion': 5}
   2 | 23    12.63   {'cat': 5, 'ostrich': 9, 'bear': 3, 'lion': 6}
   3 | 11    8.22    {'cat': 7, 'bird': 4}
   4 | 28    22.99   {'cat': 8, 'horse': 8, 'rabit': 10, 'bird': 2}
   5 | 2     1.88    {'bear': 2}
   6 | 2     1.88    {'cat': 2}
   7 | 10    11.44   {'dog': 8, 'bear': 2}
   8 | 0     0.00    {}
   9 | 14    7.99    {'horse': 2, 'ostrich': 3, 'bear': 7, 'lion': 2}
  10 | 19    14.32   {'ostrich': 5, 'tiger': 5, 'bird': 9}
  11 | 8     9.83    {'horse': 8}
  12 | 10    7.63    {'ostrich': 5, 'lion': 5}
  13 | 12    11.02   {'cat': 1, 'tiger': 6, 'bird': 5}
  14 | 11    8.34    {'horse': 5, 'bear': 6}
  15 | 10    8.53    {'cat': 9, 'ostrich': 1}
  16 | 21    14.54   {'dog': 2, 'bear': 4, 'tiger': 5, 'lion': 10}
  17 | 29    17.80   {'dog': 7, 'horse': 9, 'ostrich': 7, 'lion': 6}
  18 | 3     2.82    {'cat': 3}
  19 | 17    12.93   {'horse': 1, 'bear': 7, 'bird': 9}
  20 | 25    23.14   {'horse': 10, 'rabit': 10, 'ostrich': 1, 'bear': 4}


### Evaluation

In [34]:
DEBUG = False

# initial step for "bird(2) horse"
query = {'bird': 2, 'horse':1}
k = 10
print(query)

# (optional) enable a predicate for the filtering step
predicate = None
# predicate = lambda doc_id: doc_id % 2 == 0
# predicate = lambda doc_id: doc_id % 2 == 1
selected_docs = None
# selected_docs = list(range(10))

# run query, display result, and get feedback
topk = VSModel_DAAT.query(query, k, 'cosine', predicate)
print_topk(topk)

{'bird': 2, 'horse': 1}

    r  id score  document
-------------------------------------
    1  19 (0.81) {'id': 19, 'vector': {'horse': 1, 'bear': 7, 'bird': 9}, 'len': 17, 'norm': 12.929646825221669}
    2  36 (0.80) {'id': 36, 'vector': {'cat': 4, 'bird': 6}, 'len': 10, 'norm': 8.277286265141177}
    3  10 (0.69) {'id': 10, 'vector': {'ostrich': 5, 'tiger': 5, 'bird': 9}, 'len': 19, 'norm': 14.319172933962935}
    4  40 (0.67) {'id': 40, 'vector': {'cat': 8, 'bird': 7}, 'len': 15, 'norm': 11.42978978465472}
    5   3 (0.53) {'id': 3, 'vector': {'cat': 7, 'bird': 4}, 'len': 11, 'norm': 8.218329050423852}
    6  13 (0.50) {'id': 13, 'vector': {'cat': 1, 'tiger': 6, 'bird': 5}, 'len': 12, 'norm': 11.018215228846154}
    7  11 (0.45) {'id': 11, 'vector': {'horse': 8}, 'len': 8, 'norm': 9.829323335330459}
    8  35 (0.45) {'id': 35, 'vector': {'horse': 10}, 'len': 10, 'norm': 12.286654169163073}
    9  37 (0.45) {'id': 37, 'vector': {'horse': 6, 'ostrich': 2, 'tiger': 4, 'bird': 1}, 'len

## IMDB Search Example
### Load the data set

In [276]:
from utils import stopwords
import re

def build_index(data):
    global nDocs, index, documents, vocabulary
    nDocs = len(data)
    index = {}
    documents = {}
    vocabulary = {}
    all_terms = []
    for doc in data:
        documents[doc['id']] = doc
        # get terms from all string properties of doc
        text = ' '.join([value for key, value in doc.items() if type(value) == str])
        text = re.sub(r'[,\.\-\?!\(\)\s:;_\'"]', ' ', text.lower())
        terms = filter(lambda t: t not in stopwords.english, text.split(' '))
        # create bag-of-words vector (dict)
        doc['vector'] = dict([(term, len(list(group))) for term, group in groupby(sorted(terms))])
        doc['len'] = sum([tf for _, tf in doc['vector'].items()])
        # update vocabulary (holding df values) and index (term -> postings)
        for term, tf in doc['vector'].items():
            vocabulary[term] = vocabulary.get(term, 0) + 1
            index[term] = index.get(term, []) + [(doc['id'], tf)]
    # update vocabulary -> create dict with df, idf, and idf_bm25 values
    vocabulary = dict([(term, {'df': df, 'idf': idf(df, nDocs), 'idf_bm25': idf_bm25(df, nDocs)}) for term, df in vocabulary.items()])
    # calculate norm of vectoir for cosine measure
    for doc in documents.values():
        doc['norm'] = sum([(tf * vocabulary[term]['idf']) ** 2 for term, tf in doc['vector'].items()]) ** 0.5        

In [277]:
# loading the imdb data set (1000 movies)
from datasets.docs import imdb
from utils import stopwords

data = imdb.load()
data[:1]

[{'id': 1,
  'title': 'The Shawshank Redemption',
  'year': 1994,
  'runtime': 142,
  'rating': 9.3,
  'genre': 'Drama',
  'actors': 'Tim Robbins Morgan Freeman Bob Gunton William Sadler',
  'summary': 'Two imprisoned men bond over a number of years, finding solace and eventual redemption through acts of common decency.'}]

In [278]:
build_index(data)
print('nDocs = {nDocs}, nTerms = {nTerms}, nPostings'.format(nDocs=nDocs, nTerms=len(vocabulary), nPostings=sum([len(postings) for postings in index.values()])))

nDocs = 1000, nTerms = 9906, nPostings


In [279]:
# print vocabulary with df and idf
print('term               df     idf idf_bm25    postings')
print('-' * 100)
for term, item in random.sample(list(vocabulary.items()), 25):
    print("{term:16} {df:>4d} {idf:>7.2f} {idf_bm25:>8.2f}    {postings}".format(term=term.ljust(10), df=item['df'], idf=item['idf'], idf_bm25=item['idf_bm25'], postings=index[term]))

term               df     idf idf_bm25    postings
----------------------------------------------------------------------------------------------------
wymark              1    6.22     6.50    [(855, 1)]
ostracized          1    6.22     6.50    [(198, 1)]
cents               1    6.22     6.50    [(300, 1)]
reminisce           1    6.22     6.50    [(129, 1)]
bueller             1    6.22     6.50    [(680, 1)]
club                4    5.30     5.40    [(10, 2), (360, 1), (684, 1), (698, 1)]
süleyman            1    6.22     6.50    [(55, 1)]
enemies             4    5.30     5.40    [(21, 1), (407, 1), (566, 1), (869, 1)]
assassinating       1    6.22     6.50    [(75, 1)]
neither             1    6.22     6.50    [(215, 1)]
paying              1    6.22     6.50    [(728, 1)]
forte               1    6.22     6.50    [(740, 1)]
grant              11    4.42     4.45    [(119, 1), (403, 1), (456, 1), (547, 1), (548, 1), (560, 1), (563, 1), (564, 1), (577, 1), (716, 1), (831, 1)]
mag

In [280]:
# print a few documents
for doc_id in range(1, 21):
    print("{id:>4} | {len:<5d} {norm:<7.2f} {terms}".format(id=doc_id, len=documents[doc_id]['len'], norm=documents[doc_id]['norm'], terms=str(documents[doc_id]['vector'])))

KeyError: 'norm'