In [5]:
%load_ext autoreload
%autoreload 2

import os, sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Vector Space retrieval with inverted files


## Helper functions

### Globals
* `DEBUG = False`: A debug flag for additional output (if needed)
* `nDocs: int`: number of documents
* `index: dict[term, list[tuple[int, int]]]`: inverted index mapping terms to postings. Postings contain doc_id and term frequency sorted by doc_id
* `vocabulary: dict[term, dict{df, idf, idf_bm25}]`: vocabulary with all terms. Values contain objects with document frequency, idf, and idf for BM25
* `documents: dict[int, dict{id, vector, len, norm}]`: collection of documents as dictionary with doc_id as key. Entries is a dictionary with the properies of the document (as loaded) and additional properties for the retrieval:
  - `id` hold the document id as generated when loading the document; corresponds to the key in documents
  - `vector` holds the term freqeuncies as dictionary (key=term, value=term frequency)
  - `len` is the number of terms in the document (its length)
  - `norm` is the length of the vector with term frequency multiplied by the idf (only needed for cosine measure)

In [6]:
import random, math
import ipywidgets as widgets
from typing import Callable

global nDocs, index, vocabulary, documents
DEBUG = False

### TopKList class

In [7]:
from heapq import heappop, heappush

class TopKList:
    """
        Maintains a list of top-k documents. Initializer accepts
        a list of tuples (term, weight) to provide information about
        weights used by retrieval model. Implements the iter() interface.
        Takes an optional predicate(doc_id: int) function to filter documents
        before returning them. 
    """
    def __init__(self, k: int, term_weights: list[tuple[str,float]] = None, predicate: Callable[[int], bool] = None):
        self.docs_heap = []
        self.k = k
        self.predicate = predicate
        if term_weights:
            self.term_weights = term_weights
            self.terms = [term for term, _ in self.term_weights]
            self.weights = dict(self.term_weights)
    
    def add(self, doc_id: int, score: float):
        heappush(self.docs_heap, (-score, doc_id, {'id': doc_id, 'score': score}))
        # optional (infrequent) pruning if heap grows too large

    def __iter__(self):
        rank = 0
        while rank < self.k and len(self.docs_heap) > 0:
            entry = heappop(self.docs_heap)[2]
            if self.predicate == None or self.predicate(entry['id']):
                rank += 1
                entry['rank'] = rank
                yield entry

### IDF implementations

In [8]:
def idf(doc_freq: int, num_docs: int) -> float:
    return math.log((num_docs + 1) / (doc_freq + 1))

def idf_bm25(doc_freq: int, num_docs: int) -> float:
    return math.log((num_docs - doc_freq + 0.5) / (doc_freq + 0.5))

### Functions for pretty printing

In [55]:
# helper function to display result and get feedback
def print_topk(topk, doc_format: Callable[[dict], str] = None):
    doc_format = doc_format or (lambda x: str(x))
    print("\n    r  id score  document\n-------------------------------------")
    for entry in topk:
        print("  {rank: >3d} {id: >3d} ({score:.2f})".format(**entry), doc_format(documents[entry['id']]))
    print()
    for term in sorted(topk.weights.keys(), key = lambda term: -topk.weights[term]):
        print(term.rjust(16), topk.weights[term])

## Vector Space Model Implementation


### Scoring functions


In [10]:
class VSMeasure: pass
    
# implements the cosine measure
class CosineMeasure(VSMeasure):

    def __init__(self, query_vector: dict[str, int]):
        self.query_vector_normalized = {}
        self.term_weights = []
        self.query_norm = 0
        for term in query_vector.keys():
            if term in vocabulary:
                idf_2 = vocabulary[term]['idf'] ** 2
                self.query_vector_normalized[term] = query_vector[term] * idf_2
                self.term_weights.append((term, vocabulary[term]['idf']))
                self.query_norm += idf_2 * query_vector[term] ** 2
        self.query_norm = self.query_norm ** 0.5
        for term in self.query_vector_normalized.keys():
            self.query_vector_normalized[term] /= self.query_norm
    
    def similarity(self, doc_id: int, doc_vector: dict[str, int] = None):
        if not doc_vector: 
            doc_vector = documents[doc_id]['vector']
        dot_product = sum([doc_vector.get(term, 0) * q for (term, q) in self.query_vector_normalized.items()])
        return dot_product / documents[doc_id]['norm']

# implements the dot product
class DotProduct(VSMeasure):
    def __init__(self, query_vector: dict[str, int]):
        self.query_vector_idf2 = {}
        self.term_weights = []
        for term in query_vector.keys():
            if term in vocabulary:
                idf = vocabulary[term]['idf']
                self.query_vector_idf2[term] = query_vector[term] * idf ** 2
                self.term_weights.append((term, idf))
    
    def similarity(self, doc_id: int, doc_vector: dict[str, int] = None):
        if not doc_vector:
            doc_vector = documents[doc_id]['vector']
        dot_product = sum([doc_vector.get(term, 0) * q for (term, q) in self.query_vector_idf2.items()])
        return dot_product

### Base class

In [11]:
class VSModel:
    """
        Generic class for the evaluation of the Vector Space model, inherited by the document-at-a-time (DAAT) and 
        term-at-a-time (TAAT) implementation. This superclass defines the idf-weights including filtering the most
        important terms.
    """
    @staticmethod
    def get_similarity_measure(measure: str, query_vector: dict[str, int]) -> VSMeasure:
        return {
            'cosine': CosineMeasure(query_vector)
        }[measure]

### Document-at-a-time for Vector Space Model

In [116]:
class VSModel_DAAT(VSModel):
    """
        Implements the DAAT model for the Vector Space model using inverted index method.
    """
    @staticmethod
    def query(query_vector: dict[str, int], k: int, measure: str = 'dot', predicate: Callable[[int], bool] = None, selected_docs: set[int] = None):
        # determine simialrity measure for this query
        sim = VSModel.get_similarity_measure(measure, query_vector)
        
        # get iterators for each term and fetch first posting; postings have form (term, tf)
        iters = [iter(index[term]) for (term, _) in sim.term_weights]
        nexts = [next(iter, None) for iter in iters]

        # keep track of all retrieved documents and their score; stored as tuples (doc_id, score)
        topk = TopKList(k, sim.term_weights, predicate)
        while not all(e is None for e in nexts):
            # get smallest value from nexts, ignoring None values
            smallest = min(nexts, key = lambda x: x[0] if x is not None else float('inf'))[0]
            # if selected_docs is given and smallest is not in selected_docs, skip this document
            if selected_docs == None or smallest in selected_docs:
                # create a document vector with only the query terms for the document with id = smallest
                doc_query_terms = {sim.term_weights[i][0]: nexts[i][1] for i in range(len(nexts)) if nexts[i] and nexts[i][0] == smallest}
                score = sim.similarity(smallest, doc_query_terms)
                # assert score == sim.similarity(smallest)
                topk.add(smallest, score)
            # for each entry in nexts, fetch next item if entry equals smallest
            for i, e in enumerate(nexts):
                if e and e[0] is smallest:
                    nexts[i] = next(iters[i], None)
        
        # finsihed, return topk for result iteration
        return topk

### Term-at-a-time for Vector Space Model

## Building the index from document collection

In [117]:
from utils import stopwords
from itertools import groupby
import re

def tokenize(text: str) -> list[str]:
    text = re.sub(r'[,\.\-\?!\(\)\s:;_\'"\+\*\&\$]', ' ', text.lower())
    text = re.sub(r'\s+', ' ', text).strip()
    return filter(lambda t: t not in stopwords.english, text.split(' '))

def bag_of_words(tokens: list[str]) -> dict[str, int]:
    return dict([(token, len(list(group))) for token, group in groupby(sorted(tokens))])

def build_index(collection: list[dict]) -> None:
    global nDocs, index, documents, vocabulary
    nDocs = 0
    index = {}
    documents = {}
    vocabulary = {}
    for doc in collection:
        nDocs += 1
        # add document to documents dict, with id as key and all properties
        doc_id = doc['id'] = nDocs
        documents[doc_id] = doc
        # get terms from all string properties of doc
        text = ' '.join([value for key, value in doc.items() if type(value) == str])
        doc['vector'] = bag_of_words(tokenize(text))
        doc['len'] = sum([tf for _, tf in doc['vector'].items()])
        # update vocabulary (holding df values during this loop) and index (term -> postings)
        for term, tf in doc['vector'].items():
            vocabulary[term] = vocabulary.get(term, 0) + 1
            index[term] = index.get(term, []) + [(doc_id, tf)]
    # update vocabulary -> create dict with df, idf, and idf_bm25 values
    vocabulary = dict([(term, {'df': df, 'idf': idf(df, nDocs), 'idf_bm25': idf_bm25(df, nDocs)}) for term, df in vocabulary.items()])
    # calculate norm of vector for cosine measure
    for doc in documents.values():
        doc['norm'] = sum([(tf * vocabulary[term]['idf']) ** 2 for term, tf in doc['vector'].items()]) ** 0.5      

## IMDB Search Example
### Load the data set

In [118]:
# loading the imdb data set (1000 movies)
from datasets.docs import imdb
from utils import stopwords

data = imdb.load()
def doc_format_imdb(doc: dict) -> str:
    trim = lambda s,n: len(s) > n and s[:n] + "\u2026" or s
    title_ex = '{title_short} ({year}, {runtime}m, {rating})'.format(title_short=trim(doc['title'], 30), **doc)
    return '{title_ex:<50} {genre_short:<20} {summary} [{actors}]'.format(title_ex=title_ex, genre_short=trim(doc['genre'], 18), **doc)

for item in data[:5]:
    print(doc_format_imdb(item))

data[0]

The Shawshank Redemption (1994, 142m, 9.3)         Drama                Two imprisoned men bond over a number of years, finding solace and eventual redemption through acts of common decency. [Tim Robbins Morgan Freeman Bob Gunton William Sadler]
The Godfather (1972, 175m, 9.2)                    Crime Drama          An organized crime dynasty's aging patriarch transfers control of his clandestine empire to his reluctant son. [Marlon Brando Al Pacino James Caan Diane Keaton]
The Dark Knight (2008, 152m, 9.0)                  Action Crime Drama   When the menace known as the Joker wreaks havoc and chaos on the people of Gotham, Batman must accept one of the greatest psychological and physical tests of his ability to fight injustice. [Christian Bale Heath Ledger Aaron Eckhart Michael Caine]
The Godfather: Part II (1974, 202m, 9.0)           Crime Drama          The early life and career of Vito Corleone in 1920s New York City is portrayed, while his son, Michael, expands and tightens his 

{'title': 'The Shawshank Redemption',
 'year': 1994,
 'runtime': 142,
 'rating': 9.3,
 'genre': 'Drama',
 'actors': 'Tim Robbins Morgan Freeman Bob Gunton William Sadler',
 'summary': 'Two imprisoned men bond over a number of years, finding solace and eventual redemption through acts of common decency.'}

In [119]:
build_index(data)
print('nDocs = {nDocs}, nTerms = {nTerms}, nPostings'.format(nDocs=nDocs, nTerms=len(vocabulary), nPostings=sum([len(postings) for postings in index.values()])))

nDocs = 1000, nTerms = 9900, nPostings


### Looking at top (and some random) terms in the vocabulary including postings from the index

In [120]:
# print vocabulary with df and idf
print('term               df     idf idf_bm25    postings')
print('-' * 100)
for term, item in sorted(vocabulary.items(), key=lambda t: -t[1]['df'])[:10]:
    print("{term:16} {df:>4d} {idf:>7.2f} {idf_bm25:>8.2f}    {postings}".format(term=term.ljust(10), df=item['df'], idf=item['idf'], idf_bm25=item['idf_bm25'], postings=index[term]))
print()
for term, item in random.sample(list(vocabulary.items()), 15):
    print("{term:16} {df:>4d} {idf:>7.2f} {idf_bm25:>8.2f}    {postings}".format(term=term.ljust(10), df=item['df'], idf=item['idf'], idf_bm25=item['idf_bm25'], postings=index[term]))

term               df     idf idf_bm25    postings
----------------------------------------------------------------------------------------------------
drama             724    0.32    -0.96    [(1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (10, 1), (11, 1), (12, 1), (14, 1), (16, 1), (18, 1), (19, 1), (20, 1), (21, 1), (22, 1), (23, 1), (25, 1), (26, 1), (27, 1), (28, 1), (29, 1), (31, 1), (32, 1), (33, 1), (34, 1), (35, 1), (36, 1), (37, 1), (38, 1), (39, 1), (40, 1), (41, 1), (43, 1), (44, 1), (46, 1), (47, 1), (51, 1), (52, 1), (53, 1), (54, 1), (55, 1), (56, 1), (57, 1), (58, 1), (60, 1), (63, 1), (65, 1), (66, 1), (68, 1), (69, 1), (72, 1), (74, 1), (75, 1), (77, 1), (78, 1), (80, 1), (81, 1), (83, 1), (84, 1), (85, 1), (86, 1), (87, 1), (88, 1), (89, 1), (90, 1), (91, 1), (92, 1), (93, 1), (94, 1), (95, 1), (98, 1), (99, 1), (100, 1), (101, 1), (103, 1), (104, 1), (105, 1), (106, 1), (108, 1), (109, 1), (111, 1), (112, 1), (113, 1), (114, 1), (117, 1), (118, 1)

### Looking at some random documents

In [121]:
# print a few documents
print('  id |  len     norm   vector')
print('-' * 100)
for doc_id in random.sample(range(1, len(documents) + 1), 25):
    print("{id:>4} | {len:>4d} {norm:>8.2f}   {vector}".format(id=doc_id, len=documents[doc_id]['len'], norm=documents[doc_id]['norm'], vector=str(documents[doc_id]['vector'])))

  id |  len     norm   vector
----------------------------------------------------------------------------------------------------
 864 |   19    20.91   {'bergen': 1, 'cape': 1, 'drama': 1, 'family': 1, 'fear': 1, 'gregory': 1, 'helped': 1, 'jail': 1, 'lawyer': 1, 'lori': 1, 'man': 1, 'martin': 1, 'mitchum': 1, 'peck': 1, 'polly': 1, 'put': 1, 'robert': 1, 'stalked': 1, 'thriller': 1}
 740 |   29    29.40   {'addled': 1, 'adventure': 1, 'aging': 1, 'bob': 1, 'booze': 1, 'bruce': 1, 'claim': 1, 'comedy': 1, 'dern': 1, 'dollar': 1, 'drama': 1, 'estranged': 1, 'father': 1, 'forte': 1, 'june': 1, 'makes': 1, 'marketing': 1, 'mega': 1, 'million': 1, 'montana': 1, 'nebraska': 2, 'odenkirk': 1, 'order': 1, 'prize': 1, 'son': 1, 'squibb': 1, 'sweepstakes': 1, 'trip': 1}
 781 |   24    23.11   {'anna': 1, 'bill': 1, 'bond': 1, 'comedy': 1, 'crossing': 1, 'drama': 1, 'faded': 1, 'faris': 1, 'form': 1, 'giovanni': 1, 'johansson': 1, 'lost': 1, 'movie': 1, 'murray': 1, 'neglected': 1, 'paths': 1,

### Queries

In [127]:
from typing import Callable
def search(query: str, k: int, measure: str, predicate: str, algorithm: str) -> TopKList:
    query_vector = bag_of_words(tokenize(query))
    vsmodel = {'ddat': VSModel_DAAT}.get(algorithm.lower(), VSModel_DAAT)
    predicate_func = {
        'year > 1990 (predicate)': lambda doc_id: documents[doc_id]['year'] > 1990, 
        'year <= 1990 (predicate)': lambda doc_id: documents[doc_id]['year'] <= 1990, 
    }.get(predicate, None)
    selected_docs = {
        'top-100 (selection)': set(range(1,101))
    }.get(predicate, None)
    topk = vsmodel.query(query_vector, k, measure, predicate=predicate_func, selected_docs=selected_docs)
    print_topk(topk, doc_format_imdb)

# options for the dialog
queries = ['star wars', 'drama morgan freeman', 'comedy']
measures = ['cosine', 'dot', 'bm25']
predicates = ['<none>', 'year > 1990 (predicate)', 'year <= 1990 (predicate)', 'top-100 (selection)']
algorithm = ['ddat', 'taat']

# interactive selection of scenario
widgets.interact(search, 
    query=widgets.Dropdown(options=queries), 
    k=widgets.IntSlider(min=5, max=50, step=5, value=20),
    measure=widgets.Dropdown(options=measures),
    predicate=widgets.Dropdown(options=predicates),
    algorithm=widgets.Dropdown(options=algorithm),
);


interactive(children=(Dropdown(description='query', options=('star wars', 'drama morgan freeman', 'comedy'), v…

In [36]:
queries = ['star wars', 'drama morgan freeman', 'comedy']
print(list(enumerate(queries)))
q_dropdown = widgets.Dropdown(
    options=[('One', 1), ('Two', 2), ('Three', 3)],
    value=2,
    description='Number:',
)

[(0, 'star wars'), (1, 'drama morgan freeman'), (2, 'comedy')]


In [34]:
DEBUG = False

# initial step for "bird(2) horse"
query = {'bird': 2, 'horse':1}
k = 10
print(query)

# (optional) enable a predicate for the filtering step
predicate = None
# predicate = lambda doc_id: doc_id % 2 == 0
# predicate = lambda doc_id: doc_id % 2 == 1
selected_docs = None
# selected_docs = list(range(10))

# run query, display result, and get feedback
topk = VSModel_DAAT.query(query, k, 'cosine', predicate)
print_topk(topk)

{'bird': 2, 'horse': 1}

    r  id score  document
-------------------------------------
    1  19 (0.81) {'id': 19, 'vector': {'horse': 1, 'bear': 7, 'bird': 9}, 'len': 17, 'norm': 12.929646825221669}
    2  36 (0.80) {'id': 36, 'vector': {'cat': 4, 'bird': 6}, 'len': 10, 'norm': 8.277286265141177}
    3  10 (0.69) {'id': 10, 'vector': {'ostrich': 5, 'tiger': 5, 'bird': 9}, 'len': 19, 'norm': 14.319172933962935}
    4  40 (0.67) {'id': 40, 'vector': {'cat': 8, 'bird': 7}, 'len': 15, 'norm': 11.42978978465472}
    5   3 (0.53) {'id': 3, 'vector': {'cat': 7, 'bird': 4}, 'len': 11, 'norm': 8.218329050423852}
    6  13 (0.50) {'id': 13, 'vector': {'cat': 1, 'tiger': 6, 'bird': 5}, 'len': 12, 'norm': 11.018215228846154}
    7  11 (0.45) {'id': 11, 'vector': {'horse': 8}, 'len': 8, 'norm': 9.829323335330459}
    8  35 (0.45) {'id': 35, 'vector': {'horse': 10}, 'len': 10, 'norm': 12.286654169163073}
    9  37 (0.45) {'id': 37, 'vector': {'horse': 6, 'ostrich': 2, 'tiger': 4, 'bird': 1}, 'len