# Boolean Retrieval

In [1]:
%pip install -r requirements.txt
%load_ext autoreload
%autoreload 2   

Note: you may need to restart the kernel to use updated packages.


In [2]:
from helpers import print_table
from vectorspace import VSRetriever, TopKList, VSRetriever_TAAT, VSRetriever_DAAT
from datasets import imdb as collection
import ipywidgets as widgets
import random
import math
from itertools import islice
from typing import Callable
from helpers import set_of_words
from collections import defaultdict
from functools import reduce
from IPython.display import clear_output

## IMDB data set

In [3]:
# load data set and index collection for boolean retrieval
retriever = VSRetriever(collection.load())

# show collection
n, m = min(10, retriever.n_docs), min(20, retriever.n_terms)

print_table([collection.format(doc) for doc in retriever.documents.values()], collection.headers(), max_rows = n)
print_table(random.sample([[term, term_data['df'], round(term_data['idf'], 2), ', '.join([f'{doc_id} ({round(w, 2)})' for doc_id, w in retriever.index[term]])] for term, term_data in retriever.vocabulary.items()], m), ['term', 'df', 'idf', 'posting'], max_rows=m)

print(f'{len(retriever.documents)} documents in collection')
print(f'{len(retriever.vocabulary)} distinct terms in collection')
print('{count} postings'.format(count=sum([len(postings) for postings in retriever.index.values()])))

|   id | title                           |   year |   runtime |   rating | genre                 | actors                          | summary                                                                                               |
|-----:|:--------------------------------|-------:|----------:|---------:|:----------------------|:--------------------------------|:------------------------------------------------------------------------------------------------------|
|    1 | The Shawshank Redemption        |   1994 |       142 |      9.3 | Drama                 | Tim Robbins Morgan Freeman Bob… | Two imprisoned men bond over a number of years, finding solace and eventual redemption through acts … |
|    2 | The Godfather                   |   1972 |       175 |      9.2 | Crime Drama           | Marlon Brando Al Pacino James … | An organized crime dynasty's aging patriarch transfers control of his clandestine empire to his relu… |
|    3 | The Dark Knight                 |   2008 |       152 |      9   | Action Crime Drama    | Christian Bale Heath Ledger Aa… | When the menace known as the Joker wreaks havoc and chaos on the people of Gotham, Batman must accep… |
|    4 | The Godfather: Part II          |   1974 |       202 |      9   | Crime Drama           | Al Pacino Robert De Niro Rober… | The early life and career of Vito Corleone in 1920s New York City is portrayed, while his son, Micha… |
|    5 | 12 Angry Men                    |   1957 |        96 |      9   | Crime Drama           | Henry Fonda Lee J. Cobb Martin… | A jury holdout attempts to prevent a miscarriage of justice by forcing his colleagues to reconsider … |
|    6 | The Lord of the Rings: The Ret… |   2003 |       201 |      8.9 | Action Adventure Dra… | Elijah Wood Viggo Mortensen Ia… | Gandalf and Aragorn lead the World of Men against Sauron's army to draw his gaze from Frodo and Sam … |
|    7 | Pulp Fiction                    |   1994 |       154 |      8.9 | Crime Drama           | John Travolta Uma Thurman Samu… | The lives of two mob hitmen, a boxer, a gangster and his wife, and a pair of diner bandits intertwin… |
|    8 | Schindler's List                |   1993 |       195 |      8.9 | Biography Drama Hist… | Liam Neeson Ralph Fiennes Ben … | In German-occupied Poland during World War II, industrialist Oskar Schindler gradually becomes conce… |
|    9 | Inception                       |   2010 |       148 |      8.8 | Action Adventure Sci… | Leonardo DiCaprio Joseph Gordo… | A thief who steals corporate secrets through the use of dream-sharing technology is given the invers… |
|   10 | Fight Club                      |   1999 |       139 |      8.8 | Drama                 | Brad Pitt Edward Norton Meat L… | An insomniac office worker and a devil-may-care soapmaker form an underground fight club that evolve… |

| term        |   df |   idf | posting                            |
|:------------|-----:|------:|:-----------------------------------|
| grondin     |    1 |  6.22 | 501 (6.22)                         |
| uri         |    1 |  6.22 | 130 (6.22)                         |
| settling    |    1 |  6.22 | 839 (6.22)                         |
| hagerty     |    1 |  6.22 | 843 (6.22)                         |
| richter     |    1 |  6.22 | 115 (6.22)                         |
| dita        |    1 |  6.22 | 316 (6.22)                         |
| milegi      |    1 |  6.22 | 222 (6.22)                         |
| hood        |    3 |  5.52 | 565 (5.52), 607 (5.52), 670 (5.52) |
| ferrara     |    1 |  6.22 | 538 (6.22)                         |
| kalki       |    1 |  6.22 | 351 (6.22)                         |
| farahani    |    1 |  6.22 | 350 (6.22)                         |
| sylwan      |    1 |  6.22 | 286 (6.22)                         |
| slights     |    1 |  6.22 | 595 (6.22)                         |
| wookiee     |    1 |  6.22 | 30 (6.22)                          |
| jean-paul   |    1 |  6.22 | 709 (6.22)                         |
| ladykillers |    1 |  6.22 | 870 (6.22)                         |
| zone        |    1 |  6.22 | 175 (6.22)                         |
| maribel     |    2 |  5.81 | 151 (5.81), 947 (5.81)             |
| dunbar      |    1 |  6.22 | 403 (6.22)                         |
| komma       |    1 |  6.22 | 488 (6.22)                         |

1000 documents in collection
9836 distinct terms in collection
26200 postings


### IDF implementations and BM25 parameters
BM25 parameters are typically `k=1.2` and `b=0.75`, while `adl` must be set from the collection. If we leave `adl=None`, the term normalization does not take document length into account (which is ok if documents in the collection have about equal length)

In [4]:
BM25 = { 'k': 1.2, 'b': 0.75, 'adl': None }

def idf(doc_freq: int, num_docs: int) -> float:
    return math.log((num_docs + 1) / (doc_freq + 1))

def idf_bm25(doc_freq: int, num_docs: int) -> float:
    return math.log((num_docs - doc_freq + 0.5) / (doc_freq + 0.5))
    
def idf_bm25_pos(doc_freq: int, num_docs: int) -> float:
    return math.log((num_docs + 1) / (doc_freq + 0.5))

### TF normalization functions
We apply document normalization at index building time. We also use normalized query vectors so that similarity becomes a simple dot product between document and query vector. The function below performs term normalization for documents given a bag-of-word and a vocabulary. The vocabulary maps a term to a dictionary that holds the idf values for the dot-product and cosine measure. 

In [5]:
def normalize_doc_vector(vector: dict[str, int], vocabulary: dict[str, dict], measure: str) -> dict[str, float]:
    # dot-product: multiply each term's tf by its idf
    if measure == 'dot':
        return {term: tf * vocabulary[term]['idf'] for term,tf in vector.items()}

    # cosine-measure: multiply each term's tf by its idf and divide by total vector length
    if measure == 'cosine':
        norm = sum([(tf * vocabulary[term]['idf']) ** 2 for term, tf in vector.items()]) ** 0.5
        return {term: tf * vocabulary[term]['idf'] / norm for term, tf in vector.items()}

    # bm25: normalize with bm25 formula with document length
    if measure in ['bm25', 'bm25-pos'] and BM25['adl']:
        doc_len = sum(vector.values())
        return {term: tf * (BM25['k'] + 1) / (tf + BM25['k'] * (1 - BM25['b'] + BM25['b'] * doc_len / BM25['adl']))  for term, tf in vector.items()}

    # bm25: normalize with bm25 formula without document length
    if measure in ['bm25', 'bm25-nolen', 'bm25-pos']:
        return {term: tf * (BM25['k'] + 1) / (tf + BM25['k'])  for term, tf in vector.items()}

    raise ValueError('Unknown normalization measure')

### Build the index

In [6]:
# defines query, predicates, and assessments for demo
queries = [
    'star wars', 
    'drama morgan freeman', 
    'comedy'
]
assessments = {
    'top-100': lambda id: id < 100,
    'star in title': lambda id: 'star' in retriever.documents[id]['title'].lower(),
    'morgan in actor': lambda id: 'morgan' in retriever.documents[id]['actors'].lower(),
    'comedy in genre': lambda id: 'comedy' in retriever.documents[id]['genre'].lower(),
}
predicates = {
    'year < 1990': lambda id: retriever.documents[id]['year'] < 1990,
    'year >= 1990': lambda id: retriever.documents[id]['year'] >= 1990,
}
selections = {
    'top-100': list(range(100)),
    'top-250': list(range(250)),
}

def print_topk(topk: TopKList):
    list = []
    for entry in topk:
        list.append(collection.format(retriever.documents[entry['id']], [
            entry['rank'],
            round(entry['score'], 2)
        ]))
    print_table(list, collection.headers('rel', 'score'), max_rows=len(list))

retriever = VSRetriever_DAAT()

### Search with feedback iterations

In [8]:
# remove handlers if we re-execute this cell, need to do this before we overwrite function
try:
    opt_measure.unobserve(rebuild_dataset, 'value')
except:
    pass

def run_query(query: str, k:int, predicate: str, selection: str):
    topk = retriever.search(query, k, measure=opt_measure.value, predicate=predicates.get(predicate, None), selected_docs=selections.get(selection, None))
    print_topk(topk)
    for term in sorted(topk.weights.keys(), key = lambda term: -topk.weights[term]):
        print(term.rjust(16), topk.weights[term])

def rebuild_dataset(*args):
    retriever.build_index(collection.load(), opt_measure.value, remove_stopwords=True)
    f_query.options = list(queries)
    f_predicate.options = ['<none>'] + list(predicates.keys())
    f_selection.options = ['<none>'] + list(selections.keys())
    k = f_k.value
    f_k.value = 0
    f_k.value = k

# build the dataset widgets
opt_measure = widgets.Dropdown(options=['dot', 'cosine', 'bm25', 'bm25-pos', 'bm25-nolen'])
form_data = widgets.HBox([opt_measure], layout = {'margin': '0px 0px 20px'})
opt_measure.observe(rebuild_dataset, 'value')

# build query form
f_query = widgets.Dropdown(description='query', options=list(queries))
f_k = widgets.IntSlider(min=5, max=50, step=5, value=20)
f_predicate = widgets.Dropdown(description='predicate',options=['<none>'] + list(predicates.keys()))
f_selection = widgets.Dropdown(description='selection',options=['<none>'] + list(selections.keys()))
form_query = widgets.interactive(run_query, query=f_query, k=f_k, predicate=f_predicate, selection=f_selection)

# display
rebuild_dataset()
display(widgets.VBox([form_data, form_query]))

VBox(children=(HBox(children=(Dropdown(options=('dot', 'cosine', 'bm25', 'bm25-pos', 'bm25-nolen'), value='dot…