# Binary Independence Retrieval (BIR) Model

In [12]:
%pip install -r requirements.txt
%load_ext autoreload
%autoreload 2

Note: you may need to restart the kernel to use updated packages.
The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [13]:
from helpers import  print_table
from probabilistic import BIRRetriever, Feedback, TopKList
from datasets import imdb as collection
import ipywidgets as widgets
import random
import math
from itertools import islice
from typing import Callable
from helpers import set_of_words
from collections import defaultdict
from functools import reduce
from IPython.display import clear_output

## IMDB data set

In [14]:
# load data set and index collection for boolean retrieval
retriever = BIRRetriever(collection.load())

# show collection
n, m = min(10, retriever.n_docs), min(20, retriever.n_terms)

print_table([collection.format(doc) for doc in retriever.documents.values()], collection.headers(), max_rows = n)
print_table(random.sample([[term, df, retriever.index[term]] for term, df in retriever.vocabulary.items() if df > 5], m), ['term', 'df', 'posting'], max_rows=20)
print(f'{retriever.n_docs} documents in collection')
print(f'{retriever.n_terms} distinct terms in collection')
print('{count} postings'.format(count=sum([len(postings) for postings in retriever.index.values()])))

|   id | title                           |   year |   runtime |   rating | genre                 | actors                          | summary                                                                                               |
|-----:|:--------------------------------|-------:|----------:|---------:|:----------------------|:--------------------------------|:------------------------------------------------------------------------------------------------------|
|    1 | The Shawshank Redemption        |   1994 |       142 |      9.3 | Drama                 | Tim Robbins Morgan Freeman Bob… | Two imprisoned men bond over a number of years, finding solace and eventual redemption through acts … |
|    2 | The Godfather                   |   1972 |       175 |      9.2 | Crime Drama           | Marlon Brando Al Pacino James … | An organized crime dynasty's aging patriarch transfers control of his clandestine empire to his relu… |
|    3 | The Dark Knight                 |   2008 |       152 |      9   | Action Crime Drama    | Christian Bale Heath Ledger Aa… | When the menace known as the Joker wreaks havoc and chaos on the people of Gotham, Batman must accep… |
|    4 | The Godfather: Part II          |   1974 |       202 |      9   | Crime Drama           | Al Pacino Robert De Niro Rober… | The early life and career of Vito Corleone in 1920s New York City is portrayed, while his son, Micha… |
|    5 | 12 Angry Men                    |   1957 |        96 |      9   | Crime Drama           | Henry Fonda Lee J. Cobb Martin… | A jury holdout attempts to prevent a miscarriage of justice by forcing his colleagues to reconsider … |
|    6 | The Lord of the Rings: The Ret… |   2003 |       201 |      8.9 | Action Adventure Dra… | Elijah Wood Viggo Mortensen Ia… | Gandalf and Aragorn lead the World of Men against Sauron's army to draw his gaze from Frodo and Sam … |
|    7 | Pulp Fiction                    |   1994 |       154 |      8.9 | Crime Drama           | John Travolta Uma Thurman Samu… | The lives of two mob hitmen, a boxer, a gangster and his wife, and a pair of diner bandits intertwin… |
|    8 | Schindler's List                |   1993 |       195 |      8.9 | Biography Drama Hist… | Liam Neeson Ralph Fiennes Ben … | In German-occupied Poland during World War II, industrialist Oskar Schindler gradually becomes conce… |
|    9 | Inception                       |   2010 |       148 |      8.8 | Action Adventure Sci… | Leonardo DiCaprio Joseph Gordo… | A thief who steals corporate secrets through the use of dream-sharing technology is given the invers… |
|   10 | Fight Club                      |   1999 |       139 |      8.8 | Drama                 | Brad Pitt Edward Norton Meat L… | An insomniac office worker and a devil-may-care soapmaker form an underground fight club that evolve… |

| term    |   df | posting                                                                                                                                              |
|:--------|-----:|:-----------------------------------------------------------------------------------------------------------------------------------------------------|
| goes    |   20 | [50, 88, 101, 104, 119, 169, 194, 314, 338, 387, 424, 425, 567, 609, 814, 867, 879, 892, 957, 961]                                                   |
| lost    |    9 | [65, 73, 329, 470, 562, 690, 781, 943, 962]                                                                                                          |
| pair    |    6 | [7, 101, 599, 853, 913, 974]                                                                                                                         |
| drug    |   10 | [109, 233, 255, 357, 578, 623, 672, 853, 942, 980]                                                                                                   |
| deadly  |    8 | [28, 85, 141, 421, 511, 556, 886, 933]                                                                                                               |
| rob     |    6 | [44, 336, 531, 792, 796, 964]                                                                                                                        |
| burt    |    7 | [183, 280, 441, 518, 633, 852, 998]                                                                                                                  |
| german  |   19 | [63, 106, 111, 126, 172, 180, 193, 297, 316, 339, 446, 462, 574, 582, 617, 706, 857, 951, 964]                                                       |
| tries   |   28 | [41, 56, 79, 84, 101, 107, 118, 122, 137, 172, 183, 188, 255, 339, 378, 494, 521, 570, 615, 658, 742, 753, 760, 775, 790, 828, 879, 1000]            |
| brad    |   15 | [10, 28, 94, 97, 216, 368, 394, 601, 629, 677, 796, 880, 911, 961, 988]                                                                              |
| jared   |    6 | [98, 331, 360, 599, 625, 932]                                                                                                                        |
| british |   12 | [80, 153, 248, 305, 343, 347, 412, 574, 693, 758, 822, 861]                                                                                          |
| kate    |    8 | [95, 330, 504, 515, 653, 813, 941, 963]                                                                                                              |
| care    |    6 | [509, 581, 659, 671, 885, 982]                                                                                                                       |
| scott   |   10 | [79, 400, 435, 532, 536, 543, 593, 668, 687, 803]                                                                                                    |
| grant   |   11 | [119, 403, 456, 547, 548, 560, 563, 564, 577, 716, 831]                                                                                              |
| general |    9 | [40, 79, 81, 302, 304, 321, 543, 855, 911]                                                                                                           |
| steve   |   11 | [180, 256, 360, 419, 601, 627, 738, 866, 898, 907, 979]                                                                                              |
| best    |   11 | [77, 99, 166, 214, 375, 455, 534, 551, 558, 658, 936]                                                                                                |
| old     |   30 | [72, 92, 106, 142, 159, 168, 189, 192, 230, 233, 296, 297, 309, 310, 333, 386, 416, 424, 456, 485, 533, 545, 559, 597, 616, 820, 826, 840, 928, 949] |

1000 documents in collection
9836 distinct terms in collection
26200 postings


## Calculations of c_j-weights

In [15]:
class BIRRetriever(BIRRetriever):
    def cj_weight(self, term: str, feedback: Feedback):
        doc_freq = len(self.index[term])
        if feedback.is_initial_step():
            rj = 0.5
            nj = (doc_freq + 0.5) / (len(self.documents) + 1)
        else:
            # get postings as set to simplify calculations in Python
            docs = set(self.index[term])

            # number of assessed relevant documents which have the term
            lj, L = len(feedback.relevant & docs), len(feedback.relevant)
            
            # number of assessed documents which have the term
            kj, K = len(feedback.assessed & docs), len(feedback.assessed)
            
            # calculate rj and nj
            rj = (lj + 0.5) / (L + 1)
            nj = (kj - lj + 0.5) / (K - L + 1)
            
        return math.log(rj / (1 - rj) * (1 - nj) / nj)

### Term & document filtering with options
Pruning of terms and documents based on the following settings:
- `PRUNE_NEGATIVE_WEIGHTS: bool = False`, set this property to True to remove terms with negative weights
- `PRUNE_WEIGHT_THRESHOLD: bool  = False`, set this property to remove terms with absolute weights smaller than this value
- `PRUNE_TOPK: bool | int = False`, set this property to select top-k weights based on absolute values
- `PRUNE_NON_RELEVANT: bool = False`, set this property to true to prune non-relevant documents from result list

In [16]:
class BIRRetriever(BIRRetriever):
    # set this property to True to remove terms with negative weights
    PRUNE_NEGATIVE_WEIGHTS = False

    # set this property to remove terms with absolute weights smaller than this value
    PRUNE_WEIGHT_THRESHOLD  = False

    # set this property to select top-k weights based on absolute values
    PRUNE_TOPK = False

    # set this property to true to prune non-relevant documents from result list
    PRUNE_NON_RELEVANT = False

    def query_weights(self, terms: set[str], feedback: Feedback) -> list[tuple[str,float]]:
        # remove terms not in vocabulary
        terms = list(filter(lambda t: t in self.vocabulary, terms))

        # calculate weigths and produce tuples (term, weight)
        term_weights = list(map(lambda t: (t, self.cj_weight(t, feedback)), terms))
        
        # filter terms with negative weights
        if self.PRUNE_NEGATIVE_WEIGHTS:
            term_weights = list(filter(lambda t: t[1] >= 0, term_weights))
        
        # filter terms with small absolute weights
        if self.PRUNE_WEIGHT_THRESHOLD:
            term_weights = list(filter(lambda t: abs(t[1]) > self.PRUNE_WEIGHT_THRESHOLD, term_weights))
        
        # select top-k terms based on absolute values
        if self.PRUNE_TOPK:
            term_weights = sorted(term_weights, key = lambda t: (-abs(t[1]),len(self.index[t[0]]),t[0]))[:self.PRUNE_TOPK]
        
        return term_weights

### Document-at-a-time (DAAT) for BIR Model
The implementation of DAAT for the BIR model uses sorted postings and processes postings in ascending order of the document IDs (see Or-implementation of Boolean model).

In [17]:
class BIRRetriever_DAAT(BIRRetriever):
    """
        Implements the DAAT model for the BIR model using inverted index method.
    """
    def search(self, query: str, k: int, feedback: Feedback, predicate: Callable[[int], bool] = None, selected_docs: set[int] = None) -> TopKList:
        query_vector = self._get_vector(query)

        # filter terms and obtain c_j-weights for terms in order of their importance 
        term_weights = self.query_weights(query_vector, feedback)
        
        # get iterators for each term and fetch first posting
        iters = [iter(self.index[term]) for (term, _) in term_weights]
        nexts = [next(iter, None) for iter in iters]

        # keep track of all retrieved documents and their score; stored as tuples (doc_id, score)
        topk = TopKList(k, term_weights, predicate)

        # iterate through all streams and calculate score for smallest doc id
        while any(e for e in nexts):
            # get smallest value from nexts, ignoring None values
            smallest = min(nexts, key = lambda x: x or math.inf)

            # if we have feedback, make sure document is either relevant or not assessed so far; if we have selected_docs, make sure document is in it
            if not(self.PRUNE_NON_RELEVANT and feedback.is_not_relevant(smallest)) and (selected_docs is None or smallest in selected_docs):
                # if so, add it to topk
                score = sum([term_weights[i][1] for i in range(len(nexts)) if nexts[i] == smallest])
                topk.add(smallest, score)
            
            # for each entry in nexts, fetch next item if entry equals smallest
            for i, e in enumerate(nexts):
                if e is smallest:
                    nexts[i] = next(iters[i], None)
        
        # finished, return topk for result iteration
        return topk

### Term-at-a-time for BIR 
This code block defines a class called BIRRetriever_TAAT which implements a Term-At-A-Time (TAAT) approach for Binary Independent Retrieval (BIR) Model.

In [18]:
class BIRRetriever_TAAT(BIRRetriever):
    """
        Implements the TAAT model for the BIR model using inverted index method.
    """
    def search(self, query: str, k: int, feedback: Feedback, predicate: Callable[[int], bool] = None, selected_docs: set[int] = None) -> TopKList:
        query_vector = self._get_vector(query)

        # filter terms and obtain c_j-weights for terms in order of their importance 
        term_weights = self.query_weights(query_vector, feedback)
        doc_scores = defaultdict(float)

        # iterate over terms and fetch postings
        for (term, weight) in term_weights:
            for doc_id in self.index[term]:
                # check if it is either not assessed or relevant; check if doc_id is selected_docs (if given)
                if not(self.PRUNE_NON_RELEVANT and feedback.is_not_relevant(doc_id)) and (selected_docs is None or doc_id in selected_docs):
                    doc_scores[doc_id] += weight

        # we do not need a full sort of doc_scores, but can use the heap in TopKList
        topk = TopKList(k, term_weights, predicate)
        for doc_id, score in doc_scores.items():
            topk.add(doc_id, score)
        
        # finisheds, return topk for result iteration
        return topk

## Putting all together for the movies collection

In [19]:
opt_strategy = widgets.Dropdown(options=['document-at-a-time', 'term-at-a-time'])
display(opt_strategy)

Dropdown(options=('document-at-a-time', 'term-at-a-time'), value='document-at-a-time')

### Build the index

In [20]:
# select the evaluation strategy
if opt_strategy.value == 'document-at-a-time':
    retriever = BIRRetriever_DAAT()
else:
    retriever = BIRRetriever_TAAT()

# defines query, predicates, and assessments for demo
queries = [
    'star wars', 
    'drama morgan freeman', 
    'comedy'
]
assessments = {
    'top-100': lambda id: id < 100,
    'star in title': lambda id: 'star' in retriever.documents[id]['title'].lower(),
    'morgan in actor': lambda id: 'morgan' in retriever.documents[id]['actors'].lower(),
    'comedy in genre': lambda id: 'comedy' in retriever.documents[id]['genre'].lower(),
}
predicates = {
    'year < 1990': lambda id: retriever.documents[id]['year'] < 1990,
    'year >= 1990': lambda id: retriever.documents[id]['year'] >= 1990,
}
selections = {
    'top-100': list(range(100)),
    'top-250': list(range(250)),
}

# build index
retriever.build_index(collection.load())

# helper functions
def print_feedback(feedback: Feedback, func: str, text: str = 'feedback'):
    info = ", ".join([('+' if feedback.is_relevant(doc_id) else '-') + str(doc_id) for doc_id in sorted(feedback.assessed, key=lambda doc_id: (not feedback.is_relevant(doc_id), doc_id))])
    print(f'{text} ({func}): {info}')

def print_topk(topk: TopKList, feedback: Feedback):
    list = []
    for entry in topk:
        list.append(collection.format(retriever.documents[entry['id']], [
            '+' if feedback.is_relevant(entry['id']) else '-' if feedback.is_assessed(entry['id']) else ' ',
            entry['rank'],
            round(entry['score'], 2)
        ]))
    print_table(list, collection.headers('rel', 'rank', 'score'), max_rows=len(list))

def add_feedback(feedback, topk, n_feedback):
    for entry in topk:
        if n_feedback <= 0: return
        if feedback.is_assessed(entry['id']): continue
        feedback.assess(entry['id'])
        n_feedback -= 1
    for doc_id in filter(lambda doc_id: not feedback.is_assessed(doc_id), retriever.documents.keys()):
        if n_feedback <= 0: return
        if feedback.is_assessed(doc_id): continue
        feedback.assess(doc_id)
        n_feedback -= 1

### Search with feedback iterations

In [21]:
# remove handlers if we re-execute this cell, need to do this before we overwrite function
try:
    f_query.unobserve(on_start, 'value')
except:
    pass

# define globals
feedback = Feedback()

def run_query(query: str, k: int, assessment: str, predicate: str, selection: str, n_feedback: int):
    global topk
    feedback.assessment_func = assessments.get(assessment, None)
    print_feedback(feedback, assessment)
    print()
    topk = retriever.search(query, k, feedback=feedback, predicate=predicates.get(predicate, None), selected_docs=selections.get(selection, None))
    add_feedback(feedback, topk, n_feedback)
    print_topk(topk, feedback)
    for term in sorted(topk.weights.keys(), key = lambda term: -topk.weights[term]):
        print(term.rjust(16), topk.weights[term])
    print_feedback(feedback, assessment, "\nnext feedback")

def on_next(btn):
    retriever.PRUNE_NEGATIVE_WEIGHTS = opt_neg.value
    retriever.PRUNE_WEIGHT_THRESHOLD = opt_small.value and 0.5
    retriever.PRUNE_TOPK = opt_topk.value and 10
    retriever.PRUNE_NON_RELEVANT = opt_nonrel.value
    if opt_expand.value:
        query_text = f_query.value + ' ' + ' '.join(reduce(lambda terms, doc_id: terms | retriever.documents[doc_id]['vector'], feedback.relevant, set()))
    else:
        query_text = f_query.value
    with out:
        clear_output()
        print(query_text)
        run_query(query_text, 20, f_assessment.value, f_predicate.value, f_selection.value, f_feedback.value)

def on_start(btn):
    feedback.clear()
    on_next(btn)

# buttons
btn_start = widgets.Button(description=' start', icon='play')
btn_start.on_click(on_start)
btn_next = widgets.Button(description=' next', icon='step-forward')
btn_next.on_click(on_next)
buttons = widgets.HBox([btn_start, btn_next])

# query left side
f_query = widgets.Dropdown(description='query',options=list(queries))
f_assessment = widgets.Dropdown(description='assessment',options=['<none>'] + list(assessments.keys()))
f_feedback = widgets.IntSlider(description='feedback', min=5, max=50, step=5, value=5)
f_predicate = widgets.Dropdown(description='predicate',options=['<none>'] + list(predicates.keys()))
f_selection = widgets.Dropdown(description='selection',options=['<none>'] + list(selections.keys()))
left = widgets.VBox([f_query, f_assessment, f_feedback, f_predicate, f_selection])
f_query.observe(on_start, 'value')

# options right side
opt_neg = widgets.Checkbox(value=False, description='prune negative weights')
opt_small = widgets.Checkbox(value=False, description='prune small weights (abs < 0.5)')
opt_topk = widgets.Checkbox(value=False, description='keep top 10 weights')
opt_nonrel = widgets.Checkbox(value=False, description='prune non relevant documents')
opt_expand = widgets.Checkbox(value=False, description='expand query with feedback')
right = widgets.VBox([opt_neg, opt_small, opt_topk, opt_nonrel, opt_expand])

# display the dialog object
display(widgets.VBox([buttons, widgets.HBox([left, right], layout={'margin': '20px'})]))

# capture output with this widget
out = widgets.Output(layout={'border': '1px solid #eeeeee', 'height': '800px', 'overflow': 'auto', 'padding': '0px 0px 0px 10px'})
display(out)

VBox(children=(HBox(children=(Button(description=' start', icon='play', style=ButtonStyle()), Button(descripti…

Output(layout=Layout(border_bottom='1px solid #eeeeee', border_left='1px solid #eeeeee', border_right='1px sol…