In [38]:
import numpy as np
import pandas as pd
from msmarco.index import indexed
from msmarco.evaluate import grade_results, judge_queries
from time import perf_counter
from searcharray.similarity import bm25_similarity, classic_similarity, default_bm25

msmarco_mrr10 = [0] * 8
NUM_QUERIES = 1000

In [2]:
# Will take a min or two to load, and a good hour to build an index first time
msmarco = indexed()

In [40]:
def or_query_search(corpus, fields, query, query_id=None, n=10, 
                    similarities=default_bm25):
    if not isinstance(fields, list):
        fields = [fields]
    if isinstance(similarities, list):
        assert len(similarities) == len(fields)
    if not isinstance(similarities, list):
        similarities = [similarities] * len(fields)
    start = perf_counter()
    scored = np.zeros(len(corpus))
    for field, similarity in zip(fields, similarities):
        boost = 1.0
        if '^' in field:
            field, boost = field.split('^')
            boost = float(boost)
        tokenizer = corpus[field].array.tokenizer
        tokens = tokenizer(query)
        scored += (np.sum([corpus[field].array.score(token, similarity=similarity) for token in tokens], axis=0) * boost)
    took = perf_counter() - start
    
    top_n_idx = np.argsort(scored)[::-1][:n]
    scores = scored[top_n_idx]
    top_n_idx, scores

    top_n = corpus[['title', 'msmarco_id']].iloc[top_n_idx]
    top_n['scores'] = scores
    top_n['query'] = query
    top_n['query_id'] = query_id
    top_n['took'] = took
    top_n['rank'] = np.arange(n) + 1
    return top_n

or_query_search(corpus=msmarco, fields="title_idx", query="cheese inside")

Unnamed: 0,title,msmarco_id,scores,query,query_id,took,rank
1622353,Cheese & Cheese Products: Cheese,D2146882,5.24958,cheese inside,,0.046699,1
1409917,Inside,D3479997,4.808581,cheese inside,,0.046699,2
171414,Insider,D932824,4.808581,cheese inside,,0.046699,3
3021605,inside,D1820840,4.808581,cheese inside,,0.046699,4
940283,inside,D1820838,4.808581,cheese inside,,0.046699,5
3087670,Cheeses,D1534465,4.733851,cheese inside,,0.046699,6
1741404,cheese,D1303785,4.733851,cheese inside,,0.046699,7
634958,cheese,D2695337,4.733851,cheese inside,,0.046699,8
1884790,Cheese,D187458,4.733851,cheese inside,,0.046699,9
1386241,Cheese,D859447,4.733851,cheese inside,,0.046699,10


In [41]:
from msmarco.evaluate import grade_results, judgments

msmarco_judgments = judgments().sample(NUM_QUERIES)

In [42]:

def run_all(corpus, judgments, fields, similarities=default_bm25, n=10):
    results = []
    for idx, row in judgments.iterrows():
        top_n = or_query_search(corpus=msmarco, fields=fields, similarities=similarities, n=n,
                                query=row['query'], query_id=row['query_id'])
        results.append(top_n)
    results = pd.concat(results)
    graded = grade_results(msmarco_judgments, results)
    return graded

## Search title

Search a snowball tokenized version of title, using BM25, report MRR

In [43]:
graded = run_all(msmarco, msmarco_judgments,
                  fields=['title_idx'])

queries_judged = judge_queries(graded)
msmarco_mrr10[0] = queries_judged.sum() / NUM_QUERIES
msmarco_mrr10[0]

0.178518253968254

## Search body

Search a snowball tokenized version of body, using BM25, report MRR

In [44]:
graded = run_all(msmarco, msmarco_judgments,
                  fields=['body_idx'])

queries_judged = judge_queries(graded)
msmarco_mrr10[1] = queries_judged.sum() / NUM_QUERIES
msmarco_mrr10[1]

0.21866785714285714

## Search body + title

In [45]:
graded = run_all(msmarco, msmarco_judgments,
                  fields=['title_idx', 'body_idx'])

queries_judged = judge_queries(graded)
msmarco_mrr10[2] = queries_judged.sum() / NUM_QUERIES
msmarco_mrr10[2]

0.2427626984126984

## Weigh body*10

In [46]:
graded = run_all(msmarco, msmarco_judgments,
                  fields=['title_idx', 'body_idx^10'])

queries_judged = judge_queries(graded)
msmarco_mrr10[3] = queries_judged.sum() / NUM_QUERIES
msmarco_mrr10[3]

0.24909285714285714

## Weigh title*10

In [47]:
graded = run_all(msmarco, msmarco_judgments,
                  fields=['title_idx^10', 'body_idx'])

queries_judged = judge_queries(graded)
msmarco_mrr10[4] = queries_judged.sum() / NUM_QUERIES
msmarco_mrr10[4]

0.19806666666666667

## No bias to shorter length

In [48]:
bm25_similarity_nolen = bm25_similarity(b=0.0)

graded = run_all(msmarco, msmarco_judgments,
                 similarities = bm25_similarity_nolen,
                 fields=['title_idx', 'body_idx'])

queries_judged = judge_queries(graded)
msmarco_mrr10[5] = queries_judged.sum() / NUM_QUERIES
msmarco_mrr10[5]

0.13158968253968253

## Title-only ignore len

In [49]:
bm25_similarity_nolen = bm25_similarity(b=0.0)

graded = run_all(msmarco, msmarco_judgments,
                 similarities = [bm25_similarity_nolen, default_bm25],
                 fields=['title_idx', 'body_idx'])

queries_judged = judge_queries(graded)
msmarco_mrr10[6] = queries_judged.sum() / NUM_QUERIES
msmarco_mrr10[6]

0.15383769841269843

## Body only ignore len

In [50]:
bm25_similarity_nolen = bm25_similarity(b=0.0)

graded = run_all(msmarco, msmarco_judgments,
                 similarities = [default_bm25, bm25_similarity_nolen],
                 fields=['title_idx', 'body_idx'])

queries_judged = judge_queries(graded)
msmarco_mrr10[7] = queries_judged.sum() / NUM_QUERIES
msmarco_mrr10[7]

0.23371706349206348