In [46]:
import numpy as np
import pandas as pd
from msmarco.index import indexed
from msmarco.evaluate import grade_results, judge_queries
from time import perf_counter

NUM_QUERIES = 1000

In [4]:
# Will take a min or two to load, and a good hour to build an index first time
msmarco = indexed()

In [47]:
def or_query_search(corpus, fields, query, query_id=None, n=10):
    if not isinstance(fields, list):
        fields = [fields]
    start = perf_counter()
    scored = np.zeros(len(corpus))
    for field in fields:
        boost = 1.0
        if '^' in field:
            field, boost = field.split('^')
            boost = float(boost)
        tokenizer = corpus[field].array.tokenizer
        tokens = tokenizer(query)
        scored += (np.sum([corpus[field].array.score(token) for token in tokens], axis=0) * boost)
    took = perf_counter() - start
    
    top_n_idx = np.argsort(scored)[::-1][:n]
    scores = scored[top_n_idx]
    top_n_idx, scores

    top_n = corpus[['title', 'msmarco_id']].iloc[top_n_idx]
    top_n['scores'] = scores
    top_n['query'] = query
    top_n['query_id'] = query_id
    top_n['took'] = took
    top_n['rank'] = np.arange(n) + 1
    return top_n

or_query_search(corpus=msmarco, fields="title_idx", query="cheese inside")

Unnamed: 0,title,msmarco_id,scores,query,query_id,took,rank
1622353,Cheese & Cheese Products: Cheese,D2146882,5.24958,cheese inside,,0.0449,1
1409917,Inside,D3479997,4.808581,cheese inside,,0.0449,2
171414,Insider,D932824,4.808581,cheese inside,,0.0449,3
3021605,inside,D1820840,4.808581,cheese inside,,0.0449,4
940283,inside,D1820838,4.808581,cheese inside,,0.0449,5
3087670,Cheeses,D1534465,4.733851,cheese inside,,0.0449,6
1741404,cheese,D1303785,4.733851,cheese inside,,0.0449,7
634958,cheese,D2695337,4.733851,cheese inside,,0.0449,8
1884790,Cheese,D187458,4.733851,cheese inside,,0.0449,9
1386241,Cheese,D859447,4.733851,cheese inside,,0.0449,10


In [48]:
from msmarco.evaluate import grade_results, judgments

msmarco_judgments = judgments().sample(NUM_QUERIES)

## Search title

Search a snowball tokenized version of title, using BM25, report MRR

In [49]:
results = []

for idx, row in msmarco_judgments.iterrows():
    top_n = or_query_search(corpus=msmarco, fields=['title_idx'], query=row['query'], query_id=row['query_id'])
    results.append(top_n)

results = pd.concat(results)
results

Unnamed: 0,title,msmarco_id,scores,query,query_id,took,rank
1830509,Vitamins That Aid Digestion,D2143698,9.674032,foods that aid in digestion,188984,0.126615,1
749761,5 Simple Foods That Quickly Relieve Stomach Cr...,D39866,8.057952,foods that aid in digestion,188984,0.126615,2
1212028,10 Foods That Digest Fast,D3169726,8.017378,foods that aid in digestion,188984,0.126615,3
837206,10 Foods That Digest Quickly,D2002064,8.017378,foods that aid in digestion,188984,0.126615,4
51893,Digestive Aid Juice,D658202,7.936059,foods that aid in digestion,188984,0.126615,5
...,...,...,...,...,...,...,...
2986861,How many calories should a woman consume daily?,D1349431,9.515442,how many calories a day for a woman,275820,0.110465,6
795770,How Many Calories Should a Pregnant Woman Eat?,D2305232,9.515442,how many calories a day for a woman,275820,0.110465,7
3042068,How many calories should a woman eat for fat l...,D1312449,9.416354,how many calories a day for a woman,275820,0.110465,8
874718,How Many Calories Should a Woman Eat per Day t...,D967501,9.155397,how many calories a day for a woman,275820,0.110465,9


In [50]:
graded = grade_results(msmarco_judgments, results)
queries_judged = judge_queries(graded)
queries_judged.sum() / NUM_QUERIES

0.1914920634920635

## Search body

Search a snowball tokenized version of body, using BM25, report MRR

In [None]:
results = []

for idx, row in msmarco_judgments.iterrows():
    top_n = or_query_search(corpus=msmarco, fields=['body_idx'], query=row['query'], query_id=row['query_id'])
    results.append(top_n)

results = pd.concat(results)
results

In [None]:
graded = grade_results(msmarco_judgments, results)
queries_judged = judge_queries(graded)
queries_judged.sum() / NUM_QUERIES

## Search body + title

In [None]:
results = []

for idx, row in msmarco_judgments.iterrows():
    top_n = or_query_search(corpus=msmarco, fields=['body_idx', 'title_idx'], query=row['query'], query_id=row['query_id'])
    results.append(top_n)

results = pd.concat(results)
results

graded = grade_results(msmarco_judgments, results)
queries_judged = judge_queries(graded)
queries_judged.sum() / NUM_QUERIES