In [1]:
import numpy as np
import pandas as pd
from msmarco.index import indexed

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
# Will take a min or two to load, and a good hour to build an index first time
msmarco = indexed()

In [3]:
tokenizer = msmarco['body_idx'].array.tokenizer

def or_query_search(query, query_id=None, n=100):
    tokens = tokenizer(query)
    scored = np.sum([msmarco['body_idx'].array.score(token) for token in tokens], axis=0)
    top_n_idx = np.argsort(scored)[::-1][:n]
    scores = scored[top_n_idx]
    top_n_idx, scores

    top_n = msmarco[['title', 'msmarco_id']].iloc[top_n_idx]
    top_n['scores'] = scores
    top_n['query'] = query
    top_n['query_id'] = query_id
    top_n['rank'] = np.arange(n) + 1
    return top_n

or_query_search("cheese inside")

Unnamed: 0,title,msmarco_id,scores,query,query_id,rank
2301803,50 Panini,D491358,5.803620,cheese inside,,1
2044174,How To Make Twice-Baked Potatoes,D2419893,5.689438,cheese inside,,2
238792,.,D2613852,5.653761,cheese inside,,3
1365175,How many white foods and drinks can you name?,D3539166,5.648460,cheese inside,,4
2833655,Comfort food: Fried cheese recipes,D2396182,5.642127,cheese inside,,5
...,...,...,...,...,...,...
2330536,Can You Freeze Cream Cheese Frosting?,D3064354,5.257333,cheese inside,,96
1402971,Food Storage - How long can you keep...,D2766710,5.256273,cheese inside,,97
1579828,.,D2039867,5.256127,cheese inside,,98
1954062,How long does mozzarella cheese stay good once...,D2293852,5.255138,cheese inside,,99


In [4]:
from msmarco.evaluate import grade_results, mrr, judgments

msmarco_judgments = judgments().sample(50)

In [5]:
results = []

for idx, row in msmarco_judgments.iterrows():
    top_n = or_query_search(query=row['query'], query_id=row['query_id'])
    results.append(top_n)

In [6]:
results_df = pd.concat(results)
results_df

Unnamed: 0,title,msmarco_id,scores,query,query_id,rank
383609,"How Much Weight Can You Lose a Week on a 1,200...",D804509,13.864087,1200 calorie diet how much protein,632,1
1089296,"Ideal protein diet - safety, efficacy?",D1883982,13.509857,1200 calorie diet how much protein,632,2
1569806,New Here. What does 1200 calories look like?,D1218847,13.393041,1200 calorie diet how much protein,632,3
988637,Try This 3-Day 1200 Calorie Diet Plan,D804513,13.322729,1200 calorie diet how much protein,632,4
124496,Will Eating Chicken Breast Every Day Help You ...,D2923258,13.313102,1200 calorie diet how much protein,632,5
...,...,...,...,...,...,...
3095728,Bacterial cellular morphologies,D716324,9.478049,by what means do bacterial cells typically div...,2317,96
2191931,Bacterial cellular morphologies,D990616,9.471072,by what means do bacterial cells typically div...,2317,97
245297,Bacterial Growth/Nutrition,D1011878,9.464734,by what means do bacterial cells typically div...,2317,98
1644567,Frederick Griffith,D3422449,9.463410,by what means do bacterial cells typically div...,2317,99


In [8]:
graded = grade_results(msmarco_judgments, results_df)
mrr(graded)

0.2105990821737933