In [15]:
import numpy as np
import pandas as pd
from msmarco.index import indexed

NUM_QUERIES = 50

In [2]:
# Will take a min or two to load, and a good hour to build an index first time
msmarco = indexed()

In [3]:
tokenizer = msmarco['body_idx'].array.tokenizer
from time import perf_counter

def or_query_search(query, query_id=None, n=100):
    start = perf_counter()
    tokens = tokenizer(query)
    scored = np.sum([msmarco['body_idx'].array.score(token) for token in tokens], axis=0)
    took = perf_counter() - start
    
    top_n_idx = np.argsort(scored)[::-1][:n]
    scores = scored[top_n_idx]
    top_n_idx, scores

    top_n = msmarco[['title', 'msmarco_id']].iloc[top_n_idx]
    top_n['scores'] = scores
    top_n['query'] = query
    top_n['query_id'] = query_id
    top_n['took'] = took
    top_n['rank'] = np.arange(n) + 1
    return top_n

or_query_search("cheese inside")

Unnamed: 0,title,msmarco_id,scores,query,query_id,took,rank
2301803,50 Panini,D491358,5.803620,cheese inside,,0.03878,1
2044174,How To Make Twice-Baked Potatoes,D2419893,5.689438,cheese inside,,0.03878,2
238792,.,D2613852,5.653761,cheese inside,,0.03878,3
1365175,How many white foods and drinks can you name?,D3539166,5.648460,cheese inside,,0.03878,4
2833655,Comfort food: Fried cheese recipes,D2396182,5.642127,cheese inside,,0.03878,5
...,...,...,...,...,...,...,...
2330536,Can You Freeze Cream Cheese Frosting?,D3064354,5.257333,cheese inside,,0.03878,96
1402971,Food Storage - How long can you keep...,D2766710,5.256273,cheese inside,,0.03878,97
1579828,.,D2039867,5.256127,cheese inside,,0.03878,98
1954062,How long does mozzarella cheese stay good once...,D2293852,5.255137,cheese inside,,0.03878,99


In [5]:
from msmarco.evaluate import grade_results, judgments

msmarco_judgments = judgments().sample(NUM_QUERIES)

In [6]:
results = []

for idx, row in msmarco_judgments.iterrows():
    top_n = or_query_search(query=row['query'], query_id=row['query_id'])
    results.append(top_n)

In [7]:
results_df = pd.concat(results)
results_df

Unnamed: 0,title,msmarco_id,scores,query,query_id,took,rank
2980446,Plant Spacing,D2571034,14.488810,how far apart do you plant apple trees?,3599,0.173555,1
135861,Plant Spacing,D2468577,14.488810,how far apart do you plant apple trees?,3599,0.173555,2
2166909,Plant Spacing,D1349111,14.488810,how far apart do you plant apple trees?,3599,0.173555,3
2990535,Plant Spacing,D2689495,14.488810,how far apart do you plant apple trees?,3599,0.173555,4
1975353,Plant Spacing,D1988646,14.488810,how far apart do you plant apple trees?,3599,0.173555,5
...,...,...,...,...,...,...,...
2438137,About The 1946 D Wheat Penny,D468835,6.500823,1964 penny value,776,0.049639,96
1487479,How to Make Money off of Copper Pennies,D1765983,6.494459,1964 penny value,776,0.049639,97
2983354,How Much Is a 1936 Penny Worth?,D1463149,6.484992,1964 penny value,776,0.049639,98
1124614,Old Copper Pennies: Which Ones To Save & What ...,D1461790,6.463014,1964 penny value,776,0.049639,99


In [16]:
graded = grade_results(msmarco_judgments, results_df)
graded[graded['rank'] <= 100].groupby('query_id')['reciprical_rank'].max().sum() / NUM_QUERIES

0.23342281213637658

In [17]:
graded.sort_values('took', ascending=False)

Unnamed: 0,title,msmarco_id,scores,query_x,query_id,took,rank,q0,grade,query_y,reciprical_rank
22,Leachate,D238303,15.823506,define the term leachate. where does it origin...,2685,0.340442,1,0,1,define the term leachate. where does it origin...,1.0
19,Glossectomy,D2975099,16.018728,a hemiglossectomy is the surgical excision of ...,1791,0.267291,3,0,1,a hemiglossectomy is the surgical excision of ...,0.333333
2,Second Great Awakening,D1338391,16.512806,how did the second great awakening influence t...,3459,0.258079,12,0,1,how did the second great awakening influence t...,0.083333
5,.,D2662079,10.601771,a chemical in groundwater that causes cancer i...,1770,0.206866,27,0,1,a chemical in groundwater that causes cancer i...,0.037037
0,How Far Apart Should Apple Trees Be to Cross-P...,D1349110,13.898209,how far apart do you plant apple trees?,3599,0.173555,7,0,1,how far apart do you plant apple trees?,0.142857
23,Trade name,D783465,10.558008,commercial name for a drug; trademark or trade...,2544,0.172785,38,0,1,commercial name for a drug; trademark or trade...,0.026316
7,.,D2937374,19.220474,i reset my password and now cant trade steam,4406,0.159368,1,0,1,i reset my password and now cant trade steam,1.0
11,Softball,D404132,14.264502,how many innings are usually played for softball,3831,0.149567,19,0,1,how many innings are usually played for softball,0.052632
10,What are some examples of commodity money?,D357539,6.977769,.which is an example of a commodity,415,0.145351,13,0,1,.which is an example of a commodity,0.076923
17,Gigabyte HD 7870 2GB Graphic Card GV-R787OC-2GD,D2466702,19.736467,2gb graphic card for pc price in pakistan,1180,0.136721,1,0,1,2gb graphic card for pc price in pakistan,1.0
