In [1]:
import numpy as np
import pandas as pd
from msmarco.index import indexed
from msmarco.evaluate import grade_results, judge_queries
from time import perf_counter
from searcharray.similarity import bm25_similarity, classic_similarity, default_bm25

NUM_QUERIES = 1000

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
# Will take a min or two to load, and a good hour to build an index first time
msmarco = indexed()

In [3]:
def or_query_search(corpus, fields, query, query_id=None, n=10, similarity=default_bm25):
    if not isinstance(fields, list):
        fields = [fields]
    start = perf_counter()
    scored = np.zeros(len(corpus))
    for field in fields:
        boost = 1.0
        if '^' in field:
            field, boost = field.split('^')
            boost = float(boost)
        tokenizer = corpus[field].array.tokenizer
        tokens = tokenizer(query)
        scored += (np.sum([corpus[field].array.score(token, similarity=similarity) for token in tokens], axis=0) * boost)
    took = perf_counter() - start
    
    top_n_idx = np.argsort(scored)[::-1][:n]
    scores = scored[top_n_idx]
    top_n_idx, scores

    top_n = corpus[['title', 'msmarco_id']].iloc[top_n_idx]
    top_n['scores'] = scores
    top_n['query'] = query
    top_n['query_id'] = query_id
    top_n['took'] = took
    top_n['rank'] = np.arange(n) + 1
    return top_n

or_query_search(corpus=msmarco, fields="title_idx", query="cheese inside")

Unnamed: 0,title,msmarco_id,scores,query,query_id,took,rank
1622353,Cheese & Cheese Products: Cheese,D2146882,5.24958,cheese inside,,0.036862,1
1409917,Inside,D3479997,4.808581,cheese inside,,0.036862,2
171414,Insider,D932824,4.808581,cheese inside,,0.036862,3
3021605,inside,D1820840,4.808581,cheese inside,,0.036862,4
940283,inside,D1820838,4.808581,cheese inside,,0.036862,5
3087670,Cheeses,D1534465,4.733851,cheese inside,,0.036862,6
1741404,cheese,D1303785,4.733851,cheese inside,,0.036862,7
634958,cheese,D2695337,4.733851,cheese inside,,0.036862,8
1884790,Cheese,D187458,4.733851,cheese inside,,0.036862,9
1386241,Cheese,D859447,4.733851,cheese inside,,0.036862,10


In [4]:
from msmarco.evaluate import grade_results, judgments

msmarco_judgments = judgments().sample(NUM_QUERIES)

## Search title

Search a snowball tokenized version of title, using BM25, report MRR

In [5]:
results = []

for idx, row in msmarco_judgments.iterrows():
    top_n = or_query_search(corpus=msmarco, fields=['title_idx'], query=row['query'], query_id=row['query_id'])
    results.append(top_n)

results = pd.concat(results)
results

Unnamed: 0,title,msmarco_id,scores,query,query_id,took,rank
534334,Golf Shoes,D1520271,8.936840,what type of shoes are good for golf,914556,0.096347,1
2696236,Golf Shoes,D1520596,8.936840,what type of shoes are good for golf,914556,0.096347,2
528036,Golf Shoes,D3560283,8.936840,what type of shoes are good for golf,914556,0.096347,3
99999,What type of running shoes are acceptable?,D2554165,8.572004,what type of shoes are good for golf,914556,0.096347,4
175554,What Types of Woods are Good for Carving?,D1630275,8.482737,what type of shoes are good for golf,914556,0.096347,5
...,...,...,...,...,...,...,...
2547821,Tulsa Jail costs to house inmates keep growing,D3298740,8.595405,cost to put an inmate in jail for a year,110408,0.123366,6
3151544,How to Find Jail Inmates,D2079268,8.513981,cost to put an inmate in jail for a year,110408,0.123366,7
136970,Glendora Jail Inmate Search,D648758,8.105003,cost to put an inmate in jail for a year,110408,0.123366,8
1458410,Westland Jail Inmate Locator,D1424086,8.105003,cost to put an inmate in jail for a year,110408,0.123366,9


In [6]:
graded = grade_results(msmarco_judgments, results)
queries_judged = judge_queries(graded)
queries_judged.sum() / NUM_QUERIES

0.21903095238095238

## Search body

Search a snowball tokenized version of body, using BM25, report MRR

In [7]:
results = []

for idx, row in msmarco_judgments.iterrows():
    top_n = or_query_search(corpus=msmarco, fields=['body_idx'], query=row['query'], query_id=row['query_id'])
    results.append(top_n)

results = pd.concat(results)
results

Unnamed: 0,title,msmarco_id,scores,query,query_id,took,rank
324020,Golf Shoes Buying Guide,D1785338,10.776132,what type of shoes are good for golf,914556,0.158226,1
44428,Best Golf Shoes for Walkingâ¦and more!,D1118151,10.377231,what type of shoes are good for golf,914556,0.158226,2
2330617,How to Buy Golf Shoes,D3560282,10.373627,what type of shoes are good for golf,914556,0.158226,3
2577299,Best golf shoes reviews,D1520273,10.270243,what type of shoes are good for golf,914556,0.158226,4
1228559,About Different Types of Soles for Sport Shoes,D1335783,9.973626,what type of shoes are good for golf,914556,0.158226,5
...,...,...,...,...,...,...,...
1373411,Scott County Jail Inmate Commissary,D2082355,12.921789,cost to put an inmate in jail for a year,110408,0.148651,6
676429,Pitt County Jail & Sheriff Inmate Commissary,D3447479,12.909848,cost to put an inmate in jail for a year,110408,0.148651,7
2879197,Meherrin River Regional Jail Inmate Commissary,D1891690,12.906716,cost to put an inmate in jail for a year,110408,0.148651,8
1059436,IT'S YOUR MONEY: Taxpayers Cover Cost of Housi...,D1534055,12.895087,cost to put an inmate in jail for a year,110408,0.148651,9


In [8]:
graded = grade_results(msmarco_judgments, results)
queries_judged = judge_queries(graded)
queries_judged.sum() / NUM_QUERIES

0.22259325396825397

## Search body + title

In [9]:
results = []

for idx, row in msmarco_judgments.iterrows():
    top_n = or_query_search(corpus=msmarco, fields=['body_idx', 'title_idx'], query=row['query'], query_id=row['query_id'])
    results.append(top_n)

results = pd.concat(results)
results

graded = grade_results(msmarco_judgments, results)
queries_judged = judge_queries(graded)
queries_judged.sum() / NUM_QUERIES

0.271956746031746

## Weigh body*10

In [10]:
results = []

for idx, row in msmarco_judgments.iterrows():
    top_n = or_query_search(corpus=msmarco,
                            fields=['body_idx^10', 'title_idx'], query=row['query'], query_id=row['query_id'])
    results.append(top_n)

results = pd.concat(results)
graded = grade_results(msmarco_judgments, results)
queries_judged = judge_queries(graded)
queries_judged.sum() / NUM_QUERIES

0.2555924603174603

## Weigh title*10

In [11]:
results = []

for idx, row in msmarco_judgments.iterrows():
    top_n = or_query_search(corpus=msmarco,
                            fields=['body_idx', 'title_idx^10'], query=row['query'], query_id=row['query_id'])
    results.append(top_n)

results = pd.concat(results)
graded = grade_results(msmarco_judgments, results)
queries_judged = judge_queries(graded)
queries_judged.sum() / NUM_QUERIES

0.23200555555555552