In [2]:
import numpy as np
import pandas as pd
from msmarco.index import indexed
from msmarco.evaluate import grade_results, judge_queries
from time import perf_counter
from searcharray.similarity import bm25_similarity, classic_similarity, default_bm25

NUM_QUERIES = 250

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [3]:
# Will take a min or two to load, and a good hour to build an index first time
msmarco = indexed()

Cannot find indexed file at /Users/douglas.turnbull/.msmarco/msmarco_indexed_snowball_tokenizer.pkl
Indexing
Saving ids
Getting URL
Getting titles


2024-03-22 09:45:38,633 - INFO - Indexing begins


Indexing body


2024-03-22 09:45:44,264 - INFO - 0 Batch Start
2024-03-22 09:45:44,264 - INFO - Tokenizing 100000 documents
2024-03-22 09:45:52,160 - INFO - Tokenized 10000 (10.0%)
2024-03-22 09:45:59,970 - INFO - Tokenized 20000 (20.0%)
2024-03-22 09:46:07,693 - INFO - Tokenized 30000 (30.0%)
2024-03-22 09:46:15,751 - INFO - Tokenized 40000 (40.0%)
2024-03-22 09:46:23,450 - INFO - Tokenized 50000 (50.0%)
2024-03-22 09:46:31,293 - INFO - Tokenized 60000 (60.0%)
2024-03-22 09:46:39,492 - INFO - Tokenized 70000 (70.0%)
2024-03-22 09:46:47,730 - INFO - Tokenized 80000 (80.0%)
2024-03-22 09:46:56,458 - INFO - Tokenized 90000 (90.0%)
2024-03-22 09:47:04,528 - INFO - Tokenization -- vstacking
2024-03-22 09:47:04,936 - INFO - Tokenization -- DONE
2024-03-22 09:47:05,229 - INFO - Inverting docs->terms
2024-03-22 09:48:23,199 - INFO - Encoding positions to bit array
2024-03-22 09:48:25,950 - INFO - 0 Batch Complete
2024-03-22 09:48:26,022 - INFO - Roaringish NBytes -- 810.74 MB
2024-03-22 09:48:26,023 - INFO -

Indexing title


2024-03-22 11:55:47,506 - INFO - 0 Batch Start
2024-03-22 11:55:47,507 - INFO - Tokenizing 100000 documents
2024-03-22 11:55:47,638 - INFO - Tokenized 10000 (10.0%)
2024-03-22 11:55:47,750 - INFO - Tokenized 20000 (20.0%)
2024-03-22 11:55:47,865 - INFO - Tokenized 30000 (30.0%)
2024-03-22 11:55:47,968 - INFO - Tokenized 40000 (40.0%)
2024-03-22 11:55:48,075 - INFO - Tokenized 50000 (50.0%)
2024-03-22 11:55:48,174 - INFO - Tokenized 60000 (60.0%)
2024-03-22 11:55:48,284 - INFO - Tokenized 70000 (70.0%)
2024-03-22 11:55:48,404 - INFO - Tokenized 80000 (80.0%)
2024-03-22 11:55:48,519 - INFO - Tokenized 90000 (90.0%)
2024-03-22 11:55:48,761 - INFO - Tokenization -- vstacking
2024-03-22 11:55:48,873 - INFO - Tokenization -- DONE
2024-03-22 11:55:48,878 - INFO - Inverting docs->terms
2024-03-22 11:55:48,959 - INFO - Encoding positions to bit array
2024-03-22 11:55:49,005 - INFO - 0 Batch Complete
2024-03-22 11:55:49,008 - INFO - Roaringish NBytes -- 4.02 MB
2024-03-22 11:55:49,009 - INFO - T

In [4]:
np.sort(msmarco['body_idx'].array.score('cheese'))

array([0.       , 0.       , 0.       , ..., 4.7565475, 5.0600863,
       8.568701 ], dtype=float32)

In [13]:
# %%prun

def or_query_search(corpus, fields, query, query_id=None, n=10, 
                    max_posn18s=None,
                    min_posn18s=None,
                    similarities=default_bm25):
    if not isinstance(fields, list):
        fields = [fields]
    if isinstance(similarities, list):
        assert len(similarities) == len(fields)
    if isinstance(max_posn18s, list):
        assert len(max_posn18s) == len(fields)
    if isinstance(min_posn18s, list):
        assert len(min_posn18s) == len(fields)
    if not isinstance(similarities, list):
        similarities = [similarities] * len(fields)
    if not isinstance(max_posn18s, list):
        max_posn18s = [max_posn18s] * len(fields)
    if not isinstance(min_posn18s, list):
        min_posn18s = [min_posn18s] * len(fields)

    
    start = perf_counter()
    scored = np.zeros(len(corpus))
    for field, similarity, max_posn18, min_posn18 in zip(fields, similarities, max_posn18s, min_posn18s):
        boost = 1.0
        if '^' in field:
            field, boost = field.split('^')
            boost = float(boost)
        tokenizer = corpus[field].array.tokenizer
        tokens = tokenizer(query)
        max_posn_arg = (max_posn18 * 18 - 1) if max_posn18 is not None else None
        min_posn_arg = (min_posn18 * 18) if min_posn18 is not None else None
        # print(max_posn_arg, min_posn_arg)

        for token in tokens:
            token_score = corpus[field].array.score(token,
                                                    max_posn=max_posn_arg,
                                                    min_posn=min_posn_arg,
                                                    similarity=similarity)
            # print(field, token, max_posn_arg, min_posn_arg)
            # print(np.sort(token_score))
            scored += token_score * boost
            
    took = perf_counter() - start

    top_n_idx = np.argpartition(-scored, n)[:n]
    # top_n_idx = np.argsort(scored)[::-1][:n]
    scores = scored[top_n_idx]

    top_n = corpus[['title', 'msmarco_id']].iloc[top_n_idx]
    top_n['scores'] = scores
    top_n['query'] = query
    top_n['query_id'] = query_id
    top_n['took'] = took
    top_n.sort_values('scores', ascending=False, inplace=True)
    # print(query, took)
    top_n['rank'] = np.arange(n) + 1
    return top_n

or_query_search(corpus=msmarco, fields=["title_idx^10"], query="Adjustable-rate mortgage", n=100)

Unnamed: 0,title,msmarco_id,scores,query,query_id,took,rank
1782294,Adjustable Rate Mortgage,D578534,111.879284,Adjustable-rate mortgage,,0.04918,1
886385,Adjustable-rate mortgage,D1968890,111.879284,Adjustable-rate mortgage,,0.04918,2
1673472,Adjustable-rate mortgage,D578531,111.879284,Adjustable-rate mortgage,,0.04918,3
487631,adjustable rate mortgage,D578530,111.879284,Adjustable-rate mortgage,,0.04918,4
368379,Adjustable-Rate Mortgages,D1213211,111.879284,Adjustable-rate mortgage,,0.04918,5
...,...,...,...,...,...,...,...
2641962,Common Mortgage Insurance Premium Questions An...,D2106970,60.626610,Adjustable-rate mortgage,,0.04918,96
1057920,Mortgage Rates and Treasury Bonds,D2407381,59.265333,Adjustable-rate mortgage,,0.04918,97
2876082,Understanding Reverse Mortgage Interest Rates,D1912271,59.265333,Adjustable-rate mortgage,,0.04918,98
3058401,5-year Variable Mortgage Rates,D2906409,59.265333,Adjustable-rate mortgage,,0.04918,99


In [14]:
from msmarco.evaluate import grade_results, judgments

msmarco_judgments = judgments().sample(NUM_QUERIES)
msmarco_judgments

Unnamed: 0,query_id,q0,msmarco_id,grade,query
38233,125169,0,D855776,1,define periosteum
122908,423793,0,D724662,1,is simons town a marine protected area
93386,302147,0,D1047344,1,how much are corian countertops
80733,264041,0,D3013418,1,how long is manitou springs incline trail
312937,981469,0,D2301737,1,where is dobre gymnastics academy
...,...,...,...,...,...
256363,812873,0,D863307,1,what is the cost of owning a horse a month
324080,1010840,0,D1513864,1,which indian food contain more carbohydrate
225236,722552,0,D858654,1,what is basic human rights
305516,960936,0,D1010210,1,when was the first environmental law passed


In [15]:

def run_all(corpus, judgments, fields, max_posns=None, min_posns=None, similarities=default_bm25, n=10):
    results = []
    start = perf_counter()
    query_no = 0
    for idx, row in judgments.iterrows():
        top_n = or_query_search(corpus=msmarco, fields=fields, similarities=similarities, n=n,
                                min_posn18s=min_posns,
                                max_posn18s=max_posns,
                                query=row['query'], query_id=row['query_id'])
        results.append(top_n)
        if query_no > 0 and query_no % 50 == 0:
            print(f"-- {query_no} QPS: {query_no / (perf_counter() - start)} q:{row['query']}")
        query_no += 1
    results = pd.concat(results)
    graded = grade_results(msmarco_judgments, results)
    return graded

## Run random search

Try different parameters w/ random search

In [16]:
from random import uniform
# 0.3164444444444445 ['title_idx^1.7072861134255923', 'body_idx^9.467854536052052'] title: b,k1: 0.9054699170098017,1.3007711184048492 body: b,k1: 1.2957539903585236,1.8407221594355443

results = []

for _ in range(0, 100):
    title_boost = uniform(0, 10)
    body_boost = uniform(0, 10)
    b_title = uniform(0,2)
    k1_title = uniform(0.1,2)
    b_body = uniform(0,2)
    k1_body = uniform(0.1,2)

    min_posn_block_body = uniform(1, 10) // 1
    min_posn_block_title= uniform(1, 10) // 1
    max_posn_block_body = uniform(min_posn_block_body, 100) // 1
    max_posn_block_title= uniform(min_posn_block_title, 100) // 1
    if max_posn_block_body > 80:
        max_posn_block_body = None
    if max_posn_block_title > 80:
        max_posn_block_title = None

    if min_posn_block_body > 80:
        min_posn_block_body = None
    if min_posn_block_title > 80:
        min_posn_block_title = None
        
    max_posns = [None, None]
    min_posns = [None, None]
    
    # max_posns = [None, None]
    
    bm25_similarity_title = bm25_similarity(b=b_title,
                                            k1=k1_title)
    bm25_similarity_body = bm25_similarity(b=b_body,
                                           k1=k1_body)
    fields = [f"title_idx^{title_boost}", f"body_idx^{body_boost}"]
    similarities = [bm25_similarity_title, bm25_similarity_body]

    graded = run_all(msmarco, msmarco_judgments,
                     fields=fields,
                     n=100,
                     min_posns=min_posns,
                     max_posns=max_posns,
                     similarities=similarities)

    queries_judged = judge_queries(graded)
    mrr = queries_judged.sum() / len(msmarco_judgments)
    results.append({"mrr100": mrr, "title_boost": title_boost, "body_boost": body_boost,
                    "b_title": b_title, "k1_title": k1_title,
                    "b_body": b_body, "k1_body": k1_body,
                    "min_posn_body": min_posns[1], "min_posn_title": min_posns[0],
                    "max_posn_body": max_posns[1], "max_posn_title": max_posns[1]})
    print(mrr, fields, f"title: b,k1: {b_title},{k1_title},{min_posns[0]}-{max_posns[0]}",
          f"body: b,k1: {b_body},{k1_body},{min_posns[1]}-{max_posns[1]}")


-- 50 QPS: 4.752907432511487 q:how many steps to take in a day
-- 100 QPS: 4.8450716081529395 q:what hotels do hilton own
-- 150 QPS: 4.882798075696449 q:what is the basic function of a villi
-- 200 QPS: 4.9078320942091365 q:what are wolves known for
0.13735986678496828 ['title_idx^0.38941244599562497', 'body_idx^2.559805082996678'] title: b,k1: 0.44095006618723165,1.1848203168869542,None-None body: b,k1: 1.3083239943344258,1.5676166104574947,None-None
-- 50 QPS: 5.119643749521139 q:how many steps to take in a day
-- 100 QPS: 5.116892968104784 q:what hotels do hilton own
-- 150 QPS: 5.149663534050727 q:what is the basic function of a villi
-- 200 QPS: 5.127668744470373 q:what are wolves known for
0.22866999189539036 ['title_idx^7.665041235361406', 'body_idx^2.6063855881267326'] title: b,k1: 0.6846620698781061,1.4137009149376496,None-None body: b,k1: 0.8089368272395296,1.6228641331964733,None-None
-- 50 QPS: 5.00411035523077 q:how many steps to take in a day
-- 100 QPS: 5.03849279382391

In [17]:
# 0.2251567139820729 ['title_idx^8.303587107121324', 'body_idx^3.4224600274312156'] title: b,k1: 0.7560176405958086,0.3356740179960797,None body: b,k1: 0.9876853925433389,0.5993344811416361,28.0

# 0.3164444444444445 ['title_idx^1.7072861134255923', 'body_idx^9.467854536052052'] title: b,k1: 0.9054699170098017,1.3007711184048492 body: b,k1: 1.2957539903585236,1.8407221594355443

title_boost = 1.7072861134255923
body_boost = 9.467854536052052
b_title = 0.9054699170098017
k1_title = 1.3007711184048492
b_body = 1.2957539903585236
k1_body = 1.8407221594355443

bm25_similarity_title = bm25_similarity(b=b_title,
                                        k1=k1_title)
bm25_similarity_body = bm25_similarity(b=b_body,
                                       k1=k1_body)
fields = [f"title_idx^{title_boost}", f"body_idx^{body_boost}"]
similarities = [bm25_similarity_title, bm25_similarity_body]

graded = run_all(msmarco, msmarco_judgments,
                 fields=fields,
                 n=100,
                 similarities=default_bm25)

queries_judged = judge_queries(graded)
mrr = queries_judged.sum() / NUM_QUERIES

-- 50 QPS: 4.948395329274489 q:how many steps to take in a day
-- 100 QPS: 4.986313669952577 q:what hotels do hilton own
-- 150 QPS: 4.937820332872482 q:what is the basic function of a villi
-- 200 QPS: 4.942370904996126 q:what are wolves known for


In [94]:
mrr

0.06939648216483567

In [95]:
queries_judged

query
 is a unit price item for                 0.000000
adjustable mortgage loans definition      0.000000
adult sore throat causes                  0.028571
air filter arrestance definition          0.038462
aleah name meaning                        1.000000
                                            ...   
who made the original monopoly game       0.000000
who manufactures vizio televisions        0.083333
who wrote hurts so bad ronstadt           1.000000
why is it good to have dystopian books    0.000000
why passport denied                       0.000000
Name: reciprical_rank, Length: 250, dtype: float64

In [97]:
graded[graded['query'] == 'adjustable mortgage loans definition'].head(50)

Unnamed: 0,title,msmarco_id,scores,query,query_id,took,rank,q0,grade,reciprical_rank
17100,.,D3491319,96.549774,adjustable mortgage loans definition,11653,0.070098,1,,0.0,0.0
17101,Definitions &Translations,D412648,94.490601,adjustable mortgage loans definition,11653,0.070098,2,,0.0,0.0
17102,Shared Flashcard Set,D2437325,93.984756,adjustable mortgage loans definition,11653,0.070098,3,,0.0,0.0
17103,eSSBASE,D660188,93.59166,adjustable mortgage loans definition,11653,0.070098,4,,0.0,0.0
17104,Definitions &Translations,D291685,93.118851,adjustable mortgage loans definition,11653,0.070098,5,,0.0,0.0
17105,definition,D2534010,92.898048,adjustable mortgage loans definition,11653,0.070098,6,,0.0,0.0
17106,Does Watching Television Cause Binge Eating?,D3090292,91.649757,adjustable mortgage loans definition,11653,0.070098,7,,0.0,0.0
17107,mortgagee,D2724946,90.544086,adjustable mortgage loans definition,11653,0.070098,8,,0.0,0.0
17108,The Difference Between Mortgagor & Mortgagee,D1712999,86.536993,adjustable mortgage loans definition,11653,0.070098,9,,0.0,0.0
17109,mortgagee,D1713001,80.501942,adjustable mortgage loans definition,11653,0.070098,10,,0.0,0.0


In [98]:
msmarco_judgments[msmarco_judgments['query_id'] == 11653]

Unnamed: 0,query_id,q0,msmarco_id,grade,query
2995,11653,0,D578531,1,adjustable mortgage loans definition


In [99]:
msmarco[msmarco['msmarco_id'] == 'D578531']

Unnamed: 0,msmarco_id,url,title,body_idx,title_idx
1673472,D578531,https://en.wikipedia.org/wiki/Adjustable-rate_...,Adjustable-rate mortgage,"Terms({'interv', 'kingdom', 'state', 'retriev'...","Terms({'mortgag', 'adjustabler'})"


In [1]:
from msmarco.tokenizers import snowball_tokenizer

snowball_tokenizer('Adjustable-rate mortgage')

['adjust', 'rate', 'mortgag']