In [1]:
import numpy as np
import pandas as pd
from msmarco.index import indexed
from msmarco.evaluate import grade_results, judge_queries
from time import perf_counter
from searcharray.similarity import bm25_similarity, classic_similarity, default_bm25

NUM_QUERIES = 250

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
# Will take a min or two to load, and a good hour to build an index first time
msmarco = indexed()

In [3]:
np.sort(msmarco['body_idx'].array.score('cheese'))

array([0.       , 0.       , 0.       , ..., 4.7565475, 5.0600863,
       8.568701 ], dtype=float32)

In [None]:
from searcharray.solr import edismax

edismax(msmarco, qf=['title', 'body'], mm=2,
        pf='title')

In [6]:
# %%prun

def or_query_search(corpus, fields, query, query_id=None, n=10, 
                    max_posn18s=None,
                    min_posn18s=None,
                    term_tie=0.1,
                    similarities=default_bm25):
    if not isinstance(fields, list):
        fields = [fields]
    if isinstance(similarities, list):
        assert len(similarities) == len(fields)
    if isinstance(max_posn18s, list):
        assert len(max_posn18s) == len(fields)
    if isinstance(min_posn18s, list):
        assert len(min_posn18s) == len(fields)
    if not isinstance(similarities, list):
        similarities = [similarities] * len(fields)
    if not isinstance(max_posn18s, list):
        max_posn18s = [max_posn18s] * len(fields)
    if not isinstance(min_posn18s, list):
        min_posn18s = [min_posn18s] * len(fields)

    
    start = perf_counter()
    scored = np.zeros(len(corpus))
    for field, similarity, max_posn18, min_posn18 in zip(fields, similarities, max_posn18s, min_posn18s):
        boost = 1.0
        if '^' in field:
            field, boost = field.split('^')
            boost = float(boost)
        tokenizer = corpus[field].array.tokenizer
        tokens = tokenizer(query)
        max_posn_arg = (max_posn18 * 18 - 1) if max_posn18 is not None else None
        min_posn_arg = (min_posn18 * 18) if min_posn18 is not None else None
        # print(max_posn_arg, min_posn_arg)

        token_scores = []
        for token in tokens:
            token_score = corpus[field].array.score(token,
                                                    max_posn=max_posn_arg,
                                                    min_posn=min_posn_arg,
                                                    similarity=similarity)
            # print(field, token, max_posn_arg, min_posn_arg)
            # print(np.sort(token_score))
            scored += token_score * boost
            
    took = perf_counter() - start

    top_n_idx = np.argpartition(-scored, n)[:n]
    # top_n_idx = np.argsort(scored)[::-1][:n]
    scores = scored[top_n_idx]

    top_n = corpus[['title', 'msmarco_id']].iloc[top_n_idx]
    top_n['scores'] = scores
    top_n['query'] = query
    top_n['query_id'] = query_id
    top_n['took'] = took
    top_n.sort_values('scores', ascending=False, inplace=True)
    # print(query, took)
    top_n['rank'] = np.arange(n) + 1
    return top_n

or_query_search(corpus=msmarco, fields=["title_idx^10"], query="Adjustable-rate mortgage", n=100)

Unnamed: 0,title,msmarco_id,scores,query,query_id,took,rank
1782294,Adjustable Rate Mortgage,D578534,111.879284,Adjustable-rate mortgage,,0.04406,1
886385,Adjustable-rate mortgage,D1968890,111.879284,Adjustable-rate mortgage,,0.04406,2
1673472,Adjustable-rate mortgage,D578531,111.879284,Adjustable-rate mortgage,,0.04406,3
487631,adjustable rate mortgage,D578530,111.879284,Adjustable-rate mortgage,,0.04406,4
368379,Adjustable-Rate Mortgages,D1213211,111.879284,Adjustable-rate mortgage,,0.04406,5
...,...,...,...,...,...,...,...
2641962,Common Mortgage Insurance Premium Questions An...,D2106970,60.626610,Adjustable-rate mortgage,,0.04406,96
1057920,Mortgage Rates and Treasury Bonds,D2407381,59.265333,Adjustable-rate mortgage,,0.04406,97
2876082,Understanding Reverse Mortgage Interest Rates,D1912271,59.265333,Adjustable-rate mortgage,,0.04406,98
3058401,5-year Variable Mortgage Rates,D2906409,59.265333,Adjustable-rate mortgage,,0.04406,99


In [7]:
from msmarco.evaluate import grade_results, judgments

msmarco_judgments = judgments().sample(NUM_QUERIES)
msmarco_judgments

Unnamed: 0,query_id,q0,msmarco_id,grade,query
84809,275996,0,D70534,1,how many calories are in a slice of domino's
7525,25784,0,D2728982,1,are there tile shower designs that can also be...
268696,849060,0,D1782225,1,what is the study of trees
15508,51405,0,D2965368,1,benicar medication side effects
4213,15637,0,D2815262,1,allah hu meaning
...,...,...,...,...,...
183033,605369,0,D491509,1,what county is east hampton ct in
185970,611596,0,D1264421,1,what county is pottsville pa in
225876,724372,0,D179203,1,what is bis chemistry
296607,932216,0,D2503653,1,what's the life span of a copper cable (house ...


In [8]:

def run_all(corpus, judgments, fields, max_posns=None, min_posns=None, similarities=default_bm25, n=10):
    results = []
    start = perf_counter()
    query_no = 0
    for idx, row in judgments.iterrows():
        top_n = or_query_search(corpus=msmarco, fields=fields, similarities=similarities, n=n,
                                min_posn18s=min_posns,
                                max_posn18s=max_posns,
                                query=row['query'], query_id=row['query_id'])
        results.append(top_n)
        if query_no > 0 and query_no % 50 == 0:
            print(f"-- {query_no} QPS: {query_no / (perf_counter() - start)} q:{row['query']}")
        query_no += 1
    results = pd.concat(results)
    graded = grade_results(msmarco_judgments, results)
    return graded

## Run random search

Try different parameters w/ random search

In [None]:
from random import uniform
# 0.3164444444444445 ['title_idx^1.7072861134255923', 'body_idx^9.467854536052052'] title: b,k1: 0.9054699170098017,1.3007711184048492 body: b,k1: 1.2957539903585236,1.8407221594355443

results = []

for _ in range(0, 100):
    title_boost = uniform(0, 10)
    body_boost = uniform(0, 10)
    b_title = uniform(0,2)
    k1_title = uniform(0.1,2)
    b_body = uniform(0,2)
    k1_body = uniform(0.1,2)

    min_posn_block_body = uniform(1, 10) // 1
    min_posn_block_title= uniform(1, 10) // 1
    max_posn_block_body = uniform(min_posn_block_body, 100) // 1
    max_posn_block_title= uniform(min_posn_block_title, 100) // 1
    if max_posn_block_body > 80:
        max_posn_block_body = None
    if max_posn_block_title > 80:
        max_posn_block_title = None

    if min_posn_block_body > 80:
        min_posn_block_body = None
    if min_posn_block_title > 80:
        min_posn_block_title = None
        
    max_posns = [None, None]
    min_posns = [None, None]
    
    # max_posns = [None, None]
    
    bm25_similarity_title = bm25_similarity(b=b_title,
                                            k1=k1_title)
    bm25_similarity_body = bm25_similarity(b=b_body,
                                           k1=k1_body)
    fields = [f"title_idx^{title_boost}", f"body_idx^{body_boost}"]
    similarities = [bm25_similarity_title, bm25_similarity_body]

    graded = run_all(msmarco, msmarco_judgments,
                     fields=fields,
                     n=100,
                     min_posns=min_posns,
                     max_posns=max_posns,
                     similarities=similarities)

    queries_judged = judge_queries(graded)
    mrr = queries_judged.sum() / len(msmarco_judgments)
    results.append({"mrr100": mrr, "title_boost": title_boost, "body_boost": body_boost,
                    "b_title": b_title, "k1_title": k1_title,
                    "b_body": b_body, "k1_body": k1_body,
                    "min_posn_body": min_posns[1], "min_posn_title": min_posns[0],
                    "max_posn_body": max_posns[1], "max_posn_title": max_posns[1]})
    print(mrr, fields, f"title: b,k1: {b_title},{k1_title},{min_posns[0]}-{max_posns[0]}",
          f"body: b,k1: {b_body},{k1_body},{min_posns[1]}-{max_posns[1]}")


-- 50 QPS: 3.8204183570180583 q:what does igneous rock mean
-- 100 QPS: 3.938174286832919 q:what part of the body makes platelets site:webmd.com
-- 150 QPS: 4.039792825576454 q:what does sneezing do to your body
-- 200 QPS: 4.074020318341173 q:healthy amount of tumeric to take each day
0.15892067013706618 ['title_idx^5.696436972732263', 'body_idx^4.0434406840900134'] title: b,k1: 1.4031600415454564,1.542822527584481,None-None body: b,k1: 0.346121573072542,1.0933878708496338,None-None
-- 50 QPS: 3.876821629995696 q:what does igneous rock mean
-- 100 QPS: 4.061879958763216 q:what part of the body makes platelets site:webmd.com
-- 150 QPS: 4.207438474253044 q:what does sneezing do to your body
-- 200 QPS: 4.2624516097776635 q:healthy amount of tumeric to take each day
0.008131805961009075 ['title_idx^5.932241212094312', 'body_idx^2.384149965379807'] title: b,k1: 0.16523956445732169,1.416476497785739,None-None body: b,k1: 1.6193356764029605,1.5759213015753595,None-None
-- 50 QPS: 4.0777892

In [None]:
# MRR - 
# 0.2251567139820729 ['title_idx^8.303587107121324', 'body_idx^3.4224600274312156'] title: b,k1: 0.7560176405958086,0.3356740179960797,None body: b,k1: 0.9876853925433389,0.5993344811416361,28.0
# 0.2147149704345346 ['title_idx^8.656527070785664', 'body_idx^1.5575073825064867'] title: b,k1: 1.6880219450871794,0.2341927148628216,None-None body: b,k1: 0.5521578239164557,0.9016764440121112,None-None
# 0.20722922540247896 ['title_idx^6.0928972119917955', 'body_idx^3.93316526148468'] title: b,k1: 0.14107783588920553,0.5186770272416928,None-None body: b,k1: 0.40062278905019477,0.41099829275872124,None-None
# 0.20508620955342508 ['title_idx^5.776155923373016', 'body_idx^9.576567654275728'] title: b,k1: 1.0202850798293444,0.9002844873475122,None-None body: b,k1: 0.6412991523249352,0.9996043235465262,None-None
# 0.23036763573501912 ['title_idx^4.548151177254303', 'body_idx^9.986898643553745']
#                      title: b,k1: 0.81487604301131,0.9259702176710924,None-None 
#                      body: b,k1: 0.24949060704340464,0.21097965680803105,None-None
# 0.2402296751510475 ['title_idx^7.619485345377434', 'body_idx^6.282527958035832'] 
#                     title: b,k1: 1.0140068707767622,1.053043194102012,None-None
#                     body: b,k1: 0.34379806577386884,0.114212811677032,None-None
# 0.23114537478017397 ['title_idx^5.629508463041821', 'body_idx^3.6128533287082476']
#                     title: b,k1: 0.8948383473299681,0.7316806082971699,None-None body: b,k1: 0.02519371512474855,0.8821762081227371,None-None

title_boost = 1.7072861134255923
body_boost = 9.467854536052052
b_title = 0.9054699170098017
k1_title = 1.3007711184048492
b_body = 1.2957539903585236
k1_body = 1.8407221594355443

bm25_similarity_title = bm25_similarity(b=b_title,
                                        k1=k1_title)
bm25_similarity_body = bm25_similarity(b=b_body,
                                       k1=k1_body)
fields = [f"title_idx^{title_boost}", f"body_idx^{body_boost}"]
similarities = [bm25_similarity_title, bm25_similarity_body]

graded = run_all(msmarco, msmarco_judgments,
                 fields=fields,
                 n=100,
                 similarities=default_bm25)

queries_judged = judge_queries(graded)
mrr = queries_judged.sum() / NUM_QUERIES

In [None]:
mrr

In [95]:
queries_judged

query
 is a unit price item for                 0.000000
adjustable mortgage loans definition      0.000000
adult sore throat causes                  0.028571
air filter arrestance definition          0.038462
aleah name meaning                        1.000000
                                            ...   
who made the original monopoly game       0.000000
who manufactures vizio televisions        0.083333
who wrote hurts so bad ronstadt           1.000000
why is it good to have dystopian books    0.000000
why passport denied                       0.000000
Name: reciprical_rank, Length: 250, dtype: float64

In [97]:
graded[graded['query'] == 'adjustable mortgage loans definition'].head(50)

Unnamed: 0,title,msmarco_id,scores,query,query_id,took,rank,q0,grade,reciprical_rank
17100,.,D3491319,96.549774,adjustable mortgage loans definition,11653,0.070098,1,,0.0,0.0
17101,Definitions &Translations,D412648,94.490601,adjustable mortgage loans definition,11653,0.070098,2,,0.0,0.0
17102,Shared Flashcard Set,D2437325,93.984756,adjustable mortgage loans definition,11653,0.070098,3,,0.0,0.0
17103,eSSBASE,D660188,93.59166,adjustable mortgage loans definition,11653,0.070098,4,,0.0,0.0
17104,Definitions &Translations,D291685,93.118851,adjustable mortgage loans definition,11653,0.070098,5,,0.0,0.0
17105,definition,D2534010,92.898048,adjustable mortgage loans definition,11653,0.070098,6,,0.0,0.0
17106,Does Watching Television Cause Binge Eating?,D3090292,91.649757,adjustable mortgage loans definition,11653,0.070098,7,,0.0,0.0
17107,mortgagee,D2724946,90.544086,adjustable mortgage loans definition,11653,0.070098,8,,0.0,0.0
17108,The Difference Between Mortgagor & Mortgagee,D1712999,86.536993,adjustable mortgage loans definition,11653,0.070098,9,,0.0,0.0
17109,mortgagee,D1713001,80.501942,adjustable mortgage loans definition,11653,0.070098,10,,0.0,0.0


In [98]:
msmarco_judgments[msmarco_judgments['query_id'] == 11653]

Unnamed: 0,query_id,q0,msmarco_id,grade,query
2995,11653,0,D578531,1,adjustable mortgage loans definition


In [99]:
msmarco[msmarco['msmarco_id'] == 'D578531']

Unnamed: 0,msmarco_id,url,title,body_idx,title_idx
1673472,D578531,https://en.wikipedia.org/wiki/Adjustable-rate_...,Adjustable-rate mortgage,"Terms({'interv', 'kingdom', 'state', 'retriev'...","Terms({'mortgag', 'adjustabler'})"


In [1]:
from msmarco.tokenizers import snowball_tokenizer

snowball_tokenizer('Adjustable-rate mortgage')

['adjust', 'rate', 'mortgag']