In [1]:
import numpy as np
import pandas as pd
from msmarco.index import indexed
from msmarco.evaluate import grade_results, judge_queries
from time import perf_counter
from searcharray.similarity import bm25_similarity, classic_similarity, default_bm25

NUM_QUERIES = 250

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
# Will take a min or two to load, and a good hour to build an index first time
msmarco = indexed()

In [3]:
np.sort(msmarco['body_idx'].array.score('cheese'))

array([0.       , 0.       , 0.       , ..., 4.7565475, 5.0600863,
       8.568701 ], dtype=float32)

In [4]:
# %%prun

def or_query_search(corpus, fields, query, query_id=None, n=10, 
                    max_posn18s=None,
                    min_posn18s=None,
                    similarities=default_bm25):
    if not isinstance(fields, list):
        fields = [fields]
    if isinstance(similarities, list):
        assert len(similarities) == len(fields)
    if isinstance(max_posn18s, list):
        assert len(max_posn18s) == len(fields)
    if isinstance(min_posn18s, list):
        assert len(min_posn18s) == len(fields)
    if not isinstance(similarities, list):
        similarities = [similarities] * len(fields)
    if not isinstance(max_posn18s, list):
        max_posn18s = [max_posn18s] * len(fields)
    if not isinstance(min_posn18s, list):
        min_posn18s = [min_posn18s] * len(fields)

    
    start = perf_counter()
    scored = np.zeros(len(corpus))
    for field, similarity, max_posn18, min_posn18 in zip(fields, similarities, max_posn18s, min_posn18s):
        boost = 1.0
        if '^' in field:
            field, boost = field.split('^')
            boost = float(boost)
        tokenizer = corpus[field].array.tokenizer
        tokens = tokenizer(query)
        max_posn_arg = (max_posn18 * 18 - 1) if max_posn18 is not None else None
        min_posn_arg = (min_posn18 * 18) if min_posn18 is not None else None
        # print(max_posn_arg, min_posn_arg)

        for token in tokens:
            token_score = corpus[field].array.score(token,
                                                    max_posn=max_posn_arg,
                                                    min_posn=min_posn_arg,
                                                    similarity=similarity)
            # print(field, token, max_posn_arg, min_posn_arg)
            # print(np.sort(token_score))
            scored += token_score * boost
            
    took = perf_counter() - start

    top_n_idx = np.argpartition(-scored, n)[:n]
    # top_n_idx = np.argsort(scored)[::-1][:n]
    scores = scored[top_n_idx]

    top_n = corpus[['title', 'msmarco_id']].iloc[top_n_idx]
    top_n['scores'] = scores
    top_n['query'] = query
    top_n['query_id'] = query_id
    top_n['took'] = took
    top_n.sort_values('scores', ascending=False, inplace=True)
    # print(query, took)
    top_n['rank'] = np.arange(n) + 1
    return top_n

or_query_search(corpus=msmarco, fields=["title_idx^10"], query="Adjustable-rate mortgage", n=100)

Unnamed: 0,title,msmarco_id,scores,query,query_id,took,rank
1782294,Adjustable Rate Mortgage,D578534,111.879284,Adjustable-rate mortgage,,0.02831,1
886385,Adjustable-rate mortgage,D1968890,111.879284,Adjustable-rate mortgage,,0.02831,2
1673472,Adjustable-rate mortgage,D578531,111.879284,Adjustable-rate mortgage,,0.02831,3
487631,adjustable rate mortgage,D578530,111.879284,Adjustable-rate mortgage,,0.02831,4
368379,Adjustable-Rate Mortgages,D1213211,111.879284,Adjustable-rate mortgage,,0.02831,5
...,...,...,...,...,...,...,...
2641962,Common Mortgage Insurance Premium Questions An...,D2106970,60.626610,Adjustable-rate mortgage,,0.02831,96
1057920,Mortgage Rates and Treasury Bonds,D2407381,59.265333,Adjustable-rate mortgage,,0.02831,97
2876082,Understanding Reverse Mortgage Interest Rates,D1912271,59.265333,Adjustable-rate mortgage,,0.02831,98
3058401,5-year Variable Mortgage Rates,D2906409,59.265333,Adjustable-rate mortgage,,0.02831,99


In [5]:
from msmarco.evaluate import grade_results, judgments

msmarco_judgments = judgments().sample(NUM_QUERIES)
msmarco_judgments

Unnamed: 0,query_id,q0,msmarco_id,grade,query
138495,471217,0,D156862,1,pain doctor in tracy ca
14461,48444,0,D3001663,1,baker definition
149582,505913,0,D810322,1,sweep in debit define
256674,813726,0,D108472,1,what is the current cost of regular stamps
221019,710530,0,D418734,1,what is an oral contract
...,...,...,...,...,...
296767,932702,0,D311383,1,what's the pin number?
28780,94115,0,D1131946,1,code officials conference of michigan
217287,699787,0,D2013091,1,what is a shooter marble
254737,807966,0,D607428,1,what is the benefit of branching in storage po...


In [167]:
import random

def random_probes(columns, num_probes=10, feature_ranges={}):
    
    probes = []
    for _ in range(num_probes):
        param_dict = {}
        for param in columns:
            if param in feature_ranges:
                param_dict[param] = random.uniform(feature_ranges[param][0],
                                                   feature_ranges[param][1])
            else:
                param_dict[param] = random.random() * 10
        probes.append(param_dict)
    return pd.DataFrame(probes)

probes = random_probes(columns=x_train.columns, feature_ranges={'k1_title': (0.1,3.0)})
probes

Unnamed: 0,title_boost,body_boost,k1_title,b_title,k1_body,b_body
0,1.229735,9.826749,0.953737,2.589154,0.033105,8.854818
1,2.744897,1.342824,0.494599,2.435563,2.797624,9.527796
2,6.897106,8.139808,0.138604,2.008488,0.050843,3.461254
3,6.32095,9.438639,0.786106,5.623721,9.452991,4.413656
4,2.057281,1.626731,2.80089,2.058843,5.962469,9.897089
5,0.711873,8.668215,0.350461,0.616672,5.891562,8.381044
6,6.221129,6.588693,1.066311,2.655972,7.898815,6.085177
7,0.102079,4.767183,0.912597,7.834181,0.412063,5.542749
8,1.556326,0.916064,2.466466,1.314735,0.733756,3.422824
9,4.724504,9.801562,1.003174,0.800306,4.877991,6.381691


In [168]:

def run_all(corpus, judgments, fields, max_posns=None, min_posns=None, similarities=default_bm25, n=10):
    results = []
    start = perf_counter()
    query_no = 0
    for idx, row in judgments.iterrows():
        top_n = or_query_search(corpus=corpus, fields=fields, similarities=similarities, n=n,
                                min_posn18s=min_posns,
                                max_posn18s=max_posns,
                                query=row['query'], query_id=row['query_id'])
        results.append(top_n)
        if query_no > 0 and query_no % 50 == 0:
            print(f"-- {query_no} QPS: {query_no / (perf_counter() - start)} q:{row['query']}")
        query_no += 1
    results = pd.concat(results)
    graded = grade_results(msmarco_judgments, results)
    return graded

def run_with_params(params,
                    msmarco,
                    msmarco_judgments,
                    n=100):
    bm25_similarity_title = bm25_similarity(b=params['b_title'],
                                            k1=params['k1_title'])
    bm25_similarity_body = bm25_similarity(b=params['b_body'],
                                           k1=params['k1_body'])
    fields = [f"title_idx^{params['title_boost']}", f"body_idx^{params['body_boost']}"]
    similarities = [bm25_similarity_title, bm25_similarity_body]

    results = run_all(msmarco,
                      msmarco_judgments,
                      fields=fields,
                      n=100,
                      similarities=similarities)
    return results, params


## Random search param

Seed the search with a random search

In [169]:
from random import uniform

def random_search(msmarco, msmarco_judgments, 
                  features, feature_ranges={}, times=1):
    
    results = []
    futures = []

    params = random_probes(features, feature_ranges=feature_ranges, num_probes=times)
    print(params)

    print(f"Random search over {len(params)}")
    for _, row in params.iterrows():
        graded, params = run_with_params(row, msmarco, msmarco_judgments,
                                         n=100)
        queries_judged = judge_queries(graded)
        mrr = queries_judged.sum() / len(msmarco_judgments)

        result_dict = {param_name: params[param_name] for param_name in features} 
        result_dict['mrr100'] = mrr
        results.append(result_dict)
        print(result_dict)

    return pd.DataFrame(results)

msmarco_judgments = judgments().sample(100)
initial_probes = random_search(msmarco, msmarco_judgments,
                               features=['title_boost', 'body_boost',
                                         'k1_title', 'b_title',
                                         'k1_body', 'b_body'])

   title_boost  body_boost  k1_title   b_title   k1_body    b_body
0     0.944797    8.827906  7.481235  8.017114  8.993549  3.648977
Random search over 1
-- 50 QPS: 5.449955229556461 q:percentage of americans are hoarders
{'title_boost': 0.9447970299202191, 'body_boost': 8.827905887617753, 'k1_title': 7.481235396196984, 'b_title': 8.017114417573405, 'k1_body': 8.993549027662771, 'b_body': 3.648977335859639, 'mrr100': 0.0}


In [170]:
from sklearn.gaussian_process import GaussianProcessRegressor
import pandas as pd

y_train = initial_probes['mrr100']
x_train = initial_probes.drop('mrr100', axis=1)


gpr = GaussianProcessRegressor()
gpr.fit(x_train.to_numpy(), y_train.to_numpy())

In [171]:
x_train

Unnamed: 0,title_boost,body_boost,k1_title,b_title,k1_body,b_body
0,0.944797,8.827906,7.481235,8.017114,8.993549,3.648977


### Score probes

In [172]:
probes

Unnamed: 0,title_boost,body_boost,k1_title,b_title,k1_body,b_body
0,1.229735,9.826749,0.953737,2.589154,0.033105,8.854818
1,2.744897,1.342824,0.494599,2.435563,2.797624,9.527796
2,6.897106,8.139808,0.138604,2.008488,0.050843,3.461254
3,6.32095,9.438639,0.786106,5.623721,9.452991,4.413656
4,2.057281,1.626731,2.80089,2.058843,5.962469,9.897089
5,0.711873,8.668215,0.350461,0.616672,5.891562,8.381044
6,6.221129,6.588693,1.066311,2.655972,7.898815,6.085177
7,0.102079,4.767183,0.912597,7.834181,0.412063,5.542749
8,1.556326,0.916064,2.466466,1.314735,0.733756,3.422824
9,4.724504,9.801562,1.003174,0.800306,4.877991,6.381691


In [173]:
from scipy.stats import norm

def score_probes(gpr, probes, best_mrr, theta):
    predictions, std_devs = gpr.predict(probes, return_std=True)
    
    probes["prediction"] = predictions
    probes["std_dev"] = std_devs
        
    probes['opportunity'] = probes['prediction'] - best_mrr - theta
    probes['prob_of_improvement'] = norm.cdf( probes['opportunity'] / probes['std_dev'])
    
    return probes.sort_values('prob_of_improvement', ascending=False)

score_probes(gpr, probes.copy(), best_mrr=1.0, theta=0.9)



Unnamed: 0,title_boost,body_boost,k1_title,b_title,k1_body,b_body,prediction,std_dev,opportunity,prob_of_improvement
0,1.229735,9.826749,0.953737,2.589154,0.033105,8.854818,0.0,1.0,-1.9,0.028717
1,2.744897,1.342824,0.494599,2.435563,2.797624,9.527796,0.0,1.0,-1.9,0.028717
2,6.897106,8.139808,0.138604,2.008488,0.050843,3.461254,0.0,1.0,-1.9,0.028717
3,6.32095,9.438639,0.786106,5.623721,9.452991,4.413656,0.0,1.0,-1.9,0.028717
4,2.057281,1.626731,2.80089,2.058843,5.962469,9.897089,0.0,1.0,-1.9,0.028717
5,0.711873,8.668215,0.350461,0.616672,5.891562,8.381044,0.0,1.0,-1.9,0.028717
6,6.221129,6.588693,1.066311,2.655972,7.898815,6.085177,0.0,1.0,-1.9,0.028717
7,0.102079,4.767183,0.912597,7.834181,0.412063,5.542749,0.0,1.0,-1.9,0.028717
8,1.556326,0.916064,2.466466,1.314735,0.733756,3.422824,0.0,1.0,-1.9,0.028717
9,4.724504,9.801562,1.003174,0.800306,4.877991,6.381691,0.0,1.0,-1.9,0.028717


### Bayesian search loop

In [179]:
from concurrent.futures import ThreadPoolExecutor, as_completed

def search_w_prob_params(probes, msmarco_judgments):
    results = []
    futures = []
    future_params = []
    with ThreadPoolExecutor(max_workers=8) as executor:
        for idx, row in probes.iterrows():
            future = executor.submit(run_with_params, row, msmarco,
                                     msmarco_judgments, 100)
            futures.append(future)

        for future in as_completed(futures):
            graded, params = future.result()
            queries_judged = judge_queries(graded)
            mrr = queries_judged.sum() / len(msmarco_judgments)
    
            result_dict = {'mrr100': mrr}
            for param, value in params.items():
                result_dict[param] = value
            results.append(result_dict)
            print(result_dict['mrr100'], result_dict)
    return pd.DataFrame(results)


In [None]:
SEED=100
NUM_QUERIES=1000
PICKLE_PATH = f"/tmp/msmarco_bayes_probes_{NUM_QUERIES}_{SEED}.pkl"

def bayesian_search(initial_probes, 
                    msmarco_judgments,
                    feature_ranges,
                    theta=1.2,
                    rounds=10,
                    num_probes=4):
    probes = initial_probes.copy()
    initial_cols = initial_probes.columns
    features = initial_probes.drop('mrr100', axis=1).columns
    print(features)
    for _ in range(rounds):
        y_train = probes['mrr100']
        x_train = probes.drop('mrr100', axis=1)
        x_train = probes[features]
        
        gpr = GaussianProcessRegressor()
        gpr.fit(x_train.to_numpy(), y_train.to_numpy())

        new_probes = random_probes(x_train.columns,
                                   num_probes=100, 
                                   feature_ranges=feature_ranges)
        new_probes = score_probes(gpr, new_probes,
                                  best_mrr=probes['mrr100'].max(),
                                  theta=theta)
        new_probes = new_probes[:num_probes]
        print(new_probes.sort_values('prob_of_improvement', ascending=False))
        new_probes = search_w_prob_params(new_probes, msmarco_judgments)

        # Concat, dropping scoring, etc
        probes = pd.concat([new_probes, probes])
        probes = probes[initial_cols]
        print(len(probes))
        print("CURRENT RESULTS")
        print(probes.sort_values('mrr100', ascending=False))
        probes.to_pickle(PICKLE_PATH)
    return probes

probes = None
try:
    probes = pd.read_pickle(PICKLE_PATH)
    print("Loaded ", PICKLE_PATH)
    print(probes)
except FileNotFoundError:
    print("No probes file found, starting with random search")
    pass

# Run with 1000 queries
msmarco_judgments = judgments().sample(NUM_QUERIES, random_state=SEED)
print("Random search")
feature_ranges={'k1_title': (0.1,3.0),
            'b_title':  (0.1,3.0),
            'k1_body': (0.1,3.0),
            'b_body':  (0.1,3.0),
            }
initial_probes = random_search(msmarco,
                               msmarco_judgments,
                               features=['title_boost', 'body_boost',
                                         'k1_title', 'b_title',
                                         'k1_body', 'b_body'],
                               feature_ranges=feature_ranges,
                               times=5)

print(initial_probes)

if probes is not None:
    probes = pd.concat([probes, initial_probes])
else:
    probes = initial_probes

bayesian_search(probes,
                msmarco_judgments,
                feature_ranges=feature_ranges)

No probes file found, starting with random search
Random search
   title_boost  body_boost  k1_title   b_title   k1_body    b_body
0     8.340754    9.351995  2.444928  2.744155  1.086662  2.022452
1     8.958762    4.560379  0.810099  1.107320  2.799882  2.638216
2     4.655410    5.702038  1.735131  1.616303  0.797276  2.248234
3     7.365682    5.339424  0.309116  1.260438  2.076064  2.511225
4     8.390986    3.787052  0.693142  1.532799  1.508777  1.895207
Random search over 5
-- 50 QPS: 5.406344502057192 q:implant cost per tooth
-- 100 QPS: 5.434331827326387 q:how many expresso beans equals one shot of expeesso
-- 150 QPS: 5.5244640338196715 q:how long will a judgement stay on my credit report
-- 200 QPS: 5.514147796518769 q:what is the minimum credit hours i can take with my pell grant
-- 250 QPS: 5.513613588878563 q:what are argentina's cowboys called?
-- 300 QPS: 5.531970366952632 q:symptoms of a vaginal yeast infection
-- 350 QPS: 5.527894505200648 q:is rheumatoid arthritis s