In [1]:
import numpy as np
import pandas as pd
from msmarco.index import indexed
from msmarco.evaluate import grade_results, judge_queries
from time import perf_counter
from searcharray.similarity import bm25_similarity, classic_similarity, default_bm25

NUM_QUERIES = 250

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
# Will take a min or two to load, and a good hour to build an index first time
msmarco = indexed()

In [3]:
np.sort(msmarco['body_idx'].array.score('cheese'))

array([0.       , 0.       , 0.       , ..., 4.7565475, 5.0600863,
       8.568701 ], dtype=float32)

In [4]:
# %%prun

def or_query_search(corpus, fields, query, query_id=None, n=10, 
                    max_posn18s=None,
                    min_posn18s=None,
                    similarities=default_bm25):
    if not isinstance(fields, list):
        fields = [fields]
    if isinstance(similarities, list):
        assert len(similarities) == len(fields)
    if isinstance(max_posn18s, list):
        assert len(max_posn18s) == len(fields)
    if isinstance(min_posn18s, list):
        assert len(min_posn18s) == len(fields)
    if not isinstance(similarities, list):
        similarities = [similarities] * len(fields)
    if not isinstance(max_posn18s, list):
        max_posn18s = [max_posn18s] * len(fields)
    if not isinstance(min_posn18s, list):
        min_posn18s = [min_posn18s] * len(fields)

    
    start = perf_counter()
    scored = np.zeros(len(corpus))
    for field, similarity, max_posn18, min_posn18 in zip(fields, similarities, max_posn18s, min_posn18s):
        boost = 1.0
        if '^' in field:
            field, boost = field.split('^')
            boost = float(boost)
        tokenizer = corpus[field].array.tokenizer
        tokens = tokenizer(query)
        max_posn_arg = (max_posn18 * 18 - 1) if max_posn18 is not None else None
        min_posn_arg = (min_posn18 * 18) if min_posn18 is not None else None
        # print(max_posn_arg, min_posn_arg)

        for token in tokens:
            token_score = corpus[field].array.score(token,
                                                    max_posn=max_posn_arg,
                                                    min_posn=min_posn_arg,
                                                    similarity=similarity)
            # print(field, token, max_posn_arg, min_posn_arg)
            # print(np.sort(token_score))
            scored += token_score * boost
            
    took = perf_counter() - start

    top_n_idx = np.argpartition(-scored, n)[:n]
    # top_n_idx = np.argsort(scored)[::-1][:n]
    scores = scored[top_n_idx]

    top_n = corpus[['title', 'msmarco_id']].iloc[top_n_idx]
    top_n['scores'] = scores
    top_n['query'] = query
    top_n['query_id'] = query_id
    top_n['took'] = took
    top_n.sort_values('scores', ascending=False, inplace=True)
    # print(query, took)
    top_n['rank'] = np.arange(n) + 1
    return top_n

or_query_search(corpus=msmarco, fields=["title_idx^10"], query="Adjustable-rate mortgage", n=100)

Unnamed: 0,title,msmarco_id,scores,query,query_id,took,rank
1782294,Adjustable Rate Mortgage,D578534,111.879284,Adjustable-rate mortgage,,0.02831,1
886385,Adjustable-rate mortgage,D1968890,111.879284,Adjustable-rate mortgage,,0.02831,2
1673472,Adjustable-rate mortgage,D578531,111.879284,Adjustable-rate mortgage,,0.02831,3
487631,adjustable rate mortgage,D578530,111.879284,Adjustable-rate mortgage,,0.02831,4
368379,Adjustable-Rate Mortgages,D1213211,111.879284,Adjustable-rate mortgage,,0.02831,5
...,...,...,...,...,...,...,...
2641962,Common Mortgage Insurance Premium Questions An...,D2106970,60.626610,Adjustable-rate mortgage,,0.02831,96
1057920,Mortgage Rates and Treasury Bonds,D2407381,59.265333,Adjustable-rate mortgage,,0.02831,97
2876082,Understanding Reverse Mortgage Interest Rates,D1912271,59.265333,Adjustable-rate mortgage,,0.02831,98
3058401,5-year Variable Mortgage Rates,D2906409,59.265333,Adjustable-rate mortgage,,0.02831,99


In [5]:
from msmarco.evaluate import grade_results, judgments

msmarco_judgments = judgments().sample(NUM_QUERIES)
msmarco_judgments

Unnamed: 0,query_id,q0,msmarco_id,grade,query
138495,471217,0,D156862,1,pain doctor in tracy ca
14461,48444,0,D3001663,1,baker definition
149582,505913,0,D810322,1,sweep in debit define
256674,813726,0,D108472,1,what is the current cost of regular stamps
221019,710530,0,D418734,1,what is an oral contract
...,...,...,...,...,...
296767,932702,0,D311383,1,what's the pin number?
28780,94115,0,D1131946,1,code officials conference of michigan
217287,699787,0,D2013091,1,what is a shooter marble
254737,807966,0,D607428,1,what is the benefit of branching in storage po...


In [6]:

def run_all(corpus, judgments, fields, max_posns=None, min_posns=None, similarities=default_bm25, n=10):
    results = []
    start = perf_counter()
    query_no = 0
    for idx, row in judgments.iterrows():
        top_n = or_query_search(corpus=msmarco, fields=fields, similarities=similarities, n=n,
                                min_posn18s=min_posns,
                                max_posn18s=max_posns,
                                query=row['query'], query_id=row['query_id'])
        results.append(top_n)
        if query_no > 0 and query_no % 50 == 0:
            print(f"-- {query_no} QPS: {query_no / (perf_counter() - start)} q:{row['query']}")
        query_no += 1
    results = pd.concat(results)
    graded = grade_results(msmarco_judgments, results)
    return graded

## Random search param

Seed the search with a random search

In [27]:
from random import uniform

def random_search(times=10):
    
    results = []
    
    for _ in range(0, times):
        title_boost = uniform(0, 10)
        body_boost = uniform(0, 10)
        b_title = uniform(0,2)
        k1_title = uniform(0.1,2)
        b_body = uniform(0,2)
        k1_body = uniform(0.1,2)
        
        bm25_similarity_title = bm25_similarity(b=b_title,
                                                k1=k1_title)
        bm25_similarity_body = bm25_similarity(b=b_body,
                                               k1=k1_body)
        fields = [f"title_idx^{title_boost}", f"body_idx^{body_boost}"]
        similarities = [bm25_similarity_title, bm25_similarity_body]
    
        graded = run_all(msmarco, msmarco_judgments,
                         fields=fields,
                         n=100,
                         similarities=similarities)

    
        queries_judged = judge_queries(graded)
        mrr = queries_judged.sum() / len(msmarco_judgments)
        results.append({"mrr100": mrr, "title_boost": title_boost, "body_boost": body_boost,
                        "b_title": b_title, "k1_title": k1_title,
                        "b_body": b_body, "k1_body": k1_body})

    return pd.DataFrame(results)

initial_probes = random_search()

-- 50 QPS: 4.708193835941206 q:what kind of anesthetic did micheal jackson o.d. on
-- 100 QPS: 4.870967205839954 q:kohl's corporate headquarters customer service
-- 150 QPS: 4.9814547496730395 q:what is integral calculus
-- 200 QPS: 4.952206829562804 q:organizations definition sociology
-- 50 QPS: 4.90705963637589 q:what kind of anesthetic did micheal jackson o.d. on
-- 100 QPS: 5.102055523782536 q:kohl's corporate headquarters customer service
-- 150 QPS: 5.154873787026692 q:what is integral calculus
-- 200 QPS: 5.133572882045559 q:organizations definition sociology
-- 50 QPS: 4.987729375230765 q:what kind of anesthetic did micheal jackson o.d. on
-- 100 QPS: 5.168737907010805 q:kohl's corporate headquarters customer service
-- 150 QPS: 5.214755577877904 q:what is integral calculus
-- 200 QPS: 5.174968515572395 q:organizations definition sociology
-- 50 QPS: 4.947021461160821 q:what kind of anesthetic did micheal jackson o.d. on
-- 100 QPS: 5.120831481189596 q:kohl's corporate headqua

In [35]:
from sklearn.gaussian_process import GaussianProcessRegressor
import pandas as pd

y_train = initial_probes['mrr100']
x_train = initial_probes.drop('mrr100', axis=1)


gpr = GaussianProcessRegressor()
gpr.fit(x_train.to_numpy(), y_train.to_numpy())

In [36]:
x_train

Unnamed: 0,title_boost,body_boost,b_title,k1_title,b_body,k1_body
0,1.5089,3.025983,0.983151,1.474616,0.344192,1.340275
1,3.718998,2.718191,1.775831,0.921365,0.302024,1.570309
2,1.749108,9.756847,0.771192,1.881868,1.093976,1.557335
3,8.969764,6.855932,1.319435,0.237071,1.317911,1.596215
4,8.88369,8.03206,1.102976,0.828003,0.56487,1.362288
5,6.833801,1.451718,0.474847,1.039707,0.059683,0.77969
6,9.721925,2.604217,0.568969,0.385284,1.69986,0.895972
7,2.433199,7.07062,1.895361,0.17916,1.14243,1.595012
8,1.631043,4.465961,0.538408,0.794882,0.848509,1.057414
9,2.155212,4.039521,0.26091,1.40434,0.104368,1.703862


### Generate 100 probes

In [50]:
import random

def random_probes(columns, num_probes=100):
    
    probes = []
    for _ in range(100):
        param_dict = {}
        for param in columns:
            param_dict[param] = random.random() * 10
        probes.append(param_dict)
    return pd.DataFrame(probes)

probes = random_probes(x_train.columns)
probes

Unnamed: 0,title_boost,body_boost,b_title,k1_title,b_body,k1_body
0,3.339401,0.067110,7.005538,7.190855,1.348288,1.546968
1,8.281183,3.610244,6.374368,7.261583,6.310868,1.531036
2,4.631252,8.137012,1.862359,5.661093,0.020002,3.101658
3,3.118723,9.632319,5.146642,9.977634,5.232420,9.314831
4,9.332810,3.947002,8.994759,6.111871,7.247212,7.840062
...,...,...,...,...,...,...
95,4.802095,0.165380,1.327447,4.379010,1.436073,5.242673
96,7.490920,2.593290,4.523872,5.879850,9.437544,0.915439
97,6.038166,4.967975,6.069409,7.929928,0.014055,2.141731
98,0.874957,5.133061,1.094922,7.347273,3.034614,4.307133


### Score probes

In [42]:
probes

Unnamed: 0,title_boost,body_boost,b_title,k1_title,b_body,k1_body
0,1.182575,0.063145,1.223727,5.631537,1.714028,3.691688
1,5.210438,3.337481,7.024862,5.418661,3.827144,5.911389
2,9.365677,7.733257,4.108540,2.099589,9.086515,2.321197
3,3.900006,5.761428,2.593221,3.242471,1.475084,0.800543
4,2.329488,0.153928,3.322898,5.947825,9.427683,2.728698
...,...,...,...,...,...,...
95,2.658741,6.533579,2.606372,4.179032,3.913661,8.549627
96,1.383045,9.607653,7.924148,5.631176,4.847452,0.335895
97,3.404830,7.032024,7.218068,8.170625,6.485972,5.466261
98,5.863768,0.194723,3.093331,6.171504,9.501430,2.138594


In [60]:
from scipy.stats import norm

def score_probes(gpr, probes, best_mrr):
    predictions, std_devs = gpr.predict(probes, return_std=True)
    
    probes["prediction"] = predictions
    probes["std_dev"] = std_devs
        
    theta = 0.9
    probes['opportunity'] = probes['prediction'] - best_mrr - theta
    probes['prob_of_improvement'] = norm.cdf( probes['opportunity'] / probes['std_dev'])
    
    return probes.sort_values('prob_of_improvement', ascending=False)

score_probes(gpr, probes.copy(), 1.0)



Unnamed: 0,title_boost,body_boost,b_title,k1_title,b_body,k1_body,prediction,std_dev,opportunity,prob_of_improvement
22,0.950041,6.899567,4.265089,0.476040,0.916907,0.467657,1.395383e-03,0.999952,-1.898605,0.028802
8,3.639900,9.498604,3.624746,2.224948,0.670303,2.396666,2.378478e-04,0.999998,-1.899762,0.028732
5,4.746352,3.022795,1.675515,0.661855,0.144786,5.315071,6.013158e-05,1.000000,-1.899940,0.028720
39,5.158588,6.460012,3.249052,2.112647,3.990313,1.073688,2.743960e-06,1.000000,-1.899997,0.028717
74,4.450672,7.245138,2.567452,2.983035,1.075761,5.850741,3.472225e-08,1.000000,-1.900000,0.028717
...,...,...,...,...,...,...,...,...,...,...
36,8.152942,4.500614,7.979392,8.014107,9.714303,4.070044,1.883478e-42,1.000000,-1.900000,0.028717
35,1.300267,7.790530,6.221722,5.874095,5.783709,8.135010,1.608232e-26,1.000000,-1.900000,0.028717
33,2.178062,3.741472,7.834258,5.048637,6.564873,8.574381,4.277616e-33,1.000000,-1.900000,0.028717
32,8.908587,0.714399,3.643433,5.184738,7.643956,6.312056,1.363155e-23,1.000000,-1.900000,0.028717


### Bayesian search loop

In [None]:
def search_w_prob_params(probes):
    results = []
    for idx, row in probes.iterrows():
        bm25_similarity_title = bm25_similarity(b=row['b_title'],
                                                k1=row['k1_title'])
        bm25_similarity_body = bm25_similarity(b=row['b_body'],
                                               k1=row['k1_body'])
        fields = [f"title_idx^{row['title_boost']}", f"body_idx^{row['body_boost']}"]
        similarities = [bm25_similarity_title, bm25_similarity_body]
    
        graded = run_all(msmarco,
                         msmarco_judgments,
                         fields=fields,
                         n=100,
                         similarities=similarities)
        
        queries_judged = judge_queries(graded)
        mrr = queries_judged.sum() / len(msmarco_judgments)

        result_dict = {'mrr100': mrr}
        for param, value in row.items():
            result_dict[param] = value
        results.append(result_dict)
        print(result_dict['mrr100'], result_dict)
    return pd.DataFrame(results)


def bayesian_search(initial_probes, rounds=10):
    probes = initial_probes.copy()
    features = initial_probes.drop('mrr100', axis=1).columns
    print(features)
    for _ in range(rounds):
        y_train = probes['mrr100']
        x_train = probes.drop('mrr100', axis=1)
        x_train = probes[features]
        
        gpr = GaussianProcessRegressor()
        gpr.fit(x_train.to_numpy(), y_train.to_numpy())

        new_probes = random_probes(x_train.columns)
        new_probes = score_probes(gpr, new_probes, best_mrr=probes['mrr100'].max())
        new_probes = search_w_prob_params(new_probes[:2])

        probes = pd.concat([new_probes, probes])
        print(probes.sort_values('mrr100', ascending=False)[0])

bayesian_search(initial_probes)


Index(['title_boost', 'body_boost', 'b_title', 'k1_title', 'b_body',
       'k1_body'],
      dtype='object')




-- 50 QPS: 5.26689369356321 q:what kind of anesthetic did micheal jackson o.d. on
-- 100 QPS: 5.326570340944882 q:kohl's corporate headquarters customer service
-- 150 QPS: 5.404446530000407 q:what is integral calculus
-- 200 QPS: 5.354387755775813 q:organizations definition sociology
0.1177072083600246 {'mrr100': 0.1177072083600246, 'title_boost': 8.669017580880253, 'body_boost': 2.060912131332545, 'b_title': 2.137324907137704, 'k1_title': 0.5664849525529381, 'b_body': 1.7259717402894448, 'k1_body': 0.8838268411699612, 'prediction': 0.02148381986928959, 'std_dev': 0.9897633724451352, 'opportunity': -1.0544790798342403, 'prob_of_improvement': 0.14335086424862664}
-- 50 QPS: 5.092208696637235 q:what kind of anesthetic did micheal jackson o.d. on
-- 100 QPS: 5.287354440464743 q:kohl's corporate headquarters customer service
