In [1]:
import numpy as np
import pandas as pd
from msmarco.index import indexed
from msmarco.evaluate import grade_results, judge_queries
from time import perf_counter
from searcharray.similarity import bm25_similarity, classic_similarity, default_bm25

NUM_QUERIES = 250

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
# Will take a min or two to load, and a good hour to build an index first time
msmarco = indexed()

In [3]:
np.sort(msmarco['body_idx'].array.score('cheese'))

array([0.       , 0.       , 0.       , ..., 4.7565475, 5.0600863,
       8.568701 ], dtype=float32)

In [5]:
from searcharray.solr import edismax

edismax(msmarco, q='cheese curds',
        qf=['title_idx', 'body_idx'],
        mm=2,
        tie=0.2,
        pf='title_idx')

(array([0., 0., 0., ..., 0., 0., 0.]),
 '((title_idx:chees^1 | body_idx:chees^1) (title_idx:curd^1 | body_idx:curd^1))~2 (title_idx:"chees curd")^1')

In [22]:
# %%prun

def search(corpus, fields, query, query_id=None,
           pf2_fields=None,
           pf_fields=None,
           tie=0,
           mm=1,
           n=10, 
           similarities=default_bm25):
    if not isinstance(fields, list):
        fields = [fields]

    start = perf_counter()
    scored, explain = edismax(msmarco,
                              q=query,
                              qf=fields,
                              tie=tie,
                              pf=pf_fields,
                              pf2=pf2_fields,
                              mm=mm,
                              similarity=similarities)
    # print(explain)
    took = perf_counter() - start

    top_n_idx = np.argpartition(-scored, n)[:n]
    # top_n_idx = np.argsort(scored)[::-1][:n]
    scores = scored[top_n_idx]

    top_n = corpus[['title', 'msmarco_id']].iloc[top_n_idx]
    top_n['scores'] = scores
    top_n['query'] = query
    top_n['query_id'] = query_id
    top_n['took'] = took
    top_n.sort_values('scores', ascending=False, inplace=True)
    # print(query, took)
    top_n['rank'] = np.arange(n) + 1
    return top_n

search(corpus=msmarco, fields=["title_idx^10", "body_idx"], query="Adjustable-rate mortgage", n=100)

Unnamed: 0,title,msmarco_id,scores,query,query_id,took,rank
1162730,Adjustable-rate mortgages,D3468865,111.879284,Adjustable-rate mortgage,,0.182185,1
487631,adjustable rate mortgage,D578530,111.879284,Adjustable-rate mortgage,,0.182185,2
2340434,Adjustable-rate mortgages,D520595,111.879284,Adjustable-rate mortgage,,0.182185,3
368379,Adjustable-Rate Mortgages,D1213211,111.879284,Adjustable-rate mortgage,,0.182185,4
1782294,Adjustable Rate Mortgage,D578534,111.879284,Adjustable-rate mortgage,,0.182185,5
...,...,...,...,...,...,...,...
2632208,Fed Hike Means Adjustable Rate Mortgages Will ...,D2009272,61.422127,Adjustable-rate mortgage,,0.182185,96
3182752,Current Wells Fargo Mortgage Rates,D3303400,61.417705,Adjustable-rate mortgage,,0.182185,97
1636509,Historical Adjusted Cohort Graduation Rate,D3411347,61.387037,Adjustable-rate mortgage,,0.182185,98
3075301,Mortgage annual percentage rate calculator,D292964,61.299828,Adjustable-rate mortgage,,0.182185,99


In [23]:
from msmarco.evaluate import grade_results, judgments

msmarco_judgments = judgments().sample(NUM_QUERIES)
msmarco_judgments

Unnamed: 0,query_id,q0,msmarco_id,grade,query
96795,313303,0,D3007057,1,how much does bundy owe the blm
47383,152513,0,D3060305,1,diseases of the mouth
11705,38976,0,D2766775,1,average monthly cost of an iphone
226179,725209,0,D1084558,1,what is bondplex treatment
231927,741645,0,D1327492,1,what is dubbin made of
...,...,...,...,...,...
45066,144994,0,D2748755,1,did russia and china sign the paris accords
13169,43684,0,D2667190,1,average salary welder florida
60277,194017,0,D105725,1,genes are chains of _________________.
272645,860758,0,D1863675,1,what is vermicelli


In [24]:
import random

def random_probes(features, num_probes=10, feature_ranges={}):
    
    probes = []
    for _ in range(num_probes):
        param_dict = {}
        for param in features:
            if param in feature_ranges:
                param_dict[param] = random.uniform(feature_ranges[param][0],
                                                   feature_ranges[param][1])
            else:
                param_dict[param] = random.random() * 10
        probes.append(param_dict)
    return pd.DataFrame(probes)

probes = random_probes(features=['title_boost', 'body_boost',
                                 'title_pf_boost', 'body_pf_boost',
                                 'title_pf2_boost', 'body_pf2_boost',
                                 'tie', 'mm',
                                 'k1_title', 'b_title',
                                 'k1_body', 'b_body'],
                       feature_ranges={'k1_title': (0.1,3.0),
                                       'tie': (0,1),
                                       'mm': (0,100)})
probes

Unnamed: 0,title_boost,body_boost,title_pf_boost,body_pf_boost,title_pf2_boost,body_pf2_boost,tie,mm,k1_title,b_title,k1_body,b_body
0,3.206206,8.311243,4.598652,5.501628,5.308989,1.836496,0.54809,46.467339,2.760748,9.475886,6.824235,8.693596
1,1.712852,6.428464,4.146929,2.786072,4.106736,5.675825,0.873325,78.627747,0.258889,1.317354,5.511437,4.488625
2,6.177034,3.141446,1.885128,4.26715,3.980968,8.63639,0.36404,3.823525,0.181938,3.773695,9.677949,3.568646
3,5.318557,0.59508,6.44521,2.750301,2.534763,4.448689,0.227572,44.898603,2.12279,5.782894,0.61556,7.293539
4,0.383731,8.539289,4.794543,7.731636,6.131814,4.86305,0.288931,12.441354,0.806682,7.447746,9.746709,7.503958
5,9.330354,6.521421,3.366908,4.290321,7.414552,9.290418,0.194229,91.03176,0.995194,5.736639,9.338661,9.586159
6,7.47295,7.94146,9.113808,2.340323,6.17557,1.65068,0.132107,52.357594,0.258533,3.458661,9.544361,3.923416
7,2.052693,5.522872,0.162006,0.917884,6.288,6.8929,0.611424,4.189258,1.771623,7.677413,2.267047,9.320895
8,9.421807,6.698163,7.170703,1.869882,9.212642,7.585908,0.035754,19.041931,2.152808,7.270945,6.600519,8.025584
9,8.104065,9.08123,1.041837,7.624807,3.478139,6.026378,0.285031,45.562888,2.725685,7.826303,8.010206,5.295017


In [39]:

def run_all(corpus, judgments, fields, pf_fields, pf2_fields,
            tie=0, mm=1,
            max_posns=None, min_posns=None, similarities=default_bm25, n=10):
    results = []
    start = perf_counter()
    query_no = 0
    for idx, row in judgments.iterrows():
        top_n = search(corpus=corpus,
                       fields=fields,
                       pf_fields=pf_fields,
                       pf2_fields=pf2_fields,
                       similarities=similarities,
                       n=n,
                       query=row['query'],
                       query_id=row['query_id'])
        results.append(top_n)
        if query_no > 0 and query_no % 50 == 0:
            print(f"-- {query_no} QPS: {query_no / (perf_counter() - start)} q:{row['query']}")
        query_no += 1
    results = pd.concat(results)
    graded = grade_results(msmarco_judgments, results)
    return graded

def run_with_params(params,
                    msmarco,
                    msmarco_judgments,
                    n=100):
    bm25_similarity_title = bm25_similarity(b=params['b_title'],
                                            k1=params['k1_title'])
    bm25_similarity_body = bm25_similarity(b=params['b_body'],
                                           k1=params['k1_body'])
    fields = [f"title_idx^{params['title_boost']}", f"body_idx^{params['body_boost']}"]
    pf_fields = [f"title_idx^{params['title_pf_boost']}", f"body_idx^{params['body_pf_boost']}"]
    pf2_fields = [f"title_idx^{params['title_pf2_boost']}", f"body_idx^{params['body_pf2_boost']}"]
    mm=f"{int(params['mm'])}%"

    similarities = {"title_idx": bm25_similarity_title,
                    "body_idx": bm25_similarity_body}

    results = run_all(msmarco,
                      msmarco_judgments,
                      fields=fields,
                      pf2_fields=pf2_fields,
                      pf_fields=pf_fields,
                      mm=mm,
                      tie=params['tie'],
                      n=100,
                      similarities=similarities)
    return results, params


## Random search param

Seed the search with a random search

In [40]:
from random import uniform

def random_search(msmarco, msmarco_judgments, 
                  features, feature_ranges={}, times=1):
    
    results = []
    futures = []

    params = random_probes(features, feature_ranges=feature_ranges, num_probes=times)
    print(params)

    print(f"Random search over {len(params)}")
    for _, row in params.iterrows():
        graded, params = run_with_params(row, msmarco, msmarco_judgments,
                                         n=100)
        queries_judged = judge_queries(graded)
        mrr = queries_judged.sum() / len(msmarco_judgments)

        result_dict = {param_name: params[param_name] for param_name in features} 
        result_dict['mrr100'] = mrr
        results.append(result_dict)
        print(result_dict)

    return pd.DataFrame(results)

msmarco_judgments = judgments().sample(100)
initial_probes = random_search(msmarco, msmarco_judgments,
                               features=['title_boost', 'body_boost',
                                         'title_pf_boost', 'body_pf_boost',
                                         'title_pf2_boost', 'body_pf2_boost',
                                         'mm', 'tie',
                                         'k1_title', 'b_title',
                                         'k1_body', 'b_body'])

   title_boost  body_boost  title_pf_boost  body_pf_boost  title_pf2_boost  \
0     6.355857    0.295742        3.697541       8.018303         2.049598   

   body_pf2_boost        mm       tie  k1_title   b_title   k1_body    b_body  
0        2.668719  3.570001  1.254645  4.534043  9.469413  0.994516  2.602612  
Random search over 1
-- 50 QPS: 1.2218744957476984 q:what is medical credentialing definition
{'title_boost': 6.355856753851537, 'body_boost': 0.29574186346326026, 'title_pf_boost': 3.697541147767671, 'body_pf_boost': 8.018303288356677, 'title_pf2_boost': 2.049598436141534, 'body_pf2_boost': 2.6687187405486124, 'mm': 3.570000930194409, 'tie': 1.2546445466702993, 'k1_title': 4.534042881185389, 'b_title': 9.46941324090416, 'k1_body': 0.9945155064478661, 'b_body': 2.6026124095742755, 'mrr100': 0.0031111111111111114}


In [41]:
from sklearn.gaussian_process import GaussianProcessRegressor
import pandas as pd

y_train = initial_probes['mrr100']
x_train = initial_probes.drop('mrr100', axis=1)


gpr = GaussianProcessRegressor()
gpr.fit(x_train.to_numpy(), y_train.to_numpy())

In [42]:
x_train

Unnamed: 0,title_boost,body_boost,title_pf_boost,body_pf_boost,title_pf2_boost,body_pf2_boost,mm,tie,k1_title,b_title,k1_body,b_body
0,6.355857,0.295742,3.697541,8.018303,2.049598,2.668719,3.570001,1.254645,4.534043,9.469413,0.994516,2.602612


### Score probes

In [43]:
probes

In [45]:
from scipy.stats import norm

def score_probes(gpr, probes, best_mrr, theta):
    predictions, std_devs = gpr.predict(probes, return_std=True)
    
    probes["prediction"] = predictions
    probes["std_dev"] = std_devs
        
    probes['opportunity'] = probes['prediction'] - best_mrr - theta
    probes['prob_of_improvement'] = norm.cdf( probes['opportunity'] / probes['std_dev'])
    
    return probes.sort_values('prob_of_improvement', ascending=False)

score_probes(gpr, probes.copy(), best_mrr=1.0, theta=0.9)

AttributeError: 'NoneType' object has no attribute 'copy'

### Bayesian search loop

In [46]:
from concurrent.futures import ThreadPoolExecutor, as_completed

def search_w_prob_params(probes, msmarco_judgments):
    results = []
    futures = []
    future_params = []
    with ThreadPoolExecutor(max_workers=8) as executor:
        for idx, row in probes.iterrows():
            future = executor.submit(run_with_params, row, msmarco,
                                     msmarco_judgments, 100)
            futures.append(future)

        for future in as_completed(futures):
            graded, params = future.result()
            queries_judged = judge_queries(graded)
            mrr = queries_judged.sum() / len(msmarco_judgments)
    
            result_dict = {'mrr100': mrr}
            for param, value in params.items():
                result_dict[param] = value
            results.append(result_dict)
            print(result_dict['mrr100'], result_dict)
    return pd.DataFrame(results)


In [None]:
SEED=100
NUM_QUERIES=100
FEATURES = ['title_boost', 'body_boost',
            'title_pf_boost', 'body_pf_boost',
            'title_pf2_boost', 'body_pf2_boost',
            'tie', 'mm',
            'k1_title', 'b_title',
            'k1_body', 'b_body']
PICKLE_PATH = f"/tmp/msmarco_bayes_probes_{len(FEATURES)}_{NUM_QUERIES}_{SEED}.pkl"

def bayesian_search(initial_probes, 
                    msmarco_judgments,
                    feature_ranges,
                    theta=1.2,
                    rounds=10,
                    num_probes=4):
    probes = initial_probes.copy()
    initial_cols = initial_probes.columns
    features = initial_probes.drop('mrr100', axis=1).columns
    print(features)
    for _ in range(rounds):
        y_train = probes['mrr100']
        x_train = probes.drop('mrr100', axis=1)
        x_train = probes[features]
        
        gpr = GaussianProcessRegressor()
        gpr.fit(x_train.to_numpy(), y_train.to_numpy())

        new_probes = random_probes(x_train.columns,
                                   num_probes=100, 
                                   feature_ranges=feature_ranges)
        new_probes = score_probes(gpr, new_probes,
                                  best_mrr=probes['mrr100'].max(),
                                  theta=theta)
        new_probes = new_probes[:num_probes]
        print(new_probes.sort_values('prob_of_improvement', ascending=False))
        new_probes = search_w_prob_params(new_probes, msmarco_judgments)

        # Concat, dropping scoring, etc
        probes = pd.concat([new_probes, probes])
        probes = probes[initial_cols]
        print(len(probes))
        print("CURRENT RESULTS")
        print(probes.sort_values('mrr100', ascending=False))
        probes.to_pickle(PICKLE_PATH)
    return probes

probes = None
try:
    probes = pd.read_pickle(PICKLE_PATH)
    print("Loaded ", PICKLE_PATH)
    print(probes)
except FileNotFoundError:
    print(f"No probes file found at {PICKLE_PATH}, starting with random search")
    pass

# Run with 1000 queries
msmarco_judgments = judgments().sample(NUM_QUERIES, random_state=SEED)
print("Random search")
feature_ranges={'k1_title': (0.1,3.0),
            'b_title':  (0.1,3.0),
            'k1_body': (0.1,3.0),
            'b_body':  (0.1,3.0),
            'tie': (0,1.0),
            'mm': (25,100)
            }
initial_probes = random_search(msmarco,
                               msmarco_judgments,
                               features=['title_boost', 'body_boost',
                                         'title_pf_boost', 'body_pf_boost',
                                         'title_pf2_boost', 'body_pf2_boost',
                                         'tie', 'mm',
                                         'k1_title', 'b_title',
                                         'k1_body', 'b_body'],
                               feature_ranges=feature_ranges,
                               times=5)

print(initial_probes)

if probes is not None:
    probes = pd.concat([probes, initial_probes])
else:
    probes = initial_probes

bayesian_search(probes,
                msmarco_judgments,
                feature_ranges=feature_ranges)

No probes file found at /tmp/msmarco_bayes_probes_12_100_100.pkl, starting with random search
Random search
   title_boost  body_boost  title_pf_boost  body_pf_boost  title_pf2_boost  \
0     2.009451    5.606873        5.573777       6.813848         6.309039   
1     8.788216    9.740913        4.299888       0.863447         7.753185   
2     6.997906    8.280280        1.072774       5.674602         9.524545   
3     1.638995    6.890582        2.264802       0.838019         8.882419   
4     0.268310    1.055917        0.333586       0.190848         7.792238   

   body_pf2_boost       tie         mm  k1_title   b_title   k1_body    b_body  
0        3.803295  0.067184  82.317148  1.495766  0.186580  0.803608  1.666073  
1        9.932568  0.342664  91.372716  2.226936  0.289951  1.024538  2.747943  
2        3.121435  0.146006  84.695770  0.793583  1.665811  2.737528  2.209444  
3        3.021330  0.409958  61.501931  0.519415  2.397687  1.020432  2.961566  
4        5.532931 