### Notebook to process benchmar results

Please run this notebook after running all the benchmarks and storing them in the `results` dir. This will export them in the desired format for the single node benchmark plots of [qdrant.tech/benchmarks](https://qdrant.tech/benchmarks)

In [64]:
from pathlib import Path
import re
import json
import pandas as pd
from datetime import datetime, timezone

In [65]:
DATA_DIR = Path().resolve().parent / "results"
DATA_DIR, list(DATA_DIR.glob("*.json"))[0].name

(PosixPath('/home/rigazilla/git/vector-db-benchmark/results'),
 'redis-m-32-ef-128-gist-960-euclidean-upload-2024-03-18-18-57-07.json')

In [66]:
PATH_REGEX = re.compile(r"(?P<engine_name>("
                        r"?P<engine>[a-z\-]+)"
                        r"\-m\-(?P<m>[0-9]+)"
                        r"\-ef\-(?P<ef>[0-9]+)"
                        r")"
                        r"\-(?P<dataset>[a-zA-Z0-9\-]+)"
                        r"\-(?P<operation>(search)|(upload))"
                        r"(\-(?P<search_index>[0-9]{1,2})\-)?"
                        r"\-?(?P<date>.*)\.json")

In [67]:
upload_results, search_results = [], []

for path in DATA_DIR.glob("*.json"):
    match = PATH_REGEX.match(path.name)
    if match is None:
        continue
        
    experiment = match.groupdict()
    
    with open(path, "r") as fp:
        stats = json.load(fp)

    entry = [match["engine"], match["m"], match["ef"], 
             match["dataset"], match["search_index"], match["date"], 
             stats["params"], stats["results"]]
    if experiment["operation"] == "search":
        search_results.append(entry)
    elif experiment["operation"] == "upload":
        upload_results.append(entry)

len(upload_results), len(search_results)

(2, 16)

In [68]:
column_names = ["engine", "m", "ef", "dataset", "search_index", "date", "params", "results"]

In [69]:
upload_df = pd.DataFrame(upload_results, columns=column_names) \
    .drop(columns="search_index")
upload_df["date"] = pd.to_datetime(upload_df["date"], format="%Y-%m-%d-%H-%M-%S")
upload_df = upload_df.sort_values("date", ascending=False) \
    .groupby(["engine", "m", "ef", "dataset"]) \
    .last()
upload_df = pd.concat([upload_df, upload_df["results"].apply(pd.Series)], axis=1)
upload_df = upload_df.drop(columns="results")

print(len(upload_df))

upload_df.sort_values("total_time", ascending=True).head(n=5)

2


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,date,params,post_upload,upload_time,total_time
engine,m,ef,dataset,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
redis,32,128,gist-960-euclidean,2024-03-18 18:57:07,"{'experiment': 'redis-m-32-ef-128', 'engine': ...",{},1968.103895,1968.103937
infinispan,32,128,gist-960-euclidean,2024-03-19 15:55:13,"{'experiment': 'infinispan-m-32-ef-128', 'engi...",{},4692.670652,4692.670674


In [70]:
search_df = pd.DataFrame(search_results, columns=column_names)
search_df["date"] = pd.to_datetime(search_df["date"], format="%Y-%m-%d-%H-%M-%S")
search_df = search_df.sort_values("date", ascending=False) \
    .groupby(["engine", "m", "ef", "dataset", "search_index"]) \
    .first()

print(len(search_df))

for column_name in ["params", "results","date", "dataset", "engine"]:
    search_df = pd.concat([search_df, search_df[column_name].apply(pd.Series)], axis=1)
    search_df = search_df.drop(columns=column_name)
search_df.sort_values("rps", ascending=False).head(n=16)

16


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,experiment,parallel,search_params,total_time,mean_time,mean_precisions,std_time,min_time,max_time,rps,p95_time,p99_time,0,0,0
engine,m,ef,dataset,search_index,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
redis,32,128,gist-960-euclidean,4,redis-m-32-ef-128,100,{'ef': 64},2.04713,0.183394,0.75632,0.057327,0.013735,0.384822,488.488837,0.291484,0.37902,2024-03-18 18:57:44,gist-960-euclidean,redis
redis,32,128,gist-960-euclidean,5,redis-m-32-ef-128,100,{'ef': 128},2.426658,0.220274,0.79936,0.050495,0.011361,0.413207,412.089469,0.256568,0.40243,2024-03-18 18:58:02,gist-960-euclidean,redis
redis,32,128,gist-960-euclidean,0,redis-m-32-ef-128,1,{'ef': 64},3.134228,0.003046,0.75632,0.000373,0.001923,0.005209,319.057856,0.003583,0.003778,2024-03-18 18:57:10,gist-960-euclidean,redis
redis,32,128,gist-960-euclidean,1,redis-m-32-ef-128,1,{'ef': 128},3.434508,0.003345,0.79936,0.000437,0.002062,0.004802,291.162509,0.003982,0.004193,2024-03-18 18:57:13,gist-960-euclidean,redis
redis,32,128,gist-960-euclidean,6,redis-m-32-ef-128,100,{'ef': 256},3.893497,0.363886,0.89231,0.068511,0.013783,0.654896,256.838509,0.419571,0.644122,2024-03-18 18:58:21,gist-960-euclidean,redis
redis,32,128,gist-960-euclidean,2,redis-m-32-ef-128,1,{'ef': 256},5.077234,0.004971,0.89231,0.000825,0.002479,0.0074,196.957632,0.006126,0.006502,2024-03-18 18:57:18,gist-960-euclidean,redis
redis,32,128,gist-960-euclidean,7,redis-m-32-ef-128,100,{'ef': 512},6.497608,0.618295,0.94885,0.086542,0.018249,0.73637,153.902783,0.706901,0.731007,2024-03-18 18:58:43,gist-960-euclidean,redis
redis,32,128,gist-960-euclidean,3,redis-m-32-ef-128,1,{'ef': 512},8.099951,0.007972,0.94885,0.005054,0.003178,0.160063,123.457534,0.009996,0.0115,2024-03-18 18:57:26,gist-960-euclidean,redis
infinispan,32,128,gist-960-euclidean,5,infinispan-m-32-ef-128,100,{'ef': 128},10.47589,1.007671,0.1,0.432922,0.091883,3.241671,95.457288,1.690915,1.958981,2024-03-19 16:00:42,gist-960-euclidean,infinispan
infinispan,32,128,gist-960-euclidean,6,infinispan-m-32-ef-128,100,{'ef': 256},13.086889,1.250723,0.1,0.526012,0.112938,2.976911,76.412353,2.117653,2.452173,2024-03-19 16:01:11,gist-960-euclidean,infinispan


In [71]:
_search = search_df.reset_index()
_upload = upload_df.reset_index()

joined_df = _search.merge(_upload, on=["engine", "m", "ef", "dataset"], how="left", suffixes=("_search", "_upload"))
print(len(joined_df))
joined_df

16


Unnamed: 0,engine,m,ef,dataset,search_index,experiment,parallel,search_params,total_time_search,mean_time,...,p95_time,p99_time,0,0.1,0.2,date,params,post_upload,upload_time,total_time_upload
0,infinispan,32,128,gist-960-euclidean,0,infinispan-m-32-ef-128,1,{'ef': 64},79.762096,0.079463,...,0.09677,0.115982,2024-03-19 15:56:33,gist-960-euclidean,infinispan,2024-03-19 15:55:13,"{'experiment': 'infinispan-m-32-ef-128', 'engi...",{},4692.670652,4692.670674
1,infinispan,32,128,gist-960-euclidean,1,infinispan-m-32-ef-128,1,{'ef': 128},63.955368,0.063692,...,0.073856,0.077532,2024-03-19 15:57:37,gist-960-euclidean,infinispan,2024-03-19 15:55:13,"{'experiment': 'infinispan-m-32-ef-128', 'engi...",{},4692.670652,4692.670674
2,infinispan,32,128,gist-960-euclidean,2,infinispan-m-32-ef-128,1,{'ef': 256},65.581345,0.065305,...,0.08055,0.093705,2024-03-19 15:58:43,gist-960-euclidean,infinispan,2024-03-19 15:55:13,"{'experiment': 'infinispan-m-32-ef-128', 'engi...",{},4692.670652,4692.670674
3,infinispan,32,128,gist-960-euclidean,3,infinispan-m-32-ef-128,1,{'ef': 512},63.704724,0.063441,...,0.073783,0.077441,2024-03-19 15:59:46,gist-960-euclidean,infinispan,2024-03-19 15:55:13,"{'experiment': 'infinispan-m-32-ef-128', 'engi...",{},4692.670652,4692.670674
4,infinispan,32,128,gist-960-euclidean,4,infinispan-m-32-ef-128,100,{'ef': 64},14.121041,1.353599,...,2.304622,2.616821,2024-03-19 16:00:16,gist-960-euclidean,infinispan,2024-03-19 15:55:13,"{'experiment': 'infinispan-m-32-ef-128', 'engi...",{},4692.670652,4692.670674
5,infinispan,32,128,gist-960-euclidean,5,infinispan-m-32-ef-128,100,{'ef': 128},10.47589,1.007671,...,1.690915,1.958981,2024-03-19 16:00:42,gist-960-euclidean,infinispan,2024-03-19 15:55:13,"{'experiment': 'infinispan-m-32-ef-128', 'engi...",{},4692.670652,4692.670674
6,infinispan,32,128,gist-960-euclidean,6,infinispan-m-32-ef-128,100,{'ef': 256},13.086889,1.250723,...,2.117653,2.452173,2024-03-19 16:01:11,gist-960-euclidean,infinispan,2024-03-19 15:55:13,"{'experiment': 'infinispan-m-32-ef-128', 'engi...",{},4692.670652,4692.670674
7,infinispan,32,128,gist-960-euclidean,7,infinispan-m-32-ef-128,100,{'ef': 512},15.118512,1.449361,...,2.145022,2.455565,2024-03-19 16:01:42,gist-960-euclidean,infinispan,2024-03-19 15:55:13,"{'experiment': 'infinispan-m-32-ef-128', 'engi...",{},4692.670652,4692.670674
8,redis,32,128,gist-960-euclidean,0,redis-m-32-ef-128,1,{'ef': 64},3.134228,0.003046,...,0.003583,0.003778,2024-03-18 18:57:10,gist-960-euclidean,redis,2024-03-18 18:57:07,"{'experiment': 'redis-m-32-ef-128', 'engine': ...",{},1968.103895,1968.103937
9,redis,32,128,gist-960-euclidean,1,redis-m-32-ef-128,1,{'ef': 128},3.434508,0.003345,...,0.003982,0.004193,2024-03-18 18:57:13,gist-960-euclidean,redis,2024-03-18 18:57:07,"{'experiment': 'redis-m-32-ef-128', 'engine': ...",{},1968.103895,1968.103937


In [72]:
json_all = []
json_1_or_100_thread = []

for index, row in joined_df.reset_index().iterrows():
    engine_params = {}
    if isinstance(row['search_params'], dict):
        engine_params.update(row['search_params'])
    if isinstance(row['params'], dict):
        engine_params.update(row['params'])

    engine_name = row['engine']

    if engine_name == "qdrant-rps" or engine_name == "qdrant-bq-rps" or engine_name == "qdrant-sq-rps":
        engine_name = "qdrant"

    json_object = {
        "engine_name": engine_name,
        "setup_name": f"{row['engine']}-m-{row['m']}-ef-{row['ef']}",
        "dataset_name": row['dataset'],
        # "search_idx": row['search_index'],
        "upload_time": row['upload_time'],
        "total_upload_time": row['total_time_upload'],
        "p95_time": row['p95_time'],
        "rps": row['rps'],
        "parallel": row['parallel'],
        "p99_time": row['p99_time'],
        "mean_time": row['mean_time'],
        "mean_precisions": row['mean_precisions'],
        "engine_params": engine_params,
    }
    json_all.append(json_object)
    
    parallel = row['parallel']

    if parallel == 1 or parallel == 100:
        json_1_or_100_thread.append(json_object)

format = '%Y-%M-%d' # T%H:%M:%S
now = datetime.now().replace(tzinfo=timezone.utc).strftime(format)

Path(f"results-{now}.json").write_text(json.dumps(json_all, indent=2))
Path(f"results-1-100-threads-{now}.json").write_text(json.dumps(json_1_or_100_thread, indent=2))

json_1_or_100_thread[-1], len(json_all), len(json_1_or_100_thread)

({'engine_name': 'redis',
  'setup_name': 'redis-m-32-ef-128',
  'dataset_name': 'gist-960-euclidean',
  'upload_time': 1968.1038949000067,
  'total_upload_time': 1968.1039367000049,
  'p95_time': 0.7069005783705506,
  'rps': 153.90278313303938,
  'parallel': 100,
  'p99_time': 0.7310065932184807,
  'mean_time': 0.6182948905912635,
  'mean_precisions': 0.9488499999999999,
  'engine_params': {'ef': 512,
   'experiment': 'redis-m-32-ef-128',
   'engine': 'redis',
   'dataset': 'gist-960-euclidean',
   'parallel': 16,
   'hnsw_config': {'M': 32, 'EF_CONSTRUCTION': 128}}},
 16,
 16)