### Notebook to process benchmar results

Please run this notebook after running all the benchmarks and storing them in the `results` dir. This will export them in the desired format for the single node benchmark plots of [qdrant.tech/benchmarks](https://qdrant.tech/benchmarks)

In [None]:
from pathlib import Path
import re
import json
import pandas as pd
from datetime import datetime, timezone

In [None]:
DATA_DIR = Path().resolve().parent / "results"
DATA_DIR, list(DATA_DIR.glob("*.json"))[0].name

In [None]:
PATH_REGEX = re.compile(r"(?P<engine_name>("
                        r"?P<engine>[a-z\-]+)"
                        r"\-m\-(?P<m>[0-9]+)"
                        r"\-ef\-(?P<ef>[0-9]+)"
                        r")"
                        r"\-(?P<dataset>[a-zA-Z0-9\-]+)"
                        r"\-(?P<operation>(search)|(upload))"
                        r"(\-(?P<search_index>[0-9]{1,2})\-)?"
                        r"\-?(?P<date>.*)\.json")

In [None]:
upload_results, search_results = [], []

for path in DATA_DIR.glob("*.json"):
    match = PATH_REGEX.match(path.name)
    if match is None:
        continue
        
    experiment = match.groupdict()

    with open(path, "r") as fp:
        stats = json.load(fp)
        
    params = stats['params']
    dataset = params.pop("dataset")
    engine = params.pop("engine")

    entry = [engine, match["m"], match["ef"], 
             dataset, match["search_index"], match["date"], 
             params, stats["results"]]
    
    if experiment["operation"] == "search":
        search_results.append(entry)
    elif experiment["operation"] == "upload":
        upload_results.append(entry)

len(upload_results), len(search_results)

In [None]:
column_names = ["engine", "m", "ef", "dataset", "search_index", "date", "params", "results"]

In [None]:
upload_results, search_results[0]

In [None]:
upload_df = pd.DataFrame(upload_results, columns=column_names) \
    .drop(columns="search_index")
upload_df["date"] = pd.to_datetime(upload_df["date"], format="%Y-%m-%d-%H-%M-%S")
upload_df = upload_df.sort_values("date", ascending=False) \
    .groupby(["engine", "m", "ef", "dataset"]) \
    .last()
upload_df = pd.concat([upload_df, upload_df["results"].apply(pd.Series)], axis=1)
upload_df = upload_df.drop(columns="results")

print(len(upload_df))

upload_df.sort_values("total_time", ascending=True).head(n=5)

In [None]:
search_df = pd.DataFrame(search_results, columns=column_names)
search_df["date"] = pd.to_datetime(search_df["date"], format="%Y-%m-%d-%H-%M-%S")
search_df = search_df.sort_values("date", ascending=False) \
    .groupby(["engine", "m", "ef", "dataset", "search_index"]) \
    .first()

print(len(search_df))

for column_name in ["params", "results"]:
    search_df = pd.concat([search_df, search_df[column_name].apply(pd.Series)], axis=1)
    search_df = search_df.drop(columns=column_name)
search_df.sort_values("rps", ascending=False).head(n=10)

In [None]:
_search = search_df.reset_index()
_upload = upload_df.reset_index()

joined_df = _search.merge(_upload, on=["engine", "m", "ef", "dataset"], how="left", suffixes=("_search", "_upload"))
print(len(joined_df))
joined_df

In [None]:
pd.set_option('display.max_columns', None)  

joined_df.head(1)

In [None]:
json_results = []

for index, row in joined_df.reset_index().iterrows():
    engine_params = {}
    if isinstance(row['params'], dict):
        engine_params.update(row['params'])
    if isinstance(row['config'], dict): # Search config
        engine_params.update(row['config'])

    engine_name = row['engine']

    if engine_name == "qdrant-rps" or engine_name == "qdrant-bq-rps" or engine_name == "qdrant-sq-rps":
        engine_name = "qdrant"

    json_object = {
        "engine_name": engine_name,
        "setup_name": f"{row['engine']}-m-{row['m']}-ef-{row['ef']}",
        "dataset_name": row['dataset'],
        # "search_idx": row['search_index'],
        "upload_time": row['upload_time'],
        "total_upload_time": row['total_time_upload'],
        "p95_time": row['p95_time'],
        "rps": row['rps'],
        "parallel": row['parallel'],
        "p99_time": row['p99_time'],
        "mean_time": row['mean_time'],
        "mean_precisions": row['mean_precisions'],
        "engine_params": engine_params,
    }
    json_results.append(json_object)
    
    parallel = row['parallel']


format = '%Y-%M-%dT%H:%M:%S'
now = datetime.now().replace(tzinfo=timezone.utc).strftime(format)

Path(f"results.json").write_text(json.dumps(json_results, indent=2))
Path(f"results-{now}.json").write_text(json.dumps(json_results, indent=2))

json_results[-1], len(json_results)